166 lines
6.3 KiB
Python
166 lines
6.3 KiB
Python
|
#!/usr/bin/env python3
|
|||
|
# coding: utf-8
|
|||
|
# File: data_spider.py
|
|||
|
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
|||
|
# Date: 18-10-3
|
|||
|
|
|||
|
|
|||
|
import urllib.request
|
|||
|
import urllib.parse
|
|||
|
from lxml import etree
|
|||
|
import pymongo
|
|||
|
import re
|
|||
|
|
|||
|
'''基于司法网的犯罪案件采集'''
|
|||
|
class CrimeSpider:
|
|||
|
def __init__(self):
|
|||
|
self.conn = pymongo.MongoClient()
|
|||
|
self.db = self.conn['medical']
|
|||
|
self.col = self.db['data']
|
|||
|
|
|||
|
'''根据url,请求html'''
|
|||
|
def get_html(self, url):
|
|||
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|||
|
'Chrome/51.0.2704.63 Safari/537.36'}
|
|||
|
req = urllib.request.Request(url=url, headers=headers)
|
|||
|
res = urllib.request.urlopen(req)
|
|||
|
html = res.read().decode('gbk')
|
|||
|
return html
|
|||
|
|
|||
|
'''url解析'''
|
|||
|
def url_parser(self, content):
|
|||
|
selector = etree.HTML(content)
|
|||
|
urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]
|
|||
|
return urls
|
|||
|
|
|||
|
'''测试'''
|
|||
|
def spider_main(self):
|
|||
|
for page in range(1, 11000):
|
|||
|
try:
|
|||
|
basic_url = 'http://jib.xywy.com/il_sii/gaishu/%s.htm'%page
|
|||
|
cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page
|
|||
|
prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page
|
|||
|
symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page
|
|||
|
inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page
|
|||
|
treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page
|
|||
|
food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page
|
|||
|
drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page
|
|||
|
data = {}
|
|||
|
data['url'] = basic_url
|
|||
|
data['basic_info'] = self.basicinfo_spider(basic_url)
|
|||
|
data['cause_info'] = self.common_spider(cause_url)
|
|||
|
data['prevent_info'] = self.common_spider(prevent_url)
|
|||
|
data['symptom_info'] = self.symptom_spider(symptom_url)
|
|||
|
data['inspect_info'] = self.inspect_spider(inspect_url)
|
|||
|
data['treat_info'] = self.treat_spider(treat_url)
|
|||
|
data['food_info'] = self.food_spider(food_url)
|
|||
|
data['drug_info'] = self.drug_spider(drug_url)
|
|||
|
print(page, basic_url)
|
|||
|
self.col.insert(data)
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(e, page)
|
|||
|
return
|
|||
|
|
|||
|
'''基本信息解析'''
|
|||
|
def basicinfo_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
title = selector.xpath('//title/text()')[0]
|
|||
|
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
|
|||
|
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
|
|||
|
ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
|
|||
|
infobox = []
|
|||
|
for p in ps:
|
|||
|
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
|||
|
infobox.append(info)
|
|||
|
basic_data = {}
|
|||
|
basic_data['category'] = category
|
|||
|
basic_data['name'] = title.split('的简介')[0]
|
|||
|
basic_data['desc'] = desc
|
|||
|
basic_data['attributes'] = infobox
|
|||
|
return basic_data
|
|||
|
|
|||
|
'''treat_infobox治疗解析'''
|
|||
|
def treat_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
|
|||
|
infobox = []
|
|||
|
for p in ps:
|
|||
|
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
|||
|
infobox.append(info)
|
|||
|
return infobox
|
|||
|
|
|||
|
'''treat_infobox治疗解析'''
|
|||
|
def drug_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
|
|||
|
return drugs
|
|||
|
|
|||
|
'''food治疗解析'''
|
|||
|
def food_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
|
|||
|
try:
|
|||
|
food_data = {}
|
|||
|
food_data['good'] = divs[0].xpath('./div/p/text()')
|
|||
|
food_data['bad'] = divs[1].xpath('./div/p/text()')
|
|||
|
food_data['recommand'] = divs[2].xpath('./div/p/text()')
|
|||
|
except:
|
|||
|
return {}
|
|||
|
|
|||
|
return food_data
|
|||
|
|
|||
|
'''症状信息解析'''
|
|||
|
def symptom_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
symptoms = selector.xpath('//a[@class="gre" ]/text()')
|
|||
|
ps = selector.xpath('//p')
|
|||
|
detail = []
|
|||
|
for p in ps:
|
|||
|
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
|
|||
|
detail.append(info)
|
|||
|
symptoms_data = {}
|
|||
|
symptoms_data['symptoms'] = symptoms
|
|||
|
symptoms_data['symptoms_detail'] = detail
|
|||
|
return symptoms, detail
|
|||
|
|
|||
|
'''检查信息解析'''
|
|||
|
def inspect_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
inspects = selector.xpath('//li[@class="check-item"]/a/@href')
|
|||
|
return inspects
|
|||
|
|
|||
|
'''通用解析模块'''
|
|||
|
def common_spider(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
ps = selector.xpath('//p')
|
|||
|
infobox = []
|
|||
|
for p in ps:
|
|||
|
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
|
|||
|
if info:
|
|||
|
infobox.append(info)
|
|||
|
return '\n'.join(infobox)
|
|||
|
'''检查项抓取模块'''
|
|||
|
def inspect_crawl(self):
|
|||
|
for page in range(1, 3685):
|
|||
|
try:
|
|||
|
url = 'http://jck.xywy.com/jc_%s.html'%page
|
|||
|
html = self.get_html(url)
|
|||
|
data = {}
|
|||
|
data['url']= url
|
|||
|
data['html'] = html
|
|||
|
self.db['jc'].insert(data)
|
|||
|
print(url)
|
|||
|
except Exception as e:
|
|||
|
print(e)
|
|||
|
|
|||
|
|
|||
|
handler = CrimeSpider()
|
|||
|
handler.inspect_crawl()
|