QAonMilitaryKG/collect_data.py

172 lines
6.3 KiB
Python
Raw Permalink Normal View History

2019-05-02 00:44:38 +08:00
#coding = utf-8
import os
from urllib import request
from lxml import etree
import gzip
import pymongo
import datetime
class NewspaperSpider:
def __init__(self):
self.term_dict = {
'aircraft': "飞行器",
'warship': "舰船舰艇",
'guns': "枪械与单兵",
'tank': "坦克装甲车辆",
'artillery': "火炮",
'missile': "导弹武器",
'spaceship': "太空装备",
'explosive': "爆炸物",
}
self.conn = pymongo.MongoClient()
return
'''get html '''
def get_html(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
'Host':'weapon.huanqiu.com'
}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = gzip.decompress(page).decode('utf-8')
return page
'''get_urllist'''
def get_urllist(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
papers = ['http://weapon.huanqiu.com' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
return list(set(papers))
'''content parser'''
def html_parser(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
contents = [html, title]
for article in attrs:
content = article.xpath('string(.)')
contents.append(content)
return contents
'''modify data'''
def modify_data(self):
keys = []
for item in self.conn['military']['kb'].find():
body = item['contents']
title = body[1].replace(' ','').replace('','-').replace('','(').replace('',')')
title = title.split('_')
data = {}
name = title[0]
category = title[1]
data['名称'] = name
data['类别'] = category
attrs = body[2:]
html = body[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
data['产国'] = country
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('','(').replace(' ','').replace('\t','')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t','').replace('\n','').replace(',','')
keys.append(key)
self.conn['military']['graph_data'].insert(data)
return
'''采集主函数'''
def spider_main(self):
big_cates = ['aircraft', 'warship',
'guns', 'tank',
'artillery', 'missile',
'spaceship', 'explosive'
]
for big_cate in big_cates:
big_url = 'http://weapon.huanqiu.com/weaponlist/%s'%big_cate
html = self.get_html(big_url)
selector = etree.HTML(html)
span = selector.xpath('//span[@class="list"]')[0]
second_urls = ['http://weapon.huanqiu.com' + i for i in span.xpath('./a/@href')]
second_cates = [i for i in span.xpath('./a/text()')]
second_dict = {}
for indx, second_cate in enumerate(second_cates):
second_dict[second_cate] = second_urls[indx]
for second_cate, second_url in second_dict.items():
max_pages = self.get_maxpage(second_url)
for page in range(1, max_pages+1):
url = second_url + '_0_0_%s'%page
seed_urls = self.get_urllist(url)
for seed in seed_urls:
self.get_info(seed, big_cate, second_cate)
'''根据最大值,获取所有信息'''
def get_info(self, url, big_cate, second_cate):
content = self.html_parser(url)
data = self.extract_data(content)
data['大类'] = self.term_dict.get(big_cate)
data['类型'] = second_cate
if data:
print(data)
self.conn['military']['knowledge_base'].insert(data)
return
'''modify data'''
def extract_data(self, content):
title = content[1].replace(' ', '').replace('', '-').replace('', '(').replace('', ')')
title = title.split('_')
data = {}
name = title[0]
data['名称'] = name
attrs = content[2:]
html = content[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
image = selector.xpath('//div[@class="maxPic"]/img/@src')
if not image:
image = ''
else:
image = image[0]
data['产国'] = country
data['图片'] = image
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('', '(').replace(' ', '').replace('\t', '')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
return data
'''获取最大值'''
def get_maxpage(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
if not max_pages:
max_page = 1
else:
max_page = int(max_pages[-2])
return max_page
if __name__ == '__main__':
handler = NewspaperSpider()
handler.spider_main()