QAonMilitaryKG/collect_data.py
lhy_in_blcu@126.com 46e8b676e6 create new project
2019-05-02 00:44:38 +08:00

172 lines
6.3 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#coding = utf-8
import os
from urllib import request
from lxml import etree
import gzip
import pymongo
import datetime
class NewspaperSpider:
def __init__(self):
self.term_dict = {
'aircraft': "飞行器",
'warship': "舰船舰艇",
'guns': "枪械与单兵",
'tank': "坦克装甲车辆",
'artillery': "火炮",
'missile': "导弹武器",
'spaceship': "太空装备",
'explosive': "爆炸物",
}
self.conn = pymongo.MongoClient()
return
'''get html '''
def get_html(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
'Host':'weapon.huanqiu.com'
}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = gzip.decompress(page).decode('utf-8')
return page
'''get_urllist'''
def get_urllist(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
papers = ['http://weapon.huanqiu.com' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
return list(set(papers))
'''content parser'''
def html_parser(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
contents = [html, title]
for article in attrs:
content = article.xpath('string(.)')
contents.append(content)
return contents
'''modify data'''
def modify_data(self):
keys = []
for item in self.conn['military']['kb'].find():
body = item['contents']
title = body[1].replace(' ','').replace('','-').replace('','(').replace('',')')
title = title.split('_')
data = {}
name = title[0]
category = title[1]
data['名称'] = name
data['类别'] = category
attrs = body[2:]
html = body[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
data['产国'] = country
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('','(').replace(' ','').replace('\t','')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t','').replace('\n','').replace(',','')
keys.append(key)
self.conn['military']['graph_data'].insert(data)
return
'''采集主函数'''
def spider_main(self):
big_cates = ['aircraft', 'warship',
'guns', 'tank',
'artillery', 'missile',
'spaceship', 'explosive'
]
for big_cate in big_cates:
big_url = 'http://weapon.huanqiu.com/weaponlist/%s'%big_cate
html = self.get_html(big_url)
selector = etree.HTML(html)
span = selector.xpath('//span[@class="list"]')[0]
second_urls = ['http://weapon.huanqiu.com' + i for i in span.xpath('./a/@href')]
second_cates = [i for i in span.xpath('./a/text()')]
second_dict = {}
for indx, second_cate in enumerate(second_cates):
second_dict[second_cate] = second_urls[indx]
for second_cate, second_url in second_dict.items():
max_pages = self.get_maxpage(second_url)
for page in range(1, max_pages+1):
url = second_url + '_0_0_%s'%page
seed_urls = self.get_urllist(url)
for seed in seed_urls:
self.get_info(seed, big_cate, second_cate)
'''根据最大值,获取所有信息'''
def get_info(self, url, big_cate, second_cate):
content = self.html_parser(url)
data = self.extract_data(content)
data['大类'] = self.term_dict.get(big_cate)
data['类型'] = second_cate
if data:
print(data)
self.conn['military']['knowledge_base'].insert(data)
return
'''modify data'''
def extract_data(self, content):
title = content[1].replace(' ', '').replace('', '-').replace('', '(').replace('', ')')
title = title.split('_')
data = {}
name = title[0]
data['名称'] = name
attrs = content[2:]
html = content[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
image = selector.xpath('//div[@class="maxPic"]/img/@src')
if not image:
image = ''
else:
image = image[0]
data['产国'] = country
data['图片'] = image
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('', '(').replace(' ', '').replace('\t', '')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
return data
'''获取最大值'''
def get_maxpage(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
if not max_pages:
max_page = 1
else:
max_page = int(max_pages[-2])
return max_page
if __name__ == '__main__':
handler = NewspaperSpider()
handler.spider_main()