172 lines
6.3 KiB
Python
172 lines
6.3 KiB
Python
|
#coding = utf-8
|
|||
|
import os
|
|||
|
from urllib import request
|
|||
|
from lxml import etree
|
|||
|
import gzip
|
|||
|
import pymongo
|
|||
|
import datetime
|
|||
|
|
|||
|
class NewspaperSpider:
|
|||
|
def __init__(self):
|
|||
|
self.term_dict = {
|
|||
|
'aircraft': "飞行器",
|
|||
|
'warship': "舰船舰艇",
|
|||
|
'guns': "枪械与单兵",
|
|||
|
'tank': "坦克装甲车辆",
|
|||
|
'artillery': "火炮",
|
|||
|
'missile': "导弹武器",
|
|||
|
'spaceship': "太空装备",
|
|||
|
'explosive': "爆炸物",
|
|||
|
}
|
|||
|
|
|||
|
self.conn = pymongo.MongoClient()
|
|||
|
return
|
|||
|
|
|||
|
'''get html '''
|
|||
|
def get_html(self, url):
|
|||
|
headers = {
|
|||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
|
|||
|
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
|
|||
|
'Accept-Encoding':'gzip, deflate',
|
|||
|
'Accept-Language':'en-US,en;q=0.8',
|
|||
|
'Cache-Control':'max-age=0',
|
|||
|
'Connection':'keep-alive',
|
|||
|
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
|
|||
|
'Host':'weapon.huanqiu.com'
|
|||
|
}
|
|||
|
req = request.Request(url, headers=headers)
|
|||
|
page = request.urlopen(req).read()
|
|||
|
page = gzip.decompress(page).decode('utf-8')
|
|||
|
|
|||
|
return page
|
|||
|
|
|||
|
'''get_urllist'''
|
|||
|
def get_urllist(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
papers = ['http://weapon.huanqiu.com' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
|
|||
|
return list(set(papers))
|
|||
|
|
|||
|
'''content parser'''
|
|||
|
def html_parser(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
title = selector.xpath('//title/text()')[0]
|
|||
|
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
|
|||
|
contents = [html, title]
|
|||
|
for article in attrs:
|
|||
|
content = article.xpath('string(.)')
|
|||
|
contents.append(content)
|
|||
|
return contents
|
|||
|
|
|||
|
'''modify data'''
|
|||
|
def modify_data(self):
|
|||
|
keys = []
|
|||
|
for item in self.conn['military']['kb'].find():
|
|||
|
body = item['contents']
|
|||
|
title = body[1].replace(' ','').replace('-','-').replace('(','(').replace(')',')')
|
|||
|
title = title.split('_')
|
|||
|
data = {}
|
|||
|
name = title[0]
|
|||
|
category = title[1]
|
|||
|
data['名称'] = name
|
|||
|
data['类别'] = category
|
|||
|
attrs = body[2:]
|
|||
|
html = body[0]
|
|||
|
selector = etree.HTML(html)
|
|||
|
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
|
|||
|
data['产国'] = country
|
|||
|
for attr in attrs:
|
|||
|
if len(attr.split(':')) < 2:
|
|||
|
continue
|
|||
|
key = attr.split(':')[0].replace('(','(').replace(' ','').replace('\t','')
|
|||
|
if key.startswith('(') or len(key) > 6:
|
|||
|
continue
|
|||
|
value = attr.split(':')[1]
|
|||
|
data[key] = value.replace('\t','').replace('\n','').replace(',','')
|
|||
|
keys.append(key)
|
|||
|
self.conn['military']['graph_data'].insert(data)
|
|||
|
return
|
|||
|
|
|||
|
'''采集主函数'''
|
|||
|
def spider_main(self):
|
|||
|
big_cates = ['aircraft', 'warship',
|
|||
|
'guns', 'tank',
|
|||
|
'artillery', 'missile',
|
|||
|
'spaceship', 'explosive'
|
|||
|
]
|
|||
|
for big_cate in big_cates:
|
|||
|
big_url = 'http://weapon.huanqiu.com/weaponlist/%s'%big_cate
|
|||
|
html = self.get_html(big_url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
span = selector.xpath('//span[@class="list"]')[0]
|
|||
|
second_urls = ['http://weapon.huanqiu.com' + i for i in span.xpath('./a/@href')]
|
|||
|
second_cates = [i for i in span.xpath('./a/text()')]
|
|||
|
second_dict = {}
|
|||
|
for indx, second_cate in enumerate(second_cates):
|
|||
|
second_dict[second_cate] = second_urls[indx]
|
|||
|
for second_cate, second_url in second_dict.items():
|
|||
|
max_pages = self.get_maxpage(second_url)
|
|||
|
for page in range(1, max_pages+1):
|
|||
|
url = second_url + '_0_0_%s'%page
|
|||
|
seed_urls = self.get_urllist(url)
|
|||
|
for seed in seed_urls:
|
|||
|
self.get_info(seed, big_cate, second_cate)
|
|||
|
|
|||
|
|
|||
|
'''根据最大值,获取所有信息'''
|
|||
|
def get_info(self, url, big_cate, second_cate):
|
|||
|
content = self.html_parser(url)
|
|||
|
data = self.extract_data(content)
|
|||
|
data['大类'] = self.term_dict.get(big_cate)
|
|||
|
data['类型'] = second_cate
|
|||
|
if data:
|
|||
|
print(data)
|
|||
|
self.conn['military']['knowledge_base'].insert(data)
|
|||
|
return
|
|||
|
|
|||
|
'''modify data'''
|
|||
|
def extract_data(self, content):
|
|||
|
title = content[1].replace(' ', '').replace('-', '-').replace('(', '(').replace(')', ')')
|
|||
|
title = title.split('_')
|
|||
|
data = {}
|
|||
|
name = title[0]
|
|||
|
data['名称'] = name
|
|||
|
attrs = content[2:]
|
|||
|
html = content[0]
|
|||
|
selector = etree.HTML(html)
|
|||
|
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
|
|||
|
image = selector.xpath('//div[@class="maxPic"]/img/@src')
|
|||
|
if not image:
|
|||
|
image = ''
|
|||
|
else:
|
|||
|
image = image[0]
|
|||
|
data['产国'] = country
|
|||
|
data['图片'] = image
|
|||
|
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
|
|||
|
for attr in attrs:
|
|||
|
if len(attr.split(':')) < 2:
|
|||
|
continue
|
|||
|
key = attr.split(':')[0].replace('(', '(').replace(' ', '').replace('\t', '')
|
|||
|
if key.startswith('(') or len(key) > 6:
|
|||
|
continue
|
|||
|
value = attr.split(':')[1]
|
|||
|
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
|
|||
|
return data
|
|||
|
|
|||
|
'''获取最大值'''
|
|||
|
def get_maxpage(self, url):
|
|||
|
html = self.get_html(url)
|
|||
|
selector = etree.HTML(html)
|
|||
|
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
|
|||
|
if not max_pages:
|
|||
|
max_page = 1
|
|||
|
else:
|
|||
|
max_page = int(max_pages[-2])
|
|||
|
|
|||
|
return max_page
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
handler = NewspaperSpider()
|
|||
|
handler.spider_main()
|