2018-11-28 16:31:06 +08:00
|
|
|
|
|
|
|
from urllib import request,parse
|
|
|
|
import gzip
|
|
|
|
import json
|
|
|
|
from lxml import etree
|
|
|
|
import pymongo
|
|
|
|
from collections import Counter
|
|
|
|
|
|
|
|
|
|
|
|
class PersonSpider:
|
|
|
|
def __init__(self):
|
|
|
|
self.conn = pymongo.MongoClient()
|
|
|
|
return
|
|
|
|
|
|
|
|
'''获取html'''
|
|
|
|
def get_html(self, word):
|
|
|
|
url = 'https://www.sogou.com/kmap?query='+ parse.quote(word)+'&from=relation&id='
|
|
|
|
print(word, url)
|
|
|
|
headers = {
|
|
|
|
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
|
|
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
'Accept': '*/*',
|
|
|
|
'Accept-Encoding': 'gzip, deflate,br',
|
|
|
|
'Accept-Language': 'en-US,en;q=0.8',
|
|
|
|
'Cookie':'GOTO=Af11458; SUV=00EF7C967C10886459C27A09D07E2879; LSTMV=186%2C155; LCLKINT=2115; SUID=6488107C7C20940A0000000059C27A09; ABTEST=7|1543156540|v17; IPLOC=CN1100',
|
|
|
|
'Host':'www.sogou.com',
|
|
|
|
'Referer':'https://www.sogou.com/tupu/person.html?q=' + parse.quote(word),
|
|
|
|
}
|
|
|
|
req = request.Request(url, headers=headers)
|
|
|
|
page = request.urlopen(req).read()
|
|
|
|
html = gzip.decompress(page)
|
|
|
|
# 用urllib对网页进行请求,模式和界面版是一样的
|
|
|
|
try:
|
|
|
|
data = html.decode('UTF-16LE')[:-1]
|
|
|
|
data = json.loads(data)
|
|
|
|
except Exception as e:
|
|
|
|
return {}
|
|
|
|
return data
|
|
|
|
|
|
|
|
'''采集主函数'''
|
|
|
|
def spider_person(self, person):
|
|
|
|
#系统默认返回关于该实体的三度人物关系
|
|
|
|
data = self.get_html(person)
|
|
|
|
if not data:
|
|
|
|
return
|
|
|
|
nodes = data['nodes']
|
|
|
|
if not nodes:
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
item = {}
|
|
|
|
item['nodes'] = nodes
|
|
|
|
item['links'] = data['links']
|
|
|
|
try:
|
|
|
|
self.conn['person_rel']['data2'].insert(item)
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
'''收集人物名称'''
|
|
|
|
def collect_names_star(self):
|
|
|
|
f = open('korea_star_person_names.txt', 'w+')
|
|
|
|
for page in range(1,11):
|
|
|
|
# url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/china/index_%s.shtml'%page
|
|
|
|
url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/hanguo/index_%s.shtml'%page
|
|
|
|
req = request.Request(url)
|
|
|
|
page = request.urlopen(req).read().decode('gbk')
|
|
|
|
selector = etree.HTML(page)
|
|
|
|
names = selector.xpath('//li/a/@title')
|
|
|
|
f.write('\n'.join(list(names)) + '\n')
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
'''收集历史人物'''
|
|
|
|
def collect_names_history(self):
|
|
|
|
f = open('history_person_names2.txt', 'w+')
|
|
|
|
content = open('history_names2.html').read()
|
|
|
|
selector = etree.HTML(content)
|
|
|
|
names = [i.replace(' ','') for i in selector.xpath('//li/a/text()')]
|
|
|
|
f.write('\n'.join(names) + '\n')
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
'''采集函数'''
|
|
|
|
def spider_main(self):
|
|
|
|
history_names = [i.strip() for i in open('history_person_names.txt') if len(i.strip()) > 1]
|
|
|
|
star_names = [i.strip() for i in open('star_person_names.txt') if len(i.strip()) > 1]
|
|
|
|
name_dict = {
|
2018-11-28 19:50:44 +08:00
|
|
|
'star': star_names,
|
2018-11-28 16:31:06 +08:00
|
|
|
'history': history_names,
|
|
|
|
}
|
|
|
|
for label, names in name_dict.items():
|
|
|
|
for name in names:
|
|
|
|
data = self.spider_person(name)
|
|
|
|
|
|
|
|
'''读取人物名称'''
|
|
|
|
def update_data(self):
|
|
|
|
names_all_has = []
|
|
|
|
names_all_add = []
|
|
|
|
for item in self.conn['person_rel']['data'].find():
|
|
|
|
nodes =item['nodes']
|
|
|
|
links = item['links']
|
|
|
|
names = [node['name'] for node in nodes]
|
|
|
|
names_all_has += names
|
|
|
|
|
|
|
|
for item in self.conn['person_rel']['data2'].find():
|
|
|
|
nodes =item['nodes']
|
|
|
|
links = item['links']
|
|
|
|
names = [node['name'] for node in nodes]
|
|
|
|
names_all_add += names
|
|
|
|
|
|
|
|
for name in set(names_all_add).difference(set(names_all_has)):
|
|
|
|
self.spider_person(name)
|
|
|
|
return
|
|
|
|
|
|
|
|
'''统计有多少人物'''
|
|
|
|
def read_persons(self):
|
|
|
|
f = open('person.txt', 'w+')
|
|
|
|
names_all = []
|
|
|
|
links_all = []
|
|
|
|
for item in self.conn['person_rel']['data2'].find():
|
|
|
|
nodes = item['nodes']
|
|
|
|
links = item['links']
|
|
|
|
link_names = [link['name'] for link in links]
|
|
|
|
links_all += link_names
|
|
|
|
names = [node['name'] for node in nodes]
|
|
|
|
names_all += names
|
|
|
|
print(len(set(names_all)), len(names_all))
|
|
|
|
print(len(set(links_all)), len(links_all))
|
|
|
|
f.write('\n'.join(list(set(names_all))))
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
'''整理人物数据'''
|
|
|
|
def modify_data(self):
|
|
|
|
f_rel = open('rel_data.txt', 'w+')
|
|
|
|
f_reltype = open('rel_type.txt', 'w+')
|
|
|
|
f_person = open('person2id.txt', 'w+')
|
|
|
|
person_dict = {}
|
|
|
|
rel_dict = {}
|
|
|
|
rel_list = set()
|
|
|
|
rel_types = []
|
|
|
|
|
|
|
|
for item in self.conn['person_rel']['data2'].find():
|
|
|
|
nodes = item['nodes']
|
|
|
|
for node in nodes:
|
|
|
|
id = node['id']
|
|
|
|
name = node['name']
|
|
|
|
person_dict[id] = name
|
|
|
|
|
|
|
|
for item in self.conn['person_rel']['data2'].find():
|
|
|
|
links = item['links']
|
|
|
|
for link in links:
|
|
|
|
from_person = person_dict.get(link['from'], '')
|
|
|
|
to_person = person_dict.get(link['to'], '')
|
|
|
|
if not from_person or not to_person:
|
|
|
|
continue
|
|
|
|
rel_name = link['name']
|
|
|
|
rel_type = link['type']
|
|
|
|
|
|
|
|
rel_dict[rel_name] = rel_type
|
|
|
|
data = [from_person, to_person, rel_name, str(rel_type)]
|
|
|
|
rel_list.add('###'.join(data))
|
|
|
|
|
|
|
|
rels_num = len(rel_list)
|
|
|
|
persons_num = len(person_dict.keys())
|
|
|
|
|
|
|
|
for rel in rel_list:
|
|
|
|
if len(rel.split('###')) != 4:
|
|
|
|
continue
|
|
|
|
rel_name = rel.split('###')[2]
|
|
|
|
rel_types.append(rel_name)
|
|
|
|
|
|
|
|
|
|
|
|
for id, name in person_dict.items():
|
|
|
|
f_person.write(str(id) + '\t' + name + '\n')
|
|
|
|
|
|
|
|
reltype_dict = Counter(rel_types).most_common()
|
|
|
|
|
|
|
|
sum = 0.0
|
|
|
|
for i in reltype_dict:
|
|
|
|
rel_name = i[0]
|
|
|
|
rel_freq = i[1]
|
|
|
|
rel_percent = rel_freq/rels_num
|
|
|
|
sum += rel_percent
|
|
|
|
|
|
|
|
f_reltype.write(rel_name + '\t' + str(rel_freq) + '\t' + str(rel_percent) + '\t' + str(sum) + '\n')
|
|
|
|
|
|
|
|
f_rel.write('\n'.join(list(rel_list)))
|
|
|
|
f_person.close()
|
|
|
|
f_rel.close()
|
|
|
|
f_reltype.close()
|
|
|
|
|
|
|
|
print('rels_num', rels_num)
|
|
|
|
print('persons_num', persons_num)
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
handler = PersonSpider()
|
2018-11-28 19:50:44 +08:00
|
|
|
handler.spider_main()
|
2018-11-28 16:31:06 +08:00
|
|
|
|
|
|
|
|