PersonRelationKnowledgeGraph/collect_person_rel.py


from urllib import request,parse
import gzip
import json
from lxml import etree
import pymongo
from collections import Counter


class PersonSpider:
    def __init__(self):
        self.conn = pymongo.MongoClient()
        return

    '''获取html'''
    def get_html(self, word):
        url = 'https://www.sogou.com/kmap?query='+ parse.quote(word)+'&from=relation&id='
        print(word, url)
        headers = {
            'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
            'Connection': 'keep-alive',
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate,br',
            'Accept-Language': 'en-US,en;q=0.8',
            'Cookie':'GOTO=Af11458; SUV=00EF7C967C10886459C27A09D07E2879; LSTMV=186%2C155; LCLKINT=2115; SUID=6488107C7C20940A0000000059C27A09; ABTEST=7|1543156540|v17; IPLOC=CN1100',
            'Host':'www.sogou.com',
            'Referer':'https://www.sogou.com/tupu/person.html?q=' + parse.quote(word),
        }
        req = request.Request(url, headers=headers)
        page = request.urlopen(req).read()
        html = gzip.decompress(page)
        # 用urllib对网页进行请求,模式和界面版是一样的
        try:
            data = html.decode('UTF-16LE')[:-1]
            data = json.loads(data)
        except Exception as e:
            return {}
        return data

    '''采集主函数'''
    def spider_person(self, person):
        #系统默认返回关于该实体的三度人物关系
        data = self.get_html(person)
        if not data:
            return
        nodes = data['nodes']
        if not nodes:
            return
        else:
            item = {}
            item['nodes'] = nodes
            item['links'] = data['links']
            try:
                self.conn['person_rel']['data2'].insert(item)
            except Exception as e:
                print(e)

    '''收集人物名称'''
    def collect_names_star(self):
        f = open('korea_star_person_names.txt', 'w+')
        for page in range(1,11):
            # url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/china/index_%s.shtml'%page
            url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/hanguo/index_%s.shtml'%page
            req = request.Request(url)
            page = request.urlopen(req).read().decode('gbk')
            selector = etree.HTML(page)
            names = selector.xpath('//li/a/@title')
            f.write('\n'.join(list(names)) + '\n')
        f.close()

    '''收集历史人物'''
    def collect_names_history(self):
        f = open('history_person_names2.txt', 'w+')
        content = open('history_names2.html').read()
        selector = etree.HTML(content)
        names = [i.replace(' ','') for i in selector.xpath('//li/a/text()')]
        f.write('\n'.join(names) + '\n')
        f.close()

    '''采集函数'''
    def spider_main(self):
        history_names = [i.strip() for i in open('history_person_names.txt') if len(i.strip()) > 1]
        star_names = [i.strip() for i in open('star_person_names.txt') if len(i.strip()) > 1]
        name_dict = {
            'star': star_names,
            'history': history_names,
            }
        for label, names in name_dict.items():
            for name in names:
                data = self.spider_person(name)

    '''读取人物名称'''
    def update_data(self):
        names_all_has = []
        names_all_add = []
        for item in self.conn['person_rel']['data'].find():
            nodes =item['nodes']
            links = item['links']
            names = [node['name'] for node in nodes]
            names_all_has += names

        for item in self.conn['person_rel']['data2'].find():
            nodes =item['nodes']
            links = item['links']
            names = [node['name'] for node in nodes]
            names_all_add += names

        for name in set(names_all_add).difference(set(names_all_has)):
            self.spider_person(name)
        return

    '''统计有多少人物'''
    def read_persons(self):
        f = open('person.txt', 'w+')
        names_all = []
        links_all = []
        for item in self.conn['person_rel']['data2'].find():
            nodes = item['nodes']
            links = item['links']
            link_names = [link['name'] for link in links]
            links_all += link_names
            names = [node['name'] for node in nodes]
            names_all += names
        print(len(set(names_all)), len(names_all))
        print(len(set(links_all)), len(links_all))
        f.write('\n'.join(list(set(names_all))))
        f.close()

    '''整理人物数据'''
    def modify_data(self):
        f_rel = open('rel_data.txt', 'w+')
        f_reltype = open('rel_type.txt', 'w+')
        f_person = open('person2id.txt', 'w+')
        person_dict = {}
        rel_dict = {}
        rel_list = set()
        rel_types = []

        for item in self.conn['person_rel']['data2'].find():
            nodes = item['nodes']
            for node in nodes:
                id = node['id']
                name = node['name']
                person_dict[id] = name

        for item in self.conn['person_rel']['data2'].find():
            links = item['links']
            for link in links:
                from_person = person_dict.get(link['from'], '')
                to_person = person_dict.get(link['to'], '')
                if not from_person or not to_person:
                    continue
                rel_name = link['name']
                rel_type = link['type']

                rel_dict[rel_name] = rel_type
                data = [from_person, to_person, rel_name, str(rel_type)]
                rel_list.add('###'.join(data))

        rels_num = len(rel_list)
        persons_num = len(person_dict.keys())

        for rel in rel_list:
            if len(rel.split('###')) != 4:
                continue
            rel_name = rel.split('###')[2]
            rel_types.append(rel_name)


        for id, name in person_dict.items():
            f_person.write(str(id) + '\t' + name + '\n')

        reltype_dict = Counter(rel_types).most_common()

        sum = 0.0
        for i in reltype_dict:
            rel_name = i[0]
            rel_freq = i[1]
            rel_percent = rel_freq/rels_num
            sum += rel_percent

            f_reltype.write(rel_name + '\t' + str(rel_freq) + '\t' + str(rel_percent) + '\t' + str(sum) + '\n')

        f_rel.write('\n'.join(list(rel_list)))
        f_person.close()
        f_rel.close()
        f_reltype.close()

        print('rels_num', rels_num)
        print('persons_num', persons_num)
        return


if __name__ == '__main__':
    handler = PersonSpider()
    handler.spider_main()
创建中文人物关系抽取项目 2018-11-28 16:31:06 +08:00
			`from urllib import request,parse`
			`import gzip`
			`import json`
			`from lxml import etree`
			`import pymongo`
			`from collections import Counter`


			`class PersonSpider:`
			`def __init__(self):`
			`self.conn = pymongo.MongoClient()`
			`return`

			`'''获取html'''`
			`def get_html(self, word):`
			`url = 'https://www.sogou.com/kmap?query='+ parse.quote(word)+'&from=relation&id='`
			`print(word, url)`
			`headers = {`
			`'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '`
			`r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',`
			`'Connection': 'keep-alive',`
			`'Accept': '/',`
			`'Accept-Encoding': 'gzip, deflate,br',`
			`'Accept-Language': 'en-US,en;q=0.8',`
			`'Cookie':'GOTO=Af11458; SUV=00EF7C967C10886459C27A09D07E2879; LSTMV=186%2C155; LCLKINT=2115; SUID=6488107C7C20940A0000000059C27A09; ABTEST=7\|1543156540\|v17; IPLOC=CN1100',`
			`'Host':'www.sogou.com',`
			`'Referer':'https://www.sogou.com/tupu/person.html?q=' + parse.quote(word),`
			`}`
			`req = request.Request(url, headers=headers)`
			`page = request.urlopen(req).read()`
			`html = gzip.decompress(page)`
			`# 用urllib对网页进行请求,模式和界面版是一样的`
			`try:`
			`data = html.decode('UTF-16LE')[:-1]`
			`data = json.loads(data)`
			`except Exception as e:`
			`return {}`
			`return data`

			`'''采集主函数'''`
			`def spider_person(self, person):`
			`#系统默认返回关于该实体的三度人物关系`
			`data = self.get_html(person)`
			`if not data:`
			`return`
			`nodes = data['nodes']`
			`if not nodes:`
			`return`
			`else:`
			`item = {}`
			`item['nodes'] = nodes`
			`item['links'] = data['links']`
			`try:`
			`self.conn['person_rel']['data2'].insert(item)`
			`except Exception as e:`
			`print(e)`

			`'''收集人物名称'''`
			`def collect_names_star(self):`
			`f = open('korea_star_person_names.txt', 'w+')`
			`for page in range(1,11):`
			`# url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/china/index_%s.shtml'%page`
			`url = 'http://g.manmankan.com/dy2013/mingxing/fenlei/hanguo/index_%s.shtml'%page`
			`req = request.Request(url)`
			`page = request.urlopen(req).read().decode('gbk')`
			`selector = etree.HTML(page)`
			`names = selector.xpath('//li/a/@title')`
			`f.write('\n'.join(list(names)) + '\n')`
			`f.close()`

			`'''收集历史人物'''`
			`def collect_names_history(self):`
			`f = open('history_person_names2.txt', 'w+')`
			`content = open('history_names2.html').read()`
			`selector = etree.HTML(content)`
			`names = [i.replace(' ','') for i in selector.xpath('//li/a/text()')]`
			`f.write('\n'.join(names) + '\n')`
			`f.close()`

			`'''采集函数'''`
			`def spider_main(self):`
			`history_names = [i.strip() for i in open('history_person_names.txt') if len(i.strip()) > 1]`
			`star_names = [i.strip() for i in open('star_person_names.txt') if len(i.strip()) > 1]`
			`name_dict = {`
添加备注信息 2018-11-28 19:50:44 +08:00			`'star': star_names,`
创建中文人物关系抽取项目 2018-11-28 16:31:06 +08:00			`'history': history_names,`
			`}`
			`for label, names in name_dict.items():`
			`for name in names:`
			`data = self.spider_person(name)`

			`'''读取人物名称'''`
			`def update_data(self):`
			`names_all_has = []`
			`names_all_add = []`
			`for item in self.conn['person_rel']['data'].find():`
			`nodes =item['nodes']`
			`links = item['links']`
			`names = [node['name'] for node in nodes]`
			`names_all_has += names`

			`for item in self.conn['person_rel']['data2'].find():`
			`nodes =item['nodes']`
			`links = item['links']`
			`names = [node['name'] for node in nodes]`
			`names_all_add += names`

			`for name in set(names_all_add).difference(set(names_all_has)):`
			`self.spider_person(name)`
			`return`

			`'''统计有多少人物'''`
			`def read_persons(self):`
			`f = open('person.txt', 'w+')`
			`names_all = []`
			`links_all = []`
			`for item in self.conn['person_rel']['data2'].find():`
			`nodes = item['nodes']`
			`links = item['links']`
			`link_names = [link['name'] for link in links]`
			`links_all += link_names`
			`names = [node['name'] for node in nodes]`
			`names_all += names`
			`print(len(set(names_all)), len(names_all))`
			`print(len(set(links_all)), len(links_all))`
			`f.write('\n'.join(list(set(names_all))))`
			`f.close()`

			`'''整理人物数据'''`
			`def modify_data(self):`
			`f_rel = open('rel_data.txt', 'w+')`
			`f_reltype = open('rel_type.txt', 'w+')`
			`f_person = open('person2id.txt', 'w+')`
			`person_dict = {}`
			`rel_dict = {}`
			`rel_list = set()`
			`rel_types = []`

			`for item in self.conn['person_rel']['data2'].find():`
			`nodes = item['nodes']`
			`for node in nodes:`
			`id = node['id']`
			`name = node['name']`
			`person_dict[id] = name`

			`for item in self.conn['person_rel']['data2'].find():`
			`links = item['links']`
			`for link in links:`
			`from_person = person_dict.get(link['from'], '')`
			`to_person = person_dict.get(link['to'], '')`
			`if not from_person or not to_person:`
			`continue`
			`rel_name = link['name']`
			`rel_type = link['type']`

			`rel_dict[rel_name] = rel_type`
			`data = [from_person, to_person, rel_name, str(rel_type)]`
			`rel_list.add('###'.join(data))`

			`rels_num = len(rel_list)`
			`persons_num = len(person_dict.keys())`

			`for rel in rel_list:`
			`if len(rel.split('###')) != 4:`
			`continue`
			`rel_name = rel.split('###')[2]`
			`rel_types.append(rel_name)`


			`for id, name in person_dict.items():`
			`f_person.write(str(id) + '\t' + name + '\n')`

			`reltype_dict = Counter(rel_types).most_common()`

			`sum = 0.0`
			`for i in reltype_dict:`
			`rel_name = i[0]`
			`rel_freq = i[1]`
			`rel_percent = rel_freq/rels_num`
			`sum += rel_percent`

			`f_reltype.write(rel_name + '\t' + str(rel_freq) + '\t' + str(rel_percent) + '\t' + str(sum) + '\n')`

			`f_rel.write('\n'.join(list(rel_list)))`
			`f_person.close()`
			`f_rel.close()`
			`f_reltype.close()`

			`print('rels_num', rels_num)`
			`print('persons_num', persons_num)`
			`return`


			`if __name__ == '__main__':`
			`handler = PersonSpider()`
添加备注信息 2018-11-28 19:50:44 +08:00			`handler.spider_main()`
创建中文人物关系抽取项目 2018-11-28 16:31:06 +08:00