diff --git a/.gitignore b/.gitignore index 192dba4..3153580 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ __pycache__/ webpages/* *.html -*.txt \ No newline at end of file +*.txt +*.log \ No newline at end of file diff --git a/README.md b/README.md index 14af145..b1e28ff 100755 --- a/README.md +++ b/README.md @@ -1,47 +1,59 @@ -#### Tips +# 开源web知识图谱项目 +- 爬取百度百科中文页面 +- 解析三元组和网页内容 +- 构建中文知识图谱 +- 构建百科bot(构建中) + +##### update 20191121 + +- 迁移代码到爬虫框架scrapy +- 优化了抽取部分代码 +- 数据持久化迁移到mongodb +- 修复chatbot失效问题 +- 开放neo4j后台界面,可以查看知识图谱成型效果 + +##### Tips - 如果是项目问题,请提issue。 - 如果涉及到不方便公开的,请发邮件。 - ChatBot请访问[链接](http://bot.rubenxiao.com/) - -# 开源web知识图谱项目 - -- 爬取百度百科中文页面 -- 抽取[100W+个三元组](https://raw.githubusercontent.com/lixiang0/WEB_KG/master/kg/triples.txt) -- 构建中文知识图谱 +- 成型的百科知识图谱访问[链接](http://kg.rubenxiao.com/),用户名:neo4j,密码:123。 ### 环境 - python 3.6 -- requests:网络请求 - re:url正则匹配 -- bs4:网页解析 -- pickle:进度保存 -- threading:多线程 +- scrapy:网页爬虫和网页解析 - neo4j:知识图谱图数据库,安装可以参考[链接](http://blog.rubenxiao.com/posts/install-neo4j.html) - pip install neo4j-driver:neo4j python驱动 +- pip install pymongodb:mongodb的python支持 +- mongodb数据库:安装参考[链接](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/) -### 代码目錄 +### 代码执行: -- spider/ 抓取原始网页 -- ie/ 从网页中解析正文,从正文中抽取结构化信息 -- kg/ 抽取三元組,存入neo4j数据库 +``` +cd WEB_KG/baike +scrapy crawl baike +``` - -### 代码执行顺序: - - -- 1.spider目录下执行:python spider_main.py -- 2.ie目录下执行:python extract-para.py -- 3.ie目录下执行:python extract-table.py -- 4.kg目录下执行:python build-triple-from-table.py -- 5.kg目录下执行:python insert_to_neo4j.py - -第二步本项目可以不执行。 +执行界面(按ctrl+c停止): +![](./imgs/kg5.png) ### 知识图谱效果图 -![](./kg/kg.png) +![](./imgs/kg.png) + +### mongodb存储的网页内容 + +![](./imgs/kg3.png) + +### mongodb存储的三元组 + +![](./imgs/kg4.png) + +### neo4j后台界面 + +![](./imgs/kg2.png) diff --git a/baike/spiders/baike.py b/baike/spiders/baike.py index 092c171..f284d71 100644 --- a/baike/spiders/baike.py +++ b/baike/spiders/baike.py @@ -8,8 +8,13 @@ import re import pymongo from scrapy.selector import Selector from neo4j.v1 import GraphDatabase - - +import logging +import time +logfile_name = time.ctime(time.time()).replace(' ', '_') +if not os.path.exists('logs/'): + os.mkdir('logs/') +logging.basicConfig(filename=f'logs/{logfile_name}.log', filemode='a+', + format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') class BaikeSpider(scrapy.Spider): name = 'baike' allowed_domains = ['baike.baidu.com'] @@ -59,29 +64,35 @@ class BaikeSpider(scrapy.Spider): # 处理三元组 entity = ''.join(response.xpath( - '//h1/text()').extract()).replace('/', '') + '//h1/text()').getall()).replace('/', '') attrs = response.xpath( - '//dt[contains(@class,"basicInfo-item name")]').extract() + '//dt[contains(@class,"basicInfo-item name")]').getall() values = response.xpath( - '//dd[contains(@class,"basicInfo-item value")]').extract() + '//dd[contains(@class,"basicInfo-item value")]').getall() + if len(attrs)!= len(values): + return with self.driver.session() as session: - for i, attr in enumerate(attrs): + try: + for attr,value in zip(attrs,values): # attr - temp = Selector(text=attr).xpath( - '//dt/text()|//dt/a/text()').extract() - attr = ''.join(temp).replace('\n', '').replace(':', '').replace( - ':', '').replace('\xa0', '').replace(' ', '').replace('【', '').replace('】', '') - # value - temp = Selector(text=values[i]).xpath( - '//dd/text()|//dd/a/text()').extract() - value = ''.join(temp).replace('\n', '') - try: - self.db_triples.insert_one({ - "_id": entity+'_'+attr+'_'+value, - "item_name": entity, - "attr": attr, - "value": value, } - ) - except pymongo.errors.DuplicateKeyError: - pass - session.write_transaction(self.add_node, entity, attr, value) + temp = Selector(text=attr).xpath( + '//dt//text()').getall() + attr = ''.join(temp).replace('\xa0', '') + # value + values = Selector(text=value).xpath( + '//dd/text()|//dd/a//text()').getall() + for value in values: + try: + value=value.replace('\n','') + logging.warning(entity+'_'+attr+'_'+value) + self.db_triples.insert_one({ + "_id": entity+'_'+attr+'_'+value, + "item_name": entity, + "attr": attr, + "value": value, } + ) + except pymongo.errors.DuplicateKeyError: + pass + session.write_transaction(self.add_node, entity, attr, value) + except Exception: + logging.error('\n---'.join(attrs)+'\n_________________'+'\n---'.join(values)) \ No newline at end of file diff --git a/imgs/kg.png b/imgs/kg.png new file mode 100644 index 0000000..0372e42 Binary files /dev/null and b/imgs/kg.png differ diff --git a/imgs/kg2.png b/imgs/kg2.png new file mode 100644 index 0000000..58af85e Binary files /dev/null and b/imgs/kg2.png differ diff --git a/imgs/kg3.png b/imgs/kg3.png new file mode 100644 index 0000000..d4df25c Binary files /dev/null and b/imgs/kg3.png differ diff --git a/imgs/kg4.png b/imgs/kg4.png new file mode 100644 index 0000000..add1510 Binary files /dev/null and b/imgs/kg4.png differ diff --git a/imgs/kg5.png b/imgs/kg5.png new file mode 100644 index 0000000..6510d03 Binary files /dev/null and b/imgs/kg5.png differ