1.增加日志；2.增加部分图片；3.优化部分逻辑；4.修改README.md

2019-11-21 23:46:16 +08:00 · 2019-11-21 23:46:16 +08:00 · 97252ff6b6
commit 97252ff6b6
parent 4492395a51
8 changed files with 76 additions and 52 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 __pycache__/
 webpages/*
 *.html
-*.txt
+*.txt
 *.log
--- a/README.md
+++ b/README.md
@ -1,47 +1,59 @@
-#### Tips
+# 开源web知识图谱项目
 - 爬取百度百科中文页面
 - 解析三元组和网页内容
 - 构建中文知识图谱
 - 构建百科bot（构建中）
 ##### update 20191121
 - 迁移代码到爬虫框架scrapy
 - 优化了抽取部分代码
 - 数据持久化迁移到mongodb
 - 修复chatbot失效问题
 - 开放neo4j后台界面，可以查看知识图谱成型效果
 ##### Tips
 - 如果是项目问题，请提issue。
 - 如果涉及到不方便公开的，请发邮件。
 - ChatBot请访问[链接](http://bot.rubenxiao.com/)
-
+- 成型的百科知识图谱访问[链接](http://kg.rubenxiao.com/)，用户名：neo4j,密码：123。
 # 开源web知识图谱项目
 - 爬取百度百科中文页面
 - 抽取[100W+个三元组](https://raw.githubusercontent.com/lixiang0/WEB_KG/master/kg/triples.txt)
 - 构建中文知识图谱
 ### 环境
 - python 3.6
 - requests:网络请求
 - re:url正则匹配
- bs4:网页解析
+- scrapy:网页爬虫和网页解析
 - pickle:进度保存
 - threading:多线程
 - neo4j:知识图谱图数据库,安装可以参考[链接](http://blog.rubenxiao.com/posts/install-neo4j.html)
 - pip install neo4j-driver：neo4j python驱动
 - pip install pymongodb：mongodb的python支持
 - mongodb数据库：安装参考[链接](https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/)
-### 代码目錄
+### 代码执行：
- spider/ 抓取原始网页
+```
- ie/ 从网页中解析正文，从正文中抽取结构化信息
+cd WEB_KG/baike
- kg/ 抽取三元組，存入neo4j数据库
+scrapy crawl baike
 ```
-
+执行界面(按ctrl+c停止)：
-### 代码执行顺序：
+![](./imgs/kg5.png)
 - 1.spider目录下执行：python spider_main.py
 - 2.ie目录下执行：python extract-para.py
 - 3.ie目录下执行：python extract-table.py
 - 4.kg目录下执行：python build-triple-from-table.py
 - 5.kg目录下执行：python insert_to_neo4j.py
 第二步本项目可以不执行。
 ### 知识图谱效果图
-![](./kg/kg.png)
+![](./imgs/kg.png)
 ### mongodb存储的网页内容
 ![](./imgs/kg3.png)
 ### mongodb存储的三元组
 ![](./imgs/kg4.png)
 ### neo4j后台界面
 ![](./imgs/kg2.png)
--- a/baike/spiders/baike.py
+++ b/baike/spiders/baike.py
@ -8,8 +8,13 @@ import re
 import pymongo
 from scrapy.selector import Selector
 from neo4j.v1 import GraphDatabase
-
+import logging
-
+import time
 logfile_name = time.ctime(time.time()).replace(' ', '_')
 if not os.path.exists('logs/'):
    os.mkdir('logs/')
 logging.basicConfig(filename=f'logs/{logfile_name}.log', filemode='a+',
                    format='%(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
 class BaikeSpider(scrapy.Spider):
    name = 'baike'
    allowed_domains = ['baike.baidu.com']
@ -59,29 +64,35 @@ class BaikeSpider(scrapy.Spider):
        # 处理三元组
        entity = ''.join(response.xpath(
-            '//h1/text()').extract()).replace('/', '')
+            '//h1/text()').getall()).replace('/', '')
        attrs = response.xpath(
-            '//dt[contains(@class,"basicInfo-item name")]').extract()
+            '//dt[contains(@class,"basicInfo-item name")]').getall()
        values = response.xpath(
-            '//dd[contains(@class,"basicInfo-item value")]').extract()
+            '//dd[contains(@class,"basicInfo-item value")]').getall()
        if len(attrs)!= len(values):
            return
        with self.driver.session() as session:
-            for i, attr in enumerate(attrs):
+            try:
                for attr,value in zip(attrs,values):
                    # attr
-                temp = Selector(text=attr).xpath(
+                    temp = Selector(text=attr).xpath(
-                    '//dt/text()|//dt/a/text()').extract()
+                        '//dt//text()').getall()
-                attr = ''.join(temp).replace('\n', '').replace('：', '').replace(
+                    attr = ''.join(temp).replace('\xa0', '')
-                    ':', '').replace('\xa0', '').replace(' ', '').replace('【', '').replace('】', '')
+                    # value
-                # value
+                    values = Selector(text=value).xpath(
-                temp = Selector(text=values[i]).xpath(
+                        '//dd/text()|//dd/a//text()').getall()
-                    '//dd/text()|//dd/a/text()').extract()
+                    for value in values:
-                value = ''.join(temp).replace('\n', '')
+                        try:
-                try:
+                            value=value.replace('\n','')
-                    self.db_triples.insert_one({
+                            logging.warning(entity+'_'+attr+'_'+value)
-                        "_id": entity+'_'+attr+'_'+value,
+                            self.db_triples.insert_one({
-                        "item_name": entity,
+                                "_id": entity+'_'+attr+'_'+value,
-                        "attr": attr,
+                                "item_name": entity,
-                        "value": value, }
+                                "attr": attr,
-                    )
+                                "value": value, }
-                except pymongo.errors.DuplicateKeyError:
+                            )
-                    pass
+                        except pymongo.errors.DuplicateKeyError:
-                session.write_transaction(self.add_node, entity, attr, value)
+                            pass
                        session.write_transaction(self.add_node, entity, attr, value)
            except Exception:
                logging.error('\n---'.join(attrs)+'\n_________________'+'\n---'.join(values))
--- a/imgs/kg.png
+++ b/imgs/kg.png
--- a/imgs/kg2.png
+++ b/imgs/kg2.png
--- a/imgs/kg3.png
+++ b/imgs/kg3.png
--- a/imgs/kg4.png
+++ b/imgs/kg4.png
--- a/imgs/kg5.png
+++ b/imgs/kg5.png