修改部分代码

2019-05-12 18:57:20 +08:00 · 2019-05-12 18:57:20 +08:00 · 6a1f1ccdc6
commit 6a1f1ccdc6
parent 558ae5d005
3 changed files with 26 additions and 30 deletions
--- a/README.md
+++ b/README.md
@ -5,11 +5,6 @@
 - 如果涉及到不方便公开的，请发邮件。
 - ChatBot请访问[链接](http://bot.rubenxiao.com)

-### update 0907
-
- 1.修改网页保存路径为相对路径
- 2.删除多余的文件，只保留代码文件
-
 # 开源web知识图谱项目

 - 爬取百度百科中文页面
@ -44,6 +39,7 @@
 - 4.kg目录下执行：python build-triple-from-table.py
 - 5.kg目录下执行：python insert_to_neo4j.py

+第二步本项目可以不执行。

 ### 知识图谱效果图

--- a/spider/html_parser.py
+++ b/spider/html_parser.py
@ -35,10 +35,10 @@ class HtmlParser(object):
        except:
            title_sub__text=''
        filename = title_node.get_text() + title_sub__text
-        path='../webpages/'#custom diectory for webpages
+        path=os.path.join('.','webpages')#custom diectory for webpages
        if not os.path.exists(path):
            os.mkdir(path)
-        with open(path + filename.replace('/',''), 'w') as f:
+        with open(os.path.join(path ,filename.replace('/','')), 'w') as f:
            f.write(html_cont.decode('utf-8'))
            print('Save to disk filename:'+f.name+"")
        return res_data
--- a/spider/spider_main.py
+++ b/spider/spider_main.py
@ -5,9 +5,6 @@ import sys
 import threading


-
-
-
 class MyThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
@ -17,43 +14,46 @@ class MyThread(threading.Thread):
    def run(self):
        try:
            while urls.has_new_url() and self._running:
-                global i
-                lock.acquire()
+                LOCK.acquire()
                new_url = urls.get_new_url()
-                lock.release()
+                LOCK.release()
                print('craw %d' % (len(urls.old_urls)))
                html_cont = downloader.download(new_url)
                new_urls, _ = parser.parse(new_url, html_cont)
-                lock.acquire()
+                LOCK.acquire()
                urls.add_new_urls(new_urls)
-                lock.release()
+                LOCK.release()
        except:
            print('save state')
            pickle.dump(urls, open('urls.bin', 'wb'))


 if __name__=='__main__':
+    PATH='urls.pkl'
    root_url = 'http://baike.baidu.com'
-    lock=threading.Lock()
+    LOCK=threading.Lock()
    urls = url_manager.UrlManager()
    downloader = html_downloader.HtmlDownloader()
    parser = html_parser.HtmlParser()
-    list_thread=[]
-    count_thread=12
+    threads=[]
+    count_thread=1
+    if os.path.exists(PATH):
+        urls=pickle.load(open(PATH,'rb'))
+    else:
+        urls.add_new_url(root_url)
+    length=len(urls.new_urls)
+    print(f'build urls,length={length}')
    for i in range(count_thread):
-        list_thread.append(MyThread())
+        print(f'build thread {i}...')
+        threads.append(MyThread())
    try:
-        if os.path.exists('urls.bin'):
-            urls=pickle.load(open('urls.bin','rb'))
-        else:
-            urls.add_new_url(root_url)
-        for th in list_thread:
-            th.start()
-            th.join()
+        for t in threads:
+            t.start()
+            t.join()
    except:
-        for th in list_thread:
-            th.terminate()
+        for t in threads:
+            t.terminate()
        print('error!', sys.exc_info()[0])
    finally:
-        print('save state')
-        pickle.dump(urls, open('urls.bin', 'wb'))
+        print('finished,saving state')
+        pickle.dump(urls, open(PATH, 'wb'))