修改部分代码

This commit is contained in:
Your Name 2019-05-12 18:57:20 +08:00
parent 558ae5d005
commit 6a1f1ccdc6
3 changed files with 26 additions and 30 deletions

6
README.md Normal file → Executable file
View File

@ -5,11 +5,6 @@
- 如果涉及到不方便公开的,请发邮件。
- ChatBot请访问[链接](http://bot.rubenxiao.com)
### update 0907
- 1.修改网页保存路径为相对路径
- 2.删除多余的文件,只保留代码文件
# 开源web知识图谱项目
- 爬取百度百科中文页面
@ -44,6 +39,7 @@
- 4.kg目录下执行python build-triple-from-table.py
- 5.kg目录下执行python insert_to_neo4j.py
第二步本项目可以不执行。
### 知识图谱效果图

View File

@ -35,10 +35,10 @@ class HtmlParser(object):
except:
title_sub__text=''
filename = title_node.get_text() + title_sub__text
path='../webpages/'#custom diectory for webpages
path=os.path.join('.','webpages')#custom diectory for webpages
if not os.path.exists(path):
os.mkdir(path)
with open(path + filename.replace('/',''), 'w') as f:
with open(os.path.join(path ,filename.replace('/','')), 'w') as f:
f.write(html_cont.decode('utf-8'))
print('Save to disk filename:'+f.name+"")
return res_data

View File

@ -5,9 +5,6 @@ import sys
import threading
class MyThread(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
@ -17,43 +14,46 @@ class MyThread(threading.Thread):
def run(self):
try:
while urls.has_new_url() and self._running:
global i
lock.acquire()
LOCK.acquire()
new_url = urls.get_new_url()
lock.release()
LOCK.release()
print('craw %d' % (len(urls.old_urls)))
html_cont = downloader.download(new_url)
new_urls, _ = parser.parse(new_url, html_cont)
lock.acquire()
LOCK.acquire()
urls.add_new_urls(new_urls)
lock.release()
LOCK.release()
except:
print('save state')
pickle.dump(urls, open('urls.bin', 'wb'))
if __name__=='__main__':
PATH='urls.pkl'
root_url = 'http://baike.baidu.com'
lock=threading.Lock()
LOCK=threading.Lock()
urls = url_manager.UrlManager()
downloader = html_downloader.HtmlDownloader()
parser = html_parser.HtmlParser()
list_thread=[]
count_thread=12
threads=[]
count_thread=1
if os.path.exists(PATH):
urls=pickle.load(open(PATH,'rb'))
else:
urls.add_new_url(root_url)
length=len(urls.new_urls)
print(f'build urls,length={length}')
for i in range(count_thread):
list_thread.append(MyThread())
print(f'build thread {i}...')
threads.append(MyThread())
try:
if os.path.exists('urls.bin'):
urls=pickle.load(open('urls.bin','rb'))
else:
urls.add_new_url(root_url)
for th in list_thread:
th.start()
th.join()
for t in threads:
t.start()
t.join()
except:
for th in list_thread:
th.terminate()
for t in threads:
t.terminate()
print('error!', sys.exc_info()[0])
finally:
print('save state')
pickle.dump(urls, open('urls.bin', 'wb'))
print('finished,saving state')
pickle.dump(urls, open(PATH, 'wb'))