修改部分代码
This commit is contained in:
parent
558ae5d005
commit
6a1f1ccdc6
6
README.md
Normal file → Executable file
6
README.md
Normal file → Executable file
@ -5,11 +5,6 @@
|
||||
- 如果涉及到不方便公开的,请发邮件。
|
||||
- ChatBot请访问[链接](http://bot.rubenxiao.com)
|
||||
|
||||
### update 0907
|
||||
|
||||
- 1.修改网页保存路径为相对路径
|
||||
- 2.删除多余的文件,只保留代码文件
|
||||
|
||||
# 开源web知识图谱项目
|
||||
|
||||
- 爬取百度百科中文页面
|
||||
@ -44,6 +39,7 @@
|
||||
- 4.kg目录下执行:python build-triple-from-table.py
|
||||
- 5.kg目录下执行:python insert_to_neo4j.py
|
||||
|
||||
第二步本项目可以不执行。
|
||||
|
||||
### 知识图谱效果图
|
||||
|
||||
|
@ -35,10 +35,10 @@ class HtmlParser(object):
|
||||
except:
|
||||
title_sub__text=''
|
||||
filename = title_node.get_text() + title_sub__text
|
||||
path='../webpages/'#custom diectory for webpages
|
||||
path=os.path.join('.','webpages')#custom diectory for webpages
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
with open(path + filename.replace('/',''), 'w') as f:
|
||||
with open(os.path.join(path ,filename.replace('/','')), 'w') as f:
|
||||
f.write(html_cont.decode('utf-8'))
|
||||
print('Save to disk filename:'+f.name+"")
|
||||
return res_data
|
||||
|
@ -5,9 +5,6 @@ import sys
|
||||
import threading
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class MyThread(threading.Thread):
|
||||
def __init__(self):
|
||||
threading.Thread.__init__(self)
|
||||
@ -17,43 +14,46 @@ class MyThread(threading.Thread):
|
||||
def run(self):
|
||||
try:
|
||||
while urls.has_new_url() and self._running:
|
||||
global i
|
||||
lock.acquire()
|
||||
LOCK.acquire()
|
||||
new_url = urls.get_new_url()
|
||||
lock.release()
|
||||
LOCK.release()
|
||||
print('craw %d' % (len(urls.old_urls)))
|
||||
html_cont = downloader.download(new_url)
|
||||
new_urls, _ = parser.parse(new_url, html_cont)
|
||||
lock.acquire()
|
||||
LOCK.acquire()
|
||||
urls.add_new_urls(new_urls)
|
||||
lock.release()
|
||||
LOCK.release()
|
||||
except:
|
||||
print('save state')
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
PATH='urls.pkl'
|
||||
root_url = 'http://baike.baidu.com'
|
||||
lock=threading.Lock()
|
||||
LOCK=threading.Lock()
|
||||
urls = url_manager.UrlManager()
|
||||
downloader = html_downloader.HtmlDownloader()
|
||||
parser = html_parser.HtmlParser()
|
||||
list_thread=[]
|
||||
count_thread=12
|
||||
threads=[]
|
||||
count_thread=1
|
||||
if os.path.exists(PATH):
|
||||
urls=pickle.load(open(PATH,'rb'))
|
||||
else:
|
||||
urls.add_new_url(root_url)
|
||||
length=len(urls.new_urls)
|
||||
print(f'build urls,length={length}')
|
||||
for i in range(count_thread):
|
||||
list_thread.append(MyThread())
|
||||
print(f'build thread {i}...')
|
||||
threads.append(MyThread())
|
||||
try:
|
||||
if os.path.exists('urls.bin'):
|
||||
urls=pickle.load(open('urls.bin','rb'))
|
||||
else:
|
||||
urls.add_new_url(root_url)
|
||||
for th in list_thread:
|
||||
th.start()
|
||||
th.join()
|
||||
for t in threads:
|
||||
t.start()
|
||||
t.join()
|
||||
except:
|
||||
for th in list_thread:
|
||||
th.terminate()
|
||||
for t in threads:
|
||||
t.terminate()
|
||||
print('error!', sys.exc_info()[0])
|
||||
finally:
|
||||
print('save state')
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
print('finished,saving state')
|
||||
pickle.dump(urls, open(PATH, 'wb'))
|
||||
|
Loading…
Reference in New Issue
Block a user