优化百科词条的提取方式

This commit is contained in:
Your Name 2019-06-05 11:46:01 +08:00
parent ba29936d21
commit 3479b8b1fe
4 changed files with 47 additions and 21 deletions

0
.gitignore vendored Normal file → Executable file
View File

View File

@ -1,13 +1,28 @@
import requests
# 功能:获取对应网址的网页
class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
headers_pc = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8'}
response = requests.get(url,headers=headers,timeout=10)
# headers_mobile={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Mobile Safari/537.36',
# "Accept":"text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8"}
response = requests.get(url,headers=headers_pc,timeout=10)
if response.status_code != 200:
return None
return response.content
if __name__ == "__main__":
#https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5?bk_fr=chain_bottom&timestamp=1559566601712
downloader=HtmlDownloader()
htm=downloader.download('https://baike.baidu.com/item/文汇报').decode('utf-8')
content=open('temp.html','w')#wpf=3&ldr=1&page=1&insf=1&_=1559569199226
content.write(htm)
# 1559566851.4734867
# 1559569199226
# 1559567213000

View File

@ -4,17 +4,24 @@ import re
from urllib.parse import urljoin
import os
import urllib
class HtmlParser(object):
def _get_new_urls(self, soup):
maps = dict()
sets = set()
# /view/123.htm
#<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
links = soup.find_all('a',href=re.compile('/item/[\u4E00-\u9FA5]+'))
links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
for link in links:
temp=BeautifulSoup(str(link), 'lxml')
maps[temp.find('a').contents[0]]=urljoin('https://baike.baidu.com', temp.find('a')['href'])
return maps
temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
result=urllib.parse.unquote(temp)
# print(result)
result=re.findall('/[@#¥$%^&*()_+-=·A-Za-z\'.:\u4E00-\u9FA50-9]+[/?#]?', result)
# print(result)
item=result[0].replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
# print(item)
sets.add(urljoin('https://baike.baidu.com/item/',item))
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
# maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/'))))
return sets
def _save_new_data(self, soup,html_cont):
is_saved = False
@ -23,19 +30,18 @@ class HtmlParser(object):
path=os.path.join('.','webpages')#custom diectory for webpages
if not os.path.exists(path):
os.mkdir(path)
with open(os.path.join(path ,title), 'w') as f:
with open(os.path.join(path ,title+'.html'), 'w') as f:
f.write(html_cont.decode('utf-8'))
print('Save to disk filename:'+f.name+"")
return is_saved
def parse(self, html_cont):
if html_cont is None:
return
soup = BeautifulSoup(html_cont, 'lxml')
# print(soup.prettify())
maps = self._get_new_urls( soup)
soup = BeautifulSoup((html_cont), 'lxml')
sets = self._get_new_urls( soup)
# print(sets)
is_saved = self._save_new_data( soup,html_cont)
return list(maps.values()), is_saved
return sets, is_saved
if __name__ == "__main__":

View File

@ -3,40 +3,45 @@ import pickle
import os
import sys
import threading
import time
import urllib
class MyThread(threading.Thread):
def __init__(self):
def __init__(self,name):
threading.Thread.__init__(self)
self._running = True
self.name=name
# print(self.name)
def terminate(self):
self._running = False
def run(self):
try:
while urls.has_new_url() and self._running:
start=time.time()
LOCK.acquire()
new_url = urls.get_new_url()
LOCK.release()
print('craw %d' % (len(urls.old_urls)),new_url)
html_cont = downloader.download(new_url)
new_urls, _ = parser.parse(html_cont)
LOCK.acquire()
urls.add_new_urls(new_urls)
spend=time.time()-start
LOCK.release()
print(f"Thread:{self.name} craw id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} spend:{str(spend)}")
except:
print('save state',sys.exc_info())
pickle.dump(urls, open('urls.bin', 'wb'))
if __name__=='__main__':
PATH='urls.pkl'
root_url = 'http://baike.baidu.com'
root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
LOCK=threading.Lock()
urls = url_manager.UrlManager()
downloader = html_downloader.HtmlDownloader()
parser = html_parser.HtmlParser()
threads=[]
count_thread=36
count_thread=12
if os.path.exists(PATH):
urls=pickle.load(open(PATH,'rb'))
else:
@ -45,7 +50,7 @@ if __name__=='__main__':
print(f'build urls,length={length}')
for i in range(count_thread):
print(f'build thread {i}...')
threads.append(MyThread())
threads.append(MyThread(str(i)))
try:
for t in threads:
t.start()