优化百科词条的提取方式
This commit is contained in:
parent
ba29936d21
commit
3479b8b1fe
0
.gitignore
vendored
Normal file → Executable file
0
.gitignore
vendored
Normal file → Executable file
@ -1,13 +1,28 @@
|
||||
import requests
|
||||
|
||||
# 功能:获取对应网址的网页
|
||||
class HtmlDownloader(object):
|
||||
def download(self, url):
|
||||
if url is None:
|
||||
return None
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
|
||||
headers_pc = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
|
||||
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8'}
|
||||
response = requests.get(url,headers=headers,timeout=10)
|
||||
# headers_mobile={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Mobile Safari/537.36',
|
||||
# "Accept":"text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8"}
|
||||
response = requests.get(url,headers=headers_pc,timeout=10)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
return response.content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
#https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5?bk_fr=chain_bottom×tamp=1559566601712
|
||||
downloader=HtmlDownloader()
|
||||
htm=downloader.download('https://baike.baidu.com/item/文汇报').decode('utf-8')
|
||||
content=open('temp.html','w')#wpf=3&ldr=1&page=1&insf=1&_=1559569199226
|
||||
content.write(htm)
|
||||
# 1559566851.4734867
|
||||
# 1559569199226
|
||||
# 1559567213000
|
||||
|
||||
|
||||
|
@ -4,17 +4,24 @@ import re
|
||||
from urllib.parse import urljoin
|
||||
import os
|
||||
import urllib
|
||||
|
||||
class HtmlParser(object):
|
||||
def _get_new_urls(self, soup):
|
||||
maps = dict()
|
||||
sets = set()
|
||||
# /view/123.htm
|
||||
#<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
|
||||
links = soup.find_all('a',href=re.compile('/item/[\u4E00-\u9FA5]+'))
|
||||
links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
|
||||
for link in links:
|
||||
temp=BeautifulSoup(str(link), 'lxml')
|
||||
maps[temp.find('a').contents[0]]=urljoin('https://baike.baidu.com', temp.find('a')['href'])
|
||||
return maps
|
||||
temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
|
||||
result=urllib.parse.unquote(temp)
|
||||
# print(result)
|
||||
result=re.findall('/[!@#¥$%^&*()_+-=·A-Za-z\'.:~:\u4E00-\u9FA50-9]+[/?#]?', result)
|
||||
# print(result)
|
||||
item=result[0].replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
|
||||
# print(item)
|
||||
sets.add(urljoin('https://baike.baidu.com/item/',item))
|
||||
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
|
||||
# maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/'))))
|
||||
return sets
|
||||
|
||||
def _save_new_data(self, soup,html_cont):
|
||||
is_saved = False
|
||||
@ -23,19 +30,18 @@ class HtmlParser(object):
|
||||
path=os.path.join('.','webpages')#custom diectory for webpages
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
with open(os.path.join(path ,title), 'w') as f:
|
||||
with open(os.path.join(path ,title+'.html'), 'w') as f:
|
||||
f.write(html_cont.decode('utf-8'))
|
||||
print('Save to disk filename:'+f.name+"")
|
||||
return is_saved
|
||||
|
||||
def parse(self, html_cont):
|
||||
if html_cont is None:
|
||||
return
|
||||
soup = BeautifulSoup(html_cont, 'lxml')
|
||||
# print(soup.prettify())
|
||||
maps = self._get_new_urls( soup)
|
||||
soup = BeautifulSoup((html_cont), 'lxml')
|
||||
sets = self._get_new_urls( soup)
|
||||
# print(sets)
|
||||
is_saved = self._save_new_data( soup,html_cont)
|
||||
return list(maps.values()), is_saved
|
||||
return sets, is_saved
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -3,40 +3,45 @@ import pickle
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
|
||||
|
||||
import time
|
||||
import urllib
|
||||
class MyThread(threading.Thread):
|
||||
def __init__(self):
|
||||
def __init__(self,name):
|
||||
threading.Thread.__init__(self)
|
||||
self._running = True
|
||||
self.name=name
|
||||
# print(self.name)
|
||||
def terminate(self):
|
||||
self._running = False
|
||||
def run(self):
|
||||
try:
|
||||
while urls.has_new_url() and self._running:
|
||||
start=time.time()
|
||||
LOCK.acquire()
|
||||
new_url = urls.get_new_url()
|
||||
LOCK.release()
|
||||
print('craw %d' % (len(urls.old_urls)),new_url)
|
||||
html_cont = downloader.download(new_url)
|
||||
new_urls, _ = parser.parse(html_cont)
|
||||
LOCK.acquire()
|
||||
urls.add_new_urls(new_urls)
|
||||
spend=time.time()-start
|
||||
LOCK.release()
|
||||
print(f"Thread:{self.name} craw id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} spend:{str(spend)}")
|
||||
except:
|
||||
print('save state',sys.exc_info())
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
|
||||
PATH='urls.pkl'
|
||||
root_url = 'http://baike.baidu.com'
|
||||
root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
|
||||
LOCK=threading.Lock()
|
||||
urls = url_manager.UrlManager()
|
||||
downloader = html_downloader.HtmlDownloader()
|
||||
parser = html_parser.HtmlParser()
|
||||
threads=[]
|
||||
count_thread=36
|
||||
count_thread=12
|
||||
if os.path.exists(PATH):
|
||||
urls=pickle.load(open(PATH,'rb'))
|
||||
else:
|
||||
@ -45,7 +50,7 @@ if __name__=='__main__':
|
||||
print(f'build urls,length={length}')
|
||||
for i in range(count_thread):
|
||||
print(f'build thread {i}...')
|
||||
threads.append(MyThread())
|
||||
threads.append(MyThread(str(i)))
|
||||
try:
|
||||
for t in threads:
|
||||
t.start()
|
||||
|
Loading…
Reference in New Issue
Block a user