This commit is contained in:
Your Name 2019-06-10 10:25:41 +08:00
parent d0c836be14
commit a58941e571
2 changed files with 9 additions and 8 deletions

View File

@ -9,16 +9,16 @@ class HtmlParser(object):
sets = set()
# /view/123.htm
#<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
links = soup.find_all('a',href=re.compile('/item/*'))
for link in links:
temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
temp=BeautifulSoup(str(link), 'lxml').find('a')['href']#.replace('https://baike.baidu.com','')
result=urllib.parse.unquote(temp)
#print(result)
#result=re.findall('/item/*', result)
# print(result)
result=re.findall('/[@#¥$%^&*()_+-=·A-Za-z\'.:\u4E00-\u9FA50-9]+[/?#]?', result)
# print(result)
item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
item=result#[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
# print(item)
sets.add(urljoin('https://baike.baidu.com/item/',item))
sets.add(urljoin('https://baike.baidu.com/item',item))
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
# maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/'))))
return sets
@ -54,3 +54,4 @@ if __name__ == "__main__":
new_urls, _ = parser.parse(content)
cost=time.time()-start
# print('\n'.join(new_urls),str(cost))
print(new_urls)

View File

@ -31,7 +31,7 @@ class MyThread(threading.Thread):
pages+=1
spendtime+=time.time()-start
cost=spendtime/pages
print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} {str(cost)[:4]}:sec/page")
print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).replace('https://baike.baidu.com/item/','')} {str(cost)[:4]}:sec/page")
except KeyboardInterrupt:
print('save state',sys.exc_info())
pickle.dump(urls, open('urls.bin', 'wb'))
@ -42,7 +42,7 @@ class MyThread(threading.Thread):
if __name__=='__main__':
PATH='urls.pkl'
root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
root_url = 'https://baike.baidu.com'
LOCK=threading.Lock()
urls = url_manager.UrlManager()
downloader = html_downloader.HtmlDownloader()