update
This commit is contained in:
parent
d0c836be14
commit
a58941e571
@ -9,16 +9,16 @@ class HtmlParser(object):
|
||||
sets = set()
|
||||
# /view/123.htm
|
||||
#<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
|
||||
links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
|
||||
links = soup.find_all('a',href=re.compile('/item/*'))
|
||||
for link in links:
|
||||
temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
|
||||
temp=BeautifulSoup(str(link), 'lxml').find('a')['href']#.replace('https://baike.baidu.com','')
|
||||
result=urllib.parse.unquote(temp)
|
||||
#print(result)
|
||||
#result=re.findall('/item/*', result)
|
||||
# print(result)
|
||||
result=re.findall('/[!@#¥$%^&*()_+-=·A-Za-z\'.:~:\u4E00-\u9FA50-9]+[/?#]?', result)
|
||||
# print(result)
|
||||
item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
|
||||
item=result#[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
|
||||
# print(item)
|
||||
sets.add(urljoin('https://baike.baidu.com/item/',item))
|
||||
sets.add(urljoin('https://baike.baidu.com/item',item))
|
||||
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
|
||||
# maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/'))))
|
||||
return sets
|
||||
@ -54,3 +54,4 @@ if __name__ == "__main__":
|
||||
new_urls, _ = parser.parse(content)
|
||||
cost=time.time()-start
|
||||
# print('\n'.join(new_urls),str(cost))
|
||||
print(new_urls)
|
||||
|
@ -31,7 +31,7 @@ class MyThread(threading.Thread):
|
||||
pages+=1
|
||||
spendtime+=time.time()-start
|
||||
cost=spendtime/pages
|
||||
print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} {str(cost)[:4]}:sec/page")
|
||||
print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).replace('https://baike.baidu.com/item/','')} {str(cost)[:4]}:sec/page")
|
||||
except KeyboardInterrupt:
|
||||
print('save state',sys.exc_info())
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
@ -42,7 +42,7 @@ class MyThread(threading.Thread):
|
||||
if __name__=='__main__':
|
||||
|
||||
PATH='urls.pkl'
|
||||
root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
|
||||
root_url = 'https://baike.baidu.com'
|
||||
LOCK=threading.Lock()
|
||||
urls = url_manager.UrlManager()
|
||||
downloader = html_downloader.HtmlDownloader()
|
||||
|
Loading…
Reference in New Issue
Block a user