修改item的解析方式,解决一个词条多个页面的问题
This commit is contained in:
parent
aab330c932
commit
d0c836be14
@ -16,7 +16,7 @@ class HtmlParser(object):
|
||||
# print(result)
|
||||
result=re.findall('/[!@#¥$%^&*()_+-=·A-Za-z\'.:~:\u4E00-\u9FA50-9]+[/?#]?', result)
|
||||
# print(result)
|
||||
item=result[0].replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
|
||||
item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
|
||||
# print(item)
|
||||
sets.add(urljoin('https://baike.baidu.com/item/',item))
|
||||
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
|
||||
@ -53,4 +53,4 @@ if __name__ == "__main__":
|
||||
start=time.time()
|
||||
new_urls, _ = parser.parse(content)
|
||||
cost=time.time()-start
|
||||
# print('\n'.join(new_urls),str(cost))
|
||||
# print('\n'.join(new_urls),str(cost))
|
||||
|
Loading…
Reference in New Issue
Block a user