修改item的解析方式,解决一个词条多个页面的问题

This commit is contained in:
Your Name 2019-06-10 10:11:05 +08:00
parent aab330c932
commit d0c836be14

View File

@ -16,7 +16,7 @@ class HtmlParser(object):
# print(result)
result=re.findall('/[@#¥$%^&*()_+-=·A-Za-z\'.:\u4E00-\u9FA50-9]+[/?#]?', result)
# print(result)
item=result[0].replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
# print(item)
sets.add(urljoin('https://baike.baidu.com/item/',item))
# print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
@ -53,4 +53,4 @@ if __name__ == "__main__":
start=time.time()
new_urls, _ = parser.parse(content)
cost=time.time()-start
# print('\n'.join(new_urls),str(cost))
# print('\n'.join(new_urls),str(cost))