From a58941e5719d19e67f64e0494a4a0f62ac0fa0de Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 10 Jun 2019 10:25:41 +0800 Subject: [PATCH] update --- spider/html_parser.py | 13 +++++++------ spider/spider_main.py | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/spider/html_parser.py b/spider/html_parser.py index d7dccf0..a0d9dfa 100755 --- a/spider/html_parser.py +++ b/spider/html_parser.py @@ -9,16 +9,16 @@ class HtmlParser(object): sets = set() # /view/123.htm #李·塔玛霍瑞 - links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+')) + links = soup.find_all('a',href=re.compile('/item/*')) for link in links: - temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','') + temp=BeautifulSoup(str(link), 'lxml').find('a')['href']#.replace('https://baike.baidu.com','') result=urllib.parse.unquote(temp) + #print(result) + #result=re.findall('/item/*', result) # print(result) - result=re.findall('/[!@#¥$%^&*()_+-=·A-Za-z\'.:~:\u4E00-\u9FA50-9]+[/?#]?', result) - # print(result) - item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','') + item=result#[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','') # print(item) - sets.add(urljoin('https://baike.baidu.com/item/',item)) + sets.add(urljoin('https://baike.baidu.com/item',item)) # print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) ) # maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')))) return sets @@ -54,3 +54,4 @@ if __name__ == "__main__": new_urls, _ = parser.parse(content) cost=time.time()-start # print('\n'.join(new_urls),str(cost)) + print(new_urls) diff --git a/spider/spider_main.py b/spider/spider_main.py index ff4e8d6..da788db 100755 --- a/spider/spider_main.py +++ b/spider/spider_main.py @@ -31,7 +31,7 @@ class MyThread(threading.Thread): pages+=1 spendtime+=time.time()-start cost=spendtime/pages - print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} {str(cost)[:4]}:sec/page") + print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).replace('https://baike.baidu.com/item/','')} {str(cost)[:4]}:sec/page") except KeyboardInterrupt: print('save state',sys.exc_info()) pickle.dump(urls, open('urls.bin', 'wb')) @@ -42,7 +42,7 @@ class MyThread(threading.Thread): if __name__=='__main__': PATH='urls.pkl' - root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5' + root_url = 'https://baike.baidu.com' LOCK=threading.Lock() urls = url_manager.UrlManager() downloader = html_downloader.HtmlDownloader()