update

2019-06-10 10:25:41 +08:00 · 2019-06-10 10:25:41 +08:00 · a58941e571
commit a58941e571
parent d0c836be14
2 changed files with 9 additions and 8 deletions
--- a/spider/html_parser.py
+++ b/spider/html_parser.py
@ -9,16 +9,16 @@ class HtmlParser(object):
        sets = set()
        # /view/123.htm
        #<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
-        links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
+        links = soup.find_all('a',href=re.compile('/item/*'))
        for link in links:
-            temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
+            temp=BeautifulSoup(str(link), 'lxml').find('a')['href']#.replace('https://baike.baidu.com','')
            result=urllib.parse.unquote(temp)
+            #print(result)
+            #result=re.findall('/item/*', result)
            # print(result)
-            result=re.findall('/[！@#￥$%^&*()_+-=·A-Za-z\'.：～:\u4E00-\u9FA50-9]+[/?#]?', result)
-            # print(result)
-            item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
+            item=result#[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
            # print(item)
-            sets.add(urljoin('https://baike.baidu.com/item/',item))
+            sets.add(urljoin('https://baike.baidu.com/item',item))
            # print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
            # maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')))) 
        return sets
@ -54,3 +54,4 @@ if __name__ == "__main__":
    new_urls, _ = parser.parse(content)
    cost=time.time()-start
    # print('\n'.join(new_urls),str(cost))
+    print(new_urls)
--- a/spider/spider_main.py
+++ b/spider/spider_main.py
@ -31,7 +31,7 @@ class MyThread(threading.Thread):
                pages+=1
                spendtime+=time.time()-start
                cost=spendtime/pages
-                print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} {str(cost)[:4]}:sec/page")
+                print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).replace('https://baike.baidu.com/item/','')} {str(cost)[:4]}:sec/page")
            except KeyboardInterrupt:
                print('save state',sys.exc_info())
                pickle.dump(urls, open('urls.bin', 'wb'))
@ -42,7 +42,7 @@ class MyThread(threading.Thread):
 if __name__=='__main__':

    PATH='urls.pkl'
-    root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
+    root_url = 'https://baike.baidu.com'
    LOCK=threading.Lock()
    urls = url_manager.UrlManager()
    downloader = html_downloader.HtmlDownloader()