From a58941e5719d19e67f64e0494a4a0f62ac0fa0de Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Mon, 10 Jun 2019 10:25:41 +0800
Subject: [PATCH] update

---
 spider/html_parser.py | 13 +++++++------
 spider/spider_main.py |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/spider/html_parser.py b/spider/html_parser.py
index d7dccf0..a0d9dfa 100755
--- a/spider/html_parser.py
+++ b/spider/html_parser.py
@@ -9,16 +9,16 @@ class HtmlParser(object):
         sets = set()
         # /view/123.htm
         #<a target="_blank" href="/item/%E6%9D%8E%C2%B7%E5%A1%94%E7%8E%9B%E9%9C%8D%E7%91%9E/5486870" data-lemmaid="5486870">李·塔玛霍瑞</a>
-        links = soup.find_all('a',href=re.compile('/item/[%A-Z\u4E00-\u9FA5]+'))
+        links = soup.find_all('a',href=re.compile('/item/*'))
         for link in links:
-            temp=BeautifulSoup(str(link), 'lxml').find('a')['href'].replace('https://baike.baidu.com','')
+            temp=BeautifulSoup(str(link), 'lxml').find('a')['href']#.replace('https://baike.baidu.com','')
             result=urllib.parse.unquote(temp)
+            #print(result)
+            #result=re.findall('/item/*', result)
             # print(result)
-            result=re.findall('/[！@#￥$%^&*()_+-=·A-Za-z\'.：～:\u4E00-\u9FA50-9]+[/?#]?', result)
-            # print(result)
-            item=result[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
+            item=result#[0]#.replace('/[0-9]+','').split('/')[2].replace('#hotspotmining','')
             # print(item)
-            sets.add(urljoin('https://baike.baidu.com/item/',item))
+            sets.add(urljoin('https://baike.baidu.com/item',item))
             # print(urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')[:5]))) )
             # maps[temp.find('a').contents[0]]=urllib.parse.unquote(urljoin('https://baike.baidu.com', '/'.join(temp.find('a')['href'].split('/')))) 
         return sets
@@ -54,3 +54,4 @@ if __name__ == "__main__":
     new_urls, _ = parser.parse(content)
     cost=time.time()-start
     # print('\n'.join(new_urls),str(cost))
+    print(new_urls)
diff --git a/spider/spider_main.py b/spider/spider_main.py
index ff4e8d6..da788db 100755
--- a/spider/spider_main.py
+++ b/spider/spider_main.py
@@ -31,7 +31,7 @@ class MyThread(threading.Thread):
                 pages+=1
                 spendtime+=time.time()-start
                 cost=spendtime/pages
-                print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).split('/')[-1]} {str(cost)[:4]}:sec/page")
+                print(f"Thread:{self.name} id:{len(urls.old_urls)} URL:{urllib.parse.unquote(new_url).replace('https://baike.baidu.com/item/','')} {str(cost)[:4]}:sec/page")
             except KeyboardInterrupt:
                 print('save state',sys.exc_info())
                 pickle.dump(urls, open('urls.bin', 'wb'))
@@ -42,7 +42,7 @@ class MyThread(threading.Thread):
 if __name__=='__main__':
 
     PATH='urls.pkl'
-    root_url = 'https://baike.baidu.com/item/%E6%96%87%E6%B1%87%E6%8A%A5'
+    root_url = 'https://baike.baidu.com'
     LOCK=threading.Lock()
     urls = url_manager.UrlManager()
     downloader = html_downloader.HtmlDownloader()