1.添加10秒超时；2.检查文件名是否合法；3.修复中止程序时无法保存爬取状态的问题

2018-01-18 20:33:54 +08:00 · 2018-01-18 20:33:54 +08:00 · 122a696e09
commit 122a696e09
parent 4035aa2625
3 changed files with 4 additions and 4 deletions
--- a/html_downloader.py
+++ b/html_downloader.py
@ -5,7 +5,7 @@ class HtmlDownloader(object):
            return None
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
                   'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8'}
-        response = requests.get(url,headers=headers)
+        response = requests.get(url,headers=headers,timeout=10)
        if response.status_code != 200:
            return None
        return response.content
--- a/html_parser.py
+++ b/html_parser.py
@ -38,7 +38,7 @@ class HtmlParser(object):
        filename = title_node.get_text() + title_sub__text
        if not os.path.exists('webpages/'):
            os.mkdir('webpages/')
-        with open('webpages/' + filename, 'w') as f:
+        with open('webpages/' + filename.replace('/',''), 'w') as f:
            f.write(html_cont.decode('utf-8'))
            print('Save to disk filename:'+f.name+"")
        return res_data
--- a/spider_main.py
+++ b/spider_main.py
@ -21,7 +21,7 @@ class MyThread(threading.Thread):
                lock.acquire()
                new_url = urls.get_new_url()
                lock.release()
-                print('craw %d : %s' % (len(urls.old_urls), new_url))
+                print('craw %d' % (len(urls.old_urls)))
                html_cont = downloader.download(new_url)
                new_urls, _ = parser.parse(new_url, html_cont)
                lock.acquire()
@ -63,4 +63,4 @@ if __name__=='__main__':
        print('error!', sys.exc_info()[0])
    finally:
        print('save state')
-        pickle.dump(urls, open('urls.bin', 'wb'))
+        pickle.dump(urls, open('urls.bin', 'wb'))