1.添加10秒超时;2.检查文件名是否合法;3.修复中止程序时无法保存爬取状态的问题

This commit is contained in:
ruben 2018-01-18 20:33:54 +08:00
parent 4035aa2625
commit 122a696e09
3 changed files with 4 additions and 4 deletions

View File

@ -5,7 +5,7 @@ class HtmlDownloader(object):
return None
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8'}
response = requests.get(url,headers=headers)
response = requests.get(url,headers=headers,timeout=10)
if response.status_code != 200:
return None
return response.content

View File

@ -38,7 +38,7 @@ class HtmlParser(object):
filename = title_node.get_text() + title_sub__text
if not os.path.exists('webpages/'):
os.mkdir('webpages/')
with open('webpages/' + filename, 'w') as f:
with open('webpages/' + filename.replace('/',''), 'w') as f:
f.write(html_cont.decode('utf-8'))
print('Save to disk filename:'+f.name+"")
return res_data

View File

@ -21,7 +21,7 @@ class MyThread(threading.Thread):
lock.acquire()
new_url = urls.get_new_url()
lock.release()
print('craw %d : %s' % (len(urls.old_urls), new_url))
print('craw %d' % (len(urls.old_urls)))
html_cont = downloader.download(new_url)
new_urls, _ = parser.parse(new_url, html_cont)
lock.acquire()
@ -63,4 +63,4 @@ if __name__=='__main__':
print('error!', sys.exc_info()[0])
finally:
print('save state')
pickle.dump(urls, open('urls.bin', 'wb'))
pickle.dump(urls, open('urls.bin', 'wb'))