1.添加10秒超时;2.检查文件名是否合法;3.修复中止程序时无法保存爬取状态的问题
This commit is contained in:
parent
4035aa2625
commit
122a696e09
@ -5,7 +5,7 @@ class HtmlDownloader(object):
|
||||
return None
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
|
||||
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9,image/webp, * / *;q = 0.8'}
|
||||
response = requests.get(url,headers=headers)
|
||||
response = requests.get(url,headers=headers,timeout=10)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
return response.content
|
||||
|
@ -38,7 +38,7 @@ class HtmlParser(object):
|
||||
filename = title_node.get_text() + title_sub__text
|
||||
if not os.path.exists('webpages/'):
|
||||
os.mkdir('webpages/')
|
||||
with open('webpages/' + filename, 'w') as f:
|
||||
with open('webpages/' + filename.replace('/',''), 'w') as f:
|
||||
f.write(html_cont.decode('utf-8'))
|
||||
print('Save to disk filename:'+f.name+"")
|
||||
return res_data
|
||||
|
@ -21,7 +21,7 @@ class MyThread(threading.Thread):
|
||||
lock.acquire()
|
||||
new_url = urls.get_new_url()
|
||||
lock.release()
|
||||
print('craw %d : %s' % (len(urls.old_urls), new_url))
|
||||
print('craw %d' % (len(urls.old_urls)))
|
||||
html_cont = downloader.download(new_url)
|
||||
new_urls, _ = parser.parse(new_url, html_cont)
|
||||
lock.acquire()
|
||||
@ -63,4 +63,4 @@ if __name__=='__main__':
|
||||
print('error!', sys.exc_info()[0])
|
||||
finally:
|
||||
print('save state')
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
pickle.dump(urls, open('urls.bin', 'wb'))
|
||||
|
Loading…
Reference in New Issue
Block a user