import re
from scrapy.selector import Selector
import pickle as pk
import glob
from pathlib import Path
import os
#
#
# - 中文名
# -
# 托普卡帕故宫
#
# - 外文名
# -
# Topkapı Sarayı
#
#
#
# - 灭亡时间
# -
# 1921年
#
# - 文 物
# -
# 瓷器、官服、武器、盾牌
#
# - 建议游玩时长
# -
# 1-2天
#
#
pages=glob.glob('../webpages/*')
savepath='./paged-table.bin'
print(len(pages))
print(pages[0])
paged=[]
if os.path.exists(savepath):
paged=pk.load(open(savepath,'rb'))
print('load state')
try:
for page in pages:
print('page:',page)
if page in paged:
continue
contents = open(page,'r').read()
info_data = {}
#用Xpath提取出中的所有内容
selector=Selector(text=contents)
title=''.join(selector.xpath('//h1/text()').extract()).replace('/','')
names=selector.xpath('//dt[contains(@class,"basicInfo-item name")]').extract()
values=selector.xpath('//dd[contains(@class,"basicInfo-item value")]').extract()
print(len(names),len(values))
lines=''
for i,name in enumerate(names):
#name
temp=Selector(text=name).xpath('//dt/text()|//dt/a/text()').extract()
name=''.join(temp).replace('\n','')
#value
temp=Selector(text=values[i]).xpath('//dd/text()|//dd/a/text()').extract()
value=''.join(temp).replace('\n','')
lines+=name+'$$'+value+'\n'
print(name,value)
print('process file:'+str(title))
output = open('./info-table/'+title+'.txt','w')
output.write(lines)
output.close()
paged.append(page)
except Exception as e:
print('exception:',str(e))
pk.dump(paged,open(savepath,'wb'))
print('save state done')
pk.dump(paged,open(savepath,'wb'))
print('save state done')