add para extract and table extract
This commit is contained in:
parent
86f6f51763
commit
eff2867efc
36
extract-para.py
Normal file
36
extract-para.py
Normal file
@ -0,0 +1,36 @@
|
||||
import re
|
||||
from scrapy.selector import Selector
|
||||
|
||||
import pickle as pk
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
pages=glob.glob('/data/ruben/data/webpages/*')
|
||||
savepath='./paged.bin'
|
||||
print(len(pages))
|
||||
print(pages[0])
|
||||
paged=[]
|
||||
if os.path.exists(savepath):
|
||||
paged=pk.load(open(savepath,'rb'))
|
||||
print('load state')
|
||||
try:
|
||||
for page in pages:
|
||||
if page in paged:
|
||||
continue
|
||||
contents = open(page,'r').read()
|
||||
info_data = {}
|
||||
# print(contents)
|
||||
#用Xpath提取出<div class="para"></div>中的所有内容
|
||||
selector=Selector(text=contents)
|
||||
line=selector.xpath('//div[contains(@class, "main-content")]')
|
||||
title=line.xpath('//h1/text()').extract()
|
||||
para=''.join(word for word in line.xpath('//div[contains(@class, "para")]/text()').extract() if len(word)>1)
|
||||
print('process file:'+str(title))
|
||||
output = open('./info-para/'+''.join(title)+'.txt','w')
|
||||
output.write(para)
|
||||
output.close()
|
||||
paged.append(page)
|
||||
except:
|
||||
pk.dump(paged,open(savepath,'wb'))
|
||||
print('save state done')
|
81
extract-table.py
Executable file
81
extract-table.py
Executable file
@ -0,0 +1,81 @@
|
||||
import re
|
||||
from scrapy.selector import Selector
|
||||
|
||||
import pickle as pk
|
||||
import glob
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
|
||||
# <div class="basic-info cmn-clearfix">
|
||||
# <dl class="basicInfo-block basicInfo-left">
|
||||
# <dt class="basicInfo-item name">中文名</dt>
|
||||
# <dd class="basicInfo-item value">
|
||||
# 托普卡帕故宫
|
||||
# </dd>
|
||||
# <dt class="basicInfo-item name">外文名</dt>
|
||||
# <dd class="basicInfo-item value">
|
||||
# Topkapı Sarayı
|
||||
# </dd>
|
||||
# </dl>
|
||||
|
||||
#<dl class="basicInfo-block basicInfo-right">
|
||||
# <dt class="basicInfo-item name">灭亡时间</dt>
|
||||
# <dd class="basicInfo-item value">
|
||||
# 1921年
|
||||
# </dd>
|
||||
# <dt class="basicInfo-item name">文 物</dt>
|
||||
# <dd class="basicInfo-item value">
|
||||
# <a target="_blank" href="/item/%E7%93%B7%E5%99%A8">瓷器</a>、官服、武器、盾牌
|
||||
# </dd>
|
||||
# <dt class="basicInfo-item name">建议游玩时长</dt>
|
||||
# <dd class="basicInfo-item value">
|
||||
# 1-2天
|
||||
# </dd>
|
||||
# </dl></div>
|
||||
|
||||
pages=glob.glob('/data/ruben/data/webpages/*')
|
||||
savepath='./paged-table.bin'
|
||||
print(len(pages))
|
||||
print(pages[0])
|
||||
paged=[]
|
||||
if os.path.exists(savepath):
|
||||
paged=pk.load(open(savepath,'rb'))
|
||||
print('load state')
|
||||
try:
|
||||
for page in pages:
|
||||
if page in paged:
|
||||
continue
|
||||
contents = open(page,'r').read()
|
||||
info_data = {}
|
||||
print(page)
|
||||
#用Xpath提取出<div class="para"></div>中的所有内容
|
||||
selector=Selector(text=contents)
|
||||
title=''.join(selector.xpath('//h1/text()').extract()).replace('/','')
|
||||
names=selector.xpath('//dt[contains(@class,"basicInfo-item name")]').extract()
|
||||
values=selector.xpath('//dd[contains(@class,"basicInfo-item value")]').extract()
|
||||
print(len(names),len(values))
|
||||
lines=''
|
||||
for i,name in enumerate(names):
|
||||
#name
|
||||
temp=Selector(text=name).xpath('//dt/text()|//dt/a/text()').extract()
|
||||
name=''.join(temp).replace('\n','')
|
||||
#value
|
||||
temp=Selector(text=values[i]).xpath('//dd/text()|//dd/a/text()').extract()
|
||||
value=''.join(temp).replace('\n','')
|
||||
|
||||
lines+=name+'$$'+value+'\n'
|
||||
print(name,value)
|
||||
print('process file:'+str(title))
|
||||
output = open('./info-table/'+title+'.txt','w')
|
||||
output.write(lines)
|
||||
output.close()
|
||||
paged.append(page)
|
||||
except Exception as e:
|
||||
print('exception:',str(e))
|
||||
pk.dump(paged,open(savepath,'wb'))
|
||||
print('save state done')
|
||||
pk.dump(paged,open(savepath,'wb'))
|
||||
print('save state done')
|
||||
|
||||
|
@ -1,25 +0,0 @@
|
||||
import re
|
||||
from scrapy.selector import Selector
|
||||
|
||||
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
pages=glob.glob('/data/ruben/data/webpages/*')
|
||||
print(len(pages))
|
||||
print(pages[0])
|
||||
|
||||
for page in pages:
|
||||
contents = open(page,'r').read()
|
||||
info_data = {}
|
||||
# print(contents)
|
||||
#用Xpath提取出<div class="para"></div>中的所有内容
|
||||
selector=Selector(text=contents)
|
||||
line=selector.xpath('//div[contains(@class, "main-content")]')
|
||||
title=line.xpath('//h1/text()').extract()
|
||||
para=''.join(word for word in line.xpath('//div[contains(@class, "para")]/text()').extract() if len(word)>1)
|
||||
output = open('./info/'+''.join(title)+'.txt','w')
|
||||
output.write(para)
|
||||
output.close()
|
||||
break
|
Loading…
Reference in New Issue
Block a user