add page extract
This commit is contained in:
parent
122a696e09
commit
328ea2657b
25
info-extract.py
Normal file
25
info-extract.py
Normal file
@ -0,0 +1,25 @@
|
||||
import re
|
||||
from scrapy.selector import Selector
|
||||
|
||||
|
||||
import glob
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
pages=glob.glob('/data/ruben/data/webpages/*')
|
||||
print(len(pages))
|
||||
print(pages[0])
|
||||
|
||||
for page in pages:
|
||||
contents = open(page,'r').read()
|
||||
info_data = {}
|
||||
# print(contents)
|
||||
#用Xpath提取出<div class="para"></div>中的所有内容
|
||||
selector=Selector(text=contents)
|
||||
line=selector.xpath('//div[contains(@class, "main-content")]')
|
||||
title=line.xpath('//h1/text()').extract()
|
||||
para=''.join(word for word in line.xpath('//div[contains(@class, "para")]/text()').extract() if len(word)>1)
|
||||
output = open('./info/'+''.join(title)+'.txt','w')
|
||||
output.write(para)
|
||||
output.close()
|
||||
break
|
4
info/海陵区.txt
Normal file
4
info/海陵区.txt
Normal file
File diff suppressed because one or more lines are too long
1
output.txt
Normal file
1
output.txt
Normal file
@ -0,0 +1 @@
|
||||
中文名称海陵区外文名称Hailing District别 名海阳行政区类别市辖区所属地区中国华东、苏中地区下辖地区6个街道办事处、3镇政府驻地城中街道电话区号0523邮政区码225300地理位置江苏省泰州市中西部面 积300.5平方公里人 口47.76万人(截止2012年末)方 言江淮官话泰如片泰州小片气候条件亚热带季风气候著名景点凤城河风景区、望海楼、泰山公园机 场扬州泰州国际机场火车站泰州火车站车牌代码苏M行政代码321202GDP456.78亿人民币(2015年)
|
Loading…
Reference in New Issue
Block a user