add page extract

This commit is contained in:
Your Name 2018-04-02 22:01:08 +08:00
parent 122a696e09
commit 328ea2657b
3 changed files with 30 additions and 0 deletions

25
info-extract.py Normal file
View File

@ -0,0 +1,25 @@
import re
from scrapy.selector import Selector
import glob
from pathlib import Path
pages=glob.glob('/data/ruben/data/webpages/*')
print(len(pages))
print(pages[0])
for page in pages:
contents = open(page,'r').read()
info_data = {}
# print(contents)
#用Xpath提取出<div class="para"></div>中的所有内容
selector=Selector(text=contents)
line=selector.xpath('//div[contains(@class, "main-content")]')
title=line.xpath('//h1/text()').extract()
para=''.join(word for word in line.xpath('//div[contains(@class, "para")]/text()').extract() if len(word)>1)
output = open('./info/'+''.join(title)+'.txt','w')
output.write(para)
output.close()
break

4
info/海陵区.txt Normal file

File diff suppressed because one or more lines are too long

1
output.txt Normal file
View File

@ -0,0 +1 @@
中文名称海陵区外文名称Hailing District别    名海阳行政区类别市辖区所属地区中国华东、苏中地区下辖地区6个街道办事处、3镇政府驻地城中街道电话区号0523邮政区码225300地理位置江苏省泰州市中西部面    积300.5平方公里人    口47.76万人截止2012年末    言江淮官话泰如片泰州小片气候条件亚热带季风气候著名景点凤城河风景区、望海楼、泰山公园机    场扬州泰州国际机场火车站泰州火车站车牌代码苏M行政代码321202GDP456.78亿人民币2015年