This commit is contained in:
Your Name 2018-04-20 11:42:21 +08:00
parent b0164d0e14
commit 4f628191f9
13 changed files with 84 additions and 5 deletions

34
build-triple-from-table.py Executable file
View File

@ -0,0 +1,34 @@
import glob
import os
import re
import pickle as pkl
pages=glob.glob('./info-table/*')
# pattern=re.compile(r'[\u4e00-\u9fa5]+')
print(len(pages),pages[0])
class entity:
def __init__(self):
self.name=''
self.attr=dict()
def set_name(self,name):
self.name=name
def add_attr(self,attr,name):
self.attr[attr]=name
attrs=[]
entities=[]
for page in pages:
name=page.split('/')[-1][:-4]
lines=open(page).readlines(0)
ent=entity()
ent.name=name
for line in lines:
arrs=line.split('$$')
attrs.append(arrs[0])
ent.add_attr(arrs[0],arrs[1])
entities.append(ent)
# break
print(len(attrs),len(entities))
pkl.dump(attrs,open('./attrs.bin','wb'))
pkl.dump(entities,open('./entities.bin','wb'))

10
dict.txt Executable file
View File

@ -0,0 +1,10 @@
中国包装总公司 org
了解 v
国家新闻出版总署 org
专家 n
中国学术期刊综合评价数据库
中国核心期刊(遴选)数据库
中国期刊全文数据库
中国学术发展中心
国家级 adj
综合类 adj

0
extract-para.py → ie/extract-para.py Normal file → Executable file
View File

View File

@ -44,11 +44,12 @@ if os.path.exists(savepath):
print('load state')
try:
for page in pages:
print('page:',page)
if page in paged:
continue
contents = open(page,'r').read()
info_data = {}
print(page)
#用Xpath提取出<div class="para"></div>中的所有内容
selector=Selector(text=contents)
title=''.join(selector.xpath('//h1/text()').extract()).replace('/','')

BIN
ie/paged-table.bin Executable file

Binary file not shown.

BIN
ie/paged.bin Executable file

Binary file not shown.

BIN
kg/attrs.bin Executable file

Binary file not shown.

34
kg/build-triple-from-table.py Executable file
View File

@ -0,0 +1,34 @@
import glob
import os
import re
import pickle as pkl
pages=glob.glob('../info-table/*')
# pattern=re.compile(r'[\u4e00-\u9fa5]+')
print(len(pages),pages[0])
class entity:
def __init__(self):
self.name=''
self.attr=dict()
def set_name(self,name):
self.name=name
def add_attr(self,attr,name):
self.attr[attr]=name
attrs=[]
entities=[]
for page in pages:
name=page.split('/')[-1][:-4]
lines=open(page).readlines(0)
ent=entity()
ent.name=name
for line in lines:
arrs=line.split('$$')
attrs.append(arrs[0])
ent.add_attr(arrs[0],arrs[1])
entities.append(ent)
break
print(len(attrs),len(entities))
pkl.dump(attrs,open('./attrs.bin','wb'))
pkl.dump(entities,open('./entities.bin','wb'))

BIN
kg/entities.bin Executable file

Binary file not shown.

0
html_downloader.py → spider/html_downloader.py Normal file → Executable file
View File

8
html_parser.py → spider/html_parser.py Normal file → Executable file
View File

@ -34,11 +34,11 @@ class HtmlParser(object):
title_sub__text = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h2').get_text()
except:
title_sub__text=''
filename = title_node.get_text() + title_sub__text
if not os.path.exists('/data/ruben/data/webpages/'):
os.mkdir('/data/ruben/data/webpages/')
with open('/data/ruben/data/webpages/' + filename.replace('/',''), 'w') as f:
path='/data/ruben/data/webpages/'#custom diectory for webpages
if not os.path.exists(path):
os.mkdir(path)
with open(path + filename.replace('/',''), 'w') as f:
f.write(html_cont.decode('utf-8'))
print('Save to disk filename:'+f.name+"")
return res_data

0
spider_main.py → spider/spider_main.py Normal file → Executable file
View File

0
url_manager.py → spider/url_manager.py Normal file → Executable file
View File