update
This commit is contained in:
parent
b0164d0e14
commit
4f628191f9
34
build-triple-from-table.py
Executable file
34
build-triple-from-table.py
Executable file
@ -0,0 +1,34 @@
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import pickle as pkl
|
||||
pages=glob.glob('./info-table/*')
|
||||
|
||||
# pattern=re.compile(r'[\u4e00-\u9fa5]+')
|
||||
print(len(pages),pages[0])
|
||||
|
||||
|
||||
class entity:
|
||||
def __init__(self):
|
||||
self.name=''
|
||||
self.attr=dict()
|
||||
def set_name(self,name):
|
||||
self.name=name
|
||||
def add_attr(self,attr,name):
|
||||
self.attr[attr]=name
|
||||
attrs=[]
|
||||
entities=[]
|
||||
for page in pages:
|
||||
name=page.split('/')[-1][:-4]
|
||||
lines=open(page).readlines(0)
|
||||
ent=entity()
|
||||
ent.name=name
|
||||
for line in lines:
|
||||
arrs=line.split('$$')
|
||||
attrs.append(arrs[0])
|
||||
ent.add_attr(arrs[0],arrs[1])
|
||||
entities.append(ent)
|
||||
# break
|
||||
print(len(attrs),len(entities))
|
||||
pkl.dump(attrs,open('./attrs.bin','wb'))
|
||||
pkl.dump(entities,open('./entities.bin','wb'))
|
10
dict.txt
Executable file
10
dict.txt
Executable file
@ -0,0 +1,10 @@
|
||||
中国包装总公司 org
|
||||
了解 v
|
||||
国家新闻出版总署 org
|
||||
专家 n
|
||||
中国学术期刊综合评价数据库
|
||||
中国核心期刊(遴选)数据库
|
||||
中国期刊全文数据库
|
||||
中国学术发展中心
|
||||
国家级 adj
|
||||
综合类 adj
|
0
extract-para.py → ie/extract-para.py
Normal file → Executable file
0
extract-para.py → ie/extract-para.py
Normal file → Executable file
@ -44,11 +44,12 @@ if os.path.exists(savepath):
|
||||
print('load state')
|
||||
try:
|
||||
for page in pages:
|
||||
print('page:',page)
|
||||
if page in paged:
|
||||
continue
|
||||
contents = open(page,'r').read()
|
||||
info_data = {}
|
||||
print(page)
|
||||
|
||||
#用Xpath提取出<div class="para"></div>中的所有内容
|
||||
selector=Selector(text=contents)
|
||||
title=''.join(selector.xpath('//h1/text()').extract()).replace('/','')
|
BIN
ie/paged-table.bin
Executable file
BIN
ie/paged-table.bin
Executable file
Binary file not shown.
BIN
ie/paged.bin
Executable file
BIN
ie/paged.bin
Executable file
Binary file not shown.
BIN
kg/attrs.bin
Executable file
BIN
kg/attrs.bin
Executable file
Binary file not shown.
34
kg/build-triple-from-table.py
Executable file
34
kg/build-triple-from-table.py
Executable file
@ -0,0 +1,34 @@
|
||||
import glob
|
||||
import os
|
||||
import re
|
||||
import pickle as pkl
|
||||
pages=glob.glob('../info-table/*')
|
||||
|
||||
# pattern=re.compile(r'[\u4e00-\u9fa5]+')
|
||||
print(len(pages),pages[0])
|
||||
|
||||
|
||||
class entity:
|
||||
def __init__(self):
|
||||
self.name=''
|
||||
self.attr=dict()
|
||||
def set_name(self,name):
|
||||
self.name=name
|
||||
def add_attr(self,attr,name):
|
||||
self.attr[attr]=name
|
||||
attrs=[]
|
||||
entities=[]
|
||||
for page in pages:
|
||||
name=page.split('/')[-1][:-4]
|
||||
lines=open(page).readlines(0)
|
||||
ent=entity()
|
||||
ent.name=name
|
||||
for line in lines:
|
||||
arrs=line.split('$$')
|
||||
attrs.append(arrs[0])
|
||||
ent.add_attr(arrs[0],arrs[1])
|
||||
entities.append(ent)
|
||||
break
|
||||
print(len(attrs),len(entities))
|
||||
pkl.dump(attrs,open('./attrs.bin','wb'))
|
||||
pkl.dump(entities,open('./entities.bin','wb'))
|
BIN
kg/entities.bin
Executable file
BIN
kg/entities.bin
Executable file
Binary file not shown.
0
html_downloader.py → spider/html_downloader.py
Normal file → Executable file
0
html_downloader.py → spider/html_downloader.py
Normal file → Executable file
8
html_parser.py → spider/html_parser.py
Normal file → Executable file
8
html_parser.py → spider/html_parser.py
Normal file → Executable file
@ -34,11 +34,11 @@ class HtmlParser(object):
|
||||
title_sub__text = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h2').get_text()
|
||||
except:
|
||||
title_sub__text=''
|
||||
|
||||
filename = title_node.get_text() + title_sub__text
|
||||
if not os.path.exists('/data/ruben/data/webpages/'):
|
||||
os.mkdir('/data/ruben/data/webpages/')
|
||||
with open('/data/ruben/data/webpages/' + filename.replace('/',''), 'w') as f:
|
||||
path='/data/ruben/data/webpages/'#custom diectory for webpages
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
with open(path + filename.replace('/',''), 'w') as f:
|
||||
f.write(html_cont.decode('utf-8'))
|
||||
print('Save to disk filename:'+f.name+"")
|
||||
return res_data
|
0
spider_main.py → spider/spider_main.py
Normal file → Executable file
0
spider_main.py → spider/spider_main.py
Normal file → Executable file
0
url_manager.py → spider/url_manager.py
Normal file → Executable file
0
url_manager.py → spider/url_manager.py
Normal file → Executable file
Loading…
Reference in New Issue
Block a user