157 lines
5.4 KiB
Python
157 lines
5.4 KiB
Python
|
|
||
|
# entities list for umls, YAGO3-10, FB15k-237, WN18RR, WN11, FB13
|
||
|
with open('data/WN18RR/train.tsv', 'r') as f, open('data/WN18RR/test.tsv', 'r') as f1, open('data/WN18RR/dev.tsv', 'r') as f2, open('data/WN18RR/entities.txt', 'w') as f3:
|
||
|
lines = f.readlines() + f1.readlines() + f2.readlines()
|
||
|
entities = set()
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split('\t')
|
||
|
entities.add(temp[0])
|
||
|
entities.add(temp[2])
|
||
|
entities_str = '\n'.join(list(entities))
|
||
|
f3.write(entities_str)
|
||
|
|
||
|
# relations list for FB15k-237, umls, WN18RR and YAGO3-10, WN11, FB13
|
||
|
|
||
|
with open('data/WN18RR/train.tsv', 'r') as f, open('data/WN18RR/test.tsv', 'r') as f1, open('data/WN18RR/dev.tsv', 'r') as f2, open('data/WN18RR/relations.txt', 'w') as f3:
|
||
|
relations = set()
|
||
|
lines = f.readlines() + f1.readlines() + f2.readlines()
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split('\t')
|
||
|
relations.add(temp[1])
|
||
|
relations_str = '\n'.join(relations)
|
||
|
f3.write(relations_str)
|
||
|
|
||
|
# entities and relation list for WN18 and FB15K
|
||
|
with open('data/FB15K/entity2id.txt', 'r') as f, open('data/FB15K/relation2id.txt', 'r') as f1:
|
||
|
lines = f.readlines()
|
||
|
ent_id_dict = {}
|
||
|
entities = []
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split()
|
||
|
if len(temp) == 2:
|
||
|
ent_id_dict[temp[1]] = temp[0]
|
||
|
entities.append(temp[0])
|
||
|
|
||
|
lines = f1.readlines()
|
||
|
rel_id_dict = {}
|
||
|
relations = []
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split()
|
||
|
if len(temp) == 2:
|
||
|
rel_id_dict[temp[1]] = temp[0]
|
||
|
relations.append(temp[0])
|
||
|
|
||
|
with open('data/FB15K/entities.txt', 'w') as f, open('data/FB15K/relations.txt', 'w') as f1:
|
||
|
f.write('\n'.join(entities))
|
||
|
f1.write('\n'.join(relations))
|
||
|
|
||
|
# train.tsv, dev.tsv and test.tsv for WN18 and FB15K
|
||
|
with open('data/FB15K/valid2id.txt', 'r') as f, open('data/FB15K/dev.tsv', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
text_lines = []
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split()
|
||
|
|
||
|
if len(temp) == 3:
|
||
|
head_entity = ent_id_dict[temp[0]]
|
||
|
tail_entity = ent_id_dict[temp[1]]
|
||
|
relation = rel_id_dict[temp[2]]
|
||
|
text_lines.append(head_entity + '\t' + relation + '\t'+ tail_entity)
|
||
|
text_lines_str = '\n'.join(text_lines)
|
||
|
f1.write(text_lines_str)
|
||
|
|
||
|
# entity to text for WN18RR
|
||
|
with open('data/WN18RR/wordnet-mlj12-definitions.txt', 'r') as f, open('data/WN18RR/entity2text.txt', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
ent2texts = []
|
||
|
count = 0
|
||
|
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split('\t')
|
||
|
if len(temp) == 3:
|
||
|
if temp[1].find('_NN_') != -1 or temp[1].find('_JJ_') != -1 or temp[1].find('_VB_') != -1 or temp[1].find('_RB_') != -1:
|
||
|
wordnet_str = temp[1][2:]
|
||
|
num_start = wordnet_str.rfind('_')
|
||
|
pos_start = wordnet_str[:num_start].rfind('_')
|
||
|
name = wordnet_str[:pos_start]
|
||
|
name = name.replace('_', ' ')
|
||
|
ent2texts.append(temp[0] + '\t' + name + ', ' + temp[2])
|
||
|
count += 1
|
||
|
print(count)
|
||
|
f1.write('\n'.join(ent2texts))
|
||
|
|
||
|
# entity to text for FB, names
|
||
|
with open('data/FB15k-237/FB15k_mid2name.txt', 'r') as f, open('data/FB15k-237/entity2text.txt', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
ent2texts = []
|
||
|
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split('\t')
|
||
|
temp[1] = temp[1].replace('_', ' ')
|
||
|
ent2texts.append(temp[0] + '\t' + temp[1])
|
||
|
f1.write('\n'.join(ent2texts))
|
||
|
|
||
|
# entity to text for UMLS and YAGO3-10
|
||
|
with open('data/YAGO3-10/entities.txt', 'r') as f, open('data/YAGO3-10/entity2text.txt', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
ent2texts = []
|
||
|
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
ent2texts.append(line + '\t' + line.replace('_', ' '))
|
||
|
f1.write('\n'.join(ent2texts))
|
||
|
|
||
|
# relations to texts for Freebase, umls, WordNet
|
||
|
with open('data/WN18/relations.txt', 'r') as f, open('data/WN18/relation2text.txt', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
relation_texts = []
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
text = line.replace('/' , ' ')
|
||
|
text = text.replace('_', ' ').strip()
|
||
|
#print(line, text)
|
||
|
relation_texts.append(line + '\t' + text)
|
||
|
f1.write('\n'.join(relation_texts))
|
||
|
|
||
|
|
||
|
# entity to text for FB, descriptions
|
||
|
with open('data/FB15K/FB15k_mid2description.txt', 'r') as f, open('data/FB15K/entity2textlong.txt', 'w') as f1:
|
||
|
lines = f.readlines()
|
||
|
ent2texts = []
|
||
|
|
||
|
for line in lines:
|
||
|
line = line.strip()
|
||
|
temp = line.split('\t')
|
||
|
temp[1] = temp[1][1:-4]
|
||
|
temp[1] = temp[1].replace('_', ' ')
|
||
|
ent2texts.append(temp[0] + '\t' + temp[1])
|
||
|
f1.write('\n'.join(ent2texts))
|
||
|
|
||
|
|
||
|
# entity text length
|
||
|
max_len = 0
|
||
|
min_len = 10000
|
||
|
avg_len = 0
|
||
|
with open('data/WN18RR/entity2text.txt', 'r') as f1:
|
||
|
lines = f1.readlines()
|
||
|
for line in lines:
|
||
|
temp = line.strip().split("\t")
|
||
|
words = temp[1].split(" ")
|
||
|
length = len(words)
|
||
|
print(length)
|
||
|
if length < min_len:
|
||
|
min_len = length
|
||
|
if length > max_len:
|
||
|
max_len = length
|
||
|
avg_len += length
|
||
|
print("max:", max_len)
|
||
|
print("min:", min_len)
|
||
|
print("avg:", avg_len * 1.0 / len(lines))
|
||
|
|