QAonMilitaryKG/insert_data.py
lhy_in_blcu@126.com 46e8b676e6 create new project
2019-05-02 00:44:38 +08:00

138 lines
4.8 KiB
Python

import os
import json
import re
import pymongo
class InsertData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.datapath = os.path.join(cur, 'data/military.json')
self.conn = pymongo.MongoClient()
self.db = self.conn['military_qa']
self.collection = self.db['data']
self.unit_dict = {
'海里':[1852,''],
'英里':[1610,''],
'/节':[1852,''],
'km/节':[1000,''],
'':[1000,'千克'],
'-吨':[1000,'千克'],
'公里':[1000,''],
'公里/节':[1000,''],
'公里/小时':[1000,''],
'海里节':[1852,''],
'海里,节':[1852,''],
'海里/节':[1852,''],
'海哩/节':[1852,''],
'海浬/节':[1852,''],
'毫米':[0.001,''],
'':[1852,''],
'节/海里':[1852,''],
'节海里':[1852,''],
'节行驶英里':[1852,''],
'节下海里':[1852,''],
'':[0.001,'千克'],
'':[1852,''],
'里/节':[1852,''],
'':[1,''],
'千克':[1,''],
'千米':[1000,''],
'千米/节':[1000,''],
'千米/时':[1000,''],
'千米/小时':[1000,''],
'千米每小时':[1000,''],
'万海里/节':[18520000,''],
'英里,节':[1610,''],
'英里/节':[1610,''],
'余英里':[1610,''],
'约海里':[1852,''],
'最大海里':[1852,''],
'': [1, ''],
'': [1, '']}
return
def insert_main(self):
count = 0
for record in open(self.datapath):
data = {i:j for i,j in json.loads(record).items() if i !='_id'}
data_new = data.copy()
for key, value in data.items():
if key not in ['简介', '_id'] and self.check_num(value) and (value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('')) and len(value) < 11:
value_ = ''.join([i for i in value if i not in ['0','1','2','3','4','5','6','7','8','9','.']]).replace(' ','')
try:
num = float(value.replace(value_,''))
unit_info = self.unit_dict.get(value_)
plus = unit_info[0]
unit = unit_info[1]
num_standrd = num * plus
value_new = num_standrd
value_unit = unit
key_unit = key + '_单位'
data_new[key_unit] = value_unit
except Exception as e:
print(e)
value_new = value
pass
data_new[key] = value_new
elif key not in ['简介', '_id'] and self.check_year(value) and len(value) <= 15:
new_key = key + '_详细'
new_value = self.check_year(value)
data_new[new_key] = value
data_new[key] = new_value
print(data_new)
self.collection.insert(data_new)
count += 1
print('finished insert into database with %s records!'%count)
return
'检测是否有数字'
def check_num(self, sent):
pattern = re.compile('\d+')
res = pattern.findall(str(sent))
return res
'''检查年份'''
def check_year(self, sent):
sent = sent.replace(' ', '')
pattern_year = re.compile('[0-9]{4}')
pattern_month = re.compile('[0-9]{1,4}月')
pattern_day = re.compile('[0-9]{1,4}日')
default_day = ''
default_month = ''
month = pattern_month.findall(sent)
day = pattern_day.findall(sent)
year = pattern_year.findall(sent)
if year:
year = year[0].replace('', '')
if month:
default_month = month[0].replace('', '')
if day:
default_day = day[0].replace('', '')
if year:
date_new = year + self.full_date(default_month) + self.full_date(default_day)
else:
date_new = ''
else:
return ''
return date_new
'''补全日期'''
def full_date(self, date):
if not date:
date = '01'
if int(date) < 10 and len(date) < 2:
date = '0' + date
return date
if __name__ == '__main__':
handler = InsertData()
handler.insert_main()