#coding = utf-8
import os
from urllib import request
from lxml import etree
import gzip
import pymongo
import datetime
class NewspaperSpider:
def __init__(self):
self.term_dict = {
'aircraft': "飞行器",
'warship': "舰船舰艇",
'guns': "枪械与单兵",
'tank': "坦克装甲车辆",
'artillery': "火炮",
'missile': "导弹武器",
'spaceship': "太空装备",
'explosive': "爆炸物",
self.conn = pymongo.MongoClient()
'''get html '''
def get_html(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
'Accept-Encoding':'gzip, deflate',
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = gzip.decompress(page).decode('utf-8')
return page
def get_urllist(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
papers = ['' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
return list(set(papers))
'''content parser'''
def html_parser(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
contents = [html, title]
for article in attrs:
content = article.xpath('string(.)')
return contents
'''modify data'''
def modify_data(self):
keys = []
for item in self.conn['military']['kb'].find():
body = item['contents']
title = body[1].replace(' ','').replace('-','-').replace('(','(').replace(')',')')
title = title.split('_')
data = {}
name = title[0]
category = title[1]
data['名称'] = name
data['类别'] = category
attrs = body[2:]
html = body[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
data['产国'] = country
for attr in attrs:
if len(attr.split(':')) < 2:
key = attr.split(':')[0].replace('(','(').replace(' ','').replace('\t','')
if key.startswith('(') or len(key) > 6:
value = attr.split(':')[1]
data[key] = value.replace('\t','').replace('\n','').replace(',','')
def spider_main(self):
big_cates = ['aircraft', 'warship',
'guns', 'tank',
'artillery', 'missile',
'spaceship', 'explosive'
for big_cate in big_cates:
big_url = ''%big_cate
html = self.get_html(big_url)
selector = etree.HTML(html)
span = selector.xpath('//span[@class="list"]')[0]
second_urls = ['' + i for i in span.xpath('./a/@href')]
second_cates = [i for i in span.xpath('./a/text()')]
second_dict = {}
for indx, second_cate in enumerate(second_cates):
second_dict[second_cate] = second_urls[indx]
for second_cate, second_url in second_dict.items():
max_pages = self.get_maxpage(second_url)
for page in range(1, max_pages+1):
url = second_url + '_0_0_%s'%page
seed_urls = self.get_urllist(url)
for seed in seed_urls:
self.get_info(seed, big_cate, second_cate)
def get_info(self, url, big_cate, second_cate):
content = self.html_parser(url)
data = self.extract_data(content)
data['大类'] = self.term_dict.get(big_cate)
data['类型'] = second_cate
if data:
'''modify data'''
def extract_data(self, content):
title = content[1].replace(' ', '').replace('-', '-').replace('(', '(').replace(')', ')')
title = title.split('_')
data = {}
name = title[0]
data['名称'] = name
attrs = content[2:]
html = content[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
image = selector.xpath('//div[@class="maxPic"]/img/@src')
if not image:
image = ''
image = image[0]
data['产国'] = country
data['图片'] = image
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
for attr in attrs:
if len(attr.split(':')) < 2:
key = attr.split(':')[0].replace('(', '(').replace(' ', '').replace('\t', '')
if key.startswith('(') or len(key) > 6:
value = attr.split(':')[1]
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
return data
def get_maxpage(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
if not max_pages:
max_page = 1
max_page = int(max_pages[-2])
return max_page
if __name__ == '__main__':
handler = NewspaperSpider()
import os
import json
import re
import pymongo
class InsertData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.datapath = os.path.join(cur, 'data/military.json')
self.conn = pymongo.MongoClient()
self.db = self.conn['military_qa']
self.collection = self.db['data']
self.unit_dict = {
'人': [1, '人'],
'位': [1, '位']}
def insert_main(self):
count = 0
for record in open(self.datapath):
data = {i:j for i,j in json.loads(record).items() if i !='_id'}
data_new = data.copy()
for key, value in data.items():
if key not in ['简介', '_id'] and self.check_num(value) and (value.endswith('米') or value.endswith('里') or value.endswith('克') or value.endswith('吨') or value.endswith('时') or value.endswith('节')) and len(value) < 11:
value_ = ''.join([i for i in value if i not in ['0','1','2','3','4','5','6','7','8','9','.']]).replace(' ','')
num = float(value.replace(value_,''))
unit_info = self.unit_dict.get(value_)
plus = unit_info[0]
unit = unit_info[1]
num_standrd = num * plus
value_new = num_standrd
value_unit = unit
key_unit = key + '_单位'
data_new[key_unit] = value_unit
except Exception as e:
value_new = value
data_new[key] = value_new
elif key not in ['简介', '_id'] and self.check_year(value) and len(value) <= 15:
new_key = key + '_详细'
new_value = self.check_year(value)
data_new[new_key] = value
data_new[key] = new_value
count += 1
print('finished insert into database with %s records!'%count)
def check_num(self, sent):
pattern = re.compile('\d+')
res = pattern.findall(str(sent))
return res
def check_year(self, sent):
sent = sent.replace(' ', '')
pattern_year = re.compile('[0-9]{4}年')
pattern_month = re.compile('[0-9]{1,4}月')
pattern_day = re.compile('[0-9]{1,4}日')
default_day = ''
default_month = ''
month = pattern_month.findall(sent)
day = pattern_day.findall(sent)
year = pattern_year.findall(sent)
if year:
year = year[0].replace('年', '')
if month:
default_month = month[0].replace('月', '')
if day:
default_day = day[0].replace('日', '')
if year:
date_new = year + self.full_date(default_month) + self.full_date(default_day)
date_new = ''
return ''
return date_new
def full_date(self, date):
if not date:
date = '01'
if int(date) < 10 and len(date) < 2:
date = '0' + date
return date
if __name__ == '__main__':
handler = InsertData()
#!/usr/bin/env python3
# coding: utf-8
# File:
# Author: lhy<,>
# Date: 19-3-11
import os
import re
import json
import jieba
import jieba.posseg as pseg
import pymongo
class MilitaryGraph:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.datapath = os.path.join(cur, 'data/military.json')
self.conn = pymongo.MongoClient()
db_name = 'military_qa'
col_name = 'data'
self.col = self.conn[db_name][col_name]
self.attributes ={'同型': ['同型'], '机高': ['机高'],
'战斗全重': ['战斗全重'], '水下排水量': ['水下排水量'],
'处理器': ['处理器'], '主炮': ['主炮'],
'制导系统': ['制导系统'], '全重': ['全重'],
'纬度': ['纬度'], '炮口初速': ['炮口初速'],
'发射性能': ['发射性能'], '兵装': ['兵装'],
'型号': ['型号'],
'长度': ['长度', '全长', '多长'], '翼展': ['翼展', '翼长'],
'全枪长': ['全枪长', '枪长'], '射程': ['射程'],
'前型': ['前型'],
'发射地点': ['发射地点', '发射地点'], '首飞时间': ['首飞时间', '首飞', '初次飞行', '首次飞行'],
'发动机数量': ['发动机数量', '几个发动机', '多少个发动机', '发动机个数', '发动机数目', '发动机个','发动机数'], '乘员': ['乘员'],
'战斗射速': ['战斗射速'], '生产单位': ['生产单位', '产商', '制造商', '厂家', '制造机构'],
'最大行程': ['最大行程', '最常距离'], '炮管长度': ['炮管长度', '炮管长', '炮管全长'],
'气动布局': ['气动布局'], '武备': ['武备'],
'武器装备': ['武器装备'], '引信': ['引信'],
'参战情况': ['参战情况'],
'动力装置': ['动力装置'], '飞行速度': ['飞行速度'],
'服役时间': ['服役时间'], '新造时': ['新造时'],
'活动范围': ['活动范围'], '弹匣容弹量': ['弹匣容弹量'],
'编制': ['编制'], '高度': ['高度'],
'制造厂': ['制造厂'], '口径': ['口径'],
'鱼雷': ['鱼雷'], '经度': ['经度'],
'研发时间': ['研发时间'], '简介': ['简介'],
'首次轨道发射': ['首次轨道发射'],
'挂载点': ['挂载点'], '刀锋宽度': ['刀锋宽度'],
'续航距离': ['续航距离'], '枪械': ['枪械'],
'最大速度': ['最大速度'], '运载火箭': ['运载火箭'],
'生产年限': ['生产年限'], '全枪重': ['全枪重'],
'空重': ['空重'], '水雷': ['水雷'],
'枪炮': ['枪炮'], '水上排水量': ['水上排水量', '排水量'],
'诞生时间': ['诞生时间'], '内置武器': ['内置武器'],
'机长': ['机长'], '中心直径': ['中心直径', '直径'],
'装药类型': ['装药类型'], '最大起飞重量': ['最大起飞重量', '起飞重量'],
'有效射程': ['有效射程'], '现状': ['现状'],
'研制时间': ['研制时间'], '舰舰导弹': ['舰舰导弹'],
'下水时间': ['下水时间', '下水'], '机炮': ['机炮'],
'弹长': ['弹长'], '退役时间': ['退役时间', '退役'],
'最大射程': ['最大射程'], '改装时': ['改装时'],
'刀重': ['刀重'], '自持力': ['自持力'],
'产国': ['产国'], '航速': ['航速'],
'制造商': ['制造商'], '型宽': ['型宽'],
'弹重': ['弹重'], '刀长': ['刀长'],
'舰长': ['舰长'], '研发厂商': ['研发厂商'],
'旋翼直径': ['旋翼直径'], '导弹': ['导弹'],
'满排吨位': ['满排吨位'], '底盘类型': ['底盘类型'],
'刀锋长度': ['刀锋长度'], '弹径': ['弹径'],
'全长': ['全长'], '竣工时': ['竣工时'],
'发射日期': ['发射日期'], '宽度': ['宽度'],
'总重': ['总重'], '建造时间': ['建造时间'],
'射控装置': ['射控装置'], '图片': ['图片'],
'轨道': ['轨道'], '改装前': ['改装前'],
'发动机': ['发动机'], '最大航程': ['最大航程'],
'研发单位': ['研发单位'], '大类': ['大类'],
'关注度': ['关注度'], '最大飞行速度': ['最大飞行速度'],
'火炮': ['火炮'], '战地机型': ['战地机型'],
'防空兵器': ['防空兵器'], '潜航深度': ['潜航深度'],
'轨道卫星': ['轨道卫星'], '尾翼装置': ['尾翼装置'],
'乘员与载员': ['乘员与载员'], '名称': ['名称'],
'引信装置': ['引信装置'], '次型': ['次型'],
'车长': ['车长'], '武装': ['武装'],"航长":['航长'],
'反舰导弹': ['反舰导弹'],
'满载排水量': ['满载排水量'], '装备': ['装备']}
self.big_cates ={'火炮': ['火炮'], '飞行器': ['飞行器'],
'舰船舰艇': ['舰船舰艇'], '坦克装甲车辆': ['坦克装甲车辆'],
'太空装备': ['太空装备'], '爆炸物': ['爆炸物'],
'导弹武器': ['导弹武器'], '枪械与单兵': ['枪械与单兵', '枪械', '枪', '单兵']}
self.second_cates = {'榴弹发射器': ['榴弹发射器'], '炸弹': ['炸弹', '炸药'],
'手榴弹': ['手榴弹'], '电子战机': ['电子战机'],
'机枪': ['机枪'], '宇宙飞船': ['宇宙飞船', '飞船'],
'加农炮': ['加农炮'], '救护车': ['救护车'],
'攻击机': ['攻击机'], '非自动步枪': ['非自动步枪', '步枪'],
'火箭弹': ['火箭弹'], '地雷': ['地雷'],
'高射炮': ['高射炮'], '航天飞机': ['航天飞机'],
'航天机构': ['航天机构', '航天局', '航天部门'], '舰舰导弹': ['舰舰导弹'],
'通用飞机': ['通用飞机'], '岸舰导弹': ['岸舰导弹', '导弹'],
'舰炮': ['舰炮'], '巡洋舰': ['巡洋舰'],
'气垫艇/气垫船': ['气垫艇/气垫船','气垫艇','气垫船'], '装甲指挥车': ['装甲指挥车', '装甲车', '指挥车'],
'无人机': ['无人机'], '氢弹': ['氢弹'],
'坦克炮': ['坦克炮'], '干线': ['干线'],
'原子弹': ['原子弹'], '冲锋枪': ['冲锋枪'],
'导弹艇': ['导弹艇'], '水雷战舰艇': ['水雷战舰艇'],
'侦察机': ['侦察机'], '试验机': ['试验机'],
'舰地(潜地)导弹': ['舰地(潜地)导弹','舰地导弹','潜地导弹', '导弹'],
'支线': ['支线'], '军事卫星': ['军事卫星'],
'地空导弹': ['地空导弹'], '航空炮': ['航空炮'],
'战列舰': ['战列舰'], '无后坐炮': ['无后坐炮'],
'空地导弹': ['空地导弹'], '加农榴弹炮': ['加农榴弹炮'],
'运输机': ['运输机'], '自行火炮': ['自行火炮'],
'地地导弹': ['地地导弹'], '空舰导弹': ['空舰导弹'],
'教练机': ['教练机'], '其他特种装甲车辆': ['其他特种装甲车辆'],
'火箭筒': ['火箭筒'], '空间探测器': ['空间探测器', '探测器'],
'预警机': ['预警机'], '航空母舰': ['航空母舰', '航母'],
'迷彩服': ['迷彩服'],'弹炮结合系统': ['弹炮结合系统'],
'科学卫星': ['科学卫星'], '空空导弹': ['空空导弹','导弹'],
'迫击炮': ['迫击炮'],
'应用卫星': ['应用卫星', '卫星'], '保障辅助舰艇': ['保障辅助舰艇'],
'刀具': ['刀具'], '霰弹枪': ['霰弹枪'],
'自动步枪': ['自动步枪'], '手枪': ['手枪'],
'反弹道导弹': ['反弹道导弹'], '两栖作战舰艇': ['两栖作战舰艇'],
'特种坦克': ['特种坦克', '坦克'], '运输直升机': ['运输直升机', '直升机'],
'巡逻舰/艇': ['巡逻舰/艇', '巡逻舰', '巡逻舰艇', '巡逻舰艇'], '加油机': ['加油机'],
'反坦克炮': ['反坦克炮'],
'越野车': ['越野车'], '步兵战车': ['步兵战车'],
'战斗机': ['战斗机'], '护卫舰': ['护卫舰'],
'工程抢修车': ['工程抢修车'],'反潜机': ['反潜机'],
'常规潜艇': ['常规潜艇'], '装甲侦察车': ['装甲侦察车'],
'舰空导弹': ['舰空导弹'], '运载火箭': ['运载火箭'],
'中子弹': ['中子弹'], '飞艇': ['飞艇'],
'航天基地': ['航天基地'], '鱼雷': ['鱼雷'],
'轰炸机': ['轰炸机'], '技术试验卫星': ['技术试验卫星', '卫星'],
'狙击枪': ['狙击枪'], '水雷': ['水雷'],
'装甲车载炮': ['装甲车载炮'], '榴弹炮': ['榴弹炮'],
'驱逐舰': ['驱逐舰'], '装甲运兵车': ['装甲运兵车'],
'火箭炮': ['火箭炮'], '多用途直升机': ['多用途直升机', '直升机'],
'核潜艇': ['核潜艇'], '武装直升机': ['武装直升机', '直升机'],
'布/扫雷车': ['布/扫雷车', '扫雷车', '扫雷车'], '潜舰导弹': ['潜舰导弹', '导弹'],
'主战坦克': ['主战坦克', '坦克']}
self.weapons = self.load_weapons()
self.weapon_dict = {i:i for i in self.weapons}
self.countries = {'荷兰': ['荷兰'], '阿根廷': ['阿根廷'], '瑞士': ['瑞士'],
'伊朗': ['伊朗'], '以色列': ['以色列'], '前南斯拉夫': ['前南斯拉夫'],
'越南': ['越南'], '葡萄牙': ['葡萄牙'], '乌克兰': ['乌克兰'],
'新西兰': ['新西兰'], '奥地利': ['奥地利'], '希腊': ['希腊'],
'塞尔维亚': ['塞尔维亚'], '比利时': ['比利时'],
'俄罗斯': ['俄罗斯'], '前捷克斯洛伐克': ['前捷克斯洛伐克'],
'捷克': ['捷克'], '土耳其': ['土耳其'], '缅甸': ['缅甸'],
'美国': ['美国'], '德国': ['德国'], '巴西': ['巴西'],
'印度尼西亚': ['印度尼西亚'], '法国': ['法国'],
'瑞典': ['瑞典'], '前苏联': ['前苏联'],
'朝鲜': ['朝鲜'],
'埃及': ['埃及'], '墨西哥': ['墨西哥'], '巴基斯坦': ['巴基斯坦'],
'马来西亚': ['马来西亚'], '澳大利亚': ['澳大利亚'], '泰国': ['泰国'],
'欧盟': ['欧盟'], '波兰': ['波兰'],
'韩国': ['韩国'], '日本': ['日本'],
'罗马尼亚': ['罗马尼亚'], '克罗地亚': ['克罗地亚'], '智利': ['智利'],
'匈牙利': ['匈牙利'], '意大利': ['意大利'], '英国': ['英国'],
'丹麦': ['丹麦'], '挪威': ['挪威'], '哈萨克斯坦': ['哈萨克斯坦'],
'爱尔兰': ['爱尔兰'], '伊拉克': ['伊拉克'],
'中国': ['中国','中华人民共和国'], '印度': ['印度'],
'保加利亚': ['保加利亚'], '斯洛伐克': ['斯洛伐克'],
'西班牙': ['西班牙'], '秘鲁': ['秘鲁'],
'阿联酋': ['阿联酋'], '卢森堡': ['卢森堡'],
'巴拿马': ['巴拿马'], '新加坡': ['新加坡'],
'波黑': ['波黑'], '南非': ['南非'],
'苏/俄': ['苏/俄', '苏联', '俄罗斯'], '加拿大': ['加拿大'], '芬兰': ['芬兰']}
self.compares = {
'$gt': ['高于','大于','长于','高过','大过','长过','多于', '远于', '远过', '之后', '晚于', '后于'],
'$lt': ['低于', '小于', '短于', '低过', '短过', '少于', '近于', '近过', '未达到', '没达到', '之前', '先于', '早于'],
'$lte': ['不高于','不大于','不长于','不高过','不大过','不长过','不多于', '不远于', '不远过'],
'$gte': ['不低于', '不小于', '不短于', '不低过', '不短过', '不少于', '不近于', '不近过', '达到'],
'$eq': ['等于', '差不多'],
'$ne': ['不等于', '不是']}
self.counts = ['多少', '几', '几多']
self.mosts = {
-1:['最大', '最远', '最长', '最高', '最久', '最快', '最多', '最强'],
1:['最小', '最短', '最近', '最低', '最矮', '最慢', '最少', '最弱'],
self.unit_dict = {
'海里': [1852, '米'],
'英里': [1610, '米'],
'/节': [1852, '米'],
'km/节': [1000, '米'],
'吨': [1000, '千克'],
'-吨': [1000, '千克'],
'公里': [1000, '米'],
'公里/节': [1000, '米'],
'公里/小时': [1000, '米'],
'海里节': [1852, '米'],
'海里,节': [1852, '米'],
'海里/节': [1852, '米'],
'海哩/节': [1852, '米'],
'海浬/节': [1852, '米'],
'毫米': [0.001, '米'],
'节': [1852, '米'],
'节/海里': [1852, '米'],
'节海里': [1852, '米'],
'节行驶英里': [1852, '米'],
'节下海里': [1852, '米'],
'克': [0.001, '千克'],
'里': [1852, '米'],
'里/节': [1852, '米'],
'米': [1, '米'],
'千克': [1, '克'],
'千米': [1000, '米'],
'千米/节': [1000, '米'],
'千米/时': [1000, '米'],
'千米/小时': [1000, '米'],
'千米每小时': [1000, '米'],
'万海里/节': [18520000, '米'],
'英里,节': [1610, '米'],
'英里/节': [1610, '米'],
'余英里': [1610, '米'],
'约海里': [1852, '米'],
'最大海里': [1852, '米'],
'厘米': [0.01, '米'],
'分米': [0.1, '米'],
'人': [1, '人'],
'位': [1, '位']}
unit_dict = {i:len(i) for i in self.unit_dict}
unit_wds = [i[0] for i in sorted(unit_dict.items(), key = lambda asd: asd[1], reverse=True)]
unit_regex = '([0-9]+.?[0-9]+)(%s)+' % '|'.join(unit_wds)
time_regex = '[0-9]{4}年[0-9]{0,4}月?[0-9]{0,4}日?'
self.unit_pattern = re.compile(unit_regex)
self.time_pattern = re.compile(time_regex)
self.country_dict = self.build_dict(self.countries)
self.big_dict = self.build_dict(self.big_cates)
self.small_dict = self.build_dict(self.second_cates)
self.attribute_dict = self.build_dict(self.attributes)
self.compare_dict = self.build_dict(self.compares)
self.most_dict = self.build_dict(self.mosts)
self.add_jieba(self.country_dict, 'n_country')
self.add_jieba(self.big_dict, 'n_big')
self.add_jieba(self.small_dict, 'n_small')
self.add_jieba(self.attribute_dict, 'n_attr')
self.add_jieba(self.compare_dict, 'n_compare')
self.add_jieba(self.most_dict, 'n_most')
self.add_jieba(self.weapons, 'n_weapon')
def load_weapons(self):
weapons = []
for record in open(self.datapath):
data = json.loads(record)
return list(set(weapons))
def build_dict(self, dict):
wd_dict = {}
for cate, wds in dict.items():
for wd in wds:
wd_dict[wd] = cate
return wd_dict
def detect_entity(self, question):
units = [i[0] + i[1] for i in self.unit_pattern.findall(question) if i]
times = self.time_pattern.findall(question)
return times, units
def standard_year(self, sent):
sent = sent.replace(' ', '')
pattern_year = re.compile('[0-9]{4}年')
pattern_month = re.compile('[0-9]{1,4}月')
pattern_day = re.compile('[0-9]{1,4}日')
default_day = ''
default_month = ''
month = pattern_month.findall(sent)
day = pattern_day.findall(sent)
year = pattern_year.findall(sent)
if year:
year = year[0].replace('年', '')
if month:
default_month = month[0].replace('月', '')
if day:
default_day = day[0].replace('日', '')
if year:
date_new = year + self.full_date(default_month) + self.full_date(default_day)
date_new = ''
return ''
return date_new
def full_date(self, date):
if not date:
date = '01'
if int(date) < 10 and len(date) < 2:
date = '0' + date
return date
def check_num(self, sent):
pattern = re.compile('\d+')
res = pattern.findall(str(sent))
return res[0]
def standard_unit(self, unit_value):
num = self.check_num(unit_value)
unit = unit_value.replace(num, '')
unit_info = self.unit_dict.get(unit, [1, 'default'])
plus = unit_info[0]
num_standrd = float(num) * plus
return num_standrd
def add_jieba(self, wds, tag):
for wd in wds:
jieba.add_word(wd, tag=tag, freq=300000)
def question_parser(self, question):
times, units = self.detect_entity(question)
self.add_jieba(times, 'n_time')
self.add_jieba(units, 'n_unit')
wds = [(i.word, i.flag) for i in pseg.cut(question)]
parser_dict = {}
parser_dict['n_attrs'] = [wd for wd,flag in wds if flag == 'n_attr']
parser_dict['n_times'] = [wd for wd,flag in wds if flag == 'n_time']
parser_dict['n_bigs'] = [wd for wd,flag in wds if flag == 'n_big']
parser_dict['n_smalls'] = [wd for wd,flag in wds if flag == 'n_small']
parser_dict['n_countries'] = [wd for wd,flag in wds if flag == 'n_country']
parser_dict['n_compares'] = [wd for wd,flag in wds if flag == 'n_compare']
parser_dict['n_mosts'] = [wd for wd,flag in wds if flag == 'n_most']
parser_dict['n_units'] = [wd for wd,flag in wds if flag == 'n_unit']
parser_dict['n_weapons'] = [wd for wd,flag in wds if flag == 'n_weapon']
parser_dict['pattern'] = [flag for wd, flag in wds if flag in ['n_attr', 'n_time', 'n_big', 'n_small', 'n_unit', 'n_country', 'n_compare', 'n_most', 'n_weapon']]
parser_dict['wds'] = wds
return parser_dict
def search_answer(self, parser_dict):
pattern = parser_dict['pattern']
search_data = []
condition = {}
targets = ['名称']
search_flag = 1
if pattern in [['n_country', 'n_small'], ['n_small', 'n_country']]:
country = self.country_dict.get(parser_dict.get('n_countries')[0])
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {'产国': country, '类型':n_small}
targets = ['名称']
search_data.append({'condition':condition, 'targets':targets})
elif pattern in [['n_country', 'n_big'], ['n_big', 'n_country']]:
country = self.country_dict.get(parser_dict.get('n_countries')[0])
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {'产国': country, '类型': n_big}
targets = ['名称']
search_data.append({'condition': condition, 'targets': targets})
elif pattern in [['n_country', 'n_weapon'], ['n_weapon']]:
n_weapon = self.weapon_dict.get(parser_dict.get('n_weapons')[0])
condition = {'名称': n_weapon}
targets = ['简介']
search_data.append({'condition': condition, 'targets': targets})
# 单实体多属性查询
elif pattern in [['n_country', 'n_weapon'],
['n_weapon', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr']
n_weapon = self.weapon_dict.get(parser_dict.get('n_weapons')[0])
condition = {'名称': n_weapon}
targets = [self.attribute_dict.get(attr) for attr in parser_dict.get('n_attrs')]
search_data.append({'condition': condition, 'targets': targets})
# 多实体多属性查询
elif pattern in [
['n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon','n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
n_weapons = [self.weapon_dict.get(weapon) for weapon in parser_dict.get('n_weapons')]
condition = {'名称': {"$in": n_weapons}}
targets = [self.attribute_dict.get(attr) for attr in parser_dict.get('n_attrs')]
search_data.append({'condition': condition, 'targets': targets})
# 实体、实体属性相间隔
elif pattern in [
['n_weapon', 'n_attr','n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_country','n_weapon', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_country',' n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr'],
n_indxes = [indx for indx, name in enumerate(pattern) if name == 'n_weapon']
n_weapons = [self.weapon_dict.get(weapon) for weapon in parser_dict.get('n_weapons')]
n1_weapon = n_weapons[0]
n2_weapon = n_weapons[1]
targets1 = [self.attribute_dict.get(weapon) for indx, weapon in enumerate(parser_dict.get('n_attrs')) if indx < len(n_indxes)]
targets2 = [self.attribute_dict.get(weapon) for indx, weapon in enumerate(parser_dict.get('n_attrs')) if indx >= len(n_indxes)]
condition1 = {'名称': n1_weapon}
condition2 = {'名称': n2_weapon}
search_data.append({'condition':condition1, 'targets': targets1})
search_data.append({'condition':condition2, 'targets': targets2})
# 比较查找,单操作符+操作数的实体
elif pattern in [
['n_attr', 'n_compare', 'n_unit', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_small'],
['n_attr', 'n_time', 'n_compare', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_time'],
['n_small', 'n_attr', 'n_time', 'n_compare'],
['n_attr', 'n_compare', 'n_unit', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_big'],
['n_attr', 'n_time', 'n_compare', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_time'],
['n_big', 'n_attr', 'n_time', 'n_compare'],
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_compare = self.compare_dict.get(parser_dict.get('n_compares')[0])
if 'n_unit' in pattern:
n_unit = self.standard_unit(parser_dict.get('n_units')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compare:n_unit}, '类型':n_small}
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compare:n_unit}, '大类':n_big}
n_time = self.standard_year(parser_dict.get('n_times')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr: {n_compare: n_time}, '类型': n_small}
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr: {n_compare: n_time}, '大类': n_big}
targets = [n_attr]
search_data.append({'condition':condition, 'targets':targets})
# 比较查找,双操作符+操作数的实体
elif pattern in [
['n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time', 'n_small'],
['n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time'],
['n_small', 'n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare'],
['n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time', 'n_big'],
['n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time'],
['n_big', 'n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare'],
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_compares = [self.compare_dict.get(compare) for compare in parser_dict.get('n_compares')]
if 'n_unit' in pattern:
n_units = [self.standard_unit(unit) for unit in parser_dict.get('n_units')]
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compares[0]:n_units[0], n_compares[1]:n_units[1]}, '类型':n_small}
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compares[0]:n_units[0], n_compares[1]:n_units[1]},'大类':n_big}
n_times = [self.standard_year(year) for year in parser_dict.get('n_times')]
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compares[0]:n_times[0], n_compares[1]:n_times[1]}, '类型': n_small}
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compares[0]:n_times[0], n_compares[1]:n_times[1]}, '大类': n_big}
targets = [n_attr]
search_data.append({'condition':condition, 'targets':targets})
# 属性最值查找
elif pattern in [['n_small', 'n_attr', 'n_most'],
['n_attr', 'n_most', 'n_small'],
['n_big', 'n_attr', 'n_most'],
['n_attr', 'n_most', 'n_big'],
search_flag = 0
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_most = self.most_dict.get(parser_dict.get('n_mosts')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {'类型': n_small, 'sort_key':{n_attr: n_most}}
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {'大类': n_big, 'sort_key': {n_attr: n_most}}
search_data.append({'condition':condition, 'targets':targets})
result = self.query_mongo(search_flag, search_data)
return result
def query_mongo(self, search_flag, search_data):
result = []
if search_flag:
result = self.query_mongo_attr(search_data)
result = self.query_mongo_sort(search_data)
return result
def query_mongo_attr(self, search_data):
result = []
for search in search_data:
condition = search['condition']
targets = search['targets']
for res in self.col.find(condition):
result.append([res.get('名称') + target + ':' + str(res.get(target,'null')) for target in targets if res.get(target, 'null') != 'null'])
return result
def query_mongo_sort(self, search_data):
result = []
for search in search_data:
condition = {key:value for key, value in search['condition'].items() if key != 'sort_key'}
sort_condition = [(i,j) for i, j in search['condition'].get('sort_key').items()]
targets = search['targets']
for res in self.col.find(condition).sort(sort_condition).limit(1):
result_ = [res.get('名称') + target + ':' + str(res.get(target, 'null')) for target in targets]
return result
def qa_main(self, question):
parser_dict = self.question_parser(question)
results = self.search_answer(parser_dict)
if results == [[]]:
print('sorry, do not know the answer yet...')
print('find %s result:'% len(results))
print('answer detail:')
for result in results:
if __name__ == '__main__':
handler = MilitaryGraph()
while 1:
question = input("enter an question to parser:\n")
