create new project

This commit is contained in:
lhy_in_blcu@126.com 2019-05-02 00:44:38 +08:00
commit 46e8b676e6
10 changed files with 7155 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,3 @@
<component name="MarkdownNavigator.ProfileManager">
<settings default="" pdf-export="" />
</component>

11
.idea/military_graph.iml Normal file
View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.5 (nlp)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (nlp)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/military_graph.iml" filepath="$PROJECT_DIR$/.idea/military_graph.iml" />
</modules>
</component>
</project>

427
.idea/workspace.xml Normal file
View File

@ -0,0 +1,427 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="7d73eeac-bcfb-4439-a5bc-734bf782761e" name="Default" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CreatePatchCommitExecutor">
<option name="PATCH_PATH" value="" />
</component>
<component name="FUSProjectUsageTrigger">
<session id="-1972844595">
<usages-collector id="statistics.lifecycle.project">
<counts>
<entry key="project.open.time.2" value="1" />
<entry key="project.opened" value="1" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.edit">
<counts>
<entry key="py" value="14481" />
<entry key="txt" value="14" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.edit">
<counts>
<entry key="PLAIN_TEXT" value="14" />
<entry key="Python" value="14481" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.extensions.open">
<counts>
<entry key="py" value="2" />
<entry key="txt" value="1" />
</counts>
</usages-collector>
<usages-collector id="statistics.file.types.open">
<counts>
<entry key="PLAIN_TEXT" value="1" />
<entry key="Python" value="2" />
</counts>
</usages-collector>
</session>
</component>
<component name="FavoritesManager">
<favorites_list name="military_graph" />
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/collect_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1087">
<caret line="91" column="8" selection-start-line="91" selection-start-column="8" selection-end-line="91" selection-end-column="8" />
<folding>
<element signature="e#16#25#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/militarygraph.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2758">
<caret line="567" column="48" lean-forward="true" selection-start-line="567" selection-start-column="48" selection-end-line="567" selection-end-column="48" />
<folding>
<element signature="e#144#153#0" expanded="true" />
<marker date="1556728777011" expanded="true" signature="13604:13634" ph="..." />
<marker date="1556728777011" expanded="true" signature="14921:14926" ph="..." />
<marker date="1556728777011" expanded="true" signature="15186:15279" ph="..." />
<marker date="1556728777011" expanded="true" signature="29015:29043" ph="..." />
<marker date="1556728777011" expanded="true" signature="29132:29246" ph="..." />
<marker date="1556728777011" expanded="true" signature="30060:30088" ph="..." />
</folding>
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/unit.txt">
<provider selected="true" editor-type-id="LargeFileEditor">
<state relative-caret-position="-3353">
<caret line="114" column="1" lean-forward="true" selection-start-line="114" selection-start-column="1" selection-end-line="114" selection-end-column="1" />
</state>
</provider>
</entry>
</file>
<file pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/insert_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="79">
<caret line="105" column="13" selection-start-line="105" selection-start-column="13" selection-end-line="105" selection-end-column="13" />
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>class=&quot;pic&quot;</find>
<find>机长</find>
<find>歼-15</find>
<find>类型</find>
<find>航空母舰</find>
<find>坦克</find>
<find>直升机</find>
<find>速度</find>
<find>$</find>
<find>最大航程</find>
<find>n_mos</find>
<find>pattern</find>
<find>数量</find>
<find>print</find>
</findStrings>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/data.txt" />
<option value="$PROJECT_DIR$/drug_graph.py" />
<option value="$PROJECT_DIR$/co_drug_graph.txt" />
<option value="$PROJECT_DIR$/data.py" />
<option value="$PROJECT_DIR$/insert_data.py" />
<option value="$PROJECT_DIR$/collect_data.py" />
<option value="$PROJECT_DIR$/militarygraph.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" fullScreen="true">
<option name="x" value="2" />
<option name="y" value="23" />
<option name="width" value="1680" />
<option name="height" value="971" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="military_graph" type="b2602c69:ProjectViewProjectNode" />
<item name="military_graph" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
<pane id="Scope" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RecentsManager">
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/data" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.militarygraph">
<configuration default="true" type="tests" factoryName="Attests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="military_graph" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<method />
</configuration>
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="" />
<option name="SCRIPT_NAME" value="" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="data" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="military_graph" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/collect_data.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="drug_graph" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="military_graph" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/drug_graph.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="insert_data" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="military_graph" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/insert_data.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="militarygraph" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="military_graph" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/militarygraph.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<list>
<item itemvalue="Python.data" />
<item itemvalue="Python.drug_graph" />
<item itemvalue="Python.militarygraph" />
<item itemvalue="Python.insert_data" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.militarygraph" />
<item itemvalue="Python.insert_data" />
<item itemvalue="Python.data" />
<item itemvalue="Python.drug_graph" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="7d73eeac-bcfb-4439-a5bc-734bf782761e" name="Default" comment="" />
<created>1552034938625</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1552034938625</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="0" y="0" width="1680" height="1050" extended-state="0" />
<editor active="true" />
<layout>
<window_info content_ui="combo" id="Project" order="0" visible="true" weight="0.0596745" />
<window_info id="Structure" order="1" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="Message" order="0" />
<window_info anchor="bottom" id="Find" order="1" />
<window_info anchor="bottom" id="Run" order="2" weight="0.6765306" />
<window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
<window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="6" />
<window_info anchor="bottom" id="Version Control" order="7" show_stripe_button="false" />
<window_info anchor="bottom" id="Terminal" order="8" />
<window_info anchor="bottom" id="Python Console" order="9" />
<window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
<window_info anchor="right" id="Commander" order="0" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
<window_info anchor="right" id="Data View" order="3" />
</layout>
</component>
<component name="Vcs.Log.UiProperties">
<option name="RECENTLY_FILTERED_USER_GROUPS">
<collection />
</option>
<option name="RECENTLY_FILTERED_BRANCH_GROUPS">
<collection />
</option>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/data.txt" />
<entry file="file://$PROJECT_DIR$/drug_graph.py" />
<entry file="file://$PROJECT_DIR$/attributes_all.txt" />
<entry file="file://$PROJECT_DIR$/co_drug_graph.txt" />
<entry file="file://$PROJECT_DIR$/unit.txt">
<provider selected="true" editor-type-id="LargeFileEditor">
<state relative-caret-position="-3353">
<caret line="114" column="1" lean-forward="true" selection-start-line="114" selection-start-column="1" selection-end-line="114" selection-end-column="1" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/collect_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-1087">
<caret line="91" column="8" selection-start-line="91" selection-start-column="8" selection-end-line="91" selection-end-column="8" />
<folding>
<element signature="e#16#25#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/insert_data.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="79">
<caret line="105" column="13" selection-start-line="105" selection-start-column="13" selection-end-line="105" selection-end-column="13" />
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/militarygraph.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2758">
<caret line="567" column="48" lean-forward="true" selection-start-line="567" selection-start-column="48" selection-end-line="567" selection-end-column="48" />
<folding>
<element signature="e#144#153#0" expanded="true" />
<marker date="1556728777011" expanded="true" signature="13604:13634" ph="..." />
<marker date="1556728777011" expanded="true" signature="14921:14926" ph="..." />
<marker date="1556728777011" expanded="true" signature="15186:15279" ph="..." />
<marker date="1556728777011" expanded="true" signature="29015:29043" ph="..." />
<marker date="1556728777011" expanded="true" signature="29132:29246" ph="..." />
<marker date="1556728777011" expanded="true" signature="30060:30088" ph="..." />
</folding>
</state>
</provider>
</entry>
</component>
</project>

172
collect_data.py Normal file
View File

@ -0,0 +1,172 @@
#coding = utf-8
import os
from urllib import request
from lxml import etree
import gzip
import pymongo
import datetime
class NewspaperSpider:
def __init__(self):
self.term_dict = {
'aircraft': "飞行器",
'warship': "舰船舰艇",
'guns': "枪械与单兵",
'tank': "坦克装甲车辆",
'artillery': "火炮",
'missile': "导弹武器",
'spaceship': "太空装备",
'explosive': "爆炸物",
}
self.conn = pymongo.MongoClient()
return
'''get html '''
def get_html(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
'Host':'weapon.huanqiu.com'
}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = gzip.decompress(page).decode('utf-8')
return page
'''get_urllist'''
def get_urllist(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
papers = ['http://weapon.huanqiu.com' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
return list(set(papers))
'''content parser'''
def html_parser(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
contents = [html, title]
for article in attrs:
content = article.xpath('string(.)')
contents.append(content)
return contents
'''modify data'''
def modify_data(self):
keys = []
for item in self.conn['military']['kb'].find():
body = item['contents']
title = body[1].replace(' ','').replace('','-').replace('','(').replace('',')')
title = title.split('_')
data = {}
name = title[0]
category = title[1]
data['名称'] = name
data['类别'] = category
attrs = body[2:]
html = body[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
data['产国'] = country
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('','(').replace(' ','').replace('\t','')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t','').replace('\n','').replace(',','')
keys.append(key)
self.conn['military']['graph_data'].insert(data)
return
'''采集主函数'''
def spider_main(self):
big_cates = ['aircraft', 'warship',
'guns', 'tank',
'artillery', 'missile',
'spaceship', 'explosive'
]
for big_cate in big_cates:
big_url = 'http://weapon.huanqiu.com/weaponlist/%s'%big_cate
html = self.get_html(big_url)
selector = etree.HTML(html)
span = selector.xpath('//span[@class="list"]')[0]
second_urls = ['http://weapon.huanqiu.com' + i for i in span.xpath('./a/@href')]
second_cates = [i for i in span.xpath('./a/text()')]
second_dict = {}
for indx, second_cate in enumerate(second_cates):
second_dict[second_cate] = second_urls[indx]
for second_cate, second_url in second_dict.items():
max_pages = self.get_maxpage(second_url)
for page in range(1, max_pages+1):
url = second_url + '_0_0_%s'%page
seed_urls = self.get_urllist(url)
for seed in seed_urls:
self.get_info(seed, big_cate, second_cate)
'''根据最大值,获取所有信息'''
def get_info(self, url, big_cate, second_cate):
content = self.html_parser(url)
data = self.extract_data(content)
data['大类'] = self.term_dict.get(big_cate)
data['类型'] = second_cate
if data:
print(data)
self.conn['military']['knowledge_base'].insert(data)
return
'''modify data'''
def extract_data(self, content):
title = content[1].replace(' ', '').replace('', '-').replace('', '(').replace('', ')')
title = title.split('_')
data = {}
name = title[0]
data['名称'] = name
attrs = content[2:]
html = content[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
image = selector.xpath('//div[@class="maxPic"]/img/@src')
if not image:
image = ''
else:
image = image[0]
data['产国'] = country
data['图片'] = image
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
for attr in attrs:
if len(attr.split('')) < 2:
continue
key = attr.split('')[0].replace('', '(').replace(' ', '').replace('\t', '')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split('')[1]
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
return data
'''获取最大值'''
def get_maxpage(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
if not max_pages:
max_page = 1
else:
max_page = int(max_pages[-2])
return max_page
if __name__ == '__main__':
handler = NewspaperSpider()
handler.spider_main()

5800
data/military.json Normal file

File diff suppressed because it is too large Load Diff

137
insert_data.py Normal file
View File

@ -0,0 +1,137 @@
import os
import json
import re
import pymongo
class InsertData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.datapath = os.path.join(cur, 'data/military.json')
self.conn = pymongo.MongoClient()
self.db = self.conn['military_qa']
self.collection = self.db['data']
self.unit_dict = {
'海里':[1852,''],
'英里':[1610,''],
'/节':[1852,''],
'km/节':[1000,''],
'':[1000,'千克'],
'-吨':[1000,'千克'],
'公里':[1000,''],
'公里/节':[1000,''],
'公里/小时':[1000,''],
'海里节':[1852,''],
'海里,节':[1852,''],
'海里/节':[1852,''],
'海哩/节':[1852,''],
'海浬/节':[1852,''],
'毫米':[0.001,''],
'':[1852,''],
'节/海里':[1852,''],
'节海里':[1852,''],
'节行驶英里':[1852,''],
'节下海里':[1852,''],
'':[0.001,'千克'],
'':[1852,''],
'里/节':[1852,''],
'':[1,''],
'千克':[1,''],
'千米':[1000,''],
'千米/节':[1000,''],
'千米/时':[1000,''],
'千米/小时':[1000,''],
'千米每小时':[1000,''],
'万海里/节':[18520000,''],
'英里,节':[1610,''],
'英里/节':[1610,''],
'余英里':[1610,''],
'约海里':[1852,''],
'最大海里':[1852,''],
'': [1, ''],
'': [1, '']}
return
def insert_main(self):
count = 0
for record in open(self.datapath):
data = {i:j for i,j in json.loads(record).items() if i !='_id'}
data_new = data.copy()
for key, value in data.items():
if key not in ['简介', '_id'] and self.check_num(value) and (value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('') or value.endswith('')) and len(value) < 11:
value_ = ''.join([i for i in value if i not in ['0','1','2','3','4','5','6','7','8','9','.']]).replace(' ','')
try:
num = float(value.replace(value_,''))
unit_info = self.unit_dict.get(value_)
plus = unit_info[0]
unit = unit_info[1]
num_standrd = num * plus
value_new = num_standrd
value_unit = unit
key_unit = key + '_单位'
data_new[key_unit] = value_unit
except Exception as e:
print(e)
value_new = value
pass
data_new[key] = value_new
elif key not in ['简介', '_id'] and self.check_year(value) and len(value) <= 15:
new_key = key + '_详细'
new_value = self.check_year(value)
data_new[new_key] = value
data_new[key] = new_value
print(data_new)
self.collection.insert(data_new)
count += 1
print('finished insert into database with %s records!'%count)
return
'检测是否有数字'
def check_num(self, sent):
pattern = re.compile('\d+')
res = pattern.findall(str(sent))
return res
'''检查年份'''
def check_year(self, sent):
sent = sent.replace(' ', '')
pattern_year = re.compile('[0-9]{4}')
pattern_month = re.compile('[0-9]{1,4}月')
pattern_day = re.compile('[0-9]{1,4}日')
default_day = ''
default_month = ''
month = pattern_month.findall(sent)
day = pattern_day.findall(sent)
year = pattern_year.findall(sent)
if year:
year = year[0].replace('', '')
if month:
default_month = month[0].replace('', '')
if day:
default_day = day[0].replace('', '')
if year:
date_new = year + self.full_date(default_month) + self.full_date(default_day)
else:
date_new = ''
else:
return ''
return date_new
'''补全日期'''
def full_date(self, date):
if not date:
date = '01'
if int(date) < 10 and len(date) < 2:
date = '0' + date
return date
if __name__ == '__main__':
handler = InsertData()
handler.insert_main()

593
military_qa.py Normal file
View File

@ -0,0 +1,593 @@
#!/usr/bin/env python3
# coding: utf-8
# File: militarygraph.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 19-3-11
import os
import re
import json
import jieba
import jieba.posseg as pseg
import pymongo
class MilitaryGraph:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.datapath = os.path.join(cur, 'data/military.json')
self.conn = pymongo.MongoClient()
db_name = 'military_qa'
col_name = 'data'
self.col = self.conn[db_name][col_name]
self.attributes ={'同型': ['同型'], '机高': ['机高'],
'战斗全重': ['战斗全重'], '水下排水量': ['水下排水量'],
'处理器': ['处理器'], '主炮': ['主炮'],
'制导系统': ['制导系统'], '全重': ['全重'],
'纬度': ['纬度'], '炮口初速': ['炮口初速'],
'发射性能': ['发射性能'], '兵装': ['兵装'],
'型号': ['型号'],
'长度': ['长度', '全长', '多长'], '翼展': ['翼展', '翼长'],
'全枪长': ['全枪长', '枪长'], '射程': ['射程'],
'前型': ['前型'],
'发射地点': ['发射地点', '发射地点'], '首飞时间': ['首飞时间', '首飞', '初次飞行', '首次飞行'],
'发动机数量': ['发动机数量', '几个发动机', '多少个发动机', '发动机个数', '发动机数目', '发动机个','发动机数'], '乘员': ['乘员'],
'战斗射速': ['战斗射速'], '生产单位': ['生产单位', '产商', '制造商', '厂家', '制造机构'],
'最大行程': ['最大行程', '最常距离'], '炮管长度': ['炮管长度', '炮管长', '炮管全长'],
'气动布局': ['气动布局'], '武备': ['武备'],
'武器装备': ['武器装备'], '引信': ['引信'],
'参战情况': ['参战情况'],
'动力装置': ['动力装置'], '飞行速度': ['飞行速度'],
'服役时间': ['服役时间'], '新造时': ['新造时'],
'活动范围': ['活动范围'], '弹匣容弹量': ['弹匣容弹量'],
'编制': ['编制'], '高度': ['高度'],
'制造厂': ['制造厂'], '口径': ['口径'],
'鱼雷': ['鱼雷'], '经度': ['经度'],
'研发时间': ['研发时间'], '简介': ['简介'],
'首次轨道发射': ['首次轨道发射'],
'挂载点': ['挂载点'], '刀锋宽度': ['刀锋宽度'],
'续航距离': ['续航距离'], '枪械': ['枪械'],
'最大速度': ['最大速度'], '运载火箭': ['运载火箭'],
'生产年限': ['生产年限'], '全枪重': ['全枪重'],
'空重': ['空重'], '水雷': ['水雷'],
'枪炮': ['枪炮'], '水上排水量': ['水上排水量', '排水量'],
'诞生时间': ['诞生时间'], '内置武器': ['内置武器'],
'机长': ['机长'], '中心直径': ['中心直径', '直径'],
'装药类型': ['装药类型'], '最大起飞重量': ['最大起飞重量', '起飞重量'],
'有效射程': ['有效射程'], '现状': ['现状'],
'研制时间': ['研制时间'], '舰舰导弹': ['舰舰导弹'],
'下水时间': ['下水时间', '下水'], '机炮': ['机炮'],
'弹长': ['弹长'], '退役时间': ['退役时间', '退役'],
'最大射程': ['最大射程'], '改装时': ['改装时'],
'刀重': ['刀重'], '自持力': ['自持力'],
'产国': ['产国'], '航速': ['航速'],
'制造商': ['制造商'], '型宽': ['型宽'],
'弹重': ['弹重'], '刀长': ['刀长'],
'舰长': ['舰长'], '研发厂商': ['研发厂商'],
'旋翼直径': ['旋翼直径'], '导弹': ['导弹'],
'满排吨位': ['满排吨位'], '底盘类型': ['底盘类型'],
'刀锋长度': ['刀锋长度'], '弹径': ['弹径'],
'全长': ['全长'], '竣工时': ['竣工时'],
'发射日期': ['发射日期'], '宽度': ['宽度'],
'总重': ['总重'], '建造时间': ['建造时间'],
'射控装置': ['射控装置'], '图片': ['图片'],
'轨道': ['轨道'], '改装前': ['改装前'],
'发动机': ['发动机'], '最大航程': ['最大航程'],
'研发单位': ['研发单位'], '大类': ['大类'],
'关注度': ['关注度'], '最大飞行速度': ['最大飞行速度'],
'火炮': ['火炮'], '战地机型': ['战地机型'],
'防空兵器': ['防空兵器'], '潜航深度': ['潜航深度'],
'轨道卫星': ['轨道卫星'], '尾翼装置': ['尾翼装置'],
'乘员与载员': ['乘员与载员'], '名称': ['名称'],
'引信装置': ['引信装置'], '次型': ['次型'],
'车长': ['车长'], '武装': ['武装'],"航长":['航长'],
'反舰导弹': ['反舰导弹'],
'满载排水量': ['满载排水量'], '装备': ['装备']}
self.big_cates ={'火炮': ['火炮'], '飞行器': ['飞行器'],
'舰船舰艇': ['舰船舰艇'], '坦克装甲车辆': ['坦克装甲车辆'],
'太空装备': ['太空装备'], '爆炸物': ['爆炸物'],
'导弹武器': ['导弹武器'], '枪械与单兵': ['枪械与单兵', '枪械', '', '单兵']}
self.second_cates = {'榴弹发射器': ['榴弹发射器'], '炸弹': ['炸弹', '炸药'],
'手榴弹': ['手榴弹'], '电子战机': ['电子战机'],
'机枪': ['机枪'], '宇宙飞船': ['宇宙飞船', '飞船'],
'加农炮': ['加农炮'], '救护车': ['救护车'],
'攻击机': ['攻击机'], '非自动步枪': ['非自动步枪', '步枪'],
'火箭弹': ['火箭弹'], '地雷': ['地雷'],
'高射炮': ['高射炮'], '航天飞机': ['航天飞机'],
'航天机构': ['航天机构', '航天局', '航天部门'], '舰舰导弹': ['舰舰导弹'],
'通用飞机': ['通用飞机'], '岸舰导弹': ['岸舰导弹', '导弹'],
'舰炮': ['舰炮'], '巡洋舰': ['巡洋舰'],
'气垫艇/气垫船': ['气垫艇/气垫船','气垫艇','气垫船'], '装甲指挥车': ['装甲指挥车', '装甲车', '指挥车'],
'无人机': ['无人机'], '氢弹': ['氢弹'],
'坦克炮': ['坦克炮'], '干线': ['干线'],
'原子弹': ['原子弹'], '冲锋枪': ['冲锋枪'],
'导弹艇': ['导弹艇'], '水雷战舰艇': ['水雷战舰艇'],
'侦察机': ['侦察机'], '试验机': ['试验机'],
'舰地(潜地)导弹': ['舰地(潜地)导弹','舰地导弹','潜地导弹', '导弹'],
'支线': ['支线'], '军事卫星': ['军事卫星'],
'地空导弹': ['地空导弹'], '航空炮': ['航空炮'],
'战列舰': ['战列舰'], '无后坐炮': ['无后坐炮'],
'空地导弹': ['空地导弹'], '加农榴弹炮': ['加农榴弹炮'],
'运输机': ['运输机'], '自行火炮': ['自行火炮'],
'地地导弹': ['地地导弹'], '空舰导弹': ['空舰导弹'],
'教练机': ['教练机'], '其他特种装甲车辆': ['其他特种装甲车辆'],
'火箭筒': ['火箭筒'], '空间探测器': ['空间探测器', '探测器'],
'预警机': ['预警机'], '航空母舰': ['航空母舰', '航母'],
'迷彩服': ['迷彩服'],'弹炮结合系统': ['弹炮结合系统'],
'科学卫星': ['科学卫星'], '空空导弹': ['空空导弹','导弹'],
'迫击炮': ['迫击炮'],
'应用卫星': ['应用卫星', '卫星'], '保障辅助舰艇': ['保障辅助舰艇'],
'刀具': ['刀具'], '霰弹枪': ['霰弹枪'],
'自动步枪': ['自动步枪'], '手枪': ['手枪'],
'反弹道导弹': ['反弹道导弹'], '两栖作战舰艇': ['两栖作战舰艇'],
'特种坦克': ['特种坦克', '坦克'], '运输直升机': ['运输直升机', '直升机'],
'巡逻舰/艇': ['巡逻舰/艇', '巡逻舰', '巡逻舰艇', '巡逻舰艇'], '加油机': ['加油机'],
'反坦克炮': ['反坦克炮'],
'越野车': ['越野车'], '步兵战车': ['步兵战车'],
'战斗机': ['战斗机'], '护卫舰': ['护卫舰'],
'工程抢修车': ['工程抢修车'],'反潜机': ['反潜机'],
'常规潜艇': ['常规潜艇'], '装甲侦察车': ['装甲侦察车'],
'舰空导弹': ['舰空导弹'], '运载火箭': ['运载火箭'],
'中子弹': ['中子弹'], '飞艇': ['飞艇'],
'航天基地': ['航天基地'], '鱼雷': ['鱼雷'],
'轰炸机': ['轰炸机'], '技术试验卫星': ['技术试验卫星', '卫星'],
'狙击枪': ['狙击枪'], '水雷': ['水雷'],
'装甲车载炮': ['装甲车载炮'], '榴弹炮': ['榴弹炮'],
'驱逐舰': ['驱逐舰'], '装甲运兵车': ['装甲运兵车'],
'火箭炮': ['火箭炮'], '多用途直升机': ['多用途直升机', '直升机'],
'核潜艇': ['核潜艇'], '武装直升机': ['武装直升机', '直升机'],
'布/扫雷车': ['布/扫雷车', '扫雷车', '扫雷车'], '潜舰导弹': ['潜舰导弹', '导弹'],
'主战坦克': ['主战坦克', '坦克']}
self.weapons = self.load_weapons()
self.weapon_dict = {i:i for i in self.weapons}
self.countries = {'荷兰': ['荷兰'], '阿根廷': ['阿根廷'], '瑞士': ['瑞士'],
'伊朗': ['伊朗'], '以色列': ['以色列'], '前南斯拉夫': ['前南斯拉夫'],
'越南': ['越南'], '葡萄牙': ['葡萄牙'], '乌克兰': ['乌克兰'],
'新西兰': ['新西兰'], '奥地利': ['奥地利'], '希腊': ['希腊'],
'塞尔维亚': ['塞尔维亚'], '比利时': ['比利时'],
'俄罗斯': ['俄罗斯'], '前捷克斯洛伐克': ['前捷克斯洛伐克'],
'捷克': ['捷克'], '土耳其': ['土耳其'], '缅甸': ['缅甸'],
'美国': ['美国'], '德国': ['德国'], '巴西': ['巴西'],
'印度尼西亚': ['印度尼西亚'], '法国': ['法国'],
'瑞典': ['瑞典'], '前苏联': ['前苏联'],
'朝鲜': ['朝鲜'],
'埃及': ['埃及'], '墨西哥': ['墨西哥'], '巴基斯坦': ['巴基斯坦'],
'马来西亚': ['马来西亚'], '澳大利亚': ['澳大利亚'], '泰国': ['泰国'],
'欧盟': ['欧盟'], '波兰': ['波兰'],
'韩国': ['韩国'], '日本': ['日本'],
'罗马尼亚': ['罗马尼亚'], '克罗地亚': ['克罗地亚'], '智利': ['智利'],
'匈牙利': ['匈牙利'], '意大利': ['意大利'], '英国': ['英国'],
'丹麦': ['丹麦'], '挪威': ['挪威'], '哈萨克斯坦': ['哈萨克斯坦'],
'爱尔兰': ['爱尔兰'], '伊拉克': ['伊拉克'],
'中国': ['中国','中华人民共和国'], '印度': ['印度'],
'保加利亚': ['保加利亚'], '斯洛伐克': ['斯洛伐克'],
'西班牙': ['西班牙'], '秘鲁': ['秘鲁'],
'阿联酋': ['阿联酋'], '卢森堡': ['卢森堡'],
'巴拿马': ['巴拿马'], '新加坡': ['新加坡'],
'波黑': ['波黑'], '南非': ['南非'],
'苏/俄': ['苏/俄', '苏联', '俄罗斯'], '加拿大': ['加拿大'], '芬兰': ['芬兰']}
self.compares = {
'$gt': ['高于','大于','长于','高过','大过','长过','多于', '远于', '远过', '之后', '晚于', '后于'],
'$lt': ['低于', '小于', '短于', '低过', '短过', '少于', '近于', '近过', '未达到', '没达到', '之前', '先于', '早于'],
'$lte': ['不高于','不大于','不长于','不高过','不大过','不长过','不多于', '不远于', '不远过'],
'$gte': ['不低于', '不小于', '不短于', '不低过', '不短过', '不少于', '不近于', '不近过', '达到'],
'$eq': ['等于', '差不多'],
'$ne': ['不等于', '不是']}
self.counts = ['多少', '', '几多']
self.mosts = {
-1:['最大', '最远', '最长', '最高', '最久', '最快', '最多', '最强'],
1:['最小', '最短', '最近', '最低', '最矮', '最慢', '最少', '最弱'],
}
self.unit_dict = {
'海里': [1852, ''],
'英里': [1610, ''],
'/节': [1852, ''],
'km/节': [1000, ''],
'': [1000, '千克'],
'-吨': [1000, '千克'],
'公里': [1000, ''],
'公里/节': [1000, ''],
'公里/小时': [1000, ''],
'海里节': [1852, ''],
'海里,节': [1852, ''],
'海里/节': [1852, ''],
'海哩/节': [1852, ''],
'海浬/节': [1852, ''],
'毫米': [0.001, ''],
'': [1852, ''],
'节/海里': [1852, ''],
'节海里': [1852, ''],
'节行驶英里': [1852, ''],
'节下海里': [1852, ''],
'': [0.001, '千克'],
'': [1852, ''],
'里/节': [1852, ''],
'': [1, ''],
'千克': [1, ''],
'千米': [1000, ''],
'千米/节': [1000, ''],
'千米/时': [1000, ''],
'千米/小时': [1000, ''],
'千米每小时': [1000, ''],
'万海里/节': [18520000, ''],
'英里,节': [1610, ''],
'英里/节': [1610, ''],
'余英里': [1610, ''],
'约海里': [1852, ''],
'最大海里': [1852, ''],
'厘米': [0.01, ''],
'分米': [0.1, ''],
'': [1, ''],
'': [1, '']}
unit_dict = {i:len(i) for i in self.unit_dict}
unit_wds = [i[0] for i in sorted(unit_dict.items(), key = lambda asd: asd[1], reverse=True)]
unit_regex = '([0-9]+.?[0-9]+)(%s)+' % '|'.join(unit_wds)
time_regex = '[0-9]{4}年[0-9]{0,4}月?[0-9]{0,4}日?'
self.unit_pattern = re.compile(unit_regex)
self.time_pattern = re.compile(time_regex)
self.country_dict = self.build_dict(self.countries)
self.big_dict = self.build_dict(self.big_cates)
self.small_dict = self.build_dict(self.second_cates)
self.attribute_dict = self.build_dict(self.attributes)
self.compare_dict = self.build_dict(self.compares)
self.most_dict = self.build_dict(self.mosts)
self.add_jieba(self.country_dict, 'n_country')
self.add_jieba(self.big_dict, 'n_big')
self.add_jieba(self.small_dict, 'n_small')
self.add_jieba(self.attribute_dict, 'n_attr')
self.add_jieba(self.compare_dict, 'n_compare')
self.add_jieba(self.most_dict, 'n_most')
self.add_jieba(self.weapons, 'n_weapon')
return
'''加载武器实体'''
def load_weapons(self):
weapons = []
for record in open(self.datapath):
data = json.loads(record)
weapons.append(data['名称'])
return list(set(weapons))
'''构造映射字典'''
def build_dict(self, dict):
wd_dict = {}
for cate, wds in dict.items():
for wd in wds:
wd_dict[wd] = cate
return wd_dict
'''检测单位'''
def detect_entity(self, question):
units = [i[0] + i[1] for i in self.unit_pattern.findall(question) if i]
times = self.time_pattern.findall(question)
return times, units
'''检查年份并统一时间'''
def standard_year(self, sent):
sent = sent.replace(' ', '')
pattern_year = re.compile('[0-9]{4}')
pattern_month = re.compile('[0-9]{1,4}月')
pattern_day = re.compile('[0-9]{1,4}日')
default_day = ''
default_month = ''
month = pattern_month.findall(sent)
day = pattern_day.findall(sent)
year = pattern_year.findall(sent)
if year:
year = year[0].replace('', '')
if month:
default_month = month[0].replace('', '')
if day:
default_day = day[0].replace('', '')
if year:
date_new = year + self.full_date(default_month) + self.full_date(default_day)
else:
date_new = ''
else:
return ''
return date_new
'''补全日期'''
def full_date(self, date):
if not date:
date = '01'
if int(date) < 10 and len(date) < 2:
date = '0' + date
return date
'检测是否有数字'
def check_num(self, sent):
pattern = re.compile('\d+')
res = pattern.findall(str(sent))
return res[0]
'''检查单位并统一数量'''
def standard_unit(self, unit_value):
num = self.check_num(unit_value)
unit = unit_value.replace(num, '')
unit_info = self.unit_dict.get(unit, [1, 'default'])
plus = unit_info[0]
num_standrd = float(num) * plus
return num_standrd
'''将实体标记和实体词加入到jieba当中'''
def add_jieba(self, wds, tag):
for wd in wds:
jieba.add_word(wd, tag=tag, freq=300000)
return
'''问句解析'''
def question_parser(self, question):
times, units = self.detect_entity(question)
self.add_jieba(times, 'n_time')
self.add_jieba(units, 'n_unit')
wds = [(i.word, i.flag) for i in pseg.cut(question)]
parser_dict = {}
parser_dict['n_attrs'] = [wd for wd,flag in wds if flag == 'n_attr']
parser_dict['n_times'] = [wd for wd,flag in wds if flag == 'n_time']
parser_dict['n_bigs'] = [wd for wd,flag in wds if flag == 'n_big']
parser_dict['n_smalls'] = [wd for wd,flag in wds if flag == 'n_small']
parser_dict['n_countries'] = [wd for wd,flag in wds if flag == 'n_country']
parser_dict['n_compares'] = [wd for wd,flag in wds if flag == 'n_compare']
parser_dict['n_mosts'] = [wd for wd,flag in wds if flag == 'n_most']
parser_dict['n_units'] = [wd for wd,flag in wds if flag == 'n_unit']
parser_dict['n_weapons'] = [wd for wd,flag in wds if flag == 'n_weapon']
parser_dict['pattern'] = [flag for wd, flag in wds if flag in ['n_attr', 'n_time', 'n_big', 'n_small', 'n_unit', 'n_country', 'n_compare', 'n_most', 'n_weapon']]
parser_dict['wds'] = wds
return parser_dict
'''答案搜索'''
def search_answer(self, parser_dict):
print(parser_dict)
pattern = parser_dict['pattern']
print(pattern)
search_data = []
condition = {}
targets = ['名称']
search_flag = 1
if pattern in [['n_country', 'n_small'], ['n_small', 'n_country']]:
country = self.country_dict.get(parser_dict.get('n_countries')[0])
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {'产国': country, '类型':n_small}
targets = ['名称']
search_data.append({'condition':condition, 'targets':targets})
elif pattern in [['n_country', 'n_big'], ['n_big', 'n_country']]:
country = self.country_dict.get(parser_dict.get('n_countries')[0])
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {'产国': country, '类型': n_big}
targets = ['名称']
search_data.append({'condition': condition, 'targets': targets})
elif pattern in [['n_country', 'n_weapon'], ['n_weapon']]:
n_weapon = self.weapon_dict.get(parser_dict.get('n_weapons')[0])
condition = {'名称': n_weapon}
targets = ['简介']
search_data.append({'condition': condition, 'targets': targets})
# 单实体多属性查询
elif pattern in [['n_country', 'n_weapon'],
['n_weapon', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_country', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr']
]:
n_weapon = self.weapon_dict.get(parser_dict.get('n_weapons')[0])
condition = {'名称': n_weapon}
targets = [self.attribute_dict.get(attr) for attr in parser_dict.get('n_attrs')]
search_data.append({'condition': condition, 'targets': targets})
# 多实体多属性查询
elif pattern in [
['n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon','n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr', 'n_attr'],
]:
n_weapons = [self.weapon_dict.get(weapon) for weapon in parser_dict.get('n_weapons')]
condition = {'名称': {"$in": n_weapons}}
targets = [self.attribute_dict.get(attr) for attr in parser_dict.get('n_attrs')]
search_data.append({'condition': condition, 'targets': targets})
# 实体、实体属性相间隔
elif pattern in [
['n_weapon', 'n_attr','n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_country','n_weapon', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_country',' n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country', 'n_weapon', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr'],
['n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr', 'n_country','n_weapon', 'n_attr', 'n_attr', 'n_attr'],
]:
n_indxes = [indx for indx, name in enumerate(pattern) if name == 'n_weapon']
n_weapons = [self.weapon_dict.get(weapon) for weapon in parser_dict.get('n_weapons')]
n1_weapon = n_weapons[0]
n2_weapon = n_weapons[1]
targets1 = [self.attribute_dict.get(weapon) for indx, weapon in enumerate(parser_dict.get('n_attrs')) if indx < len(n_indxes)]
targets2 = [self.attribute_dict.get(weapon) for indx, weapon in enumerate(parser_dict.get('n_attrs')) if indx >= len(n_indxes)]
condition1 = {'名称': n1_weapon}
condition2 = {'名称': n2_weapon}
search_data.append({'condition':condition1, 'targets': targets1})
search_data.append({'condition':condition2, 'targets': targets2})
# 比较查找,单操作符+操作数的实体
elif pattern in [
['n_attr', 'n_compare', 'n_unit', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_small'],
['n_attr', 'n_time', 'n_compare', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_time'],
['n_small', 'n_attr', 'n_time', 'n_compare'],
['n_attr', 'n_compare', 'n_unit', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_big'],
['n_attr', 'n_time', 'n_compare', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_time'],
['n_big', 'n_attr', 'n_time', 'n_compare'],
]:
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_compare = self.compare_dict.get(parser_dict.get('n_compares')[0])
if 'n_unit' in pattern:
n_unit = self.standard_unit(parser_dict.get('n_units')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compare:n_unit}, '类型':n_small}
else:
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compare:n_unit}, '大类':n_big}
else:
n_time = self.standard_year(parser_dict.get('n_times')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr: {n_compare: n_time}, '类型': n_small}
else:
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr: {n_compare: n_time}, '大类': n_big}
targets = [n_attr]
search_data.append({'condition':condition, 'targets':targets})
# 比较查找,双操作符+操作数的实体
elif pattern in [
['n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time', 'n_small'],
['n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare', 'n_small'],
['n_small', 'n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time'],
['n_small', 'n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare'],
['n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_unit', 'n_compare', 'n_unit'],
['n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time', 'n_big'],
['n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare', 'n_big'],
['n_big', 'n_attr', 'n_compare', 'n_time', 'n_compare', 'n_time'],
['n_big', 'n_attr', 'n_time', 'n_compare', 'n_time', 'n_compare'],
]:
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_compares = [self.compare_dict.get(compare) for compare in parser_dict.get('n_compares')]
if 'n_unit' in pattern:
n_units = [self.standard_unit(unit) for unit in parser_dict.get('n_units')]
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compares[0]:n_units[0], n_compares[1]:n_units[1]}, '类型':n_small}
else:
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compares[0]:n_units[0], n_compares[1]:n_units[1]},'大类':n_big}
else:
n_times = [self.standard_year(year) for year in parser_dict.get('n_times')]
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {n_attr:{n_compares[0]:n_times[0], n_compares[1]:n_times[1]}, '类型': n_small}
else:
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {n_attr:{n_compares[0]:n_times[0], n_compares[1]:n_times[1]}, '大类': n_big}
targets = [n_attr]
search_data.append({'condition':condition, 'targets':targets})
# 属性最值查找
elif pattern in [['n_small', 'n_attr', 'n_most'],
['n_attr', 'n_most', 'n_small'],
['n_big', 'n_attr', 'n_most'],
['n_attr', 'n_most', 'n_big'],
]:
search_flag = 0
n_attr = self.attribute_dict.get(parser_dict.get('n_attrs')[0])
n_most = self.most_dict.get(parser_dict.get('n_mosts')[0])
if 'n_small' in pattern:
n_small = self.small_dict.get(parser_dict.get('n_smalls')[0])
condition = {'类型': n_small, 'sort_key':{n_attr: n_most}}
else:
n_big = self.big_dict.get(parser_dict.get('n_bigs')[0])
condition = {'大类': n_big, 'sort_key': {n_attr: n_most}}
targets.append(n_attr)
search_data.append({'condition':condition, 'targets':targets})
result = self.query_mongo(search_flag, search_data)
return result
'''查询mongo数据库'''
def query_mongo(self, search_flag, search_data):
result = []
if search_flag:
result = self.query_mongo_attr(search_data)
else:
result = self.query_mongo_sort(search_data)
return result
'''查询mongo数据库正常'''
def query_mongo_attr(self, search_data):
result = []
for search in search_data:
condition = search['condition']
targets = search['targets']
for res in self.col.find(condition):
result.append([res.get('名称') + target + ':' + str(res.get(target,'null')) for target in targets if res.get(target, 'null') != 'null'])
return result
'''按照最值方法查找mongo数据库'''
def query_mongo_sort(self, search_data):
result = []
for search in search_data:
condition = {key:value for key, value in search['condition'].items() if key != 'sort_key'}
sort_condition = [(i,j) for i, j in search['condition'].get('sort_key').items()]
targets = search['targets']
for res in self.col.find(condition).sort(sort_condition).limit(1):
result_ = [res.get('名称') + target + ':' + str(res.get(target, 'null')) for target in targets]
result.append(result_)
return result
'问答主函数'
def qa_main(self, question):
parser_dict = self.question_parser(question)
results = self.search_answer(parser_dict)
if results == [[]]:
print('sorry, do not know the answer yet...')
else:
print('find %s result:'% len(results))
print('answer detail:')
for result in results:
print(result)
return
if __name__ == '__main__':
handler = MilitaryGraph()
while 1:
question = input("enter an question to parser:\n")
handler.qa_main(question)