创建法务智能项目

This commit is contained in:
liu huanyong 2018-11-11 15:49:36 +08:00
commit 90b2429647
18 changed files with 2355 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
embedding/*.bin

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6.3 (~/anaconda3/envs/py3/bin/python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

View File

@ -0,0 +1,3 @@
<component name="MarkdownNavigator.ProfileManager">
<settings default="" pdf-export="" />
</component>

83
.idea/misc.xml Normal file
View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MarkdownProjectSettings">
<PreviewSettings splitEditorLayout="SPLIT" splitEditorPreview="PREVIEW" useGrayscaleRendering="false" zoomFactor="1.0" maxImageWidth="0" showGitHubPageIfSynced="false" allowBrowsingInPreview="false" synchronizePreviewPosition="true" highlightPreviewType="NONE" highlightFadeOut="5" highlightOnTyping="true" synchronizeSourcePosition="true" verticallyAlignSourceAndPreviewSyncPosition="true" showSearchHighlightsInPreview="false" showSelectionInPreview="true" openRemoteLinks="true">
<PanelProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.panel" providerName="Default - Swing" />
</PanelProvider>
</PreviewSettings>
<ParserSettings gitHubSyntaxChange="false">
<PegdownExtensions>
<option name="ABBREVIATIONS" value="false" />
<option name="ANCHORLINKS" value="true" />
<option name="ASIDE" value="false" />
<option name="ATXHEADERSPACE" value="true" />
<option name="AUTOLINKS" value="true" />
<option name="DEFINITIONS" value="false" />
<option name="DEFINITION_BREAK_DOUBLE_BLANK_LINE" value="false" />
<option name="FENCED_CODE_BLOCKS" value="true" />
<option name="FOOTNOTES" value="false" />
<option name="HARDWRAPS" value="false" />
<option name="HTML_DEEP_PARSER" value="false" />
<option name="INSERTED" value="false" />
<option name="QUOTES" value="false" />
<option name="RELAXEDHRULES" value="true" />
<option name="SMARTS" value="false" />
<option name="STRIKETHROUGH" value="true" />
<option name="SUBSCRIPT" value="false" />
<option name="SUPERSCRIPT" value="false" />
<option name="SUPPRESS_HTML_BLOCKS" value="false" />
<option name="SUPPRESS_INLINE_HTML" value="false" />
<option name="TABLES" value="true" />
<option name="TASKLISTITEMS" value="true" />
<option name="TOC" value="false" />
<option name="WIKILINKS" value="true" />
</PegdownExtensions>
<ParserOptions>
<option name="COMMONMARK_LISTS" value="true" />
<option name="DUMMY" value="false" />
<option name="EMOJI_SHORTCUTS" value="true" />
<option name="FLEXMARK_FRONT_MATTER" value="false" />
<option name="GFM_LOOSE_BLANK_LINE_AFTER_ITEM_PARA" value="false" />
<option name="GFM_TABLE_RENDERING" value="true" />
<option name="GITBOOK_URL_ENCODING" value="false" />
<option name="GITHUB_EMOJI_URL" value="false" />
<option name="GITHUB_LISTS" value="false" />
<option name="GITHUB_WIKI_LINKS" value="true" />
<option name="JEKYLL_FRONT_MATTER" value="false" />
<option name="SIM_TOC_BLANK_LINE_SPACER" value="true" />
</ParserOptions>
</ParserSettings>
<HtmlSettings headerTopEnabled="false" headerBottomEnabled="false" bodyTopEnabled="false" bodyBottomEnabled="false" embedUrlContent="false" addPageHeader="true" embedImages="false" embedHttpImages="false">
<GeneratorProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.generator" providerName="Default Swing HTML Generator" />
</GeneratorProvider>
<headerTop />
<headerBottom />
<bodyTop />
<bodyBottom />
</HtmlSettings>
<CssSettings previewScheme="UI_SCHEME" cssUri="" isCssUriEnabled="false" isCssTextEnabled="false" isDynamicPageWidth="true">
<StylesheetProvider>
<provider providerId="com.vladsch.idea.multimarkdown.editor.swing.html.css" providerName="Default Swing Stylesheet" />
</StylesheetProvider>
<ScriptProviders />
<cssText />
</CssSettings>
<HtmlExportSettings updateOnSave="false" parentDir="$ProjectFileDir$" targetDir="$ProjectFileDir$" cssDir="" scriptDir="" plainHtml="false" imageDir="" copyLinkedImages="false" imageUniquifyType="0" targetExt="" useTargetExt="false" noCssNoScripts="false" linkToExportedHtml="true" exportOnSettingsChange="true" regenerateOnProjectOpen="false" linkFormatType="HTTP_ABSOLUTE" />
<LinkMapSettings>
<textMaps />
</LinkMapSettings>
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
<OptionsSetting value="true" id="Add" />
<OptionsSetting value="true" id="Remove" />
<OptionsSetting value="true" id="Checkout" />
<OptionsSetting value="true" id="Update" />
<OptionsSetting value="true" id="Status" />
<OptionsSetting value="true" id="Edit" />
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/envs/py3/bin/python)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/CrimeKgAssistant.iml" filepath="$PROJECT_DIR$/.idea/CrimeKgAssistant.iml" />
</modules>
</component>
</project>

421
.idea/workspace.xml Normal file
View File

@ -0,0 +1,421 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="8b1874b6-a1e9-45fd-a2c5-b0b8b2b7649b" name="Default" comment="" />
<ignored path="CrimeKgAssistant.iws" />
<ignored path=".idea/workspace.xml" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="TRACKING_ENABLED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CreatePatchCommitExecutor">
<option name="PATCH_PATH" value="" />
</component>
<component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
<component name="FavoritesManager">
<favorites_list name="CrimeKgAssistant" />
</component>
<component name="FileEditorManager">
<leaf />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/question_classify_train.py" />
<option value="$PROJECT_DIR$/question_classify.py" />
<option value="$PROJECT_DIR$/crime_classify.py" />
<option value="$PROJECT_DIR$/crime_qa_server.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds">
<option name="x" value="631" />
<option name="y" value="106" />
<option name="width" value="1261" />
<option name="height" value="1001" />
</component>
<component name="ProjectLevelVcsManager" settingsEditedManually="false">
<OptionsSetting value="true" id="Add" />
<OptionsSetting value="true" id="Remove" />
<OptionsSetting value="true" id="Checkout" />
<OptionsSetting value="true" id="Update" />
<OptionsSetting value="true" id="Status" />
<OptionsSetting value="true" id="Edit" />
<ConfirmationsSetting value="0" id="Add" />
<ConfirmationsSetting value="0" id="Remove" />
</component>
<component name="ProjectView">
<navigator currentView="ProjectPane" proportions="" version="1">
<flattenPackages />
<showMembers />
<showModules />
<showLibraryContents />
<hideEmptyPackages />
<abbreviatePackageNames />
<autoscrollToSource />
<autoscrollFromSource />
<sortByType />
<manualOrder />
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="ProjectPane">
<subPane>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="CrimeKgAssistant" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="CrimeKgAssistant" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="CrimeKgAssistant" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
<pane id="Scope" />
<pane id="Scratches" />
</panes>
</component>
<component name="PropertiesComponent">
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
<property name="settings.editor.splitter.proportion" value="0.2" />
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$" />
<recent name="$PROJECT_DIR$/data" />
</key>
<key name="MoveFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/dict" />
<recent name="$PROJECT_DIR$/embedding" />
</key>
</component>
<component name="RunManager" selected="Python.crime_classify">
<configuration default="false" name="question_classify" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/question_classify.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<method />
</configuration>
<configuration default="false" name="crime_classify" type="PythonConfigurationType" factoryName="Python" temporary="true">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/crime_classify.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<method />
</configuration>
<configuration default="true" type="BashConfigurationType" factoryName="Bash">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="INTERPRETER_PATH" value="/bin/bash" />
<option name="WORKING_DIRECTORY" value="" />
<option name="PARENT_ENVS" value="true" />
<option name="SCRIPT_NAME" value="" />
<option name="PARAMETERS" value="" />
<module name="" />
<envs />
<method />
</configuration>
<configuration default="true" type="PythonConfigurationType" factoryName="Python">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<method />
</configuration>
<configuration default="true" type="Tox" factoryName="Tox">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Attests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Doctests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Nosetests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<option name="PARAMS" value="" />
<option name="USE_PARAM" value="false" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="Unittests">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<option name="PUREUNITTEST" value="true" />
<option name="PARAMS" value="" />
<option name="USE_PARAM" value="false" />
<method />
</configuration>
<configuration default="true" type="tests" factoryName="py.test">
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs />
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="" />
<option name="IS_MODULE_SDK" value="false" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<module name="CrimeKgAssistant" />
<option name="SCRIPT_NAME" value="" />
<option name="CLASS_NAME" value="" />
<option name="METHOD_NAME" value="" />
<option name="FOLDER_NAME" value="" />
<option name="TEST_TYPE" value="TEST_SCRIPT" />
<option name="PATTERN" value="" />
<option name="USE_PATTERN" value="false" />
<option name="testToRun" value="" />
<option name="keywords" value="" />
<option name="params" value="" />
<option name="USE_PARAM" value="false" />
<option name="USE_KEYWORD" value="false" />
<method />
</configuration>
<list size="2">
<item index="0" class="java.lang.String" itemvalue="Python.question_classify" />
<item index="1" class="java.lang.String" itemvalue="Python.crime_classify" />
</list>
<recent_temporary>
<list size="2">
<item index="0" class="java.lang.String" itemvalue="Python.crime_classify" />
<item index="1" class="java.lang.String" itemvalue="Python.question_classify" />
</list>
</recent_temporary>
</component>
<component name="ShelveChangesManager" show_recycled="false">
<option name="remove_strategy" value="false" />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="8b1874b6-a1e9-45fd-a2c5-b0b8b2b7649b" name="Default" comment="" />
<created>1541920128942</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1541920128942</updated>
</task>
<servers />
</component>
<component name="ToolWindowManager">
<frame x="631" y="106" width="1261" height="1001" extended-state="0" />
<editor active="true" />
<layout>
<window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.24959612" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.6682832" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
</layout>
</component>
<component name="Vcs.Log.UiProperties">
<option name="RECENTLY_FILTERED_USER_GROUPS">
<collection />
</option>
<option name="RECENTLY_FILTERED_BRANCH_GROUPS">
<collection />
</option>
</component>
<component name="VcsContentAnnotationSettings">
<option name="myLimit" value="2678400000" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager />
<watches-manager />
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/question_classify_train.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2241">
<caret line="160" column="31" selection-start-line="160" selection-start-column="31" selection-end-line="160" selection-end-column="31" />
<folding>
<element signature="e#149#158#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/question_classify.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="484">
<caret line="40" column="15" selection-start-line="40" selection-start-column="15" selection-end-line="40" selection-end-column="15" />
<folding>
<element signature="e#150#159#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/dict/crime.txt">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crime_classify_train.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="484">
<caret line="22" column="33" selection-start-line="0" selection-start-column="0" selection-end-line="184" selection-end-column="0" />
<folding>
<element signature="e#146#155#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crime_classify.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="275">
<caret line="72" column="13" selection-start-line="72" selection-start-column="13" selection-end-line="72" selection-end-column="13" />
<folding>
<element signature="e#150#159#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/build_qa_database.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/crime_qa.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="352">
<caret line="16" column="15" selection-start-line="16" selection-start-column="15" selection-end-line="16" selection-end-column="15" />
<folding>
<element signature="e#147#156#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
</component>
</project>

98
build_qa_database.py Normal file
View File

@ -0,0 +1,98 @@
#!/usr/bin/env python3
# coding: utf-8
# File: insert_es.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-10-10
import os
import time
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pymongo
class ProcessIntoES:
def __init__(self):
self._index = "crime_data"
self.es = Elasticsearch([{"host": "127.0.0.1", "port": 9200}])
self.doc_type = "crime"
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.music_file = os.path.join(cur, 'qa_corpus.json')
'''创建ES索引确定分词类型'''
def create_mapping(self):
node_mappings = {
"mappings": {
self.doc_type: { # type
"properties": {
"question": { # field: 问题
"type": "text", # lxw NOTE: cannot be string
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"index": "true" # The index option controls whether field values are indexed.
},
"answers": { # field: 问题
"type": "text", # lxw NOTE: cannot be string
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart",
"index": "true" # The index option controls whether field values are indexed.
},
}
}
}
}
if not self.es.indices.exists(index=self._index):
self.es.indices.create(index=self._index, body=node_mappings)
print("Create {} mapping successfully.".format(self._index))
else:
print("index({}) already exists.".format(self._index))
'''批量插入数据'''
def insert_data_bulk(self, action_list):
success, _ = bulk(self.es, action_list, index=self._index, raise_on_error=True)
print("Performed {0} actions. _: {1}".format(success, _))
'''初始化ES将数据插入到ES数据库当中'''
def init_ES():
pie = ProcessIntoES()
# 创建ES的index
pie.create_mapping()
start_time = time.time()
index = 0
count = 0
action_list = []
BULK_COUNT = 1000 # 每BULK_COUNT个句子一起插入到ES中
for line in open(pie.music_file):
if not line:
continue
item = json.loads(line)
index += 1
action = {
"_index": pie._index,
"_type": pie.doc_type,
"_source": {
"question": item['question'],
"answers": '\n'.join(item['answers']),
}
}
action_list.append(action)
if index > BULK_COUNT:
pie.insert_data_bulk(action_list=action_list)
index = 0
count += 1
print(count)
action_list = []
end_time = time.time()
print("Time Cost:{0}".format(end_time - start_time))
if __name__ == "__main__":
# 将数据库插入到elasticsearch当中
# init_ES()
# 按照标题进行查询
question = '我老公要起诉离婚 我不想离婚怎么办'

99
crime_classify.py Normal file
View File

@ -0,0 +1,99 @@
#!/usr/bin/env python3
# coding: utf-8
# File: crime_classify.py.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-11
import os
import numpy as np
import jieba.posseg as pseg
from sklearn.externals import joblib
class CrimeClassify(object):
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
crime_file = os.path.join(cur, 'dict/crime.txt')
self.label_dict = self.build_crime_dict(crime_file)
self.id_dict = {j:i for i,j in self.label_dict.items()}
self.embedding_path = os.path.join(cur, 'embedding/word_vec_300.bin')
self.embdding_dict = self.load_embedding(self.embedding_path)
self.embedding_size = 300
self.model_path = 'model/crime_predict.model'
return
'''构建罪名词类型'''
def build_crime_dict(self, crimefile):
label_dict = {}
i = 0
for line in open(crimefile):
crime = line.strip()
if not crime:
continue
label_dict[crime] = i
i +=1
return label_dict
'''加载词向量'''
def load_embedding(self, embedding_path):
embedding_dict = {}
count = 0
for line in open(embedding_path):
line = line.strip().split(' ')
if len(line) < 300:
continue
wd = line[0]
vector = np.array([float(i) for i in line[1:]])
embedding_dict[wd] = vector
count += 1
if count%10000 == 0:
print(count, 'loaded')
print('loaded %s word embedding, finished'%count, )
return embedding_dict
'''对文本进行分词处理'''
def seg_sent(self, s):
wds = [i.word for i in pseg.cut(s) if i.flag[0] not in ['x', 'u', 'c', 'p', 'm', 't']]
return wds
'''基于wordvector通过lookup table的方式找到句子的wordvector的表示'''
def rep_sentencevector(self, sentence, flag='seg'):
if flag == 'seg':
word_list = [i for i in sentence.split(' ') if i]
else:
word_list = self.seg_sent(sentence)
embedding = np.zeros(self.embedding_size)
sent_len = 0
for index, wd in enumerate(word_list):
if wd in self.embdding_dict:
embedding += self.embdding_dict.get(wd)
sent_len += 1
else:
continue
return embedding/sent_len
'''对数据进行onehot映射操作'''
def label_onehot(self, label):
one_hot = [0]*len(self.label_dict)
one_hot[int(label)] = 1
return one_hot
'''使用svm模型进行预测'''
def predict(self, sent):
model = joblib.load(self.model_path)
represent_sent = self.rep_sentencevector(sent, flag='noseg')
text_vector = np.array(represent_sent).reshape(1, -1)
res = model.predict(text_vector)[0]
label = self.id_dict.get(res)
return label
def test():
handler = CrimeClassify()
while(1):
sent = input('enter an sent to search:')
label = handler.predict(sent)
print(label)
if __name__ == '__main__':
test()

184
crime_classify_train.py Normal file
View File

@ -0,0 +1,184 @@
#!/usr/bin/env python3
# coding: utf-8
# File: crime_classify.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-10
import os
import numpy as np
from sklearn.svm import SVC, LinearSVC
import jieba.posseg as pseg
from collections import Counter
from sklearn.externals import joblib
class CrimeClassify(object):
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
crime_file = os.path.join(cur, 'crime.txt')
self.label_dict = self.build_crime_dict(crime_file)
self.id_dict = {j:i for i,j in self.label_dict.items()}
self.train_file = os.path.join(cur, 'crime_train_all.txt')
self.embedding_path = os.path.join(cur, 'embedding/word_vec_300.bin')
self.embdding_dict = self.load_embedding(self.embedding_path)
self.embedding_size = 300
self.model_path = 'crime_predict_svm_all.model'
return
'''构建罪名词类型'''
def build_crime_dict(self, crimefile):
label_dict = {}
i = 0
for line in open(crimefile):
crime = line.strip()
if not crime:
continue
label_dict[crime] = i
i +=1
return label_dict
'''加载词向量'''
def load_embedding(self, embedding_path):
embedding_dict = {}
count = 0
for line in open(embedding_path):
line = line.strip().split(' ')
if len(line) < 300:
continue
wd = line[0]
vector = np.array([float(i) for i in line[1:]])
embedding_dict[wd] = vector
count += 1
if count%10000 == 0:
print(count, 'loaded')
print('loaded %s word embedding, finished'%count, )
return embedding_dict
'''对文本进行分词处理'''
def seg_sent(self, s):
wds = [i.word for i in pseg.cut(s) if i.flag[0] not in ['x', 'u', 'c', 'p', 'm', 't']]
return wds
'''基于wordvector通过lookup table的方式找到句子的wordvector的表示'''
def rep_sentencevector(self, sentence, flag='seg'):
if flag == 'seg':
word_list = [i for i in sentence.split(' ') if i]
else:
word_list = self.seg_sent(sentence)
embedding = np.zeros(self.embedding_size)
sent_len = 0
for index, wd in enumerate(word_list):
if wd in self.embdding_dict:
embedding += self.embdding_dict.get(wd)
sent_len += 1
else:
continue
return embedding/sent_len
'''对数据进行onehot映射操作'''
def label_onehot(self, label):
one_hot = [0]*len(self.label_dict)
one_hot[int(label)] = 1
return one_hot
'''加载数据集'''
def load_traindata(self):
train_X = []
train_Y = []
count = 0
for line in open(self.train_file):
line = line.strip().strip().split('\t')
if len(line) < 2:
continue
count += 1
# if count > 1000:
# break
sent = line[1]
label_id = int(line[0])
sent_vector = self.rep_sentencevector(sent, flag='seg')
train_X.append(sent_vector)
train_Y.append(label_id)
if count % 10000 == 0:
print('loaded %s lines'%count)
return np.array(train_X), np.array(train_Y)
'''使用SVM进行分类'''
def train_classifer(self):
x_train, y_train = self.load_traindata()
model = LinearSVC()
model.fit(x_train, y_train)
joblib.dump(model, self.model_path)
y_predict = model.predict(x_train)
all = len(y_predict)
right = 0
for i in range(len(y_train)):
y = y_train[i]
y_pred = y_predict[i]
if y_pred == y:
right += 1
print('precision:%s/%s=%s'%(right, all, right/all))
'''使用svm模型进行预测'''
def predict(self, sent):
model = joblib.load(self.model_path)
represent_sent = self.rep_sentencevector(sent, flag='noseg')
text_vector = np.array(represent_sent).reshape(1, -1)
res = model.predict(text_vector)[0]
label = self.id_dict.get(res)
return label
'''检查测试合准确率'''
def check_precision(self):
model = joblib.load(self.model_path)
x_train, y_train = self.load_traindata()
y_predict = model.predict(x_train)
all = len(y_predict)
right = 0
for i in range(len(y_train)):
y = y_train[i]
y_pred = y_predict[i]
if y_pred == y:
right += 1
print('precision:%s/%s=%s'%(right, all, right/all))
# precision:170231 / 204231 = 0.83352184536138
# precision:2650780 / 2880306 = 0.9203119390786951
def test():
handler = CrimeClassify()
# handler.train_classifer()
while(1):
sent = input('enter an sent to search:')
label = handler.predict(sent)
print(label)
def build_data():
label_dict = {}
i = 0
for line in open('crime.txt'):
crime = line.strip()
if not crime:
continue
label_dict[crime] = i
i += 1
f = open('crime_train_all.txt', 'w+')
count = 0
for line in open('accu_train.txt'):
line = line.strip().split('###')
if len(line) < 3:
continue
crime = line[1].split(';')[0]
sent = line[-1]
label = label_dict.get(crime)
f.write(str(label) + '\t' + sent + '\n')
count += 1
print(count)
f.close()
if __name__ == '__main__':
test()
#build_data()
#handler = CrimeClassify()
#handler.check_precision()

58
crime_qa.py Normal file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env python3
# coding: utf-8
# File: crime_qa_server.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-10
import os
import time
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pymongo
class CrimeQA:
def __init__(self):
self._index = "crime_data"
self.es = Elasticsearch([{"host": "127.0.0.1", "port": 9200}])
self.doc_type = "crime"
'''根据question进行事件的匹配查询'''
def search_specific(self, value, key="question"):
query_body = {
"query": {
"match": {
key: value,
}
}
}
searched = self.es.search(index=self._index, doc_type=self.doc_type, body=query_body, size=20)
# 输出查询到的结果
return searched["hits"]["hits"]
'''基于ES的问题查询'''
def search_es(self, question):
answers = []
res = self.search_specific(question)
for hit in res:
answer_dict = {}
answer_dict['score'] = hit['_score']
answer_dict['sim_question'] = hit['_source']['question']
answer_dict['answers'] = hit['_source']['answers'].split('\n')
answers.append(answer_dict)
return answers
'''问答主函数'''
def search_main(self, question):
candi_answers = self.search_es(question)
for candi in candi_answers:
print(candi)
if __name__ == "__main__":
handler = CrimeQA()
question = '最近买了一把枪,会犯什么罪?'
handler.search_main(question)

856
data/kg_crime.json Normal file

File diff suppressed because one or more lines are too long

BIN
data/qa_corpus.json.zip Normal file

Binary file not shown.

202
dict/crime.txt Normal file
View File

@ -0,0 +1,202 @@
妨害公务
寻衅滋事
盗窃、侮辱尸体
危险物品肇事
非法采矿
组织、强迫、引诱、容留、介绍卖淫
开设赌场
聚众斗殴
绑架
非法持有毒品
销售假冒注册商标的商品
容留他人吸毒
假冒注册商标
交通肇事
破坏电力设备
组织卖淫
合同诈骗
走私武器、弹药
抢劫
非法处置查封、扣押、冻结的财产
以危险方法危害公共安全
过失投放危险物质
非法制造、买卖、运输、邮寄、储存枪支、弹药、爆炸物
伪造、变造、买卖武装部队公文、证件、印章
持有、使用假币
重婚
聚众冲击国家机关
生产、销售伪劣农药、兽药、化肥、种子
收买被拐卖的妇女、儿童
聚众哄抢
重大劳动安全事故
侵占
包庇毒品犯罪分子
虚报注册资本
违法发放贷款
制造、贩卖、传播淫秽物品
窝藏、包庇
帮助毁灭、伪造证据
放火
强奸
非法携带枪支、弹药、管制刀具、危险物品危及公共安全
伪造、变造金融票证
爆炸
玩忽职守
对非国家工作人员行贿
伪造、倒卖伪造的有价票证
私分国有资产
非法收购、运输、加工、出售国家重点保护植物、国家重点保护植物制品
生产、销售假药
挪用特定款物
过失致人死亡
走私国家禁止进出口的货物、物品
非法制造、买卖、运输、储存危险物质
洗钱
骗取贷款、票据承兑、金融票证
非法买卖制毒物品
非法买卖、运输、携带、持有毒品原植物种子、幼苗
生产、销售有毒、有害食品
滥用职权
招收公务员、学生徇私舞弊
诬告陷害
非法获取国家秘密
非法行医
非法收购、运输、出售珍贵、濒危野生动物、珍贵、濒危野生动物制品
非法出售发票
行贿
高利转贷
非法吸收公众存款
传播淫秽物品
非法进行节育手术
盗伐林木
聚众扰乱社会秩序
走私、贩卖、运输、制造毒品
滥伐林木
赌博
非法经营
生产、销售不符合安全标准的食品
提供侵入、非法控制计算机信息系统程序、工具
倒卖文物
窃取、收买、非法提供信用卡信息
盗掘古文化遗址、古墓葬
协助组织卖淫
破坏广播电视设施、公用电信设施
走私普通货物、物品
逃税
破坏监管秩序
失火
受贿
组织、领导、参加黑社会性质组织
票据诈骗
非法制造、销售非法制造的注册商标标识
侵犯著作权
伪造、变造、买卖国家机关公文、证件、印章
徇私舞弊不征、少征税款
强迫劳动
贷款诈骗
劫持船只、汽车
诈骗
非法种植毒品原植物
非法狩猎
挪用资金
非法收购、运输盗伐、滥伐的林木
出售、购买、运输假币
抢夺
虐待被监管人
窝藏、转移、收购、销售赃物
破坏计算机信息系统
制作、复制、出版、贩卖、传播淫秽物品牟利
拒不支付劳动报酬
盗窃、抢夺枪支、弹药、爆炸物
强迫他人吸毒
走私珍贵动物、珍贵动物制品
虐待
非法获取公民个人信息
破坏交通设施
非法转让、倒卖土地使用权
非法捕捞水产品
非法占用农用地
非法制造、出售非法制造的发票
非法持有、私藏枪支、弹药
集资诈骗
强迫卖淫
伪造公司、企业、事业单位、人民团体印章
利用影响力受贿
编造、故意传播虚假恐怖信息
介绍贿赂
传播性病
拐卖妇女、儿童
倒卖车票、船票
窝藏、转移、隐瞒毒品、毒赃
徇私舞弊不移交刑事案件
过失损坏广播电视设施、公用电信设施
动植物检疫徇私舞弊
破坏交通工具
猥亵儿童
挪用公款
伪造货币
冒充军人招摇撞骗
非法采伐、毁坏国家重点保护植物
故意毁坏财物
非法拘禁
招摇撞骗
伪造、变造居民身份证
徇私枉法
非法生产、买卖警用装备
掩饰、隐瞒犯罪所得、犯罪所得收益
生产、销售伪劣产品
破坏生产经营
帮助犯罪分子逃避处罚
贪污
投放危险物质
持有伪造的发票
危险驾驶
妨害作证
非法猎捕、杀害珍贵、濒危野生动物
重大责任事故
诽谤
虚开发票
引诱、教唆、欺骗他人吸毒
脱逃
扰乱无线电通讯管理秩序
保险诈骗
非法生产、销售间谍专用器材
非法组织卖血
强迫交易
串通投标
破坏易燃易爆设备
传授犯罪方法
妨害信用卡管理
拐骗儿童
单位行贿
打击报复证人
拒不执行判决、裁定
经济犯
金融凭证诈骗
虚开增值税专用发票、用于骗取出口退税、抵扣税款发票
走私废物
组织、领导传销活动
单位受贿
盗窃、抢夺枪支、弹药、爆炸物、危险物质
过失以危险方法危害公共安全
过失致人重伤
引诱、容留、介绍卖淫
遗弃
走私
信用卡诈骗
对单位行贿
故意杀人
聚众扰乱公共场所秩序、交通秩序
盗窃
故意伤害
非法侵入住宅
强制猥亵、侮辱妇女
伪证
污染环境
巨额财产来源不明
非国家工作人员受贿
侮辱
隐匿、故意销毁会计凭证、会计帐簿、财务会计报告
过失损坏武器装备、军事设施、军事通信
敲诈勒索
职务侵占

Binary file not shown.

BIN
model/crime_predict.model Normal file

Binary file not shown.

Binary file not shown.

150
question_classify.py Normal file
View File

@ -0,0 +1,150 @@
#!/usr/bin/env python3
# coding: utf-8
# File: question_classify.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-11
import os
import numpy as np
import jieba.posseg as pseg
from keras.models import Sequential, load_model
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D, Dense, Dropout, LSTM, Bidirectional
class QuestionClassify(object):
def __init__(self):
self.label_dict = {
0: "婚姻家庭",
1: "劳动纠纷",
2: "交通事故",
3: "债权债务",
4: "刑事辩护",
5: "合同纠纷",
6: "房产纠纷",
7: "侵权",
8: "公司法",
9: "医疗纠纷",
10: "拆迁安置",
11: "行政诉讼",
12: "建设工程"
}
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.embedding_path = os.path.join(cur, 'embedding/word_vec_300.bin')
self.embdding_dict = self.load_embedding(self.embedding_path)
self.max_length = 60
self.embedding_size = 300
self.lstm_modelpath = 'model/lstm_question_classify.h5'
self.cnn_modelpath = 'model/cnn_question_classify.h5'
return
'''加载词向量'''
def load_embedding(self, embedding_path):
embedding_dict = {}
count = 0
for line in open(embedding_path):
line = line.strip().split(' ')
if len(line) < 300:
continue
wd = line[0]
vector = np.array([float(i) for i in line[1:]])
embedding_dict[wd] = vector
count += 1
if count % 10000 == 0:
print(count, 'loaded')
print('loaded %s word embedding, finished' % count, )
return embedding_dict
'''对文本进行分词处理'''
def seg_sent(self, s):
wds = [i.word for i in pseg.cut(s) if i.flag[0] not in ['w', 'x']]
return wds
'''基于wordvector通过lookup table的方式找到句子的wordvector的表示'''
def rep_sentencevector(self, sentence):
word_list = self.seg_sent(sentence)[:self.max_length]
embedding_matrix = np.zeros((self.max_length, self.embedding_size))
for index, wd in enumerate(word_list):
if wd in self.embdding_dict:
embedding_matrix[index] = self.embdding_dict.get(wd)
else:
continue
len_sent = len(word_list)
embedding_matrix = self.modify_sentencevector(embedding_matrix, len_sent)
return embedding_matrix
'''对于OOV词,通过左右词的词向量作平均,作为词向量表示'''
def modify_sentencevector(self, embedding_matrix, len_sent):
context_window = 2
for indx, vec in enumerate(embedding_matrix):
left = indx - context_window
right = indx + context_window
if left < 0:
left = 0
if right > len(embedding_matrix) - 1:
right = -2
context = embedding_matrix[left:right + 1]
if vec.tolist() == [0] * 300 and indx < len_sent:
context_vector = context.mean(axis=0)
embedding_matrix[indx] = context_vector
return embedding_matrix
'''对数据进行onehot映射操作'''
def label_onehot(self, label):
one_hot = [0] * len(self.label_dict)
one_hot[int(label)] = 1
return one_hot
'''构造CNN网络模型'''
def build_cnn_model(self):
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(self.max_length, self.embedding_size)))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(13, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.summary()
return model
'''构造LSTM网络'''
def build_lstm_model(self):
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(
self.max_length, self.embedding_size))) # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True)) # returns a sequence of vectors of dimension 32
model.add(LSTM(32)) # return a single vector of dimension 32
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
return model
'''问题分类'''
def predict(self, sent):
model = load_model(self.cnn_modelpath)
sentence_vector = np.array([self.rep_sentencevector(sent)])
res = model.predict(sentence_vector)[0].tolist()
prob = max(res)
label = self.label_dict.get(res.index(prob))
return label, prob
if __name__ == '__main__':
handler = QuestionClassify()
while (1):
sent = input('enter an sent to search:')
label, prob = handler.predict(sent)
print(label, prob)

181
question_classify_train.py Normal file
View File

@ -0,0 +1,181 @@
#!/usr/bin/env python3
# coding: utf-8
# File: question_classify.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-10
import os
import numpy as np
import jieba.posseg as pseg
from keras.models import Sequential, load_model
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D, Dense, Dropout, LSTM, Bidirectional
class QuestionClassify(object):
def __init__(self):
self.label_dict = {
0:"婚姻家庭",
1:"劳动纠纷",
2:"交通事故",
3:"债权债务",
4:"刑事辩护",
5:"合同纠纷",
6:"房产纠纷",
7:"侵权",
8:"公司法",
9:"医疗纠纷",
10:"拆迁安置",
11:"行政诉讼",
12:"建设工程"
}
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.train_file = os.path.join(cur, 'question_train.txt')
self.embedding_path = os.path.join(cur, 'word_vec_300.bin')
self.embdding_dict = self.load_embedding(self.embedding_path)
self.max_length = 60
self.embedding_size = 300
self.lstm_modelpath = 'model/lstm_question_classify.h5'
self.cnn_modelpath = 'model/cnn_question_classify.h5'
return
'''加载词向量'''
def load_embedding(self, embedding_path):
embedding_dict = {}
count = 0
for line in open(embedding_path):
line = line.strip().split(' ')
if len(line) < 300:
continue
wd = line[0]
vector = np.array([float(i) for i in line[1:]])
embedding_dict[wd] = vector
count += 1
if count%10000 == 0:
print(count, 'loaded')
print('loaded %s word embedding, finished'%count, )
return embedding_dict
'''对文本进行分词处理'''
def seg_sent(self, s):
wds = [i.word for i in pseg.cut(s) if i.flag[0] not in ['w', 'x']]
return wds
'''基于wordvector通过lookup table的方式找到句子的wordvector的表示'''
def rep_sentencevector(self, sentence):
word_list = self.seg_sent(sentence)[:self.max_length]
embedding_matrix = np.zeros((self.max_length, self.embedding_size))
for index, wd in enumerate(word_list):
if wd in self.embdding_dict:
embedding_matrix[index] = self.embdding_dict.get(wd)
else:
continue
len_sent = len(word_list)
embedding_matrix = self.modify_sentencevector(embedding_matrix, len_sent)
return embedding_matrix
'''对于OOV词,通过左右词的词向量作平均,作为词向量表示'''
def modify_sentencevector(self, embedding_matrix, len_sent):
context_window = 2
for indx, vec in enumerate(embedding_matrix):
left = indx-context_window
right = indx+context_window
if left < 0:
left = 0
if right > len(embedding_matrix)-1:
right = -2
context = embedding_matrix[left:right+1]
if vec.tolist() == [0]*300 and indx < len_sent:
context_vector = context.mean(axis=0)
embedding_matrix[indx] = context_vector
return embedding_matrix
'''对数据进行onehot映射操作'''
def label_onehot(self, label):
one_hot = [0]*len(self.label_dict)
one_hot[int(label)] = 1
return one_hot
'''加载数据集'''
def load_traindata(self):
train_X = []
train_Y = []
count = 0
for line in open(self.train_file):
line = line.strip().strip().split('\t')
if len(line) < 2:
continue
count += 1
sent = line[0]
label = line[1]
sent_vector = self.rep_sentencevector(sent)
label_vector = self.label_onehot(label)
train_X.append(sent_vector)
train_Y.append(label_vector)
if count % 10000 == 0:
print('loaded %s lines'%count)
return np.array(train_X), np.array(train_Y)
'''构造CNN网络模型'''
def build_cnn_model(self):
model = Sequential()
model.add(Conv1D(64, 3, activation='relu', input_shape=(self.max_length, self.embedding_size)))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(128, 3, activation='relu'))
model.add(Conv1D(128, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.5))
model.add(Dense(13, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
model.summary()
return model
'''构造LSTM网络'''
def build_lstm_model(self):
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(self.max_length, self.embedding_size))) # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True)) # returns a sequence of vectors of dimension 32
model.add(LSTM(32)) # return a single vector of dimension 32
model.add(Dense(13, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])
return model
'''训练CNN模型'''
def train_cnn(self):
X_train, Y_train, X_test, Y_test = self.split_trainset()
model = self.build_cnn_model()
model.fit(X_train, Y_train, batch_size=100, epochs=20, validation_data=(X_test, Y_test))
model.save(self.cnn_modelpath)
'''训练CNN模型'''
def train_lstm(self):
X_train, Y_train, X_test, Y_test = self.split_trainset()
model = self.build_lstm_model()
model.fit(X_train, Y_train, batch_size=100, epochs=50, validation_data=(X_test, Y_test))
model.save(self.lstm_modelpath)
'''划分数据集,按一定比例划分训练集和测试集'''
def split_trainset(self):
X, Y = self.load_traindata()
split_rate = 0.8
indx = int(len(X)*split_rate)
X_train = X[:indx]
Y_train = Y[:indx]
X_test = X[indx:]
Y_test = Y[indx:]
return X_train, Y_train, X_test, Y_test
if __name__ == '__main__':
handler = QuestionClassify()
handler.train_cnn()
handler.train_lstm()