添加语料构建项目
This commit is contained in:
parent
119094761b
commit
06a34764d2
12
EventMonitor/.idea/EventMonitor-master.iml
Normal file
12
EventMonitor/.idea/EventMonitor-master.iml
Normal file
@ -0,0 +1,12 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="projectConfiguration" value="Nosetests" />
|
||||
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
|
||||
</component>
|
||||
</module>
|
4
EventMonitor/.idea/misc.xml
Normal file
4
EventMonitor/.idea/misc.xml
Normal file
@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
|
||||
</project>
|
8
EventMonitor/.idea/modules.xml
Normal file
8
EventMonitor/.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/EventMonitor-master.iml" filepath="$PROJECT_DIR$/.idea/EventMonitor-master.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
340
EventMonitor/.idea/workspace.xml
Normal file
340
EventMonitor/.idea/workspace.xml
Normal file
@ -0,0 +1,340 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="TRACKING_ENABLED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<leaf>
|
||||
<file leaf-file-name="process_redis.py" pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/process_redis.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="684">
|
||||
<caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
|
||||
<folding>
|
||||
<element signature="e#14#26#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="news_spider.py" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="288">
|
||||
<caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
|
||||
<folding>
|
||||
<element signature="e#142#155#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="items.py" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="90">
|
||||
<caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
<file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="414">
|
||||
<caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
|
||||
<folding>
|
||||
<element signature="e#194#203#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>prin</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="IdeDocumentHistory">
|
||||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/EventMonitor/settings.py" />
|
||||
<option value="$PROJECT_DIR$/EventMonitor/pipelines.py" />
|
||||
<option value="$PROJECT_DIR$/EventMonitor/spiders/news_spider.py" />
|
||||
<option value="$PROJECT_DIR$/process_redis.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectFrameBounds" extendedState="7">
|
||||
<option name="y" value="-7" />
|
||||
<option name="width" value="1366" />
|
||||
<option name="height" value="732" />
|
||||
</component>
|
||||
<component name="ProjectView">
|
||||
<navigator currentView="ProjectPane" proportions="" version="1">
|
||||
<flattenPackages />
|
||||
<showMembers />
|
||||
<showModules />
|
||||
<showLibraryContents />
|
||||
<hideEmptyPackages />
|
||||
<abbreviatePackageNames />
|
||||
<autoscrollToSource />
|
||||
<autoscrollFromSource />
|
||||
<sortByType />
|
||||
<manualOrder />
|
||||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
<path>
|
||||
<item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="EventMonitor" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
<pane id="Scratches" />
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
||||
</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$/EventMonitor/spiders" />
|
||||
<recent name="$PROJECT_DIR$/EventMonitor" />
|
||||
</key>
|
||||
<key name="MoveFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$" />
|
||||
<recent name="$PROJECT_DIR$/EventMonitor" />
|
||||
<recent name="$PROJECT_DIR$/EventMonitor/spiders" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunDashboard">
|
||||
<option name="ruleStates">
|
||||
<list>
|
||||
<RuleState>
|
||||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
<RuleState>
|
||||
<option name="name" value="StatusDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.process_redis">
|
||||
<configuration name="process_redis" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<module name="EventMonitor-master" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/process_redis.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
</configuration>
|
||||
<recent_temporary>
|
||||
<list size="1">
|
||||
<item index="0" class="java.lang.String" itemvalue="Python.process_redis" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="ShelveChangesManager" show_recycled="false">
|
||||
<option name="remove_strategy" value="false" />
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
|
||||
<created>1543216322930</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1543216322930</updated>
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="-7" width="1366" height="732" extended-state="7" />
|
||||
<editor active="true" />
|
||||
<layout>
|
||||
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.18887262" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
|
||||
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
|
||||
<window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.3296" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
|
||||
<window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
|
||||
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
|
||||
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
|
||||
</layout>
|
||||
</component>
|
||||
<component name="VcsContentAnnotationSettings">
|
||||
<option name="myLimit" value="2678400000" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<option name="time" value="1" />
|
||||
</breakpoint-manager>
|
||||
<watches-manager />
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/crawl.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="90">
|
||||
<caret line="5" column="1" lean-forward="false" selection-start-line="5" selection-start-column="1" selection-end-line="5" selection-end-column="1" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/rel_data.txt">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="198">
|
||||
<caret line="11" column="0" lean-forward="false" selection-start-line="11" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/utils.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/settings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-380">
|
||||
<caret line="44" column="137" lean-forward="false" selection-start-line="44" selection-start-column="137" selection-end-line="44" selection-end-column="137" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/middlewares.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/handle_html.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="-852">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/rel_data.txt">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="414">
|
||||
<caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
|
||||
<folding>
|
||||
<element signature="e#194#203#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="90">
|
||||
<caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/rel_data.txt">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="0">
|
||||
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
|
||||
<folding />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="288">
|
||||
<caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
|
||||
<folding>
|
||||
<element signature="e#142#155#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/process_redis.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="684">
|
||||
<caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
|
||||
<folding>
|
||||
<element signature="e#14#26#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</component>
|
||||
</project>
|
0
EventMonitor/EventMonitor/__init__.py
Normal file
0
EventMonitor/EventMonitor/__init__.py
Normal file
BIN
EventMonitor/EventMonitor/__init__.pyc
Normal file
BIN
EventMonitor/EventMonitor/__init__.pyc
Normal file
Binary file not shown.
BIN
EventMonitor/EventMonitor/__pycache__/__init__.cpython-35.pyc
Normal file
BIN
EventMonitor/EventMonitor/__pycache__/__init__.cpython-35.pyc
Normal file
Binary file not shown.
BIN
EventMonitor/EventMonitor/__pycache__/items.cpython-35.pyc
Normal file
BIN
EventMonitor/EventMonitor/__pycache__/items.cpython-35.pyc
Normal file
Binary file not shown.
BIN
EventMonitor/EventMonitor/__pycache__/pipelines.cpython-35.pyc
Normal file
BIN
EventMonitor/EventMonitor/__pycache__/pipelines.cpython-35.pyc
Normal file
Binary file not shown.
BIN
EventMonitor/EventMonitor/__pycache__/settings.cpython-35.pyc
Normal file
BIN
EventMonitor/EventMonitor/__pycache__/settings.cpython-35.pyc
Normal file
Binary file not shown.
19
EventMonitor/EventMonitor/items.py
Normal file
19
EventMonitor/EventMonitor/items.py
Normal file
@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class EventmonitorItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
keyword = scrapy.Field()
|
||||
news_url = scrapy.Field()
|
||||
news_time = scrapy.Field()
|
||||
news_date = scrapy.Field()
|
||||
news_title = scrapy.Field()
|
||||
news_content = scrapy.Field()
|
103
EventMonitor/EventMonitor/middlewares.py
Normal file
103
EventMonitor/EventMonitor/middlewares.py
Normal file
@ -0,0 +1,103 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define here the models for your spider middleware
|
||||
#
|
||||
# See documentation in:
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
from scrapy import signals
|
||||
|
||||
|
||||
class EventmonitorSpiderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the spider middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_spider_input(self, response, spider):
|
||||
# Called for each response that goes through the spider
|
||||
# middleware and into the spider.
|
||||
|
||||
# Should return None or raise an exception.
|
||||
return None
|
||||
|
||||
def process_spider_output(self, response, result, spider):
|
||||
# Called with the results returned from the Spider, after
|
||||
# it has processed the response.
|
||||
|
||||
# Must return an iterable of Request, dict or Item objects.
|
||||
for i in result:
|
||||
yield i
|
||||
|
||||
def process_spider_exception(self, response, exception, spider):
|
||||
# Called when a spider or process_spider_input() method
|
||||
# (from other spider middleware) raises an exception.
|
||||
|
||||
# Should return either None or an iterable of Response, dict
|
||||
# or Item objects.
|
||||
pass
|
||||
|
||||
def process_start_requests(self, start_requests, spider):
|
||||
# Called with the start requests of the spider, and works
|
||||
# similarly to the process_spider_output() method, except
|
||||
# that it doesn’t have a response associated.
|
||||
|
||||
# Must return only requests (not items).
|
||||
for r in start_requests:
|
||||
yield r
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
||||
|
||||
|
||||
class EventmonitorDownloaderMiddleware(object):
|
||||
# Not all methods need to be defined. If a method is not defined,
|
||||
# scrapy acts as if the downloader middleware does not modify the
|
||||
# passed objects.
|
||||
|
||||
@classmethod
|
||||
def from_crawler(cls, crawler):
|
||||
# This method is used by Scrapy to create your spiders.
|
||||
s = cls()
|
||||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||
return s
|
||||
|
||||
def process_request(self, request, spider):
|
||||
# Called for each request that goes through the downloader
|
||||
# middleware.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this request
|
||||
# - or return a Response object
|
||||
# - or return a Request object
|
||||
# - or raise IgnoreRequest: process_exception() methods of
|
||||
# installed downloader middleware will be called
|
||||
return None
|
||||
|
||||
def process_response(self, request, response, spider):
|
||||
# Called with the response returned from the downloader.
|
||||
|
||||
# Must either;
|
||||
# - return a Response object
|
||||
# - return a Request object
|
||||
# - or raise IgnoreRequest
|
||||
return response
|
||||
|
||||
def process_exception(self, request, exception, spider):
|
||||
# Called when a download handler or a process_request()
|
||||
# (from other downloader middleware) raises an exception.
|
||||
|
||||
# Must either:
|
||||
# - return None: continue processing this exception
|
||||
# - return a Response object: stops process_exception() chain
|
||||
# - return a Request object: stops process_exception() chain
|
||||
pass
|
||||
|
||||
def spider_opened(self, spider):
|
||||
spider.logger.info('Spider opened: %s' % spider.name)
|
46
EventMonitor/EventMonitor/pipelines.py
Normal file
46
EventMonitor/EventMonitor/pipelines.py
Normal file
@ -0,0 +1,46 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Define your item pipelines here
|
||||
#
|
||||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
||||
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
|
||||
import os
|
||||
import pymongo
|
||||
|
||||
class EventmonitorPipeline(object):
|
||||
def __init__(self):
|
||||
CUR = '/'.join(os.path.abspath(__file__).split('/')[:-2])
|
||||
self.news_path = os.path.join(CUR, 'news')
|
||||
if not os.path.exists(self.news_path):
|
||||
os.makedirs(self.news_path)
|
||||
conn = pymongo.MongoClient('192.168.1.37', 27017)
|
||||
self.col = conn['person_rel_dataset']['docs']
|
||||
|
||||
'''处理采集资讯, 存储至Mongodb数据库'''
|
||||
def process_item(self, item, spider):
|
||||
try:
|
||||
self.col.insert(dict(item))
|
||||
except (pymongo.errors.WriteError, KeyError) as err:
|
||||
pass
|
||||
# raise DropItem("Duplicated Item: {}".format(item['name']))
|
||||
return item
|
||||
|
||||
# '''处理数据流'''
|
||||
# def process_item(self, item, spider):
|
||||
# print(item)
|
||||
# keyword = item['keyword']
|
||||
# event_path = os.path.join(self.news_path, keyword)
|
||||
# if not os.path.exists(event_path):
|
||||
# os.makedirs(event_path)
|
||||
# filename = os.path.join(event_path, item['news_date'] + '@' + item['news_title'])
|
||||
# self.save_localfile(filename, item['news_title'], item['news_time'], item['news_content'])
|
||||
# return item
|
||||
#
|
||||
# '''将内容保存至文件当中'''
|
||||
# def save_localfile(self, filename, title, pubtime, content):
|
||||
# with open(filename, 'w+') as f:
|
||||
# f.write('标题:{0}\n'.format(title))
|
||||
# f.write('发布时间:{0}\n'.format(pubtime))
|
||||
# f.write('正文:{0}\n'.format(content))
|
||||
# f.close()
|
90
EventMonitor/EventMonitor/settings.py
Normal file
90
EventMonitor/EventMonitor/settings.py
Normal file
@ -0,0 +1,90 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Scrapy settings for EventMonitor project
|
||||
#
|
||||
# For simplicity, this file contains only settings considered important or
|
||||
# commonly used. You can find more settings consulting the documentation:
|
||||
#
|
||||
# https://doc.scrapy.org/en/latest/topics/settings.html
|
||||
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
|
||||
BOT_NAME = 'EventMonitor'
|
||||
|
||||
SPIDER_MODULES = ['EventMonitor.spiders']
|
||||
NEWSPIDER_MODULE = 'EventMonitor.spiders'
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = 'EventMonitor (+http://www.yourdomain.com)'
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = False
|
||||
|
||||
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||
#CONCURRENT_REQUESTS = 32
|
||||
|
||||
# Configure a delay for requests for the same website (default: 0)
|
||||
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||
# See also autothrottle settings and docs
|
||||
#DOWNLOAD_DELAY = 3
|
||||
# The download delay setting will honor only one of:
|
||||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||
|
||||
# Disable cookies (enabled by default)
|
||||
#COOKIES_ENABLED = False
|
||||
|
||||
# Disable Telnet Console (enabled by default)
|
||||
#TELNETCONSOLE_ENABLED = False
|
||||
|
||||
# Override the default request headers:
|
||||
DEFAULT_REQUEST_HEADERS = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en',
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
|
||||
|
||||
# Enable or disable spider middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
||||
#SPIDER_MIDDLEWARES = {
|
||||
# 'EventMonitor.middlewares.EventmonitorSpiderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable downloader middlewares
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||
#DOWNLOADER_MIDDLEWARES = {
|
||||
# 'EventMonitor.middlewares.EventmonitorDownloaderMiddleware': 543,
|
||||
#}
|
||||
|
||||
# Enable or disable extensions
|
||||
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
||||
#EXTENSIONS = {
|
||||
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
||||
#}
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
'EventMonitor.pipelines.EventmonitorPipeline': 300,
|
||||
}
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
||||
#AUTOTHROTTLE_ENABLED = True
|
||||
# The initial download delay
|
||||
#AUTOTHROTTLE_START_DELAY = 5
|
||||
# The maximum download delay to be set in case of high latencies
|
||||
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||
# The average number of requests Scrapy should be sending in parallel to
|
||||
# each remote server
|
||||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = 'httpcache'
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
BIN
EventMonitor/EventMonitor/settings.pyc
Normal file
BIN
EventMonitor/EventMonitor/settings.pyc
Normal file
Binary file not shown.
4
EventMonitor/EventMonitor/spiders/__init__.py
Normal file
4
EventMonitor/EventMonitor/spiders/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# This package will contain the spiders of your Scrapy project
|
||||
#
|
||||
# Please refer to the documentation for information on how to create and manage
|
||||
# your spiders.
|
BIN
EventMonitor/EventMonitor/spiders/__init__.pyc
Normal file
BIN
EventMonitor/EventMonitor/spiders/__init__.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
318
EventMonitor/EventMonitor/spiders/extract_news.py
Normal file
318
EventMonitor/EventMonitor/spiders/extract_news.py
Normal file
@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
import re
|
||||
from collections import Counter
|
||||
from operator import itemgetter
|
||||
import copy
|
||||
from lxml import etree
|
||||
from .handle_html import *
|
||||
from .utils import *
|
||||
|
||||
class NewsParser:
|
||||
def __init__(self):
|
||||
self.score = 6
|
||||
self.length = 5
|
||||
|
||||
def _cal_score(self, text):
|
||||
"""计算兴趣度"""
|
||||
if "。" not in text:
|
||||
if "," in text:
|
||||
return 0
|
||||
else:
|
||||
return -1
|
||||
else:
|
||||
score = text.count(',') + 1
|
||||
score += text.count(',') + 1
|
||||
score += text.count(';')
|
||||
score += text.count('。')
|
||||
return score
|
||||
|
||||
def _line_div(self, html):
|
||||
html = re.sub("</?div>|</?table>", "</div><div>", html, flags=re.I)
|
||||
html = html.replace('</div>', '', 1)
|
||||
index = html.rfind('<div>')
|
||||
html = html[:index] + html[index:].replace('<div>', '', 1)
|
||||
return html
|
||||
|
||||
def _line_p(self, text):
|
||||
text_list = list()
|
||||
text = re.sub(r'</?p\s?.*?>', r'</p><p class="news_body">', text, flags=re.I | re.S)
|
||||
text = text.replace('</p>', '', 1)
|
||||
index = text.rfind('<p>')
|
||||
text = text[:index] + text[index:].replace('<p>', '', 1)
|
||||
text = '<p class="news_head">{0}</p>'.format(text)
|
||||
return text
|
||||
|
||||
def _extract_paragraph(self, html):
|
||||
|
||||
cluster_para = {}
|
||||
absorb_para = {}
|
||||
for index, div_str in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S | re.I)):
|
||||
if len(div_str.strip()) == 0:
|
||||
continue
|
||||
para_str = div_str.strip()
|
||||
score = self._cal_score(para_str)
|
||||
if score > self.score:
|
||||
cluster_para[index] = [para_str, score]
|
||||
else:
|
||||
absorb_para[index] = [para_str, score]
|
||||
return cluster_para, absorb_para
|
||||
|
||||
def _extract_feature(self, para_dict):
|
||||
c = Counter()
|
||||
index, text = max(para_dict.items(), key=lambda asd: asd[1][1])
|
||||
feature_list = re.findall("(<p.*?>)", text[0], flags=re.I | re.S)
|
||||
for feature in feature_list:
|
||||
c[feature] += 1
|
||||
if c.most_common(1):
|
||||
feature, amount = c.most_common(1)[0]
|
||||
else:
|
||||
feature = ''
|
||||
feature = feature.replace('(', '\(').replace(')', '\)')
|
||||
return index, feature
|
||||
|
||||
def _gen_skeleton(self, para_dict, index, feature):
|
||||
""" 聚类段落集聚类生成生成正文脉络集合"""
|
||||
skeleton_dict = {}
|
||||
num_list = []
|
||||
if not feature:
|
||||
skeleton_dict[index] = para_dict[index]
|
||||
return skeleton_dict
|
||||
for num in para_dict.keys():
|
||||
num_list.append(num)
|
||||
num_list = sorted(num_list)
|
||||
od = num_list.index(index)
|
||||
f_list = num_list[0:od]
|
||||
l_list = num_list[od:len(num_list)]
|
||||
# 向后聚类
|
||||
while l_list:
|
||||
tmp = l_list.pop(0)
|
||||
length = abs(tmp - index)
|
||||
if length < self.length:
|
||||
if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S | re.I):
|
||||
skeleton_dict[tmp] = para_dict[tmp]
|
||||
index = tmp
|
||||
# 向前聚类
|
||||
while f_list:
|
||||
tmp = f_list.pop()
|
||||
length = abs(index - tmp)
|
||||
if length < self.length:
|
||||
if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S):
|
||||
skeleton_dict[tmp] = para_dict[tmp]
|
||||
index = tmp
|
||||
return skeleton_dict
|
||||
|
||||
def _absorb_text(self, skeleton_dict, para_dict):
|
||||
"""从伪噪声段落吸收噪声段落"""
|
||||
content_dict = skeleton_dict
|
||||
sk_list = skeleton_dict.keys()
|
||||
pa_list = para_dict.keys()
|
||||
sk_list = sorted(sk_list)
|
||||
pa_list = sorted(pa_list)
|
||||
heads = []
|
||||
middle = []
|
||||
tail = []
|
||||
for each in pa_list:
|
||||
if each < sk_list[0]:
|
||||
heads.append(each)
|
||||
if each > sk_list[-1]:
|
||||
tail.append(each)
|
||||
if (each >= sk_list[0]) and (each <= sk_list[-1]):
|
||||
middle.append(each)
|
||||
while heads:
|
||||
tmp = heads.pop()
|
||||
index = sk_list[0]
|
||||
if abs(tmp - index) < self.length:
|
||||
if para_dict[tmp][1] * 2 > self.score:
|
||||
content_dict[tmp] = para_dict[tmp]
|
||||
else:
|
||||
break
|
||||
while tail:
|
||||
tmp = tail.pop(0)
|
||||
index = sk_list[-1]
|
||||
if abs(tmp - index) < self.length:
|
||||
if para_dict[tmp][1] * 2 > self.score:
|
||||
content_dict[tmp] = para_dict[tmp]
|
||||
else:
|
||||
break
|
||||
while middle:
|
||||
tmp = middle.pop()
|
||||
if para_dict[tmp][1] * 2 > self.score:
|
||||
content_dict[tmp] = para_dict[tmp]
|
||||
return content_dict
|
||||
|
||||
def _substring(self, text):
|
||||
text = self._line_p(text)
|
||||
text = pretty_html(text)
|
||||
selector = etree.HTML(text)
|
||||
xpath_result = selector.xpath('//p')
|
||||
if len(xpath_result) == 1:
|
||||
sub_string = xpath_result[0].xpath('string(.)')
|
||||
sub_string = drop_mutil_br(sub_string)
|
||||
else:
|
||||
text_list = []
|
||||
xpath_result = selector.xpath('//p[@class="news_body"]')
|
||||
for item in xpath_result:
|
||||
p_string = item.xpath('string(.)').strip()
|
||||
|
||||
if not p_string:
|
||||
continue
|
||||
p_string = drop_null(p_string)
|
||||
text_list.append(p_string)
|
||||
if text_list:
|
||||
sub_string = '\n'.join(text_list)
|
||||
else:
|
||||
sub_string = ''
|
||||
return sub_string
|
||||
|
||||
def _pretty_text(self, index_content_list):
|
||||
contents = list()
|
||||
for each in index_content_list:
|
||||
sub_text = self._substring(each[1][0])
|
||||
if not sub_text:
|
||||
continue
|
||||
else:
|
||||
contents.append(sub_text)
|
||||
text = "\n".join(contents)
|
||||
return text
|
||||
|
||||
def extract_news(self, html):
|
||||
html = handle_html(html)
|
||||
html = self._line_div(html)
|
||||
index = 0
|
||||
cluster_para, absorb_para = self._extract_paragraph(html)
|
||||
if cluster_para:
|
||||
index, feature = self._extract_feature(cluster_para)
|
||||
skeleton_dict = self._gen_skeleton(cluster_para, index, feature)
|
||||
if skeleton_dict:
|
||||
if absorb_para:
|
||||
content_dict = self._absorb_text(skeleton_dict, absorb_para)
|
||||
else:
|
||||
content_dict = skeleton_dict
|
||||
index_content_list = sorted(content_dict.items(), key=itemgetter(0))
|
||||
|
||||
top_div_list = list()
|
||||
top_text = ''
|
||||
index = index_content_list[0][0]
|
||||
for ind, each_div in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S)):
|
||||
if ind >= index:
|
||||
break
|
||||
top_text += each_div
|
||||
top_div_list.append((ind, each_div))
|
||||
else:
|
||||
return
|
||||
|
||||
'''正文抽取'''
|
||||
def extract_content():
|
||||
text = ''
|
||||
if index_content_list:
|
||||
text = self._pretty_text(index_content_list)
|
||||
text = text.strip()
|
||||
return text
|
||||
|
||||
'''发布时间抽取'''
|
||||
def extract_pubtime():
|
||||
pubtime = ''
|
||||
tmp_top_div_list = copy.deepcopy(top_div_list)
|
||||
while tmp_top_div_list:
|
||||
ind, item = tmp_top_div_list.pop()
|
||||
if not item.strip():
|
||||
continue
|
||||
div_selector = etree.HTML(item)
|
||||
if div_selector is None:
|
||||
continue
|
||||
div_text = div_selector.xpath('string(.)').strip()
|
||||
if not div_text:
|
||||
continue
|
||||
pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-:/]\s*\d{1,2}\s*[\-_:日]?\s*\d{1,2}\s*:\s*\d{1,2}\s*(:\s*\d{1,2})?', div_text, flags=re.S|re.I)
|
||||
if pubtime:
|
||||
pubtime = pubtime.group()
|
||||
index = ind
|
||||
break
|
||||
if not pubtime:
|
||||
tmp_top_div_list = copy.deepcopy(top_div_list)
|
||||
while tmp_top_div_list:
|
||||
ind, item = tmp_top_div_list.pop()
|
||||
if not item.strip():
|
||||
continue
|
||||
div_selector = etree.HTML(item)
|
||||
if div_selector is None:
|
||||
continue
|
||||
div_text = div_selector.xpath('string(.)')
|
||||
pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-:/]\s*\d{1,2}\s*[\-_:日/]?', div_text,
|
||||
flags=re.S)
|
||||
if pubtime:
|
||||
pubtime = pubtime.group()
|
||||
index = ind
|
||||
break
|
||||
if pubtime:
|
||||
pubtime = pubtime.strip()
|
||||
pubtime = pubtime.replace('年', '-').replace('月', '-').replace('日', ' ').replace('/', '-')
|
||||
pubtime = drop_mutil_blank(pubtime)
|
||||
return pubtime, index
|
||||
else:
|
||||
return pubtime, 0
|
||||
|
||||
'''标题抽取'''
|
||||
def extract_title():
|
||||
title = ''
|
||||
selector = etree.HTML(html)
|
||||
tmps = selector.xpath('//title/text()')
|
||||
if tmps:
|
||||
title = tmps[0].strip()
|
||||
title = clear_title(title)
|
||||
return title
|
||||
|
||||
news = {}
|
||||
news_content = extract_content()
|
||||
news_pubtime, index = extract_pubtime()
|
||||
news_title = extract_title()
|
||||
news['news_content'] = news_content
|
||||
news['news_pubtime'] = self.pretty_time(news_pubtime)
|
||||
if news['news_pubtime']:
|
||||
news['news_date'] = news['news_pubtime'].split(' ')[0]
|
||||
else:
|
||||
news['news_date'] = ''
|
||||
news['news_title'] = news_title
|
||||
|
||||
if not (news['news_content'] and news['news_pubtime'] and news['news_title'] and news['news_date']):
|
||||
return {}
|
||||
|
||||
return news
|
||||
|
||||
'''时间标准化'''
|
||||
def pretty_time(self, time):
|
||||
if not time:
|
||||
return None
|
||||
modify_time = time
|
||||
if len(time.split(' ')) == 2:
|
||||
date = modify_time.split(' ')[0]
|
||||
hour = modify_time.split(' ')[1]
|
||||
date_new = self.pretty_date(date)
|
||||
modify_time = ' '.join([date_new, hour])
|
||||
else:
|
||||
date = modify_time.split(' ')[0]
|
||||
modify_time = self.pretty_date(date)
|
||||
|
||||
return modify_time
|
||||
|
||||
'''标准化年月日'''
|
||||
def pretty_date(self, date):
|
||||
date = date.split('-')
|
||||
if len(date) != 3:
|
||||
return ''
|
||||
year = date[0]
|
||||
month = date[1]
|
||||
day = date[2]
|
||||
if int(month) < 10 and len(month) == 1:
|
||||
month = '0' + month
|
||||
|
||||
if int(day) < 10 and len(day) == 1:
|
||||
day = '0' + day
|
||||
date_new = '-'.join([year, month, day])
|
||||
return date_new
|
||||
|
||||
|
||||
|
||||
|
||||
|
BIN
EventMonitor/EventMonitor/spiders/extract_news.pyc
Normal file
BIN
EventMonitor/EventMonitor/spiders/extract_news.pyc
Normal file
Binary file not shown.
74
EventMonitor/EventMonitor/spiders/handle_html.py
Normal file
74
EventMonitor/EventMonitor/spiders/handle_html.py
Normal file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# author: chenhe<hee0624@163.com>
|
||||
# time: 2017-11-30
|
||||
# version: 1.0
|
||||
|
||||
from html.parser import HTMLParser
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class StripParser(HTMLParser):
|
||||
"""
|
||||
去除一些特定的标签
|
||||
"""
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.strict = False
|
||||
self.convert_charrefs = True
|
||||
self.drop_tags = {'script', 'style', 'iframe', 'aside', 'nav', 'footer'}
|
||||
self.fed = []
|
||||
self.point_tags =[]
|
||||
self.is_fed = True
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in self.drop_tags:
|
||||
self.is_fed = False
|
||||
self.point_tags.append(tag)
|
||||
else:
|
||||
if tag == 'p':
|
||||
tmp_attrs = ['{0}="{1}"'.format(i[0], i[1]) for i in attrs]
|
||||
tmp_attrs = ' '.join(tmp_attrs)
|
||||
|
||||
self.fed.append('<p {}>'.format(tmp_attrs))
|
||||
else:
|
||||
self.fed.append('<{}>'.format(tag))
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.is_fed:
|
||||
self.fed.append(data)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self.drop_tags:
|
||||
if tag == self.point_tags[-1]:
|
||||
self.point_tags.pop()
|
||||
if not self.point_tags:
|
||||
self.is_fed = True
|
||||
else:
|
||||
self.fed.append('</{}>'.format(tag))
|
||||
|
||||
def get_html(self):
|
||||
return '\n'.join(self.fed)
|
||||
|
||||
|
||||
def pretty_html(html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
fixed_html = soup.prettify()
|
||||
return fixed_html
|
||||
|
||||
|
||||
def strip_tag(html):
|
||||
"""
|
||||
去除html特定的标签
|
||||
:param html: string
|
||||
:return: string
|
||||
"""
|
||||
s = StripParser()
|
||||
s.feed(html)
|
||||
return s.get_html()
|
||||
|
||||
|
||||
def handle_html(html):
|
||||
html = pretty_html(html)
|
||||
html = strip_tag(html)
|
||||
return html
|
||||
|
BIN
EventMonitor/EventMonitor/spiders/handle_html.pyc
Normal file
BIN
EventMonitor/EventMonitor/spiders/handle_html.pyc
Normal file
Binary file not shown.
86
EventMonitor/EventMonitor/spiders/news_spider.py
Normal file
86
EventMonitor/EventMonitor/spiders/news_spider.py
Normal file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: news_spider.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-7-15
|
||||
|
||||
import scrapy
|
||||
import os
|
||||
from lxml import etree
|
||||
import urllib.request
|
||||
from urllib.parse import quote, quote_plus
|
||||
from .extract_news import *
|
||||
from EventMonitor.items import EventmonitorItem
|
||||
import redis
|
||||
import os
|
||||
|
||||
class BuildData:
|
||||
def __init__(self):
|
||||
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
|
||||
self.rel_filepath = os.path.join(cur, 'rel_data.txt')
|
||||
self.seed_rels = self.collect_rels()
|
||||
return
|
||||
|
||||
'''加载关系数据集'''
|
||||
def collect_rels(self):
|
||||
rels_data = []
|
||||
for line in open(self.rel_filepath):
|
||||
line = line.strip().split('###')
|
||||
keywords = line[:-2]
|
||||
rels_data.append(keywords)
|
||||
return rels_data
|
||||
|
||||
|
||||
class NewsSpider(scrapy.Spider):
|
||||
name = 'eventspider'
|
||||
def __init__(self):
|
||||
self.seed_rels = BuildData().seed_rels
|
||||
self.parser = NewsParser()
|
||||
self.pool = redis.ConnectionPool(host='192.168.1.29', port=6379, decode_responses=True)
|
||||
self.conn = redis.Redis(connection_pool=self.pool)
|
||||
self.redis_key = 'person_names'
|
||||
|
||||
'''采集主函数'''
|
||||
def start_requests(self):
|
||||
while(1):
|
||||
res = self.conn.spop(self.redis_key)
|
||||
print(res)
|
||||
if str(res) == 'None':
|
||||
return
|
||||
line = res.strip().split('###')
|
||||
keywords = line[:-1]
|
||||
search_body = '+'.join([quote_plus(wd) for wd in keywords[:-1]])
|
||||
seed_urls = []
|
||||
for page in range(0, 101, 20):
|
||||
url = 'https://www.baidu.com/s?ie=utf-8&cl=2&rtt=1&bsst=1&tn=news&word=' + search_body + '&tngroupname=organic_news&pn=' + str(
|
||||
page)
|
||||
seed_urls.append(url)
|
||||
for seed_url in seed_urls:
|
||||
param = {'url': seed_url,
|
||||
'keyword': ' '.join(keywords)}
|
||||
yield scrapy.Request(url=seed_url, meta=param, callback=self.collect_newslist, dont_filter=True)
|
||||
|
||||
'''获取新闻列表'''
|
||||
def collect_newslist(self, response):
|
||||
selector = etree.HTML(response.text)
|
||||
news_links = selector.xpath('//h3[@class="c-title"]/a/@href')
|
||||
print(response.meta['keyword'], len(set(news_links)))
|
||||
for news_link in news_links:
|
||||
param = {'url': news_link,
|
||||
'keyword': response.meta['keyword']}
|
||||
yield scrapy.Request(url=news_link, meta=param, callback=self.page_parser, dont_filter=True)
|
||||
|
||||
|
||||
'''对网站新闻进行结构化抽取'''
|
||||
def page_parser(self, response):
|
||||
data = self.parser.extract_news(response.text)
|
||||
if data:
|
||||
item = EventmonitorItem()
|
||||
item['keyword'] = response.meta['keyword']
|
||||
item['news_url'] = response.meta['url']
|
||||
item['news_time'] = data['news_pubtime']
|
||||
item['news_date'] = data['news_date']
|
||||
item['news_title'] = data['news_title']
|
||||
item['news_content'] = data['news_content']
|
||||
yield item
|
||||
return
|
35996
EventMonitor/EventMonitor/spiders/rel_data.txt
Normal file
35996
EventMonitor/EventMonitor/spiders/rel_data.txt
Normal file
File diff suppressed because it is too large
Load Diff
123
EventMonitor/EventMonitor/spiders/utils.py
Normal file
123
EventMonitor/EventMonitor/spiders/utils.py
Normal file
@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding:utf-8 -*-
|
||||
# author: chenhe<hee0624@163.com>
|
||||
# time: 2017-11-30
|
||||
# version: 1.0
|
||||
|
||||
from collections import Counter
|
||||
import jieba.posseg as pseg
|
||||
import re
|
||||
|
||||
|
||||
def is_chinese(uchar):
|
||||
"""判断一个unicode是否是汉字"""
|
||||
if uchar >= '\u4e00' and uchar <= '\u9fa5':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_number(uchar):
|
||||
"""判断一个unicode是否是数字"""
|
||||
if uchar >= u'\u0030' and uchar <= u'\u0039':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_alphabet(uchar):
|
||||
"""判断一个unicode是否是英文字母"""
|
||||
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def is_legal(uchar):
|
||||
"""判断是否非汉字,数字和英文字符"""
|
||||
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
def count_pos(str):
|
||||
"""返回词性个数"""
|
||||
pos_set = set()
|
||||
words = pseg.cut(str)
|
||||
for word, flag in words:
|
||||
pos_set.add(flag)
|
||||
return len(pos_set)
|
||||
|
||||
def is_longsent(str):
|
||||
"""根据字符串汉字长度判断是否是标题"""
|
||||
length = 0
|
||||
for uchar in str:
|
||||
if is_chinese(uchar):
|
||||
length += 1
|
||||
else:
|
||||
pass
|
||||
if length > 8:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def clear_title(title_str):
|
||||
seg_set = set(['\\', '\|', '/', '_'])
|
||||
c = Counter()
|
||||
for item in title_str:
|
||||
if item in seg_set:
|
||||
c[item] += 1
|
||||
if c.most_common(1):
|
||||
seg, count = c.most_common(1)[0]
|
||||
else:
|
||||
seg, count = '', 0
|
||||
if seg:
|
||||
title = title_str.split(seg)[0]
|
||||
else:
|
||||
title = title_str
|
||||
|
||||
title = title.replace('——', '-')
|
||||
tmp = title.split('-')
|
||||
is_continue = True
|
||||
while is_continue:
|
||||
if len(tmp) > 1:
|
||||
top = tmp[-1]
|
||||
pos_num = count_pos(top)
|
||||
if pos_num > 2:
|
||||
is_continue = False
|
||||
else:
|
||||
tmp.pop()
|
||||
else:
|
||||
is_continue = False
|
||||
title = '-'.join(tmp).replace("\t", '')
|
||||
return title
|
||||
|
||||
def clear_pan(str):
|
||||
num = str.count('>')
|
||||
if num >= 2:
|
||||
return str.split('>')[-1]
|
||||
else:
|
||||
return str
|
||||
|
||||
def drop_null(arg):
|
||||
if isinstance(arg, str):
|
||||
arg = re.sub('\s', '', arg, flags=re.S)
|
||||
return arg
|
||||
elif isinstance(arg, list):
|
||||
new_list = []
|
||||
for i in arg:
|
||||
i = i.strip()
|
||||
if i:
|
||||
new_list.append(i)
|
||||
else:
|
||||
continue
|
||||
return new_list
|
||||
else:
|
||||
return arg
|
||||
|
||||
def drop_mutil_br(str):
|
||||
str = re.sub(r'<br>|</br>', '\n', str)
|
||||
str = re.sub(r'\n\s+', '\n', str)
|
||||
return str
|
||||
|
||||
|
||||
def drop_mutil_blank(str):
|
||||
str = re.sub(r'\s{2,}', ' ', str)
|
||||
return str
|
31
EventMonitor/README.md
Normal file
31
EventMonitor/README.md
Normal file
@ -0,0 +1,31 @@
|
||||
# EventMonitor
|
||||
Event monitor based on online news corpus built by Baidu search enginee using event keyword for event storyline and analysis,基于给定事件关键词,采集事件资讯,对事件进行挖掘和分析。
|
||||
# 项目路线图
|
||||
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/project.png)
|
||||
# 项目细分
|
||||
# 1) 基于话题关键词的话题历时语料库采集
|
||||
执行方式:进入EventMonitor目录下,进入cmd窗口,执行"scrapy crawl eventspider -a keyword=话题关键词",或者直接python crawl.py, 等待数秒后,既可以在news文件夹中存储相应的新闻文件,可以得到相应事件的话题集,话题历史文本
|
||||
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/topic.png)
|
||||
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/news.png)
|
||||
![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/content.png)
|
||||
# 2)关于热点事件的情感分析
|
||||
对于1)得到的历史语料,可以使用基于依存语义和情感词库的篇章级情感分析算法进行情感分析
|
||||
这部分参考我的篇章级情感分析项目DocSentimentAnalysis:https://github.com/liuhuanyong/DocSentimentAnalysis
|
||||
# 3)关于热点事件的搜索趋势
|
||||
对于1)得到的历史语料,可以使用百度指数,新浪微博指数进行采集
|
||||
这部分参考我的百度指数采集项目BaiduIndexSpyder:https://github.com/liuhuanyong/BaiduIndexSpyder
|
||||
微博指数采集项目WeiboIndexSpyder:https://github.com/liuhuanyong/WeiboIndexSpyder
|
||||
# 4)关于热点事件的话题分析
|
||||
对于1)得到的历史语料,可以使用LDA,Kmeans模型进行话题分析
|
||||
这部分参考我的话题分析项目Topicluster:https://github.com/liuhuanyong/TopicCluster
|
||||
# 5)关于热点事件的代表性文本分析
|
||||
对于1)得到的历史语料,可以使用跨篇章的textrank算法,对文本集的重要性进行计算和排序
|
||||
这部分参考我的文本重要性分析项目ImportantEventExtractor:https://github.com/liuhuanyong/ImportantEventExtractor
|
||||
# 6)关于热点事件新闻文本的图谱化展示
|
||||
对于得到每个历史新闻事件文本,可以使用关键词,实体识别等关系抽取方法对文本进行可视化展示
|
||||
这部分内容,参考我的文本内容可视化项目项目TextGrapher:https://github.com/liuhuanyong/TextGrapher
|
||||
|
||||
# 结束语
|
||||
关于事件监测的方法有很多,也有很多问题需要去解决,以上提出的方法只是一个尝试,就算法本身还有许多需要改进的地方
|
||||
|
||||
If any question about the project or me ,see https://liuhuanyong.github.io/
|
5
EventMonitor/__init__.py
Normal file
5
EventMonitor/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
# File: __init__.py.py
|
||||
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
|
||||
# Date: 18-7-15
|
40
EventMonitor/process_redis.py
Normal file
40
EventMonitor/process_redis.py
Normal file
@ -0,0 +1,40 @@
|
||||
#coding=utf-8
|
||||
import redis
|
||||
import os
|
||||
|
||||
class RedisProcess:
|
||||
def __init__(self):
|
||||
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
|
||||
self.pool = redis.ConnectionPool(host='192.168.1.37', port=6379, decode_responses=True)
|
||||
self.conn = redis.Redis(connection_pool=self.pool)
|
||||
self.rel_filepath = os.path.join(cur, 'rel_data.txt')
|
||||
return
|
||||
|
||||
def insert_data(self):
|
||||
name = 'person_names'
|
||||
i = 0
|
||||
for line in open(self.rel_filepath):
|
||||
i += 1
|
||||
if i < 833:
|
||||
continue
|
||||
line = line.strip()
|
||||
if not line or len(line.split('###')) != 4:
|
||||
continue
|
||||
self.conn.sadd(name, line)
|
||||
print(i)
|
||||
return
|
||||
|
||||
def read_data(self):
|
||||
name = 'person_names'
|
||||
res = 1
|
||||
while(res):
|
||||
res = self.conn.spop(name)
|
||||
print(res)
|
||||
return
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
handler = RedisProcess()
|
||||
handler.insert_data()
|
||||
# handler.read_data()
|
||||
|
35996
EventMonitor/rel_data.txt
Normal file
35996
EventMonitor/rel_data.txt
Normal file
File diff suppressed because it is too large
Load Diff
11
EventMonitor/scrapy.cfg
Normal file
11
EventMonitor/scrapy.cfg
Normal file
@ -0,0 +1,11 @@
|
||||
# Automatically created by: scrapy startproject
|
||||
#
|
||||
# For more information about the [deploy] section see:
|
||||
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
||||
|
||||
[settings]
|
||||
default = EventMonitor.settings
|
||||
|
||||
[deploy]
|
||||
#url = http://localhost:6800/
|
||||
project = EventMonitor
|
Loading…
Reference in New Issue
Block a user