添加语料构建项目

2018-11-28 17:23:37 +08:00 · 2018-11-28 17:23:37 +08:00 · 06a34764d2
commit 06a34764d2
parent 119094761b
39 changed files with 73306 additions and 0 deletions
--- a/EventMonitor/.idea/EventMonitor-master.iml
+++ b/EventMonitor/.idea/EventMonitor-master.iml
@ -0,0 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">
    <option name="projectConfiguration" value="Nosetests" />
    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
  </component>
 </module>
--- a/EventMonitor/.idea/misc.xml
+++ b/EventMonitor/.idea/misc.xml
@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
 </project>
--- a/EventMonitor/.idea/modules.xml
+++ b/EventMonitor/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/EventMonitor-master.iml" filepath="$PROJECT_DIR$/.idea/EventMonitor-master.iml" />
    </modules>
  </component>
 </project>
--- a/EventMonitor/.idea/workspace.xml
+++ b/EventMonitor/.idea/workspace.xml
@ -0,0 +1,340 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ChangeListManager">
    <list default="true" id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
    <option name="TRACKING_ENABLED" value="true" />
    <option name="SHOW_DIALOG" value="false" />
    <option name="HIGHLIGHT_CONFLICTS" value="true" />
    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
    <option name="LAST_RESOLUTION" value="IGNORE" />
  </component>
  <component name="FileEditorManager">
    <leaf>
      <file leaf-file-name="process_redis.py" pinned="false" current-in-tab="true">
        <entry file="file://$PROJECT_DIR$/process_redis.py">
          <provider selected="true" editor-type-id="text-editor">
            <state relative-caret-position="684">
              <caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
              <folding>
                <element signature="e#14#26#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="news_spider.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
          <provider selected="true" editor-type-id="text-editor">
            <state relative-caret-position="288">
              <caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
              <folding>
                <element signature="e#142#155#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="items.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
          <provider selected="true" editor-type-id="text-editor">
            <state relative-caret-position="90">
              <caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
              <folding />
            </state>
          </provider>
        </entry>
      </file>
      <file leaf-file-name="pipelines.py" pinned="false" current-in-tab="false">
        <entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
          <provider selected="true" editor-type-id="text-editor">
            <state relative-caret-position="414">
              <caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
              <folding>
                <element signature="e#194#203#0" expanded="true" />
              </folding>
            </state>
          </provider>
        </entry>
      </file>
    </leaf>
  </component>
  <component name="FileTemplateManagerImpl">
    <option name="RECENT_TEMPLATES">
      <list>
        <option value="Python Script" />
      </list>
    </option>
  </component>
  <component name="FindInProjectRecents">
    <findStrings>
      <find>prin</find>
    </findStrings>
  </component>
  <component name="IdeDocumentHistory">
    <option name="CHANGED_PATHS">
      <list>
        <option value="$PROJECT_DIR$/EventMonitor/settings.py" />
        <option value="$PROJECT_DIR$/EventMonitor/pipelines.py" />
        <option value="$PROJECT_DIR$/EventMonitor/spiders/news_spider.py" />
        <option value="$PROJECT_DIR$/process_redis.py" />
      </list>
    </option>
  </component>
  <component name="ProjectFrameBounds" extendedState="7">
    <option name="y" value="-7" />
    <option name="width" value="1366" />
    <option name="height" value="732" />
  </component>
  <component name="ProjectView">
    <navigator currentView="ProjectPane" proportions="" version="1">
      <flattenPackages />
      <showMembers />
      <showModules />
      <showLibraryContents />
      <hideEmptyPackages />
      <abbreviatePackageNames />
      <autoscrollToSource />
      <autoscrollFromSource />
      <sortByType />
      <manualOrder />
      <foldersAlwaysOnTop value="true" />
    </navigator>
    <panes>
      <pane id="Scope" />
      <pane id="ProjectPane">
        <subPane>
          <expand>
            <path>
              <item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
              <item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
            </path>
            <path>
              <item name="EventMonitor-master" type="b2602c69:ProjectViewProjectNode" />
              <item name="EventMonitor-master" type="462c0819:PsiDirectoryNode" />
              <item name="EventMonitor" type="462c0819:PsiDirectoryNode" />
            </path>
          </expand>
          <select />
        </subPane>
      </pane>
      <pane id="Scratches" />
    </panes>
  </component>
  <component name="PropertiesComponent">
    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
  </component>
  <component name="RecentsManager">
    <key name="CopyFile.RECENT_KEYS">
      <recent name="$PROJECT_DIR$/EventMonitor/spiders" />
      <recent name="$PROJECT_DIR$/EventMonitor" />
    </key>
    <key name="MoveFile.RECENT_KEYS">
      <recent name="$PROJECT_DIR$" />
      <recent name="$PROJECT_DIR$/EventMonitor" />
      <recent name="$PROJECT_DIR$/EventMonitor/spiders" />
    </key>
  </component>
  <component name="RunDashboard">
    <option name="ruleStates">
      <list>
        <RuleState>
          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
        </RuleState>
        <RuleState>
          <option name="name" value="StatusDashboardGroupingRule" />
        </RuleState>
      </list>
    </option>
  </component>
  <component name="RunManager" selected="Python.process_redis">
    <configuration name="process_redis" type="PythonConfigurationType" factoryName="Python" temporary="true">
      <option name="INTERPRETER_OPTIONS" value="" />
      <option name="PARENT_ENVS" value="true" />
      <envs>
        <env name="PYTHONUNBUFFERED" value="1" />
      </envs>
      <option name="SDK_HOME" value="" />
      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
      <option name="IS_MODULE_SDK" value="true" />
      <option name="ADD_CONTENT_ROOTS" value="true" />
      <option name="ADD_SOURCE_ROOTS" value="true" />
      <module name="EventMonitor-master" />
      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/process_redis.py" />
      <option name="PARAMETERS" value="" />
      <option name="SHOW_COMMAND_LINE" value="false" />
      <option name="EMULATE_TERMINAL" value="false" />
    </configuration>
    <recent_temporary>
      <list size="1">
        <item index="0" class="java.lang.String" itemvalue="Python.process_redis" />
      </list>
    </recent_temporary>
  </component>
  <component name="ShelveChangesManager" show_recycled="false">
    <option name="remove_strategy" value="false" />
  </component>
  <component name="TaskManager">
    <task active="true" id="Default" summary="Default task">
      <changelist id="fa8272ba-2814-489d-b9dc-db4d932a6359" name="Default" comment="" />
      <created>1543216322930</created>
      <option name="number" value="Default" />
      <option name="presentableId" value="Default" />
      <updated>1543216322930</updated>
    </task>
    <servers />
  </component>
  <component name="ToolWindowManager">
    <frame x="0" y="-7" width="1366" height="732" extended-state="7" />
    <editor active="true" />
    <layout>
      <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.18887262" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
      <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
      <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
      <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.3296" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
      <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="false" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
      <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="true" content_ui="tabs" />
      <window_info id="Data View" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="-1" side_tool="false" content_ui="tabs" />
      <window_info id="Cvs" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
      <window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
      <window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
      <window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
      <window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
      <window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
      <window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
    </layout>
  </component>
  <component name="VcsContentAnnotationSettings">
    <option name="myLimit" value="2678400000" />
  </component>
  <component name="XDebuggerManager">
    <breakpoint-manager>
      <option name="time" value="1" />
    </breakpoint-manager>
    <watches-manager />
  </component>
  <component name="editorHistoryManager">
    <entry file="file://$PROJECT_DIR$/EventMonitor/crawl.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="90">
          <caret line="5" column="1" lean-forward="false" selection-start-line="5" selection-start-column="1" selection-end-line="5" selection-end-column="1" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/rel_data.txt">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="198">
          <caret line="11" column="0" lean-forward="false" selection-start-line="11" selection-start-column="0" selection-end-line="11" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/__init__.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/__init__.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/spiders/utils.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/settings.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="-380">
          <caret line="44" column="137" lean-forward="false" selection-start-line="44" selection-start-column="137" selection-end-line="44" selection-end-column="137" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/middlewares.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/spiders/handle_html.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="-852">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/rel_data.txt">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/pipelines.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="414">
          <caret line="23" column="20" lean-forward="false" selection-start-line="23" selection-start-column="20" selection-end-line="23" selection-end-column="20" />
          <folding>
            <element signature="e#194#203#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/items.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="90">
          <caret line="5" column="13" lean-forward="true" selection-start-line="5" selection-start-column="13" selection-end-line="5" selection-end-column="13" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/spiders/rel_data.txt">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="0">
          <caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
          <folding />
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/EventMonitor/spiders/news_spider.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="288">
          <caret line="46" column="21" lean-forward="false" selection-start-line="46" selection-start-column="21" selection-end-line="46" selection-end-column="21" />
          <folding>
            <element signature="e#142#155#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
    <entry file="file://$PROJECT_DIR$/process_redis.py">
      <provider selected="true" editor-type-id="text-editor">
        <state relative-caret-position="684">
          <caret line="38" column="11" lean-forward="true" selection-start-line="38" selection-start-column="11" selection-end-line="38" selection-end-column="11" />
          <folding>
            <element signature="e#14#26#0" expanded="true" />
          </folding>
        </state>
      </provider>
    </entry>
  </component>
 </project>
--- a/EventMonitor/EventMonitor/init.py
+++ b/EventMonitor/EventMonitor/init.py
--- a/EventMonitor/EventMonitor/init.pyc
+++ b/EventMonitor/EventMonitor/init.pyc
--- a/EventMonitor/EventMonitor/pycache/init.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/pycache/init.cpython-35.pyc
--- a/EventMonitor/EventMonitor/pycache/items.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/pycache/items.cpython-35.pyc
--- a/EventMonitor/EventMonitor/pycache/pipelines.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/pycache/pipelines.cpython-35.pyc
--- a/EventMonitor/EventMonitor/pycache/settings.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/pycache/settings.cpython-35.pyc
--- a/EventMonitor/EventMonitor/items.py
+++ b/EventMonitor/EventMonitor/items.py
@ -0,0 +1,19 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://doc.scrapy.org/en/latest/topics/items.html
 import scrapy
 class EventmonitorItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    keyword = scrapy.Field()
    news_url = scrapy.Field()
    news_time = scrapy.Field()
    news_date = scrapy.Field()
    news_title = scrapy.Field()
    news_content = scrapy.Field()
--- a/EventMonitor/EventMonitor/middlewares.py
+++ b/EventMonitor/EventMonitor/middlewares.py
@ -0,0 +1,103 @@
 # -*- coding: utf-8 -*-
 # Define here the models for your spider middleware
 #
 # See documentation in:
 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 from scrapy import signals
 class EventmonitorSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.
        # Should return None or raise an exception.
        return None
    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.
        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i
    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.
        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass
    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.
        # Must return only requests (not items).
        for r in start_requests:
            yield r
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
 class EventmonitorDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response
    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.
        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass
    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
--- a/EventMonitor/EventMonitor/pipelines.py
+++ b/EventMonitor/EventMonitor/pipelines.py
@ -0,0 +1,46 @@
 # -*- coding: utf-8 -*-
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 import os
 import pymongo
 class EventmonitorPipeline(object):
    def __init__(self):
        CUR = '/'.join(os.path.abspath(__file__).split('/')[:-2])
        self.news_path = os.path.join(CUR, 'news')
        if not os.path.exists(self.news_path):
            os.makedirs(self.news_path)
        conn = pymongo.MongoClient('192.168.1.37', 27017)
        self.col = conn['person_rel_dataset']['docs']
    '''处理采集资讯, 存储至Mongodb数据库'''
    def process_item(self, item, spider):
        try:
            self.col.insert(dict(item))
        except (pymongo.errors.WriteError, KeyError) as err:
            pass
            # raise DropItem("Duplicated Item: {}".format(item['name']))
        return item
    # '''处理数据流'''
    # def process_item(self, item, spider):
    #     print(item)
    #     keyword = item['keyword']
    #     event_path = os.path.join(self.news_path, keyword)
    #     if not os.path.exists(event_path):
    #         os.makedirs(event_path)
    #     filename = os.path.join(event_path, item['news_date'] + '＠' + item['news_title'])
    #     self.save_localfile(filename, item['news_title'], item['news_time'], item['news_content'])
    #     return item
    #
    # '''将内容保存至文件当中'''
    # def save_localfile(self, filename, title, pubtime, content):
    #     with open(filename, 'w+') as f:
    #         f.write('标题:{0}\n'.format(title))
    #         f.write('发布时间:{0}\n'.format(pubtime))
    #         f.write('正文:{0}\n'.format(content))
    #     f.close()
--- a/EventMonitor/EventMonitor/settings.py
+++ b/EventMonitor/EventMonitor/settings.py
@ -0,0 +1,90 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for EventMonitor project
 #
 # For simplicity, this file contains only settings considered important or
 # commonly used. You can find more settings consulting the documentation:
 #
 #     https://doc.scrapy.org/en/latest/topics/settings.html
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 BOT_NAME = 'EventMonitor'
 SPIDER_MODULES = ['EventMonitor.spiders']
 NEWSPIDER_MODULE = 'EventMonitor.spiders'
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'EventMonitor (+http://www.yourdomain.com)'
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = False
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
 #CONCURRENT_REQUESTS = 32
 # Configure a delay for requests for the same website (default: 0)
 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
 #DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 #CONCURRENT_REQUESTS_PER_IP = 16
 # Disable cookies (enabled by default)
 #COOKIES_ENABLED = False
 # Disable Telnet Console (enabled by default)
 #TELNETCONSOLE_ENABLED = False
 # Override the default request headers:
 DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
 # Enable or disable spider middlewares
 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 #SPIDER_MIDDLEWARES = {
 #    'EventMonitor.middlewares.EventmonitorSpiderMiddleware': 543,
 #}
 # Enable or disable downloader middlewares
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #DOWNLOADER_MIDDLEWARES = {
 #    'EventMonitor.middlewares.EventmonitorDownloaderMiddleware': 543,
 #}
 # Enable or disable extensions
 # See https://doc.scrapy.org/en/latest/topics/extensions.html
 #EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
 #}
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
    'EventMonitor.pipelines.EventmonitorPipeline': 300,
 }
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True
 # The initial download delay
 #AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
 #AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 # Enable and configure HTTP caching (disabled by default)
 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 #HTTPCACHE_ENABLED = True
 #HTTPCACHE_EXPIRATION_SECS = 0
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
--- a/EventMonitor/EventMonitor/settings.pyc
+++ b/EventMonitor/EventMonitor/settings.pyc
--- a/EventMonitor/EventMonitor/spiders/init.py
+++ b/EventMonitor/EventMonitor/spiders/init.py
@ -0,0 +1,4 @@
 # This package will contain the spiders of your Scrapy project
 #
 # Please refer to the documentation for information on how to create and manage
 # your spiders.
--- a/EventMonitor/EventMonitor/spiders/init.pyc
+++ b/EventMonitor/EventMonitor/spiders/init.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/init.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/init.cpython-35.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/init.cpython-36.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/init.cpython-36.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/extract_news.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/extract_news.cpython-35.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/extract_news.cpython-36.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/extract_news.cpython-36.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/handle_html.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/handle_html.cpython-35.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/handle_html.cpython-36.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/handle_html.cpython-36.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/news_spider.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/news_spider.cpython-35.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/news_spider.cpython-36.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/news_spider.cpython-36.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/utils.cpython-35.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/utils.cpython-35.pyc
--- a/EventMonitor/EventMonitor/spiders/pycache/utils.cpython-36.pyc
+++ b/EventMonitor/EventMonitor/spiders/pycache/utils.cpython-36.pyc
--- a/EventMonitor/EventMonitor/spiders/extract_news.py
+++ b/EventMonitor/EventMonitor/spiders/extract_news.py
@ -0,0 +1,318 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 import re
 from collections import Counter
 from operator import itemgetter
 import copy
 from lxml import etree
 from .handle_html import *
 from .utils import *
 class NewsParser:
    def __init__(self):
        self.score = 6
        self.length = 5
    def _cal_score(self, text):
        """计算兴趣度"""
        if "。" not in text:
            if "，" in text:
                return 0
            else:
                return -1
        else:
            score = text.count('，') + 1
            score += text.count(',') + 1
            score += text.count('；')
            score += text.count('。')
            return score
    def _line_div(self, html):
        html = re.sub("</?div>|</?table>", "</div><div>", html, flags=re.I)
        html = html.replace('</div>', '', 1)
        index = html.rfind('<div>')
        html = html[:index] + html[index:].replace('<div>', '', 1)
        return html
    def _line_p(self, text):
        text_list = list()
        text = re.sub(r'</?p\s?.*?>', r'</p><p class="news_body">', text, flags=re.I | re.S)
        text = text.replace('</p>', '', 1)
        index = text.rfind('<p>')
        text = text[:index] + text[index:].replace('<p>', '', 1)
        text = '<p class="news_head">{0}</p>'.format(text)
        return text
    def _extract_paragraph(self, html):
        cluster_para = {}
        absorb_para = {}
        for index, div_str in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S | re.I)):
            if len(div_str.strip()) == 0:
                continue
            para_str = div_str.strip()
            score = self._cal_score(para_str)
            if score > self.score:
                cluster_para[index] = [para_str, score]
            else:
                absorb_para[index] = [para_str, score]
        return cluster_para, absorb_para
    def _extract_feature(self, para_dict):
        c = Counter()
        index, text = max(para_dict.items(), key=lambda asd: asd[1][1])
        feature_list = re.findall("(<p.*?>)", text[0], flags=re.I | re.S)
        for feature in feature_list:
            c[feature] += 1
        if c.most_common(1):
            feature, amount = c.most_common(1)[0]
        else:
            feature = ''
        feature = feature.replace('(', '\(').replace(')', '\)')
        return index, feature
    def _gen_skeleton(self, para_dict, index, feature):
        """ 聚类段落集聚类生成生成正文脉络集合"""
        skeleton_dict = {}
        num_list = []
        if not feature:
            skeleton_dict[index] = para_dict[index]
            return skeleton_dict
        for num in para_dict.keys():
            num_list.append(num)
        num_list = sorted(num_list)
        od = num_list.index(index)
        f_list = num_list[0:od]
        l_list = num_list[od:len(num_list)]
        # 向后聚类
        while l_list:
            tmp = l_list.pop(0)
            length = abs(tmp - index)
            if length < self.length:
                if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S | re.I):
                    skeleton_dict[tmp] = para_dict[tmp]
            index = tmp
        # 向前聚类
        while f_list:
            tmp = f_list.pop()
            length = abs(index - tmp)
            if length < self.length:
                if re.match(r".*?{0}".format(feature), para_dict[tmp][0], flags=re.S):
                    skeleton_dict[tmp] = para_dict[tmp]
            index = tmp
        return skeleton_dict
    def _absorb_text(self, skeleton_dict, para_dict):
        """从伪噪声段落吸收噪声段落"""
        content_dict = skeleton_dict
        sk_list = skeleton_dict.keys()
        pa_list = para_dict.keys()
        sk_list = sorted(sk_list)
        pa_list = sorted(pa_list)
        heads = []
        middle = []
        tail = []
        for each in pa_list:
            if each < sk_list[0]:
                heads.append(each)
            if each > sk_list[-1]:
                tail.append(each)
            if (each >= sk_list[0]) and (each <= sk_list[-1]):
                middle.append(each)
        while heads:
            tmp = heads.pop()
            index = sk_list[0]
            if abs(tmp - index) < self.length:
                if para_dict[tmp][1] * 2 > self.score:
                    content_dict[tmp] = para_dict[tmp]
            else:
                break
        while tail:
            tmp = tail.pop(0)
            index = sk_list[-1]
            if abs(tmp - index) < self.length:
                if para_dict[tmp][1] * 2 > self.score:
                    content_dict[tmp] = para_dict[tmp]
            else:
                break
        while middle:
            tmp = middle.pop()
            if para_dict[tmp][1] * 2 > self.score:
                content_dict[tmp] = para_dict[tmp]
        return content_dict
    def _substring(self, text):
        text = self._line_p(text)
        text = pretty_html(text)
        selector = etree.HTML(text)
        xpath_result = selector.xpath('//p')
        if len(xpath_result) == 1:
            sub_string = xpath_result[0].xpath('string(.)')
            sub_string = drop_mutil_br(sub_string)
        else:
            text_list = []
            xpath_result = selector.xpath('//p[@class="news_body"]')
            for item in xpath_result:
                p_string = item.xpath('string(.)').strip()
                if not p_string:
                    continue
                p_string = drop_null(p_string)
                text_list.append(p_string)
            if text_list:
                sub_string = '\n'.join(text_list)
            else:
                sub_string = ''
        return sub_string
    def _pretty_text(self, index_content_list):
        contents = list()
        for each in index_content_list:
            sub_text = self._substring(each[1][0])
            if not sub_text:
                continue
            else:
                contents.append(sub_text)
        text = "\n".join(contents)
        return text
    def extract_news(self, html):
        html = handle_html(html)
        html = self._line_div(html)
        index = 0
        cluster_para, absorb_para = self._extract_paragraph(html)
        if cluster_para:
            index, feature = self._extract_feature(cluster_para)
            skeleton_dict = self._gen_skeleton(cluster_para, index, feature)
            if skeleton_dict:
                if absorb_para:
                    content_dict = self._absorb_text(skeleton_dict, absorb_para)
                else:
                    content_dict = skeleton_dict
                index_content_list = sorted(content_dict.items(), key=itemgetter(0))
                top_div_list = list()
                top_text = ''
                index = index_content_list[0][0]
                for ind, each_div in enumerate(re.findall("<div>(.*?)</div>", html, flags=re.S)):
                    if ind >= index:
                        break
                    top_text += each_div
                    top_div_list.append((ind, each_div))
        else:
            return
        '''正文抽取'''
        def extract_content():
            text = ''
            if index_content_list:
                text = self._pretty_text(index_content_list)
                text = text.strip()
            return text
        '''发布时间抽取'''
        def extract_pubtime():
            pubtime = ''
            tmp_top_div_list = copy.deepcopy(top_div_list)
            while tmp_top_div_list:
                ind, item = tmp_top_div_list.pop()
                if not item.strip():
                    continue
                div_selector = etree.HTML(item)
                if div_selector is None:
                    continue
                div_text = div_selector.xpath('string(.)').strip()
                if not div_text:
                    continue
                pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-：/]\s*\d{1,2}\s*[\-_:日]?\s*\d{1,2}\s*:\s*\d{1,2}\s*(:\s*\d{1,2})?', div_text, flags=re.S|re.I)
                if pubtime:
                    pubtime = pubtime.group()
                    index = ind
                    break
            if not pubtime:
                tmp_top_div_list = copy.deepcopy(top_div_list)
                while tmp_top_div_list:
                    ind, item = tmp_top_div_list.pop()
                    if not item.strip():
                        continue
                    div_selector = etree.HTML(item)
                    if div_selector is None:
                        continue
                    div_text = div_selector.xpath('string(.)')
                    pubtime = re.search(r'(\d{4}\s*[年\-:/]\s*)\d{1,2}\s*[月\-：/]\s*\d{1,2}\s*[\-_:日/]?', div_text,
                                        flags=re.S)
                    if pubtime:
                        pubtime = pubtime.group()
                        index = ind
                        break
            if pubtime:
                pubtime = pubtime.strip()
                pubtime = pubtime.replace('年', '-').replace('月', '-').replace('日', ' ').replace('/', '-')
                pubtime = drop_mutil_blank(pubtime)
                return pubtime, index
            else:
                return pubtime, 0
        '''标题抽取'''
        def extract_title():
            title = ''
            selector = etree.HTML(html)
            tmps = selector.xpath('//title/text()')
            if tmps:
                title = tmps[0].strip()
                title = clear_title(title)
            return title
        news = {}
        news_content = extract_content()
        news_pubtime, index = extract_pubtime()
        news_title = extract_title()
        news['news_content'] = news_content
        news['news_pubtime'] = self.pretty_time(news_pubtime)
        if news['news_pubtime']:
            news['news_date'] = news['news_pubtime'].split(' ')[0]
        else:
            news['news_date'] = ''
        news['news_title'] = news_title
        if not (news['news_content'] and news['news_pubtime'] and news['news_title'] and news['news_date']):
            return {}
        return news
    '''时间标准化'''
    def pretty_time(self, time):
        if not time:
            return None
        modify_time = time
        if len(time.split(' ')) == 2:
            date = modify_time.split(' ')[0]
            hour = modify_time.split(' ')[1]
            date_new = self.pretty_date(date)
            modify_time = ' '.join([date_new, hour])
        else:
            date = modify_time.split(' ')[0]
            modify_time = self.pretty_date(date)
        return modify_time
    '''标准化年月日'''
    def pretty_date(self, date):
        date = date.split('-')
        if len(date) != 3:
            return ''
        year = date[0]
        month = date[1]
        day = date[2]
        if int(month) < 10 and len(month) == 1:
            month = '0' + month
        if int(day) < 10 and len(day) == 1:
            day = '0' + day
        date_new = '-'.join([year, month, day])
        return date_new
--- a/EventMonitor/EventMonitor/spiders/extract_news.pyc
+++ b/EventMonitor/EventMonitor/spiders/extract_news.pyc
--- a/EventMonitor/EventMonitor/spiders/handle_html.py
+++ b/EventMonitor/EventMonitor/spiders/handle_html.py
@ -0,0 +1,74 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 # author: chenhe<hee0624@163.com>
 # time: 2017-11-30
 # version: 1.0
 from html.parser import HTMLParser
 from bs4 import BeautifulSoup
 class StripParser(HTMLParser):
    """
    去除一些特定的标签
    """
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.drop_tags = {'script', 'style', 'iframe', 'aside', 'nav', 'footer'}
        self.fed = []
        self.point_tags =[]
        self.is_fed = True
    def handle_starttag(self, tag, attrs):
        if tag in self.drop_tags:
            self.is_fed = False
            self.point_tags.append(tag)
        else:
            if tag == 'p':
                tmp_attrs = ['{0}="{1}"'.format(i[0], i[1]) for i in attrs]
                tmp_attrs = ' '.join(tmp_attrs)
                self.fed.append('<p {}>'.format(tmp_attrs))
            else:
                self.fed.append('<{}>'.format(tag))
    def handle_data(self, data):
        if self.is_fed:
            self.fed.append(data)
    def handle_endtag(self, tag):
        if tag in self.drop_tags:
            if tag == self.point_tags[-1]:
                self.point_tags.pop()
            if not self.point_tags:
                self.is_fed = True
        else:
            self.fed.append('</{}>'.format(tag))
    def get_html(self):
        return '\n'.join(self.fed)
 def pretty_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    fixed_html = soup.prettify()
    return fixed_html
 def strip_tag(html):
    """
    去除html特定的标签
    :param html: string
    :return: string
    """
    s = StripParser()
    s.feed(html)
    return s.get_html()
 def handle_html(html):
    html = pretty_html(html)
    html  = strip_tag(html)
    return html
--- a/EventMonitor/EventMonitor/spiders/handle_html.pyc
+++ b/EventMonitor/EventMonitor/spiders/handle_html.pyc
--- a/EventMonitor/EventMonitor/spiders/news_spider.py
+++ b/EventMonitor/EventMonitor/spiders/news_spider.py
@ -0,0 +1,86 @@
 #!/usr/bin/env python3
 # coding: utf-8
 # File: news_spider.py
 # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 # Date: 18-7-15
 import scrapy
 import os
 from lxml import etree
 import urllib.request
 from urllib.parse import quote, quote_plus
 from .extract_news import *
 from EventMonitor.items import EventmonitorItem
 import redis
 import os
 class BuildData:
    def __init__(self):
        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
        self.rel_filepath = os.path.join(cur, 'rel_data.txt')
        self.seed_rels = self.collect_rels()
        return
    '''加载关系数据集'''
    def collect_rels(self):
        rels_data = []
        for line in open(self.rel_filepath):
            line = line.strip().split('###')
            keywords = line[:-2]
            rels_data.append(keywords)
        return rels_data
 class NewsSpider(scrapy.Spider):
    name = 'eventspider'
    def __init__(self):
        self.seed_rels = BuildData().seed_rels
        self.parser = NewsParser()
        self.pool = redis.ConnectionPool(host='192.168.1.29', port=6379, decode_responses=True)
        self.conn = redis.Redis(connection_pool=self.pool)
        self.redis_key = 'person_names'
    '''采集主函数'''
    def start_requests(self):
        while(1):
            res = self.conn.spop(self.redis_key)
            print(res)
            if str(res) == 'None':
                return
            line = res.strip().split('###')
            keywords = line[:-1]
            search_body = '+'.join([quote_plus(wd) for wd in keywords[:-1]])
            seed_urls = []
            for page in range(0, 101, 20):
                url = 'https://www.baidu.com/s?ie=utf-8&cl=2&rtt=1&bsst=1&tn=news&word=' + search_body + '&tngroupname=organic_news&pn=' + str(
                    page)
                seed_urls.append(url)
            for seed_url in seed_urls:
                param = {'url': seed_url,
                         'keyword': ' '.join(keywords)}
                yield scrapy.Request(url=seed_url, meta=param, callback=self.collect_newslist, dont_filter=True)
    '''获取新闻列表'''
    def collect_newslist(self, response):
        selector = etree.HTML(response.text)
        news_links = selector.xpath('//h3[@class="c-title"]/a/@href')
        print(response.meta['keyword'], len(set(news_links)))
        for news_link in news_links:
            param = {'url': news_link,
                     'keyword': response.meta['keyword']}
            yield scrapy.Request(url=news_link, meta=param, callback=self.page_parser, dont_filter=True)
    '''对网站新闻进行结构化抽取'''
    def page_parser(self, response):
        data = self.parser.extract_news(response.text)
        if data:
            item = EventmonitorItem()
            item['keyword'] = response.meta['keyword']
            item['news_url'] = response.meta['url']
            item['news_time'] = data['news_pubtime']
            item['news_date'] = data['news_date']
            item['news_title'] = data['news_title']
            item['news_content'] = data['news_content']
            yield item
        return
--- a/EventMonitor/EventMonitor/spiders/rel_data.txt
+++ b/EventMonitor/EventMonitor/spiders/rel_data.txt
--- a/EventMonitor/EventMonitor/spiders/utils.py
+++ b/EventMonitor/EventMonitor/spiders/utils.py
@ -0,0 +1,123 @@
 #!/usr/bin/env python
 # -*- coding:utf-8 -*-
 # author: chenhe<hee0624@163.com>
 # time: 2017-11-30
 # version: 1.0
 from collections import Counter
 import jieba.posseg as pseg
 import re
 def is_chinese(uchar):
    """判断一个unicode是否是汉字"""
    if uchar >= '\u4e00' and uchar <= '\u9fa5':
        return True
    else:
        return False
 def is_number(uchar):
    """判断一个unicode是否是数字"""
    if uchar >= u'\u0030' and uchar <= u'\u0039':
        return True
    else:
        return False
 def is_alphabet(uchar):
    """判断一个unicode是否是英文字母"""
    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
        return True
    else:
        return False
 def is_legal(uchar):
    """判断是否非汉字，数字和英文字符"""
    if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
        return False
    else:
        return True
 def count_pos(str):
    """返回词性个数"""
    pos_set = set()
    words = pseg.cut(str)
    for word, flag in words:
        pos_set.add(flag)
    return len(pos_set)
 def is_longsent(str):
    """根据字符串汉字长度判断是否是标题"""
    length = 0
    for uchar in str:
        if is_chinese(uchar):
            length += 1
        else:
            pass
    if length > 8:
        return True
    else:
        return False
 def clear_title(title_str):
    seg_set = set(['\\', '\|', '/', '_'])
    c = Counter()
    for item in title_str:
        if item in seg_set:
            c[item] += 1
    if c.most_common(1):
        seg, count = c.most_common(1)[0]
    else:
        seg, count = '', 0
    if seg:
        title = title_str.split(seg)[0]
    else:
        title = title_str
    title = title.replace('——', '-')
    tmp = title.split('-')
    is_continue = True
    while is_continue:
        if len(tmp) > 1:
            top = tmp[-1]
            pos_num = count_pos(top)
            if pos_num > 2:
                is_continue = False
            else:
                tmp.pop()
        else:
            is_continue = False
    title = '-'.join(tmp).replace("\t", '')
    return title
 def clear_pan(str):
    num = str.count('>')
    if num >= 2:
        return str.split('>')[-1]
    else:
        return str
 def drop_null(arg):
    if isinstance(arg, str):
        arg = re.sub('\s', '', arg, flags=re.S)
        return arg
    elif isinstance(arg, list):
        new_list = []
        for i in arg:
            i = i.strip()
            if i:
                new_list.append(i)
            else:
                continue
        return new_list
    else:
        return arg
 def drop_mutil_br(str):
    str = re.sub(r'<br>|</br>', '\n', str)
    str = re.sub(r'\n\s+', '\n', str)
    return str
 def drop_mutil_blank(str):
    str = re.sub(r'\s{2,}', ' ', str)
    return str
--- a/EventMonitor/README.md
+++ b/EventMonitor/README.md
@ -0,0 +1,31 @@
 # EventMonitor
 Event monitor based on online news corpus  built by Baidu search enginee using event keyword  for event storyline and analysis，基于给定事件关键词，采集事件资讯，对事件进行挖掘和分析。 
 # 项目路线图
 ![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/project.png)
 # 项目细分
 # 1)　基于话题关键词的话题历时语料库采集
 执行方式：进入EventMonitor目录下，进入cmd窗口，执行"scrapy crawl eventspider -a keyword=话题关键词"，或者直接python crawl.py, 等待数秒后，既可以在news文件夹中存储相应的新闻文件,可以得到相应事件的话题集,话题历史文本  
 ![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/topic.png)
 ![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/news.png)
 ![image](https://github.com/liuhuanyong/EventMonitor/blob/master/image/content.png)    
 # 2)关于热点事件的情感分析
 对于1)得到的历史语料，可以使用基于依存语义和情感词库的篇章级情感分析算法进行情感分析  
 这部分参考我的篇章级情感分析项目DocSentimentAnalysis：https://github.com/liuhuanyong/DocSentimentAnalysis  
 # 3)关于热点事件的搜索趋势
 对于1)得到的历史语料，可以使用百度指数，新浪微博指数进行采集  
 这部分参考我的百度指数采集项目BaiduIndexSpyder：https://github.com/liuhuanyong/BaiduIndexSpyder  
 微博指数采集项目WeiboIndexSpyder：https://github.com/liuhuanyong/WeiboIndexSpyder
 # 4)关于热点事件的话题分析
 对于1)得到的历史语料，可以使用LDA,Kmeans模型进行话题分析  
 这部分参考我的话题分析项目Topicluster：https://github.com/liuhuanyong/TopicCluster
 # 5)关于热点事件的代表性文本分析
 对于1)得到的历史语料，可以使用跨篇章的textrank算法，对文本集的重要性进行计算和排序  
 这部分参考我的文本重要性分析项目ImportantEventExtractor：https://github.com/liuhuanyong/ImportantEventExtractor
 # 6)关于热点事件新闻文本的图谱化展示
 对于得到每个历史新闻事件文本，可以使用关键词，实体识别等关系抽取方法对文本进行可视化展示  
 这部分内容，参考我的文本内容可视化项目项目TextGrapher：https://github.com/liuhuanyong/TextGrapher
 # 结束语
 关于事件监测的方法有很多，也有很多问题需要去解决，以上提出的方法只是一个尝试，就算法本身还有许多需要改进的地方  
 If any question about the project or me ,see https://liuhuanyong.github.io/
--- a/EventMonitor/init.py
+++ b/EventMonitor/init.py
@ -0,0 +1,5 @@
 #!/usr/bin/env python3
 # coding: utf-8
 # File: __init__.py.py
 # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
 # Date: 18-7-15
--- a/EventMonitor/process_redis.py
+++ b/EventMonitor/process_redis.py
@ -0,0 +1,40 @@
 #coding=utf-8
 import redis
 import os
 class RedisProcess:
    def __init__(self):
        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
        self.pool = redis.ConnectionPool(host='192.168.1.37', port=6379, decode_responses=True)
        self.conn = redis.Redis(connection_pool=self.pool)
        self.rel_filepath = os.path.join(cur, 'rel_data.txt')
        return
    def insert_data(self):
        name = 'person_names'
        i = 0
        for line in open(self.rel_filepath):
            i += 1
            if i < 833:
                continue
            line = line.strip()
            if not line or len(line.split('###')) != 4:
                continue
            self.conn.sadd(name, line)
            print(i)
        return
    def read_data(self):
        name = 'person_names'
        res = 1
        while(res):
            res = self.conn.spop(name)
            print(res)
        return
 if __name__ == '__main__':
    handler = RedisProcess()
    handler.insert_data()
    # handler.read_data()
--- a/EventMonitor/rel_data.txt
+++ b/EventMonitor/rel_data.txt
--- a/EventMonitor/scrapy.cfg
+++ b/EventMonitor/scrapy.cfg
@ -0,0 +1,11 @@
 # Automatically created by: scrapy startproject
 #
 # For more information about the [deploy] section see:
 # https://scrapyd.readthedocs.io/en/latest/deploy.html
 [settings]
 default = EventMonitor.settings
 [deploy]
 #url = http://localhost:6800/
 project = EventMonitor