IP代理池插件

This commit is contained in:
mxd 2019-08-24 17:37:23 +08:00
parent 80bcd6b8f3
commit 3e8abd250a
11 changed files with 439 additions and 1 deletions

View File

@ -46,7 +46,7 @@ spider-flow
- [ ] Redis插件 - [ ] Redis插件
- [ ] Mongodb插件 - [ ] Mongodb插件
- [ ] Hbase插件 - [ ] Hbase插件
- [ ] IP代理池插件 - [x] IP代理池插件
- [ ] OCR识别插件 - [ ] OCR识别插件
### 项目部分截图 ### 项目部分截图

View File

@ -34,5 +34,6 @@
<module>spider-flow-core</module> <module>spider-flow-core</module>
<module>spider-flow-web</module> <module>spider-flow-web</module>
<module>spider-flow-selenium</module> <module>spider-flow-selenium</module>
<module>spider-flow-proxypool</module>
</modules> </modules>
</project> </project>

View File

@ -0,0 +1,18 @@
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for sp_proxy
-- ----------------------------
DROP TABLE IF EXISTS `sp_proxy`;
CREATE TABLE `sp_proxy` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`ip` varchar(32) NOT NULL,
`port` int(6) NOT NULL,
`type` varchar(16) DEFAULT NULL,
`anonymous` int(11) DEFAULT NULL,
`available` int(11) DEFAULT NULL,
`valid_date` datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `sp_proxy_unique` (`ip`,`port`) USING HASH
) ENGINE=InnoDB AUTO_INCREMENT=57 DEFAULT CHARSET=utf8mb4;

View File

@ -0,0 +1,34 @@
<?xml version="1.0"?>
<project
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.spiderflow</groupId>
<artifactId>spider-flow</artifactId>
<version>0.0.1</version>
</parent>
<artifactId>spider-flow-proxypool</artifactId>
<name>spider-flow-web</name>
<url>https://gitee.com/jmxd/spider-flow/tree/master/spider-flow-proxypool</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.spiderflow</groupId>
<artifactId>spider-flow-api</artifactId>
<version>0.0.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,49 @@
package org.spiderflow.proxypool;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spiderflow.proxypool.model.Proxy;
import org.spiderflow.proxypool.service.ProxyService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
@Component
public class ProxyPoolCleanTask {
@Autowired
private ProxyService proxyService;
@Autowired
private ProxyPoolManager proxyPoolManager;
private static Logger logger = LoggerFactory.getLogger(ProxyPoolCleanTask.class);
@Scheduled(initialDelay = 10000,fixedDelay = 10000)
public void clean(){
logger.info("开始检测代理IP有效性");
List<Proxy> proxys = proxyService.findAll();
ThreadPoolExecutor pool = new ThreadPoolExecutor(8, 8, 60, TimeUnit.SECONDS, new LinkedBlockingQueue<>());
for (Proxy proxy : proxys) {
pool.submit(()->{
if(proxyPoolManager.check(proxy) == -1){
proxyPoolManager.remove(proxy);
}
});
}
pool.shutdown();
while(!pool.isTerminated()){
try {
pool.awaitTermination(50, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
}
}
logger.info("检测代理IP有效性完毕");
}
}

View File

@ -0,0 +1,131 @@
package org.spiderflow.proxypool;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
import javax.annotation.PostConstruct;
import org.apache.commons.lang3.RandomUtils;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.spiderflow.proxypool.model.Proxy;
import org.spiderflow.proxypool.service.ProxyService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
@Component
public class ProxyPoolManager {
@Autowired
private ProxyService proxyService;
private List<Proxy> proxys = Collections.emptyList();
private static Logger logger = LoggerFactory.getLogger(ProxyPoolManager.class);
@PostConstruct
private void init(){
//读取全部代理存到内存中
this.proxys = new CopyOnWriteArrayList<>(proxyService.findAll());
}
public void remove(Proxy proxy){
this.proxys.remove(proxy);
this.proxyService.remove(proxy);
}
public boolean add(Proxy proxy){
if(this.proxys.contains(proxy)){
return true;
}
if(check(proxy) != -1){
boolean flag = proxyService.insert(proxy);
if(flag){
this.proxys.add(proxy);
}
return flag;
}
return false;
}
/**
* 随机获取一个http代理
* @return
*/
public Proxy getHttpProxy(){
return getHttpProxy(true);
}
/**
* 随机获取一个https代理
* @return
*/
public Proxy getHttpsProxy(){
return getHttpsProxy(true);
}
/**
* 随机获取一个HTTP代理
* @return
*/
public Proxy getHttpProxy(boolean anonymous){
return random(get("http", anonymous));
}
/**
* 随机获取一个HTTPS代理
* @return
*/
public Proxy getHttpsProxy(boolean anonymous){
return random(get("https", anonymous));
}
private Proxy random(List<Proxy> proxys){
int size = 0;
if(proxys != null && (size = proxys.size()) > 0){
return proxys.get(RandomUtils.nextInt(0, size));
}
return null;
}
private List<Proxy> get(String type,boolean anonymous){
List<Proxy> nProxys = new ArrayList<>();
if(this.proxys != null){
for (Proxy proxy : proxys) {
if(type.equals(proxy.getType())){
if((anonymous && proxy.getAnonymous() == 1)||(proxy.getAnonymous() == 0 && !anonymous)){
nProxys.add(proxy);
}
}
}
}
return nProxys;
}
/**
* 检测代理
* @param proxy
* @return
*/
public long check(Proxy proxy){
try {
long st = System.currentTimeMillis();
Jsoup.connect("https://www.baidu.com")
.ignoreContentType(true)
.ignoreHttpErrors(true)
.timeout(3000)
.proxy(proxy.getIp(), proxy.getPort())
.execute();
st = System.currentTimeMillis() - st;
logger.info("检测代理:{}:{},延迟:{}",proxy.getIp(),proxy.getPort(),st);
return st;
} catch (Exception e) {
logger.info("检测代理:{}:{},超时",proxy.getIp(),proxy.getPort());
return -1;
}
}
}

View File

@ -0,0 +1,53 @@
package org.spiderflow.proxypool.executor.function;
import org.spiderflow.executor.FunctionExecutor;
import org.spiderflow.proxypool.ProxyPoolManager;
import org.spiderflow.proxypool.model.Proxy;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
@Component
public class ProxyPoolFunctionExecutor implements FunctionExecutor{
private static ProxyPoolManager proxyPoolManager;
@Override
public String getFunctionPrefix() {
return "proxypool";
}
public static String http(boolean anonymous){
return convertToString(proxyPoolManager.getHttpProxy(anonymous));
}
public static String http(){
return http(true);
}
public static String https(boolean anonymous){
return convertToString(proxyPoolManager.getHttpsProxy(anonymous));
}
public static String https(){
return https(true);
}
private static String convertToString(Proxy proxy){
return String.format("%s:%s", proxy.getIp(),proxy.getPort());
}
public static void add(String ip,Integer port,String type,boolean anonymous){
Proxy proxy = new Proxy();
proxy.setIp(ip);
proxy.setPort(Integer.valueOf(port));
proxy.setType(type);
proxy.setAnonymous(anonymous ? 1: 0);
proxyPoolManager.add(proxy);
}
@Autowired
public void setProxyPoolManager(ProxyPoolManager proxyPoolManager) {
ProxyPoolFunctionExecutor.proxyPoolManager = proxyPoolManager;
}
}

View File

@ -0,0 +1,107 @@
package org.spiderflow.proxypool.model;
import java.util.Date;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Table(name = "sp_proxy")
public class Proxy {
@Id
@GeneratedValue(strategy = GenerationType.IDENTITY)
private Integer id;
private String ip;
private Integer port;
private String type;
private Integer anonymous;
private Date validDate;
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public Integer getPort() {
return port;
}
public void setPort(Integer port) {
this.port = port;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public Integer getAnonymous() {
return anonymous;
}
public void setAnonymous(Integer anonymous) {
this.anonymous = anonymous;
}
public Date getValidDate() {
return validDate;
}
public void setValidDate(Date validDate) {
this.validDate = validDate;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((ip == null) ? 0 : ip.hashCode());
result = prime * result + ((port == null) ? 0 : port.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Proxy other = (Proxy) obj;
if (ip == null) {
if (other.ip != null)
return false;
} else if (!ip.equals(other.ip))
return false;
if (port == null) {
if (other.port != null)
return false;
} else if (!port.equals(other.port))
return false;
return true;
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
}

View File

@ -0,0 +1,8 @@
package org.spiderflow.proxypool.repository;
import org.spiderflow.proxypool.model.Proxy;
import org.springframework.data.jpa.repository.JpaRepository;
public interface ProxyRepository extends JpaRepository<Proxy, Integer>{
}

View File

@ -0,0 +1,32 @@
package org.spiderflow.proxypool.service;
import java.util.List;
import org.spiderflow.proxypool.model.Proxy;
import org.spiderflow.proxypool.repository.ProxyRepository;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class ProxyService {
@Autowired
private ProxyRepository proxyRepository;
public boolean insert(Proxy proxy){
try {
proxyRepository.save(proxy);
return true;
} catch (Exception e) {
return false;
}
}
public void remove(Proxy proxy){
proxyRepository.deleteById(proxy.getId());
}
public List<Proxy> findAll(){
return proxyRepository.findAll();
}
}

View File

@ -27,5 +27,10 @@
<artifactId>spider-flow-selenium</artifactId> <artifactId>spider-flow-selenium</artifactId>
<version>0.0.1</version> <version>0.0.1</version>
</dependency> </dependency>
<dependency>
<groupId>org.spiderflow</groupId>
<artifactId>spider-flow-proxypool</artifactId>
<version>0.0.1</version>
</dependency>
</dependencies> </dependencies>
</project> </project>