增强正则抽取函数

This commit is contained in:
mxd 2019-10-22 11:09:17 +08:00
parent 54602dc3a1
commit af830b9cc7
6 changed files with 158 additions and 10 deletions

View File

@ -31,12 +31,36 @@ public class ExtractFunctionExecutor implements FunctionExecutor{
return ExtractUtils.getFirstMatcher(content, pattern, true);
}
@Comment("根据正则表达式提取内容")
@Example("${extract.regx(resp.html,'<title>(.*?)</title>',1)}")
public static String regx(String content,String pattern,int groupIndex){
return ExtractUtils.getFirstMatcher(content, pattern, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${extract.regx(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<String> regx(String content,String pattern,List<Integer> groups){
return ExtractUtils.getFirstMatcher(content, pattern, groups);
}
@Comment("根据正则表达式提取内容")
@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>')}")
public static List<String> regxs(String content,String pattern){
return ExtractUtils.getMatchers(content, pattern, true);
}
@Comment("根据正则表达式提取内容")
@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>',1)}")
public static List<String> regxs(String content,String pattern,int groupIndex){
return ExtractUtils.getMatchers(content, pattern, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${extract.regxs(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<List<String>> regxs(String content,String pattern,List<Integer> groups){
return ExtractUtils.getMatchers(content, pattern, groups);
}
@Comment("根据xpath提取内容")
@Example("${extract.xpath(resp.element(),'//title/text()')}")
public static String xpath(Element element,String xpath){

View File

@ -41,11 +41,35 @@ public class ElementFunctionExtension implements FunctionExtension{
}
@Comment("根据正则表达式提取内容")
@Example("${elementVar.regx('<h2>(.*?)</h2>')}")
@Example("${elementVar.regx('<title>(.*?)</title>',1)}")
public static String regx(Element element,String regx,int groupIndex){
return ExtractUtils.getFirstMatcher(element.html(), regx, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${elementVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<String> regx(Element element,String regx,List<Integer> groups){
return ExtractUtils.getFirstMatcher(element.html(), regx, groups);
}
@Comment("根据正则表达式提取内容")
@Example("${elementVar.regxs('<h2>(.*?)</h2>')}")
public static List<String> regxs(Element element,String regx){
return ExtractUtils.getMatchers(element.html(), regx, true);
}
@Comment("根据正则表达式提取内容")
@Example("${elementVar.regxs('<h2>(.*?)</h2>',1)}")
public static List<String> regxs(Element element,String regx,int groupIndex){
return ExtractUtils.getMatchers(element.html(), regx, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${elementVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<List<String>> regxs(Element element,String regx,List<Integer> groups){
return ExtractUtils.getMatchers(element.html(), regx, groups);
}
@Comment("根据css选择器提取内容")
@Example("${elementVar.selector('div > a')}")
public static Element selector(Element element,String cssQuery){

View File

@ -40,11 +40,35 @@ public class ElementsFunctionExtension implements FunctionExtension{
}
@Comment("根据正则表达式提取内容")
@Example("${elementsVar.regx('<h2>(.*?)</h2>')}")
@Example("${elementsVar.regx('<title>(.*?)</title>',1)}")
public static String regx(Elements elements,String regx,int groupIndex){
return ExtractUtils.getFirstMatcher(elements.html(), regx, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${elementsVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<String> regx(Elements elements,String regx,List<Integer> groups){
return ExtractUtils.getFirstMatcher(elements.html(), regx, groups);
}
@Comment("根据正则表达式提取内容")
@Example("${elementsVar.regxs('<h2>(.*?)</h2>')}")
public static List<String> regxs(Elements elements,String regx){
return ExtractUtils.getMatchers(elements.html(), regx, true);
}
@Comment("根据正则表达式提取内容")
@Example("${elementsVar.regxs('<h2>(.*?)</h2>',1)}")
public static List<String> regxs(Elements elements,String regx,int groupIndex){
return ExtractUtils.getMatchers(elements.html(), regx, groupIndex);
}
@Comment("根据正则表达式提取内容")
@Example("${elementsVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<List<String>> regxs(Elements elements,String regx,List<Integer> groups){
return ExtractUtils.getMatchers(elements.html(), regx, groups);
}
@Comment("根据css选择器提取内容")
@Example("${elementsVar.selector('div > a')}")
public static Element selector(Elements elements,String selector){

View File

@ -46,12 +46,36 @@ public class ResponseFunctionExtension implements FunctionExtension{
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, true);
}
@Comment("根据正则表达式提取请求结果中的内容")
@Example("${resp.regx('<title>(.*?)</title>',1)}")
public static String regx(SpiderResponse response,String pattern,int groupIndex){
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groupIndex);
}
@Comment("根据正则表达式提取请求结果中的内容")
@Example("${resp.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<String> regx(SpiderResponse response,String pattern,List<Integer> groups){
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groups);
}
@Comment("根据正则表达式提取请求结果中的内容")
@Example("${resp.regxs('<h2>(.*?)</h2>')}")
public static List<String> regxs(SpiderResponse response,String pattern){
return ExtractUtils.getMatchers(response.getHtml(), pattern, true);
}
@Comment("根据正则表达式提取请求结果中的内容")
@Example("${resp.regxs('<h2>(.*?)</h2>',1)}")
public static List<String> regxs(SpiderResponse response,String pattern,int groupIndex){
return ExtractUtils.getMatchers(response.getHtml(), pattern, groupIndex);
}
@Comment("根据正则表达式提取请求结果中的内容")
@Example("${resp.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<List<String>> regxs(SpiderResponse response,String pattern,List<Integer> groups){
return ExtractUtils.getMatchers(response.getHtml(), pattern, groups);
}
@Comment("根据css选择器提取请求结果")
@Example("${resp.selector('div > a')}")
public static Element selector(SpiderResponse response,String selector){

View File

@ -33,11 +33,35 @@ public class StringFunctionExtension implements FunctionExtension{
}
@Comment("根据正则表达式提取String中的内容")
@Example("${strVar.regx('<h2>(.*?)</h2>')}")
@Example("${strVar.regx('<title>(.*?)</title>',1)}")
public static String regx(String source,String pattern,int groupIndex){
return ExtractUtils.getFirstMatcher(source, pattern, groupIndex);
}
@Comment("根据正则表达式提取String中的内容")
@Example("${strVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<String> regx(String source,String pattern,List<Integer> groups){
return ExtractUtils.getFirstMatcher(source, pattern, groups);
}
@Comment("根据正则表达式提取String中的内容")
@Example("${strVar.regxs('<h2>(.*?)</h2>')}")
public static List<String> regxs(String source,String pattern){
return ExtractUtils.getMatchers(source, pattern, true);
}
@Comment("根据正则表达式提取String中的内容")
@Example("${strVar.regxs('<h2>(.*?)</h2>',1)}")
public static List<String> regxs(String source,String pattern,int groupIndex){
return ExtractUtils.getMatchers(source, pattern, groupIndex);
}
@Comment("根据正则表达式提取String中的内容")
@Example("${strVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
public static List<List<String>> regxs(String source,String pattern,List<Integer> groups){
return ExtractUtils.getMatchers(source, pattern, groups);
}
@Comment("根据xpath在String变量中查找")
@Example("${strVar.xpath('//title/text()')}")
@Return({Element.class,String.class})

View File

@ -12,7 +12,6 @@ import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.seimicrawler.xpath.JXDocument;
import org.seimicrawler.xpath.JXNode;
import org.springframework.util.StringUtils;
import com.alibaba.fastjson.JSONPath;
/**
@ -34,26 +33,55 @@ public class ExtractUtils {
}
public static List<String> getMatchers(String content,String regx,boolean isGroup){
return getMatchers(content,regx,isGroup ? 1: 0);
}
public static List<String> getMatchers(String content,String regx,int groupIndex){
Matcher matcher = compile(regx).matcher(content);
List<String> results = new ArrayList<>();
while(matcher.find()){
String group = isGroup ? matcher.group(1) : matcher.group();
if(!StringUtils.isEmpty(group)){
results.add(group);
results.add(matcher.group(groupIndex));
}
return results;
}
public static List<List<String>> getMatchers(String content,String regx,List<Integer> groups){
Matcher matcher = compile(regx).matcher(content);
List<List<String>> results = new ArrayList<>();
while(matcher.find()){
List<String> matches = new ArrayList<>();
for (Integer groupIndex : groups) {
matches.add(matcher.group(groupIndex));
}
results.add(matches);
}
return results;
}
public static String getFirstMatcher(String content,String regx,boolean isGroup){
return getFirstMatcher(content,regx,isGroup ? 1 : 0);
}
public static String getFirstMatcher(String content,String regx,int groupIndex){
Matcher matcher = compile(regx).matcher(content);
while(matcher.find()){
String group = isGroup ? matcher.group(1) : matcher.group();
return group;
if(matcher.find()){
return matcher.group(groupIndex);
}
return null;
}
public static List<String> getFirstMatcher(String content,String regx,List<Integer> groups){
Matcher matcher = compile(regx).matcher(content);
List<String> matches = new ArrayList<>();
if(matcher.find()){
for (Integer groupIndex : groups) {
matches.add(matcher.group(groupIndex));
}
}
return matches;
}
public static String getHostFromURL(String url){
return getFirstMatcher(url, "(?<=//|)((\\w)+\\.)+\\w+", false);
}