增强正则抽取函数
This commit is contained in:
parent
54602dc3a1
commit
af830b9cc7
@ -31,12 +31,36 @@ public class ExtractFunctionExecutor implements FunctionExecutor{
|
||||
return ExtractUtils.getFirstMatcher(content, pattern, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${extract.regx(resp.html,'<title>(.*?)</title>',1)}")
|
||||
public static String regx(String content,String pattern,int groupIndex){
|
||||
return ExtractUtils.getFirstMatcher(content, pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${extract.regx(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<String> regx(String content,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getFirstMatcher(content, pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>')}")
|
||||
public static List<String> regxs(String content,String pattern){
|
||||
return ExtractUtils.getMatchers(content, pattern, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${extract.regxs(resp.html,'<h2>(.*?)</h2>',1)}")
|
||||
public static List<String> regxs(String content,String pattern,int groupIndex){
|
||||
return ExtractUtils.getMatchers(content, pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${extract.regxs(resp.html,'<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<List<String>> regxs(String content,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getMatchers(content, pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据xpath提取内容")
|
||||
@Example("${extract.xpath(resp.element(),'//title/text()')}")
|
||||
public static String xpath(Element element,String xpath){
|
||||
|
@ -41,11 +41,35 @@ public class ElementFunctionExtension implements FunctionExtension{
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementVar.regx('<h2>(.*?)</h2>')}")
|
||||
@Example("${elementVar.regx('<title>(.*?)</title>',1)}")
|
||||
public static String regx(Element element,String regx,int groupIndex){
|
||||
return ExtractUtils.getFirstMatcher(element.html(), regx, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<String> regx(Element element,String regx,List<Integer> groups){
|
||||
return ExtractUtils.getFirstMatcher(element.html(), regx, groups);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementVar.regxs('<h2>(.*?)</h2>')}")
|
||||
public static List<String> regxs(Element element,String regx){
|
||||
return ExtractUtils.getMatchers(element.html(), regx, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementVar.regxs('<h2>(.*?)</h2>',1)}")
|
||||
public static List<String> regxs(Element element,String regx,int groupIndex){
|
||||
return ExtractUtils.getMatchers(element.html(), regx, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<List<String>> regxs(Element element,String regx,List<Integer> groups){
|
||||
return ExtractUtils.getMatchers(element.html(), regx, groups);
|
||||
}
|
||||
|
||||
@Comment("根据css选择器提取内容")
|
||||
@Example("${elementVar.selector('div > a')}")
|
||||
public static Element selector(Element element,String cssQuery){
|
||||
|
@ -40,11 +40,35 @@ public class ElementsFunctionExtension implements FunctionExtension{
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementsVar.regx('<h2>(.*?)</h2>')}")
|
||||
@Example("${elementsVar.regx('<title>(.*?)</title>',1)}")
|
||||
public static String regx(Elements elements,String regx,int groupIndex){
|
||||
return ExtractUtils.getFirstMatcher(elements.html(), regx, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementsVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<String> regx(Elements elements,String regx,List<Integer> groups){
|
||||
return ExtractUtils.getFirstMatcher(elements.html(), regx, groups);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementsVar.regxs('<h2>(.*?)</h2>')}")
|
||||
public static List<String> regxs(Elements elements,String regx){
|
||||
return ExtractUtils.getMatchers(elements.html(), regx, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementsVar.regxs('<h2>(.*?)</h2>',1)}")
|
||||
public static List<String> regxs(Elements elements,String regx,int groupIndex){
|
||||
return ExtractUtils.getMatchers(elements.html(), regx, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取内容")
|
||||
@Example("${elementsVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<List<String>> regxs(Elements elements,String regx,List<Integer> groups){
|
||||
return ExtractUtils.getMatchers(elements.html(), regx, groups);
|
||||
}
|
||||
|
||||
@Comment("根据css选择器提取内容")
|
||||
@Example("${elementsVar.selector('div > a')}")
|
||||
public static Element selector(Elements elements,String selector){
|
||||
|
@ -46,12 +46,36 @@ public class ResponseFunctionExtension implements FunctionExtension{
|
||||
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取请求结果中的内容")
|
||||
@Example("${resp.regx('<title>(.*?)</title>',1)}")
|
||||
public static String regx(SpiderResponse response,String pattern,int groupIndex){
|
||||
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取请求结果中的内容")
|
||||
@Example("${resp.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<String> regx(SpiderResponse response,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getFirstMatcher(response.getHtml(), pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取请求结果中的内容")
|
||||
@Example("${resp.regxs('<h2>(.*?)</h2>')}")
|
||||
public static List<String> regxs(SpiderResponse response,String pattern){
|
||||
return ExtractUtils.getMatchers(response.getHtml(), pattern, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取请求结果中的内容")
|
||||
@Example("${resp.regxs('<h2>(.*?)</h2>',1)}")
|
||||
public static List<String> regxs(SpiderResponse response,String pattern,int groupIndex){
|
||||
return ExtractUtils.getMatchers(response.getHtml(), pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取请求结果中的内容")
|
||||
@Example("${resp.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<List<String>> regxs(SpiderResponse response,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getMatchers(response.getHtml(), pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据css选择器提取请求结果")
|
||||
@Example("${resp.selector('div > a')}")
|
||||
public static Element selector(SpiderResponse response,String selector){
|
||||
|
@ -33,11 +33,35 @@ public class StringFunctionExtension implements FunctionExtension{
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取String中的内容")
|
||||
@Example("${strVar.regx('<h2>(.*?)</h2>')}")
|
||||
@Example("${strVar.regx('<title>(.*?)</title>',1)}")
|
||||
public static String regx(String source,String pattern,int groupIndex){
|
||||
return ExtractUtils.getFirstMatcher(source, pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取String中的内容")
|
||||
@Example("${strVar.regx('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<String> regx(String source,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getFirstMatcher(source, pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取String中的内容")
|
||||
@Example("${strVar.regxs('<h2>(.*?)</h2>')}")
|
||||
public static List<String> regxs(String source,String pattern){
|
||||
return ExtractUtils.getMatchers(source, pattern, true);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取String中的内容")
|
||||
@Example("${strVar.regxs('<h2>(.*?)</h2>',1)}")
|
||||
public static List<String> regxs(String source,String pattern,int groupIndex){
|
||||
return ExtractUtils.getMatchers(source, pattern, groupIndex);
|
||||
}
|
||||
|
||||
@Comment("根据正则表达式提取String中的内容")
|
||||
@Example("${strVar.regxs('<a href=\"(.*?)\">(.*?)</a>',[1,2])}")
|
||||
public static List<List<String>> regxs(String source,String pattern,List<Integer> groups){
|
||||
return ExtractUtils.getMatchers(source, pattern, groups);
|
||||
}
|
||||
|
||||
@Comment("根据xpath在String变量中查找")
|
||||
@Example("${strVar.xpath('//title/text()')}")
|
||||
@Return({Element.class,String.class})
|
||||
|
@ -12,7 +12,6 @@ import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.seimicrawler.xpath.JXDocument;
|
||||
import org.seimicrawler.xpath.JXNode;
|
||||
import org.springframework.util.StringUtils;
|
||||
|
||||
import com.alibaba.fastjson.JSONPath;
|
||||
/**
|
||||
@ -34,26 +33,55 @@ public class ExtractUtils {
|
||||
}
|
||||
|
||||
public static List<String> getMatchers(String content,String regx,boolean isGroup){
|
||||
return getMatchers(content,regx,isGroup ? 1: 0);
|
||||
}
|
||||
|
||||
public static List<String> getMatchers(String content,String regx,int groupIndex){
|
||||
Matcher matcher = compile(regx).matcher(content);
|
||||
List<String> results = new ArrayList<>();
|
||||
while(matcher.find()){
|
||||
String group = isGroup ? matcher.group(1) : matcher.group();
|
||||
if(!StringUtils.isEmpty(group)){
|
||||
results.add(group);
|
||||
results.add(matcher.group(groupIndex));
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
public static List<List<String>> getMatchers(String content,String regx,List<Integer> groups){
|
||||
Matcher matcher = compile(regx).matcher(content);
|
||||
List<List<String>> results = new ArrayList<>();
|
||||
while(matcher.find()){
|
||||
List<String> matches = new ArrayList<>();
|
||||
for (Integer groupIndex : groups) {
|
||||
matches.add(matcher.group(groupIndex));
|
||||
}
|
||||
results.add(matches);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
public static String getFirstMatcher(String content,String regx,boolean isGroup){
|
||||
|
||||
return getFirstMatcher(content,regx,isGroup ? 1 : 0);
|
||||
}
|
||||
|
||||
public static String getFirstMatcher(String content,String regx,int groupIndex){
|
||||
Matcher matcher = compile(regx).matcher(content);
|
||||
while(matcher.find()){
|
||||
String group = isGroup ? matcher.group(1) : matcher.group();
|
||||
return group;
|
||||
if(matcher.find()){
|
||||
return matcher.group(groupIndex);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public static List<String> getFirstMatcher(String content,String regx,List<Integer> groups){
|
||||
Matcher matcher = compile(regx).matcher(content);
|
||||
List<String> matches = new ArrayList<>();
|
||||
if(matcher.find()){
|
||||
for (Integer groupIndex : groups) {
|
||||
matches.add(matcher.group(groupIndex));
|
||||
}
|
||||
}
|
||||
return matches;
|
||||
}
|
||||
|
||||
public static String getHostFromURL(String url){
|
||||
return getFirstMatcher(url, "(?<=//|)((\\w)+\\.)+\\w+", false);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user