许家凯 3 năm trước cách đây
commit
7fbac92248
21 tập tin đã thay đổi với 1372 bổ sung0 xóa
  1. 35 0
      .gitignore
  2. 89 0
      pom.xml
  3. 30 0
      src/main/assembly/assembly.xml
  4. 59 0
      src/main/java/cn/winhc/elasticsearch/plugin/WinhcAnalysisPlugin.java
  5. 17 0
      src/main/java/cn/winhc/elasticsearch/plugin/WinhcTokenizerClazzNameConfiguration.java
  6. 23 0
      src/main/java/cn/winhc/elasticsearch/plugin/analyzer/WinhcAnalyzer.java
  7. 41 0
      src/main/java/cn/winhc/elasticsearch/plugin/analyzer/WinhcAnalyzerProvider.java
  8. 163 0
      src/main/java/cn/winhc/elasticsearch/plugin/configuration/AnalyzeContext.java
  9. 73 0
      src/main/java/cn/winhc/elasticsearch/plugin/configuration/Configuration.java
  10. 89 0
      src/main/java/cn/winhc/elasticsearch/plugin/entity/Term.java
  11. 97 0
      src/main/java/cn/winhc/elasticsearch/plugin/handle/EmailText2Tokenizer.java
  12. 15 0
      src/main/java/cn/winhc/elasticsearch/plugin/handle/Text2Tokenizer.java
  13. 88 0
      src/main/java/cn/winhc/elasticsearch/plugin/handle/UrlTextToTokenizer.java
  14. 37 0
      src/main/java/cn/winhc/elasticsearch/plugin/test/XjkTest.java
  15. 99 0
      src/main/java/cn/winhc/elasticsearch/plugin/tokenizer/WinhcTokenizer.java
  16. 32 0
      src/main/java/cn/winhc/elasticsearch/plugin/tokenizer/WinhcTokenizerFactory.java
  17. 317 0
      src/main/java/cn/winhc/elasticsearch/plugin/util/DomainPostfixUtils.java
  18. 46 0
      src/main/java/cn/winhc/elasticsearch/plugin/util/DomainUtils.java
  19. 11 0
      src/main/java/cn/winhc/elasticsearch/plugin/util/StringUtils.java
  20. 6 0
      src/main/resources/plugin-descriptor.properties
  21. 5 0
      src/main/resources/plugin-security.policy

+ 35 - 0
.gitignore

@@ -0,0 +1,35 @@
+### Java template
+# Compiled class file
+*.class
+
+# Log file
+*.log
+
+# BlueJ files
+*.ctxt
+
+# Mobile Tools for Java (J2ME)
+.mtj.tmp/
+
+# Package Files #
+*.jar
+*.war
+*.nar
+*.ear
+*.zip
+*.tar.gz
+*.rar
+
+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
+hs_err_pid*
+
+### Example user template template
+### Example user template
+
+# IntelliJ project files
+.idea
+*.iml
+out
+gen
+*.http
+target

+ 89 - 0
pom.xml

@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>cn.winhc</groupId>
+    <artifactId>winhc-elasticsearch-plugin</artifactId>
+    <version>1.0</version>
+
+    <properties>
+        <maven.compiler.source>8</maven.compiler.source>
+        <maven.compiler.target>8</maven.compiler.target>
+        <elasticsearch.version>6.7.0</elasticsearch.version>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <elasticsearch.description>winhc-elasticsearch-plugin</elasticsearch.description>
+        <elasticsearch.plugin.name>winhc-elasticsearch-plugin</elasticsearch.plugin.name>
+        <elasticsearch.plugin.classname>cn.winhc.elasticsearch.plugin.WinhcAnalysisPlugin</elasticsearch.plugin.classname>
+
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.elasticsearch</groupId>
+            <artifactId>elasticsearch</artifactId>
+            <version>${elasticsearch.version}</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>2.2</version>
+                <configuration>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                    <appendAssemblyId>false</appendAssemblyId>
+                    <outputDirectory>${project.build.directory}</outputDirectory>
+                    <descriptors>
+                        <descriptor>src/main/assembly/assembly.xml</descriptor>
+                    </descriptors>
+                </configuration>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <!--maven编译插件-->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.7.0</version>
+                <configuration>
+                    <source>1.8</source>
+                    <target>1.8</target>
+                    <encoding>UTF-8</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.0.2</version>
+                <configuration>
+                    <encoding>UTF-8</encoding>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <configuration>
+                    <archive>
+<!--                        <manifestFile>src/main/resources/META-INF/MANIFEST.MF</manifestFile>-->
+                    </archive>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>

+ 30 - 0
src/main/assembly/assembly.xml

@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<assembly>
+    <id>plugin</id>
+    <formats>
+        <format>zip</format>
+    </formats>
+    <includeBaseDirectory>false</includeBaseDirectory>
+    <fileSets>
+        <fileSet>
+            <directory>target</directory>
+            <outputDirectory>.</outputDirectory>
+            <includes>
+                <include>*.jar</include>
+            </includes>
+            <fileMode>0600</fileMode>
+            <directoryMode>0700</directoryMode>
+        </fileSet>
+    </fileSets>
+
+    <files>
+        <file>
+            <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
+            <filtered>true</filtered>
+        </file>
+        <file>
+            <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
+            <filtered>true</filtered>
+        </file>
+    </files>
+</assembly>

+ 59 - 0
src/main/java/cn/winhc/elasticsearch/plugin/WinhcAnalysisPlugin.java

@@ -0,0 +1,59 @@
+package cn.winhc.elasticsearch.plugin;
+
+import cn.winhc.elasticsearch.plugin.analyzer.WinhcAnalyzerProvider;
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+import cn.winhc.elasticsearch.plugin.tokenizer.WinhcTokenizerFactory;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.index.analysis.AnalyzerProvider;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+import org.elasticsearch.indices.analysis.AnalysisModule;
+import org.elasticsearch.plugins.AnalysisPlugin;
+import org.elasticsearch.plugins.Plugin;
+
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:56
+ */
+public class WinhcAnalysisPlugin extends Plugin implements AnalysisPlugin {
+    public static final String PLUGIN_NAME = "winhc-plugin";
+    private static final Logger logger = LogManager.getLogger(WinhcAnalysisPlugin.class);
+
+    public WinhcAnalysisPlugin() {
+        super();
+        logger.info(PLUGIN_NAME + " installed into elasticsearch");
+    }
+
+    public static Set<Text2Tokenizer> getText2Token() {
+        return WinhcTokenizerClazzNameConfiguration.clazzName.stream().map(c -> {
+            try {
+                return ((Text2Tokenizer) Class.forName(c).newInstance());
+            } catch (Exception e) {
+                logger.error("new instance error: {} \n{}", c, e);
+                return null;
+            }
+        }).filter(Objects::nonNull).collect(Collectors.toSet());
+    }
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
+        Set<Text2Tokenizer> text2Token = getText2Token();
+        Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> collect = text2Token.stream().collect(Collectors.toMap(o -> "winhc_" + o.getTokenizerName(), o -> (indexSettings, environment, name, settings) -> WinhcTokenizerFactory.getTokenizerFactory(indexSettings, environment, name, settings, o)));
+        logger.info("register tokenizers : {}", collect.keySet());
+        return collect;
+    }
+
+    @Override
+    public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
+        Set<Text2Tokenizer> text2Token = getText2Token();
+        Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> collect = text2Token.stream().collect(Collectors.toMap(o -> "winhc_" + o.getTokenizerName(), o -> (AnalysisModule.AnalysisProvider) (indexSettings, environment, name, settings) -> WinhcAnalyzerProvider.getInstance(indexSettings, environment, name, settings, o)));
+        logger.info("register analyzer : {}", collect.keySet());
+        return collect;
+    }
+}

+ 17 - 0
src/main/java/cn/winhc/elasticsearch/plugin/WinhcTokenizerClazzNameConfiguration.java

@@ -0,0 +1,17 @@
+package cn.winhc.elasticsearch.plugin;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/12 15:17
+ */
+public class WinhcTokenizerClazzNameConfiguration {
+
+    public static final List<String> clazzName = Arrays.asList(
+            "cn.winhc.elasticsearch.plugin.handle.UrlTextToTokenizer"
+            , "cn.winhc.elasticsearch.plugin.handle.EmailText2Tokenizer"
+    );
+
+}

+ 23 - 0
src/main/java/cn/winhc/elasticsearch/plugin/analyzer/WinhcAnalyzer.java

@@ -0,0 +1,23 @@
+package cn.winhc.elasticsearch.plugin.analyzer;
+
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+import cn.winhc.elasticsearch.plugin.tokenizer.WinhcTokenizer;
+import org.apache.lucene.analysis.Analyzer;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 10:00
+ */
+public class WinhcAnalyzer extends Analyzer {
+    private Text2Tokenizer text2Tokenizer;
+
+    public WinhcAnalyzer(Text2Tokenizer text2Tokenizer) {
+        super();
+        this.text2Tokenizer = text2Tokenizer;
+    }
+
+    @Override
+    protected TokenStreamComponents createComponents(String s) {
+        return new TokenStreamComponents(new WinhcTokenizer(text2Tokenizer));
+    }
+}

+ 41 - 0
src/main/java/cn/winhc/elasticsearch/plugin/analyzer/WinhcAnalyzerProvider.java

@@ -0,0 +1,41 @@
+package cn.winhc.elasticsearch.plugin.analyzer;
+
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 10:00
+ */
+public class WinhcAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
+    /**
+     * Constructs a new analyzer component, with the index name and its settings and the analyzer name.
+     *
+     * @param indexSettings the settings and the name of the index
+     * @param name          The analyzer name
+     * @param settings
+     */
+    public WinhcAnalyzerProvider(IndexSettings indexSettings, String name, Settings settings) {
+        super(indexSettings, name, settings);
+    }
+
+    private Text2Tokenizer text2Tokenizer;
+
+
+    public static WinhcAnalyzerProvider getInstance(IndexSettings indexSettings, Environment environment, String name, Settings settings, Text2Tokenizer text2Tokenizer) {
+        WinhcAnalyzerProvider winhcAnalyzerProvider = new WinhcAnalyzerProvider(indexSettings, name, settings);
+        winhcAnalyzerProvider.text2Tokenizer = text2Tokenizer;
+        return winhcAnalyzerProvider;
+
+    }
+
+
+    @Override
+    public Analyzer get() {
+        return new WinhcAnalyzer(text2Tokenizer);
+    }
+}

+ 163 - 0
src/main/java/cn/winhc/elasticsearch/plugin/configuration/AnalyzeContext.java

@@ -0,0 +1,163 @@
+package cn.winhc.elasticsearch.plugin.configuration;
+
+import cn.winhc.elasticsearch.plugin.entity.Term;
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:03
+ */
+public class AnalyzeContext {
+
+    /**
+     * 输入
+     */
+    private Reader input;
+    /**
+     * 配置
+     */
+//    private Configuration configuration;
+    /**
+     * 分词结果
+     */
+    private Iterator<Term> iterator;
+    /**
+     * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
+     */
+    int offset;
+    /**
+     * 缓冲区大小
+     */
+    private static final int BUFFER_SIZE = 4096;
+    /**
+     * 缓冲区
+     */
+    private char[] buffer = new char[BUFFER_SIZE];
+    /**
+     * 缓冲区未处理的下标
+     */
+    private int remainSize = 0;
+    private Text2Tokenizer text2Tokenizer;
+
+
+
+    /**
+     * 句子分隔符
+     */
+    private static final Set<Character> delimiterCharSet = new HashSet<Character>() {{
+        add('\r');
+        add('\n');
+        add('。');
+        add('!');
+        add('!');
+        add(',');
+        add(',');
+        add('?');
+        add('?');
+        add(';');
+        add(';');
+    }};
+
+    public AnalyzeContext(Reader reader, Text2Tokenizer text2Tokenizer) {
+        this.input = reader;
+        this.text2Tokenizer = text2Tokenizer;
+    }
+
+    /**
+     * 重置分词器
+     *
+     * @param reader
+     */
+    public void reset(Reader reader) {
+        input = reader;
+        offset = 0;
+        iterator = null;
+    }
+
+    public Term next() throws IOException {
+        // 如果当年迭代器中还有词,继续迭代
+        if (iterator != null && iterator.hasNext()) {
+            return iterator.next();
+        }
+        // 没词,读取下一行
+        String line = readLine();
+
+        if (line == null) {
+            return null;
+        }
+
+        List<Term> termList = text2Tokenizer.getTextTokenizer(line);
+        // 分词结果是空
+        if (termList == null || termList.isEmpty()) {
+            return null;
+        }
+
+        for (Term term : termList) {
+            term.setOffset(term.getOffset() + offset);
+        }
+        offset += line.length();
+        iterator = termList.iterator();
+        return iterator.next();
+    }
+
+    private String readLine() throws IOException {
+        int offset = 0;
+        int length = BUFFER_SIZE;
+        // 上次读取剩下的部分
+        if (remainSize > 0) {
+            offset = remainSize;
+            length -= remainSize;
+        }
+        // 读取的字符数,-1 读取结束
+        int n = input.read(buffer, offset, length);
+        if (n < 0) {
+            if (remainSize != 0) {
+                String lastLine = new String(buffer, 0, remainSize);
+                remainSize = 0;
+                return lastLine;
+            }
+            return null;
+        }
+        n += offset;
+
+        // 真正的句子结束位置
+        int eos = lastIndexOfEos(buffer, n);
+        String line = new String(buffer, 0, eos);
+        remainSize = n - eos;
+        if (remainSize > 0) {
+            // 把剩下的复制到缓冲区开始位置
+            System.arraycopy(buffer, eos, buffer, 0, remainSize);
+        }
+        return line;
+    }
+
+    /**
+     * 根据句子分隔符,找到这一段文本中的最后一句话所在位置。
+     *
+     * @param buffer
+     * @param length
+     * @return
+     */
+    private int lastIndexOfEos(char[] buffer, int length) {
+        if (length < BUFFER_SIZE) {
+            return length;
+        }
+        for (int i = length - 1; i > 0; i--) {
+            if (delimiterCharSet.contains(buffer[i])) {
+                return i + 1;
+            }
+        }
+        return length;
+    }
+
+    public int getOffset() {
+        return offset;
+    }
+}

+ 73 - 0
src/main/java/cn/winhc/elasticsearch/plugin/configuration/Configuration.java

@@ -0,0 +1,73 @@
+package cn.winhc.elasticsearch.plugin.configuration;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 16:17
+ */
+
+import cn.winhc.elasticsearch.plugin.WinhcAnalysisPlugin;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.io.PathUtils;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+
+import java.io.File;
+import java.nio.file.Path;
+
+public class Configuration {
+
+    private Environment environment;
+    private Settings settings;
+
+    //是否启用智能分词
+    private  boolean useSmart;
+
+    //是否启用远程词典加载
+    private boolean enableRemoteDict=false;
+
+    //是否启用小写处理
+    private boolean enableLowercase=true;
+
+
+    @Inject
+    public Configuration(Environment env,Settings settings) {
+        this.environment = env;
+        this.settings=settings;
+
+        this.useSmart = settings.get("use_smart", "false").equals("true");
+        this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
+        this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
+    }
+
+    public Path getConfigInPluginDir() {
+        return PathUtils
+                .get(new File(WinhcAnalysisPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
+                        .getParent(), "config")
+                .toAbsolutePath();
+    }
+
+    public boolean isUseSmart() {
+        return useSmart;
+    }
+
+    public Configuration setUseSmart(boolean useSmart) {
+        this.useSmart = useSmart;
+        return this;
+    }
+
+    public Environment getEnvironment() {
+        return environment;
+    }
+
+    public Settings getSettings() {
+        return settings;
+    }
+
+    public boolean isEnableRemoteDict() {
+        return enableRemoteDict;
+    }
+
+    public boolean isEnableLowercase() {
+        return enableLowercase;
+    }
+}

+ 89 - 0
src/main/java/cn/winhc/elasticsearch/plugin/entity/Term.java

@@ -0,0 +1,89 @@
+package cn.winhc.elasticsearch.plugin.entity;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:03
+ */
+
+public class Term {
+    //词元的起始位移
+    private int offset;
+    //词元的相对起始位置
+    private int end;
+    //词元文本
+    private String text;
+    //词元类型
+    private String lexemeType;
+
+    public static Term of(String text, int offset, int end, String lexemeType) {
+        Term term = new Term();
+        term.setOffset(offset);
+        term.setEnd(end);
+        term.setText(text);
+        term.setLexemeType(lexemeType);
+        return term;
+    }
+
+
+    public int getOffset() {
+        return offset;
+    }
+
+    public void setOffset(int offset) {
+        this.offset = offset;
+    }
+
+    public int getEnd() {
+        return end;
+    }
+
+    public void setEnd(int end) {
+        this.end = end;
+    }
+
+    public String getText() {
+        return text;
+    }
+
+    public void setText(String text) {
+        this.text = text;
+    }
+
+    public String getLexemeType() {
+        return lexemeType;
+    }
+
+    public void setLexemeType(String lexemeType) {
+        this.lexemeType = lexemeType;
+    }
+
+    @Override
+    public String toString() {
+        return "{" +
+                "\"offset\":" + offset +
+                ",\"end\":" + end +
+                ", \"text\":\"" + text + '\"' +
+                ", \"lexemeType\":\"" + lexemeType + '\"' +
+                '}';
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        Term term = (Term) o;
+
+        if (offset != term.offset) return false;
+        if (end != term.end) return false;
+        return text != null ? text.equals(term.text) : term.text == null;
+    }
+
+    @Override
+    public int hashCode() {
+        int result = offset;
+        result = 31 * result + end;
+        result = 31 * result + (text != null ? text.hashCode() : 0);
+        return result;
+    }
+}

+ 97 - 0
src/main/java/cn/winhc/elasticsearch/plugin/handle/EmailText2Tokenizer.java

@@ -0,0 +1,97 @@
+package cn.winhc.elasticsearch.plugin.handle;
+
+import cn.winhc.elasticsearch.plugin.entity.Term;
+import cn.winhc.elasticsearch.plugin.util.DomainPostfixUtils;
+import cn.winhc.elasticsearch.plugin.util.DomainUtils;
+import cn.winhc.elasticsearch.plugin.util.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/12 14:21
+ */
+public class EmailText2Tokenizer implements Text2Tokenizer {
+    public static final String name = "email";
+
+    @Override
+    public String getTokenizerName() {
+        return name;
+    }
+
+
+    private static List<Term> email2Tokenizer(String orgEmail, String email) {
+        if (StringUtils.isBlank(orgEmail)) {
+            return Arrays.asList();
+        }
+
+        ArrayList<Term> list = new ArrayList<>();
+
+
+        String[] split = email.split("@");
+
+        if (split.length == 2) {
+            String emailPre = split[0];
+            String emailPost = split[1];
+            String trimEmailPost = DomainPostfixUtils.trimDomainPostfix(emailPost);
+
+            int emailPreIndex = orgEmail.indexOf(emailPre);
+            int emailPostIndex = orgEmail.indexOf(emailPost);
+            int trimEmailPostIndex = orgEmail.indexOf(trimEmailPost);
+
+            list.add(
+                    Term.of(emailPre, emailPreIndex, emailPreIndex + emailPre.length(), "email_pre")
+            );
+            list.add(
+                    Term.of(emailPost, emailPostIndex, emailPostIndex + emailPost.length(), "email_post")
+            );
+            list.add(
+                    Term.of(trimEmailPost, trimEmailPostIndex, trimEmailPostIndex + trimEmailPost.length(), "email_post_trim")
+            );
+
+            if (trimEmailPost.contains(".")) {
+                for (String s : trimEmailPost.split("\\.")) {
+                    int subEmailPostIndex = orgEmail.indexOf(s);
+                    list.add(
+                            Term.of(s, subEmailPostIndex, subEmailPostIndex + s.length(), "sub_email_post")
+                    );
+                }
+            }
+
+        }
+
+
+        if (!orgEmail.equals(email)) {
+            int i = orgEmail.indexOf(email);
+            list.add(
+                    Term.of(email, i, i + email.length(), "org_email")
+            );
+        }
+        return list.stream().distinct().collect(Collectors.toList());
+    }
+
+
+    @Override
+    public List<Term> getTextTokenizer(String line) {
+        List<Term> collect = DomainUtils.getEmails(line).stream()
+                .flatMap(r -> {
+                    List<Term> terms = email2Tokenizer(line, r);
+                    return terms.stream();
+                }).collect(Collectors.toList());
+        if (collect.isEmpty()) {
+            return Collections.singletonList(Term.of(line, 0, line.length(), "keyword"));
+        } else {
+            return collect;
+        }
+    }
+
+    public static void main(String[] args) {
+        EmailText2Tokenizer emailText2Tokenizer = new EmailText2Tokenizer();
+        List<Term> textTokenizer = emailText2Tokenizer.getTextTokenizer("xjk@abc.topme.pro你好吗xu-jk@qq.com");
+        System.out.println(textTokenizer);
+    }
+}

+ 15 - 0
src/main/java/cn/winhc/elasticsearch/plugin/handle/Text2Tokenizer.java

@@ -0,0 +1,15 @@
+package cn.winhc.elasticsearch.plugin.handle;
+
+import cn.winhc.elasticsearch.plugin.entity.Term;
+
+import java.util.List;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:08
+ */
+public interface Text2Tokenizer {
+    String getTokenizerName();
+
+    List<Term> getTextTokenizer(String line);
+}

+ 88 - 0
src/main/java/cn/winhc/elasticsearch/plugin/handle/UrlTextToTokenizer.java

@@ -0,0 +1,88 @@
+package cn.winhc.elasticsearch.plugin.handle;
+
+import cn.winhc.elasticsearch.plugin.entity.Term;
+import cn.winhc.elasticsearch.plugin.util.DomainPostfixUtils;
+import cn.winhc.elasticsearch.plugin.util.DomainUtils;
+import cn.winhc.elasticsearch.plugin.util.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:07
+ */
+public class UrlTextToTokenizer implements Text2Tokenizer {
+    private String tokenizerName = "url";
+
+    private static final Pattern urlPattern = Pattern.compile("((http://)|(https://))?(www\\.)?([0-9a-zA-Z\\-_\\u4e00-\\u9fa5]+(\\.[a-zA-Z\\u4e00-\\u9fa5]+){1,2}).*");
+
+    @Override
+    public String getTokenizerName() {
+        return tokenizerName;
+    }
+
+
+    private static List<Term> domain2Tokenizer(String orgUrl, String domain) {
+        if (StringUtils.isBlank(domain)) {
+            return Arrays.asList();
+        }
+
+        ArrayList<Term> list = new ArrayList<>();
+
+        Matcher matcher = urlPattern.matcher(domain);
+        if (matcher.matches()) {
+            String url = matcher.group(5);
+            String trimUrl = DomainPostfixUtils.trimDomainPostfix(url);
+
+            int urlIndex = orgUrl.indexOf(url);
+            int trimUrlIndex = orgUrl.indexOf(trimUrl);
+            list.add(
+                    Term.of(url, urlIndex, urlIndex + url.length(), "url")
+            );
+            list.add(
+                    Term.of(trimUrl, trimUrlIndex, trimUrlIndex + trimUrl.length(), "trim_url")
+            );
+            if (trimUrl.contains(".")) {
+                for (String s : trimUrl.split("\\.")) {
+                    int subUrlIndex = orgUrl.indexOf(s);
+                    list.add(
+                            Term.of(s, subUrlIndex, subUrlIndex + s.length(), "sub_url")
+                    );
+                }
+            }
+        }
+        if (!orgUrl.equals(domain)) {
+            int i = orgUrl.indexOf(domain);
+            list.add(
+                    Term.of(domain, i, i + domain.length(), "org_url")
+            );
+        }
+        return list.stream().distinct().collect(Collectors.toList());
+    }
+
+    @Override
+    public List<Term> getTextTokenizer(String line) {
+        List<Term> collect = DomainUtils.getDomain(line).stream()
+                .flatMap(r -> {
+                    List<Term> terms = domain2Tokenizer(line, r);
+                    return terms.stream();
+                }).collect(Collectors.toList());
+        if (collect.isEmpty()) {
+            return Collections.singletonList(Term.of(line, 0, line.length(), "keyword"));
+        } else {
+            return collect;
+        }
+    }
+
+    public static void main(String[] args) {
+        UrlTextToTokenizer urlTextToTokenizer = new UrlTextToTokenizer();
+        List<Term> textTokenizer = urlTextToTokenizer.getTextTokenizer("www.xiaomi.com你好https://abc.baidu.com");
+        System.out.println(textTokenizer);
+    }
+}

+ 37 - 0
src/main/java/cn/winhc/elasticsearch/plugin/test/XjkTest.java

@@ -0,0 +1,37 @@
+package cn.winhc.elasticsearch.plugin.test;
+
+import cn.winhc.elasticsearch.plugin.entity.Term;
+import cn.winhc.elasticsearch.plugin.util.DomainPostfixUtils;
+
+import java.util.Arrays;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/9 09:26
+ */
+public class XjkTest {
+    private static final Pattern urlPattern = Pattern.compile("((http://)|(https://))?(www\\.)?([0-9a-zA-Z\\-_\\u4e00-\\u9fa5]+(\\.[a-zA-Z\\u4e00-\\u9fa5]+){1,2}).*");
+
+    private static Object test(String line) {
+        Matcher matcher = urlPattern.matcher(line);
+        if (matcher.matches()) {
+            String url = matcher.group(5);
+            String trimUrl = DomainPostfixUtils.trimDomainPostfix(url);
+            int urlIndex = line.indexOf(url);
+            int trimUrlIndex = line.indexOf(trimUrl);
+            return Arrays.asList(
+                    Term.of(url, urlIndex, urlIndex + url.length(), "url")
+                    , Term.of(trimUrl, trimUrlIndex, trimUrlIndex + trimUrl.length(), "trim_url")
+            );
+        }
+        return null;
+    }
+
+    public static void main(String[] args) {
+        Object test = test("https://www.baidu.com");
+        System.out.println(test);
+
+    }
+}

+ 99 - 0
src/main/java/cn/winhc/elasticsearch/plugin/tokenizer/WinhcTokenizer.java

@@ -0,0 +1,99 @@
+package cn.winhc.elasticsearch.plugin.tokenizer;
+
+import cn.winhc.elasticsearch.plugin.configuration.AnalyzeContext;
+import cn.winhc.elasticsearch.plugin.entity.Term;
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+import cn.winhc.elasticsearch.plugin.util.StringUtils;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:00
+ */
+public class WinhcTokenizer extends Tokenizer {
+    //词元文本属性
+    private final CharTermAttribute termAtt;
+    //词元位移属性
+    private final OffsetAttribute offsetAtt;
+    // 距离
+    private final PositionIncrementAttribute positionAttr;
+
+    private final TypeAttribute typeAtt;
+
+    /**
+     * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零
+     */
+    private int totalOffset = 0;
+
+    private AnalyzeContext analyzeContext;
+
+
+    public WinhcTokenizer(Text2Tokenizer text2Tokenizer) {
+        super();
+        offsetAtt = addAttribute(OffsetAttribute.class);
+        termAtt = addAttribute(CharTermAttribute.class);
+        positionAttr = addAttribute(PositionIncrementAttribute.class);
+        typeAtt = addAttribute(TypeAttribute.class);
+        analyzeContext = new AnalyzeContext(input, text2Tokenizer);
+    }
+
+    /**
+     * @return 返会true告知还有下个词元,返会false告知词元输出完毕
+     * @throws IOException
+     */
+    @Override
+    public boolean incrementToken() throws IOException {
+        this.clearAttributes();
+
+        int position = 0;
+        Term term;
+        boolean unIncreased = true;
+        do {
+            term = analyzeContext.next();
+            if (term == null) {
+                break;
+            }
+            if (StringUtils.isBlank(term.getText())) { // 过滤掉空白符,提高索引效率
+                continue;
+            }
+
+            ++position;
+            unIncreased = false;
+        } while (unIncreased);
+
+        if (term != null) {
+            positionAttr.setPositionIncrement(position);
+            termAtt.setEmpty().append(term.getText());
+            typeAtt.setType(term.getLexemeType());
+            offsetAtt.setOffset(correctOffset(totalOffset + term.getOffset()),
+                    correctOffset(totalOffset + term.getOffset() + term.getText().length()));
+            return true;
+        } else {
+            totalOffset += analyzeContext.getOffset();
+            return false;
+        }
+    }
+
+    @Override
+    public void end() throws IOException {
+        super.end();
+        offsetAtt.setOffset(totalOffset, totalOffset);
+        totalOffset = 0;
+    }
+
+    /**
+     * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
+     */
+    @Override
+    public void reset() throws IOException {
+        super.reset();
+        analyzeContext.reset(new BufferedReader(this.input));
+    }
+}

+ 32 - 0
src/main/java/cn/winhc/elasticsearch/plugin/tokenizer/WinhcTokenizerFactory.java

@@ -0,0 +1,32 @@
+package cn.winhc.elasticsearch.plugin.tokenizer;
+
+import cn.winhc.elasticsearch.plugin.handle.Text2Tokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
+import org.elasticsearch.index.analysis.TokenizerFactory;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 10:03
+ */
+public class WinhcTokenizerFactory extends AbstractTokenizerFactory {
+    private Text2Tokenizer text2Tokenizer;
+
+    public WinhcTokenizerFactory(IndexSettings indexSettings, Environment environment, String ignored, Settings settings, Text2Tokenizer text2Tokenizer) {
+        super(indexSettings, ignored, settings);
+        this.text2Tokenizer = text2Tokenizer;
+    }
+
+
+    public static TokenizerFactory getTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings, Text2Tokenizer text2Tokenizer) {
+        return new WinhcTokenizerFactory(indexSettings, environment, name, settings, text2Tokenizer);
+    }
+
+    @Override
+    public Tokenizer create() {
+        return new WinhcTokenizer(text2Tokenizer);
+    }
+}

+ 317 - 0
src/main/java/cn/winhc/elasticsearch/plugin/util/DomainPostfixUtils.java

@@ -0,0 +1,317 @@
+package cn.winhc.elasticsearch.plugin.util;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:37
+ */
+public class DomainPostfixUtils {
+
+    private static Set<String> internationalDomain = new HashSet<String>() {
+        {
+            add(".com");
+            add(".edu.cn");
+            add(".jp");
+            add(".sb");
+            add(".mx");
+            add(".sj");
+            add(".ph");
+            add(".er");
+            add(".fk");
+            add(".id");
+            add(".nm.cn");
+            add(".mu");
+            add(".sz");
+            add(".mn");
+            add(".gm");
+            add(".cd");
+            add(".fm");
+            add(".eu");
+            add(".qa");
+            add(".hn");
+            add(".mw");
+            add(".bd");
+            add(".ly");
+            add(".us");
+            add(".xxx");
+            add(".so");
+            add(".do");
+            add(".la");
+            add(".by");
+            add(".vu");
+            add(".ki");
+            add(".ne");
+            add(".es");
+            add(".mo");
+            add(".bm");
+            add(".mm");
+            add(".fi");
+            add(".td");
+            add(".hu");
+            add(".zw");
+            add(".kh");
+            add(".sn");
+            add(".in");
+            add(".cc");
+            add(".int");
+            add(".mr");
+            add(".ah.cn");
+            add(".um");
+            add(".pt");
+            add(".be");
+            add(".ru");
+            add(".gl");
+            add(".ms");
+            add(".cy");
+            add(".ch");
+            add(".gov");
+            add(".cv");
+            add(".tr");
+            add(".pro");
+            add(".com.cn");
+            add(".fj.cn");
+            add(".af");
+            add(".ro");
+            add(".gt");
+            add(".az");
+            add(".lu");
+            add(".gs.cn");
+            add(".mp");
+            add(".jx.cn");
+            add(".ua");
+            add(".bo");
+            add(".tm");
+            add(".nr");
+            add(".pk");
+            add(".as");
+            add(".ps");
+            add(".mt");
+            add(".ie");
+            add(".bj");
+            add(".gg");
+            add(".bt");
+            add(".vg");
+            add(".ag");
+            add(".cn");
+            add(".dj");
+            add(".ls");
+            add(".gy");
+            add(".kw");
+            add(".pf");
+            add(".bh");
+            add(".bf");
+            add(".al");
+            add(".dk");
+            add(".uz");
+            add(".lc");
+            add(".nz");
+            add(".ws");
+            add(".de");
+            add(".edu");
+            add(".fr");
+            add(".tz");
+            add(".nc");
+            add(".org");
+            add(".bg");
+            add(".pg");
+            add(".name");
+            add(".ck");
+            add(".mh");
+            add(".tl");
+            add(".ac.cn");
+            add(".yr");
+            add(".ln.cn");
+            add(".tf");
+            add(".nl");
+            add(".ai");
+            add(".gp");
+            add(".fo");
+            add(".gd");
+            add(".ao");
+            add(".yt");
+            add(".np");
+            add(".ba");
+            add(".ci");
+            add(".an");
+            add(".ar");
+            add(".hi.cn");
+            add(".jo");
+            add(".sv");
+            add(".aw");
+            add(".gs");
+            add(".aq");
+            add(".rw");
+            add(".mk");
+            add(".nf");
+            add(".vi");
+            add(".tp");
+            add(".va");
+            add(".sk");
+            add(".tn");
+            add(".bs");
+            add(".tk");
+            add(".wf");
+            add(".kp");
+            add(".st");
+            add(".vn");
+            add(".mo.cn");
+            add(".coop");
+            add(".hk");
+            add(".na");
+            add(".to");
+            add(".nu");
+            add(".li");
+            add(".gr");
+            add(".jm");
+            add(".hb.cn");
+            add(".gw");
+            add(".lr");
+            add(".eg");
+            add(".js.cn");
+            add(".ec");
+            add(".pn");
+            add(".sm");
+            add(".je");
+            add(".jl.cn");
+            add(".biz");
+            add(".kr");
+            add(".lt");
+            add(".im");
+            add(".si");
+            add(".pl");
+            add(".ml");
+            add(".au");
+            add(".md");
+            add(".at");
+            add(".za");
+            add(".mc");
+            add(".ac");
+            add(".my");
+            add(".ht");
+            add(".bn");
+            add(".is");
+            add(".ke");
+            add(".se");
+            add(".ve");
+            add(".ma");
+            add(".ug");
+            add(".hl.cn");
+            add(".cu");
+            add(".ge");
+            add(".ee");
+            add(".aero");
+            add(".bz");
+            add(".lb");
+            add(".py");
+            add(".gx.cn");
+            add(".museum");
+            add(".ha.cn");
+            add(".net.cn");
+            add(".org.cn");
+            add(".bw");
+            add(".gq");
+            add(".tv");
+            add(".pr");
+            add(".bj.cn");
+            add(".gov.cn");
+            add(".mq");
+            add(".cr");
+            add(".info");
+            add(".tg");
+            add(".ye");
+            add(".ca");
+            add(".gz.cn");
+            add(".uy");
+            add(".bb");
+            add(".ae");
+            add(".mv");
+            add(".re");
+            add(".pw");
+            add(".ga");
+            add(".yu");
+            add(".ad");
+            add(".nx.cn");
+            add(".kn");
+            add(".zm");
+            add(".ir");
+            add(".hm");
+            add(".gu");
+            add(".pa");
+            add(".pe");
+            add(".hr");
+            add(".no");
+            add(".cx");
+            add(".net");
+            add(".tt");
+            add(".cl");
+            add(".he.cn");
+            add(".lv");
+            add(".cm");
+            add(".pm");
+            add(".br");
+            add(".gf");
+            add(".gn");
+            add(".gh");
+            add(".sc");
+            add(".sy");
+            add(".cz");
+            add(".mz");
+            add(".iq");
+            add(".cg");
+            add(".it");
+            add(".cf");
+            add(".bv");
+            add(".kz");
+            add(".vc");
+            add(".sl");
+            add(".mil");
+            add(".mg");
+            add(".io");
+            add(".sh");
+            add(".gi");
+            add(".sd");
+            add(".dz");
+            add(".tc");
+            add(".ky");
+            add(".km");
+            add(".bi");
+            add(".idv");
+            add(".sr");
+            add(".om");
+            add(".co");
+            add(".eh");
+            add(".uk");
+            add(".kg");
+            add(".ng");
+            add(".gd.cn");
+            add(".fj");
+            add(".il");
+            add(".sa");
+            add(".lk");
+            add(".tw");
+            add(".dm");
+            add(".sg");
+            add(".am");
+            add(".hn.cn");
+            add(".th");
+            add(".tj");
+            add(".hk.cn");
+            add(".et");
+            add(".ni");
+            add(".cq.cn");
+        }
+    };
+
+
+    private static Set<String> allDomain = new HashSet<>(internationalDomain);
+
+
+    public static String trimDomainPostfix(String domain) {
+        if (domain == null) {
+            return null;
+        }
+        return allDomain.stream().filter(domain::endsWith).map(f -> domain.substring(0, domain.length() - f.length())).findFirst().orElse(domain);
+    }
+}

+ 46 - 0
src/main/java/cn/winhc/elasticsearch/plugin/util/DomainUtils.java

@@ -0,0 +1,46 @@
+package cn.winhc.elasticsearch.plugin.util;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/9 09:48
+ */
+public class DomainUtils {
+
+    private static final Pattern email = Pattern.compile("[a-z_A-Z0-9\\-]+@[a-zA-Z\\-]+[\\.(a-zA-Z\\-)]+");
+
+    private static final Pattern domainPattern = Pattern.compile("((http://)|(https://))?(www\\.)?([0-9a-zA-Z\\-_]+(\\.[a-zA-Z]+){1,2})(/.+)?");
+
+
+    public static List<String> getDomain(String text) {
+        Matcher matcher = domainPattern.matcher(text);
+        ArrayList<String> emails = new ArrayList<>();
+        while (matcher.find()) {
+            String group = matcher.group();
+            emails.add(group);
+        }
+        return emails;
+    }
+
+
+    public static List<String> getEmails(String text) {
+        Matcher matcher = email.matcher(text);
+        ArrayList<String> emails = new ArrayList<>();
+        while (matcher.find()) {
+            String group = matcher.group();
+            emails.add(group);
+        }
+        return emails;
+    }
+
+    public static void main(String[] args) {
+        List<String> domain = getDomain("https://www.baidu.com.pro你好http://baidu.com.cn完全vbindex.html");
+
+        System.out.println(domain);
+
+    }
+}

+ 11 - 0
src/main/java/cn/winhc/elasticsearch/plugin/util/StringUtils.java

@@ -0,0 +1,11 @@
+package cn.winhc.elasticsearch.plugin.util;
+
+/**
+ * @author: XuJiakai
+ * 2021/11/8 09:49
+ */
+public class StringUtils {
+    public static Boolean isBlank(String str) {
+        return str == null || "".equals(str.trim());
+    }
+}

+ 6 - 0
src/main/resources/plugin-descriptor.properties

@@ -0,0 +1,6 @@
+description={elasticsearch.description}
+version=${project.version}
+name=${elasticsearch.plugin.name}
+classname=${elasticsearch.plugin.classname}
+java.version=${maven.compiler.target}
+elasticsearch.version=${elasticsearch.version}

+ 5 - 0
src/main/resources/plugin-security.policy

@@ -0,0 +1,5 @@
+grant {
+  permission java.security.AllPermission;
+  permission java.lang.RuntimePermission "createClassLoader";
+  permission java.lang.RuntimePermission "getClassLoader";
+};