|
@@ -0,0 +1,97 @@
|
|
|
+package cn.winhc.elasticsearch.plugin.handle;
|
|
|
+
|
|
|
+import cn.winhc.elasticsearch.plugin.entity.Term;
|
|
|
+import cn.winhc.elasticsearch.plugin.util.DomainPostfixUtils;
|
|
|
+import cn.winhc.elasticsearch.plugin.util.DomainUtils;
|
|
|
+import cn.winhc.elasticsearch.plugin.util.StringUtils;
|
|
|
+
|
|
|
+import java.util.ArrayList;
|
|
|
+import java.util.Arrays;
|
|
|
+import java.util.Collections;
|
|
|
+import java.util.List;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+/**
|
|
|
+ * @author: XuJiakai
|
|
|
+ * 2021/11/12 14:21
|
|
|
+ */
|
|
|
+public class EmailText2Tokenizer implements Text2Tokenizer {
|
|
|
+ public static final String name = "email";
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public String getTokenizerName() {
|
|
|
+ return name;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ private static List<Term> email2Tokenizer(String orgEmail, String email) {
|
|
|
+ if (StringUtils.isBlank(orgEmail)) {
|
|
|
+ return Arrays.asList();
|
|
|
+ }
|
|
|
+
|
|
|
+ ArrayList<Term> list = new ArrayList<>();
|
|
|
+
|
|
|
+
|
|
|
+ String[] split = email.split("@");
|
|
|
+
|
|
|
+ if (split.length == 2) {
|
|
|
+ String emailPre = split[0];
|
|
|
+ String emailPost = split[1];
|
|
|
+ String trimEmailPost = DomainPostfixUtils.trimDomainPostfix(emailPost);
|
|
|
+
|
|
|
+ int emailPreIndex = orgEmail.indexOf(emailPre);
|
|
|
+ int emailPostIndex = orgEmail.indexOf(emailPost);
|
|
|
+ int trimEmailPostIndex = orgEmail.indexOf(trimEmailPost);
|
|
|
+
|
|
|
+ list.add(
|
|
|
+ Term.of(emailPre, emailPreIndex, emailPreIndex + emailPre.length(), "email_pre")
|
|
|
+ );
|
|
|
+ list.add(
|
|
|
+ Term.of(emailPost, emailPostIndex, emailPostIndex + emailPost.length(), "email_post")
|
|
|
+ );
|
|
|
+ list.add(
|
|
|
+ Term.of(trimEmailPost, trimEmailPostIndex, trimEmailPostIndex + trimEmailPost.length(), "email_post_trim")
|
|
|
+ );
|
|
|
+
|
|
|
+ if (trimEmailPost.contains(".")) {
|
|
|
+ for (String s : trimEmailPost.split("\\.")) {
|
|
|
+ int subEmailPostIndex = orgEmail.indexOf(s);
|
|
|
+ list.add(
|
|
|
+ Term.of(s, subEmailPostIndex, subEmailPostIndex + s.length(), "sub_email_post")
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ if (!orgEmail.equals(email)) {
|
|
|
+ int i = orgEmail.indexOf(email);
|
|
|
+ list.add(
|
|
|
+ Term.of(email, i, i + email.length(), "org_email")
|
|
|
+ );
|
|
|
+ }
|
|
|
+ return list.stream().distinct().collect(Collectors.toList());
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public List<Term> getTextTokenizer(String line) {
|
|
|
+ List<Term> collect = DomainUtils.getEmails(line).stream()
|
|
|
+ .flatMap(r -> {
|
|
|
+ List<Term> terms = email2Tokenizer(line, r);
|
|
|
+ return terms.stream();
|
|
|
+ }).collect(Collectors.toList());
|
|
|
+ if (collect.isEmpty()) {
|
|
|
+ return Collections.singletonList(Term.of(line, 0, line.length(), "keyword"));
|
|
|
+ } else {
|
|
|
+ return collect;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ public static void main(String[] args) {
|
|
|
+ EmailText2Tokenizer emailText2Tokenizer = new EmailText2Tokenizer();
|
|
|
+ List<Term> textTokenizer = emailText2Tokenizer.getTextTokenizer("xjk@abc.topme.pro你好吗xu-jk@qq.com");
|
|
|
+ System.out.println(textTokenizer);
|
|
|
+ }
|
|
|
+}
|