|
@@ -1,12 +1,17 @@
|
|
package com.winhc.bigdata.utils;
|
|
package com.winhc.bigdata.utils;
|
|
|
|
|
|
|
|
+import com.alibaba.fastjson.JSONObject;
|
|
import com.aliyun.odps.utils.StringUtils;
|
|
import com.aliyun.odps.utils.StringUtils;
|
|
|
|
+import com.github.houbb.opencc4j.util.ZhConverterUtil;
|
|
|
|
|
|
import java.util.Arrays;
|
|
import java.util.Arrays;
|
|
import java.util.Collections;
|
|
import java.util.Collections;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
|
|
+import java.util.regex.Pattern;
|
|
import java.util.stream.Collectors;
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
+import static com.github.stuxuhai.jpinyin.ChineseHelper.convertToSimplifiedChinese;
|
|
|
|
+
|
|
/**
|
|
/**
|
|
* @author: XuJiakai
|
|
* @author: XuJiakai
|
|
* 2021/12/6 11:27
|
|
* 2021/12/6 11:27
|
|
@@ -30,4 +35,30 @@ public class CompanyUtils {
|
|
return val;
|
|
return val;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ private static final Pattern pattern = Pattern.compile("[^\\u4e00-\\u9fa50-9a-zA-Z]");
|
|
|
|
+
|
|
|
|
+ public static String cleanup(String s) {
|
|
|
|
+ if (StringUtils.isBlank(s)) return "";
|
|
|
|
+ return pattern.matcher(s).replaceAll("");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public static JSONObject getCompanyName(String name) {
|
|
|
|
+ if (StringUtils.isEmpty(name)) return null;
|
|
|
|
+ else {
|
|
|
|
+ String show = name.replaceAll("\t;", "").trim();
|
|
|
|
+ String value = cleanup(name);
|
|
|
|
+ String simplifiedChineseValue = ZhConverterUtil.toSimple(value);
|
|
|
|
+ String simplifiedChinese = ZhConverterUtil.toSimple(show);
|
|
|
|
+ JSONObject j = new JSONObject();
|
|
|
|
+ if (show.equals(simplifiedChinese)) {
|
|
|
|
+ j.put("show", show);
|
|
|
|
+ j.put("value", value);
|
|
|
|
+ } else {
|
|
|
|
+ j.put("show", show);
|
|
|
|
+ j.put("value", simplifiedChineseValue);
|
|
|
|
+ j.put("simplified_chinese", simplifiedChinese);
|
|
|
|
+ }
|
|
|
|
+ return j;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|