|
@@ -0,0 +1,47 @@
|
|
|
|
+package com.winhc.bigdata.udf;
|
|
|
|
+
|
|
|
|
+import com.aliyun.odps.udf.UDF;
|
|
|
|
+import com.aliyun.odps.utils.StringUtils;
|
|
|
|
+
|
|
|
|
+import java.util.regex.Matcher;
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * @Author: XuJiakai
|
|
|
|
+ * @Date: 2020/9/30 08:55
|
|
|
|
+ * @Description:
|
|
|
|
+ */
|
|
|
|
+public class CaseNoTrim extends UDF {
|
|
|
|
+
|
|
|
|
+ private static final Pattern pattern = Pattern.compile(".*([(\\(]\\d{4}[)\\)][^号]*号?).*");
|
|
|
|
+ private static final Pattern year_pat = Pattern.compile("(\\d{4}?)年");
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ public String evaluate(String val) {
|
|
|
|
+ if (StringUtils.isBlank(val)) {
|
|
|
|
+ return null;
|
|
|
|
+ }
|
|
|
|
+ val = toDBC(val);
|
|
|
|
+ val = val.replace(" ", "");
|
|
|
|
+ if (val.length() < 8) {
|
|
|
|
+ return null;
|
|
|
|
+ }
|
|
|
|
+ val = year_pat.matcher(val).replaceAll("\\($1\\)");
|
|
|
|
+
|
|
|
|
+ Matcher matcher = pattern.matcher(val);
|
|
|
|
+ return matcher.matches() ? matcher.replaceAll("$1") : null;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static String toDBC(String input) {
|
|
|
|
+ char[] c = input.toCharArray();
|
|
|
|
+ for (int i = 0; i < c.length; i++) {
|
|
|
|
+ if (c[i] == 12288) {
|
|
|
|
+ c[i] = (char) 32;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ if (c[i] > 65280 && c[i] < 65375)
|
|
|
|
+ c[i] = (char) (c[i] - 65248);
|
|
|
|
+ }
|
|
|
|
+ return new String(c);
|
|
|
|
+ }
|
|
|
|
+}
|