|
@@ -14,8 +14,7 @@ import com.winhc.dataworks.flow.touch.service.OdpsService;
|
|
|
import com.winhc.dataworks.flow.touch.service.TouchService;
|
|
|
import com.winhc.dataworks.flow.touch.utils.DateUtils;
|
|
|
import com.winhc.dataworks.flow.touch.utils.DingUtils;
|
|
|
-import com.winhc.dataworks.flow.touch.utils.SparkDaemonKill;
|
|
|
-import com.winhc.dataworks.flow.touch.utils.SparkDaemonUtils;
|
|
|
+import com.winhc.dataworks.flow.touch.utils.SparkDaemonThread;
|
|
|
import lombok.SneakyThrows;
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
import org.apache.commons.cli.*;
|
|
@@ -165,9 +164,7 @@ public class Main {
|
|
|
String accessKeySecret = dataWorksAccessProperties.getAccessKeySecret();
|
|
|
Set<String> ps = jobs.stream().map(DataWorksFlowJob::getProject).collect(Collectors.toSet());
|
|
|
for (String p : ps) {
|
|
|
- /*SparkDaemonThread th = new SparkDaemonThread(p, accessKeyId, accessKeySecret, odpsCmdHome, 90L);
|
|
|
- th.start();*/
|
|
|
- new SparkDaemonKill(p, accessKeyId, accessKeySecret, odpsCmdHome, SparkDaemonUtils.getQueue(p)).start();
|
|
|
+ new SparkDaemonThread(p, accessKeyId, accessKeySecret, odpsCmdHome, 90L).start();
|
|
|
}
|
|
|
|
|
|
//运行job,并接收失败参数,最大重试三次
|
|
@@ -175,7 +172,29 @@ public class Main {
|
|
|
int i = 3;
|
|
|
while (!failureTask.isEmpty() && i-- > 0) {
|
|
|
Set<String> fSet = failureTask.stream().map(TaskInfo::getKey).collect(Collectors.toSet());
|
|
|
- List<DataWorksFlowJob> js = jobs.stream().filter(job -> fSet.contains(job.getProject() + ":" + job.getFlow() + ":" + job.getTask())).collect(Collectors.toList());
|
|
|
+
|
|
|
+ List<DataWorksFlowJob> js = jobs.stream().map(job -> {
|
|
|
+ String project = job.getProject();
|
|
|
+ String flow = job.getFlow();
|
|
|
+ List<DataWorksFlowTask> task = job.getTask();
|
|
|
+
|
|
|
+ List<DataWorksFlowTask> collect = task.stream().filter(t -> fSet.contains(project + ":" + flow + ":" + t.getTaskName())
|
|
|
+ ).collect(Collectors.toList());
|
|
|
+
|
|
|
+ if (collect.isEmpty()) {
|
|
|
+ return null;
|
|
|
+ } else {
|
|
|
+ return new DataWorksFlowJob(project, flow, collect);
|
|
|
+ }
|
|
|
+ }).filter(Objects::nonNull).collect(Collectors.toList());
|
|
|
+
|
|
|
+ String collect = js.stream().flatMap(job -> {
|
|
|
+ String project = job.getProject();
|
|
|
+ String flow = job.getFlow();
|
|
|
+ List<DataWorksFlowTask> task = job.getTask();
|
|
|
+ return task.stream().map(t -> project + ":" + flow + ":" + t.getTaskName());
|
|
|
+ }).collect(Collectors.joining(","));
|
|
|
+ dingUtils.send("【" + (3 - i) + "】重新启动以下job:" + collect);
|
|
|
failureTask = run(bizDate, js);
|
|
|
}
|
|
|
if (!failureTask.isEmpty()) {
|
|
@@ -246,7 +265,7 @@ public class Main {
|
|
|
if (failure.size() != 0) {
|
|
|
failedTask++;
|
|
|
log.error("failure node:{} ", failure);
|
|
|
- DingMsg error = new DingMsg("任务失败", taskInfo.getProject(), taskInfo.getFlow(), String.join(",", failure), TaskFlowEnum.FAILURE.getMsg());
|
|
|
+ DingMsg error = new DingMsg("任务失败", taskInfo.getProject(), taskInfo.getFlow(), taskInfo.getTaskName(), String.join(",", failure), TaskFlowEnum.FAILURE.getMsg());
|
|
|
dingUtils.send(error);
|
|
|
failureTask.add(taskInfo);
|
|
|
} else {
|
|
@@ -270,7 +289,7 @@ public class Main {
|
|
|
if (!timedCache.containsKey(taskInfo) && i <= 6) {
|
|
|
//超两小时
|
|
|
i++;
|
|
|
- DingMsg error = new DingMsg("【" + i + "】任务长时间未结束", taskInfo.getProject(), taskInfo.getFlow(), String.join(",", failure), TaskFlowEnum.RUNNING.getMsg());
|
|
|
+ DingMsg error = new DingMsg("【" + i + "】任务长时间未结束", taskInfo.getProject(), taskInfo.getFlow(), taskInfo.getTaskName(), String.join(",", failure), TaskFlowEnum.RUNNING.getMsg());
|
|
|
dingUtils.send(error);
|
|
|
timedCache.put(taskInfo, "1");
|
|
|
}
|