Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c45f0c3

Browse files
authored
show job support errorcode (vesoft-inc#4067)
1 parent 53ada44 commit c45f0c3

File tree

15 files changed

+196
-70
lines changed

15 files changed

+196
-70
lines changed

src/common/utils/MetaKeyUtils.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1399,7 +1399,8 @@ std::string MetaKeyUtils::jobVal(const meta::cpp2::JobType& type,
13991399
std::vector<std::string> paras,
14001400
meta::cpp2::JobStatus jobStatus,
14011401
int64_t startTime,
1402-
int64_t stopTime) {
1402+
int64_t stopTime,
1403+
nebula::cpp2::ErrorCode errCode) {
14031404
std::string val;
14041405
val.reserve(256);
14051406
val.append(reinterpret_cast<const char*>(&type), sizeof(meta::cpp2::JobType));
@@ -1412,11 +1413,17 @@ std::string MetaKeyUtils::jobVal(const meta::cpp2::JobType& type,
14121413
}
14131414
val.append(reinterpret_cast<const char*>(&jobStatus), sizeof(meta::cpp2::JobStatus))
14141415
.append(reinterpret_cast<const char*>(&startTime), sizeof(int64_t))
1415-
.append(reinterpret_cast<const char*>(&stopTime), sizeof(int64_t));
1416+
.append(reinterpret_cast<const char*>(&stopTime), sizeof(int64_t))
1417+
.append(reinterpret_cast<const char*>(&errCode), sizeof(nebula::cpp2::ErrorCode));
14161418
return val;
14171419
}
14181420

1419-
std::tuple<meta::cpp2::JobType, std::vector<std::string>, meta::cpp2::JobStatus, int64_t, int64_t>
1421+
std::tuple<meta::cpp2::JobType,
1422+
std::vector<std::string>,
1423+
meta::cpp2::JobStatus,
1424+
int64_t,
1425+
int64_t,
1426+
nebula::cpp2::ErrorCode>
14201427
MetaKeyUtils::parseJobVal(folly::StringPiece rawVal) {
14211428
CHECK_GE(rawVal.size(),
14221429
sizeof(meta::cpp2::JobType) + sizeof(size_t) + sizeof(meta::cpp2::JobStatus) +
@@ -1439,7 +1446,9 @@ MetaKeyUtils::parseJobVal(folly::StringPiece rawVal) {
14391446
auto tStart = *reinterpret_cast<const int64_t*>(rawVal.data() + offset);
14401447
offset += sizeof(int64_t);
14411448
auto tStop = *reinterpret_cast<const int64_t*>(rawVal.data() + offset);
1442-
return std::make_tuple(type, paras, status, tStart, tStop);
1449+
offset += sizeof(int64_t);
1450+
auto errCode = *reinterpret_cast<const nebula::cpp2::ErrorCode*>(rawVal.data() + offset);
1451+
return std::make_tuple(type, paras, status, tStart, tStop, errCode);
14431452
}
14441453

14451454
std::pair<GraphSpaceID, JobID> MetaKeyUtils::parseJobKey(folly::StringPiece key) {
@@ -1473,20 +1482,23 @@ std::tuple<GraphSpaceID, JobID, TaskID> MetaKeyUtils::parseTaskKey(folly::String
14731482
std::string MetaKeyUtils::taskVal(HostAddr host,
14741483
meta::cpp2::JobStatus jobStatus,
14751484
int64_t startTime,
1476-
int64_t stopTime) {
1485+
int64_t stopTime,
1486+
nebula::cpp2::ErrorCode errCode) {
14771487
std::string val;
14781488
val.reserve(128);
14791489
val.append(MetaKeyUtils::serializeHostAddr(host))
14801490
.append(reinterpret_cast<const char*>(&jobStatus), sizeof(meta::cpp2::JobStatus))
14811491
.append(reinterpret_cast<const char*>(&startTime), sizeof(int64_t))
1482-
.append(reinterpret_cast<const char*>(&stopTime), sizeof(int64_t));
1492+
.append(reinterpret_cast<const char*>(&stopTime), sizeof(int64_t))
1493+
.append(reinterpret_cast<const char*>(&errCode), sizeof(nebula::cpp2::ErrorCode));
14831494
return val;
14841495
}
14851496

1486-
std::tuple<HostAddr, meta::cpp2::JobStatus, int64_t, int64_t> MetaKeyUtils::parseTaskVal(
1487-
folly::StringPiece rawVal) {
1497+
std::tuple<HostAddr, meta::cpp2::JobStatus, int64_t, int64_t, nebula::cpp2::ErrorCode>
1498+
MetaKeyUtils::parseTaskVal(folly::StringPiece rawVal) {
14881499
CHECK_GE(rawVal.size(),
1489-
sizeof(size_t) + sizeof(Port) + sizeof(meta::cpp2::JobStatus) + sizeof(int64_t) * 2);
1500+
sizeof(size_t) + sizeof(Port) + sizeof(meta::cpp2::JobStatus) + sizeof(int64_t) * 2 +
1501+
sizeof(nebula::cpp2::ErrorCode));
14901502
size_t offset = 0;
14911503
HostAddr host = MetaKeyUtils::deserializeHostAddr(rawVal);
14921504
offset += sizeof(size_t);
@@ -1498,7 +1510,9 @@ std::tuple<HostAddr, meta::cpp2::JobStatus, int64_t, int64_t> MetaKeyUtils::pars
14981510
auto tStart = *reinterpret_cast<const int64_t*>(rawVal.data() + offset);
14991511
offset += sizeof(int64_t);
15001512
auto tStop = *reinterpret_cast<const int64_t*>(rawVal.data() + offset);
1501-
return std::make_tuple(host, status, tStart, tStop);
1513+
offset += sizeof(int64_t);
1514+
auto errCode = *reinterpret_cast<const nebula::cpp2::ErrorCode*>(rawVal.data() + offset);
1515+
return std::make_tuple(host, status, tStart, tStop, errCode);
15021516
}
15031517

15041518
} // namespace nebula

src/common/utils/MetaKeyUtils.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -442,14 +442,19 @@ class MetaKeyUtils final {
442442
std::vector<std::string> paras,
443443
meta::cpp2::JobStatus jobStatus,
444444
int64_t startTime,
445-
int64_t stopTime);
445+
int64_t stopTime,
446+
nebula::cpp2::ErrorCode errCode);
446447
/**
447448
* @brief Decode val from kvstore, return
448-
* {jobType, paras, status, start time, stop time}
449+
* {jobType, paras, status, start time, stop time, error code}
449450
*/
450-
static std::
451-
tuple<meta::cpp2::JobType, std::vector<std::string>, meta::cpp2::JobStatus, int64_t, int64_t>
452-
parseJobVal(folly::StringPiece rawVal);
451+
static std::tuple<meta::cpp2::JobType,
452+
std::vector<std::string>,
453+
meta::cpp2::JobStatus,
454+
int64_t,
455+
int64_t,
456+
nebula::cpp2::ErrorCode>
457+
parseJobVal(folly::StringPiece rawVal);
453458

454459
static std::pair<GraphSpaceID, JobID> parseJobKey(folly::StringPiece key);
455460

@@ -464,14 +469,15 @@ class MetaKeyUtils final {
464469
static std::string taskVal(HostAddr host,
465470
meta::cpp2::JobStatus jobStatus,
466471
int64_t startTime,
467-
int64_t stopTime);
472+
int64_t stopTime,
473+
nebula::cpp2::ErrorCode errCode);
468474

469475
/**
470476
* @brief Decode task val,it should be
471-
* {host, status, start time, stop time}
477+
* {host, status, start time, stop time, error code}
472478
*/
473-
static std::tuple<HostAddr, meta::cpp2::JobStatus, int64_t, int64_t> parseTaskVal(
474-
folly::StringPiece rawVal);
479+
static std::tuple<HostAddr, meta::cpp2::JobStatus, int64_t, int64_t, nebula::cpp2::ErrorCode>
480+
parseTaskVal(folly::StringPiece rawVal);
475481
};
476482

477483
} // namespace nebula

src/graph/executor/admin/SubmitJobExecutor.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,12 @@ nebula::DataSet SubmitJobExecutor::buildShowResultData(
112112
const nebula::meta::cpp2::JobDesc &jd, const std::vector<nebula::meta::cpp2::TaskDesc> &td) {
113113
if (jd.get_type() == meta::cpp2::JobType::DATA_BALANCE ||
114114
jd.get_type() == meta::cpp2::JobType::ZONE_BALANCE) {
115-
nebula::DataSet v(
116-
{"Job Id(spaceId:partId)", "Command(src->dst)", "Status", "Start Time", "Stop Time"});
115+
nebula::DataSet v({"Job Id(spaceId:partId)",
116+
"Command(src->dst)",
117+
"Status",
118+
"Start Time",
119+
"Stop Time",
120+
"Error Code"});
117121
const auto &paras = jd.get_paras();
118122
size_t index = std::stoul(paras.back());
119123
uint32_t total = paras.size() - index - 1, succeeded = 0, failed = 0, inProgress = 0,
@@ -122,7 +126,8 @@ nebula::DataSet SubmitJobExecutor::buildShowResultData(
122126
apache::thrift::util::enumNameSafe(jd.get_type()),
123127
apache::thrift::util::enumNameSafe(jd.get_status()),
124128
convertJobTimestampToDateTime(jd.get_start_time()).toString(),
125-
convertJobTimestampToDateTime(jd.get_stop_time()).toString()}));
129+
convertJobTimestampToDateTime(jd.get_stop_time()).toString(),
130+
apache::thrift::util::enumNameSafe(jd.get_code())}));
126131
for (size_t i = index; i < paras.size() - 1; i++) {
127132
meta::cpp2::BalanceTask tsk;
128133
apache::thrift::CompactSerializer::deserialize(paras[i], tsk);
@@ -144,7 +149,8 @@ nebula::DataSet SubmitJobExecutor::buildShowResultData(
144149
std::move(tsk).get_command(),
145150
apache::thrift::util::enumNameSafe(tsk.get_result()),
146151
convertJobTimestampToDateTime(std::move(tsk).get_start_time()),
147-
convertJobTimestampToDateTime(std::move(tsk).get_stop_time())}));
152+
convertJobTimestampToDateTime(std::move(tsk).get_stop_time()),
153+
apache::thrift::util::enumNameSafe(jd.get_code())}));
148154
}
149155
v.emplace_back(Row({folly::sformat("Total:{}", total),
150156
folly::sformat("Succeeded:{}", succeeded),
@@ -153,13 +159,15 @@ nebula::DataSet SubmitJobExecutor::buildShowResultData(
153159
folly::sformat("Invalid:{}", invalid)}));
154160
return v;
155161
} else {
156-
nebula::DataSet v({"Job Id(TaskId)", "Command(Dest)", "Status", "Start Time", "Stop Time"});
162+
nebula::DataSet v(
163+
{"Job Id(TaskId)", "Command(Dest)", "Status", "Start Time", "Stop Time", "Error Code"});
157164
v.emplace_back(nebula::Row({
158165
jd.get_job_id(),
159166
apache::thrift::util::enumNameSafe(jd.get_type()),
160167
apache::thrift::util::enumNameSafe(jd.get_status()),
161168
convertJobTimestampToDateTime(jd.get_start_time()),
162169
convertJobTimestampToDateTime(jd.get_stop_time()),
170+
apache::thrift::util::enumNameSafe(jd.get_code()),
163171
}));
164172
// tasks desc
165173
for (const auto &taskDesc : td) {
@@ -169,6 +177,7 @@ nebula::DataSet SubmitJobExecutor::buildShowResultData(
169177
apache::thrift::util::enumNameSafe(taskDesc.get_status()),
170178
convertJobTimestampToDateTime(taskDesc.get_start_time()),
171179
convertJobTimestampToDateTime(taskDesc.get_stop_time()),
180+
apache::thrift::util::enumNameSafe(taskDesc.get_code()),
172181
}));
173182
}
174183
return v;

src/interface/meta.thrift

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ struct JobDesc {
267267
5: JobStatus status,
268268
6: i64 start_time,
269269
7: i64 stop_time,
270+
8: common.ErrorCode code,
270271
}
271272

272273
struct TaskDesc {
@@ -277,6 +278,7 @@ struct TaskDesc {
277278
5: JobStatus status,
278279
6: i64 start_time,
279280
7: i64 stop_time,
281+
8: common.ErrorCode code,
280282
}
281283

282284
struct AdminJobResult {
@@ -517,7 +519,7 @@ struct GetPartsAllocResp {
517519

518520
// get workerid for snowflake
519521
struct GetWorkerIdReq {
520-
1: binary host,
522+
1: binary host,
521523
}
522524

523525
struct GetWorkerIdResp {

src/meta/processors/job/JobDescription.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,16 @@ JobDescription::JobDescription(GraphSpaceID space,
2626
std::vector<std::string> paras,
2727
Status status,
2828
int64_t startTime,
29-
int64_t stopTime)
29+
int64_t stopTime,
30+
nebula::cpp2::ErrorCode errCode)
3031
: space_(space),
3132
jobId_(jobId),
3233
type_(type),
3334
paras_(std::move(paras)),
3435
status_(status),
3536
startTime_(startTime),
36-
stopTime_(stopTime) {}
37+
stopTime_(stopTime),
38+
errCode_(errCode) {}
3739

3840
ErrorOr<nebula::cpp2::ErrorCode, JobDescription> JobDescription::makeJobDescription(
3941
folly::StringPiece rawkey, folly::StringPiece rawval) {
@@ -52,8 +54,15 @@ ErrorOr<nebula::cpp2::ErrorCode, JobDescription> JobDescription::makeJobDescript
5254
auto status = std::get<2>(tup);
5355
auto startTime = std::get<3>(tup);
5456
auto stopTime = std::get<4>(tup);
55-
return JobDescription(
56-
spaceIdAndJob.first, spaceIdAndJob.second, type, paras, status, startTime, stopTime);
57+
auto errCode = std::get<5>(tup);
58+
return JobDescription(spaceIdAndJob.first,
59+
spaceIdAndJob.second,
60+
type,
61+
paras,
62+
status,
63+
startTime,
64+
stopTime,
65+
errCode);
5766
} catch (std::exception& ex) {
5867
LOG(INFO) << ex.what();
5968
}
@@ -69,6 +78,7 @@ cpp2::JobDesc JobDescription::toJobDesc() {
6978
ret.status_ref() = status_;
7079
ret.start_time_ref() = startTime_;
7180
ret.stop_time_ref() = stopTime_;
81+
ret.code_ref() = errCode_;
7282
return ret;
7383
}
7484

src/meta/processors/job/JobDescription.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ class JobDescription {
2929
std::vector<std::string> paras = {},
3030
Status status = Status::QUEUE,
3131
int64_t startTime = 0,
32-
int64_t stopTime = 0);
32+
int64_t stopTime = 0,
33+
nebula::cpp2::ErrorCode errCode = nebula::cpp2::ErrorCode::E_UNKNOWN);
3334

3435
/**
3536
* @brief Return the JobDescription if both key & val is valid
@@ -119,6 +120,14 @@ class JobDescription {
119120
return stopTime_;
120121
}
121122

123+
void setErrorCode(nebula::cpp2::ErrorCode errCode) {
124+
errCode_ = errCode;
125+
}
126+
127+
nebula::cpp2::ErrorCode getErrorCode() {
128+
return errCode_;
129+
}
130+
122131
/**
123132
* @brief
124133
* Get a existed job from kvstore, return folly::none if there isn't
@@ -167,6 +176,7 @@ class JobDescription {
167176
Status status_;
168177
int64_t startTime_;
169178
int64_t stopTime_;
179+
nebula::cpp2::ErrorCode errCode_{nebula::cpp2::ErrorCode::E_UNKNOWN};
170180
};
171181

172182
} // namespace meta

src/meta/processors/job/JobManager.cpp

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,12 @@ nebula::cpp2::ErrorCode JobManager::handleRemainingJobs() {
9292
for (auto& jd : jds) {
9393
jd.setStatus(cpp2::JobStatus::QUEUE, true);
9494
auto jobKey = MetaKeyUtils::jobKey(jd.getSpace(), jd.getJobId());
95-
auto jobVal = MetaKeyUtils::jobVal(
96-
jd.getJobType(), jd.getParas(), jd.getStatus(), jd.getStartTime(), jd.getStopTime());
95+
auto jobVal = MetaKeyUtils::jobVal(jd.getJobType(),
96+
jd.getParas(),
97+
jd.getStatus(),
98+
jd.getStartTime(),
99+
jd.getStopTime(),
100+
jd.getErrorCode());
97101
save(jobKey, jobVal);
98102
}
99103
return nebula::cpp2::ErrorCode::SUCCEEDED;
@@ -143,7 +147,8 @@ void JobManager::scheduleThread() {
143147
jobDesc.getParas(),
144148
jobDesc.getStatus(),
145149
jobDesc.getStartTime(),
146-
jobDesc.getStopTime());
150+
jobDesc.getStopTime(),
151+
jobDesc.getErrorCode());
147152
save(jobKey, jobVal);
148153
spaceRunningJobs_.insert_or_assign(spaceId, true);
149154
if (!runJobInternal(jobDesc, jobOp)) {
@@ -242,13 +247,37 @@ nebula::cpp2::ErrorCode JobManager::jobFinished(GraphSpaceID spaceId,
242247
return nebula::cpp2::ErrorCode::E_SAVE_JOB_FAILURE;
243248
}
244249

250+
// Set the errorcode of the job
251+
nebula::cpp2::ErrorCode jobErrCode = nebula::cpp2::ErrorCode::SUCCEEDED;
252+
if (jobStatus != cpp2::JobStatus::FINISHED) {
253+
// Traverse the tasks and find the first task errorcode unsuccessful
254+
auto jobKey = MetaKeyUtils::jobKey(spaceId, jobId);
255+
std::unique_ptr<kvstore::KVIterator> iter;
256+
auto rc = kvStore_->prefix(kDefaultSpaceId, kDefaultPartId, jobKey, &iter);
257+
if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) {
258+
return rc;
259+
}
260+
for (; iter->valid(); iter->next()) {
261+
if (MetaKeyUtils::isJobKey(iter->key())) {
262+
continue;
263+
}
264+
auto tupTaskVal = MetaKeyUtils::parseTaskVal(iter->val());
265+
jobErrCode = std::get<4>(tupTaskVal);
266+
if (jobErrCode != nebula::cpp2::ErrorCode::SUCCEEDED) {
267+
break;
268+
}
269+
}
270+
}
271+
optJobDesc.setErrorCode(jobErrCode);
272+
245273
spaceRunningJobs_.insert_or_assign(spaceId, false);
246274
auto jobKey = MetaKeyUtils::jobKey(optJobDesc.getSpace(), optJobDesc.getJobId());
247275
auto jobVal = MetaKeyUtils::jobVal(optJobDesc.getJobType(),
248276
optJobDesc.getParas(),
249277
optJobDesc.getStatus(),
250278
optJobDesc.getStartTime(),
251-
optJobDesc.getStopTime());
279+
optJobDesc.getStopTime(),
280+
optJobDesc.getErrorCode());
252281
auto rc = save(jobKey, jobVal);
253282
if (rc != nebula::cpp2::ErrorCode::SUCCEEDED) {
254283
return rc;
@@ -281,6 +310,7 @@ nebula::cpp2::ErrorCode JobManager::saveTaskStatus(TaskDescription& td,
281310
auto status = code == nebula::cpp2::ErrorCode::SUCCEEDED ? cpp2::JobStatus::FINISHED
282311
: cpp2::JobStatus::FAILED;
283312
td.setStatus(status);
313+
td.setErrorCode(code);
284314

285315
auto spaceId = req.get_space_id();
286316
auto jobId = req.get_job_id();
@@ -301,8 +331,8 @@ nebula::cpp2::ErrorCode JobManager::saveTaskStatus(TaskDescription& td,
301331
}
302332

303333
auto taskKey = MetaKeyUtils::taskKey(td.getSpace(), td.getJobId(), td.getTaskId());
304-
auto taskVal =
305-
MetaKeyUtils::taskVal(td.getHost(), td.getStatus(), td.getStartTime(), td.getStopTime());
334+
auto taskVal = MetaKeyUtils::taskVal(
335+
td.getHost(), td.getStatus(), td.getStartTime(), td.getStopTime(), td.getErrorCode());
306336
auto rcSave = save(taskKey, taskVal);
307337
if (rcSave != nebula::cpp2::ErrorCode::SUCCEEDED) {
308338
return rcSave;
@@ -393,7 +423,8 @@ nebula::cpp2::ErrorCode JobManager::addJob(JobDescription& jobDesc, AdminClient*
393423
jobDesc.getParas(),
394424
jobDesc.getStatus(),
395425
jobDesc.getStartTime(),
396-
jobDesc.getStopTime());
426+
jobDesc.getStopTime(),
427+
jobDesc.getErrorCode());
397428
auto rc = save(jobKey, jobVal);
398429
if (rc == nebula::cpp2::ErrorCode::SUCCEEDED) {
399430
enqueue(spaceId, jobId, JbOp::ADD, jobDesc.getJobType());

src/meta/processors/job/RebuildJobExecutor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ namespace nebula {
1717
namespace meta {
1818

1919
nebula::cpp2::ErrorCode RebuildJobExecutor::prepare() {
20-
// The last value of paras_ are index name
20+
// The value of paras_ are index name
2121
auto spaceRet = spaceExist();
2222
if (spaceRet != nebula::cpp2::ErrorCode::SUCCEEDED) {
2323
LOG(INFO) << "Can't find the space, spaceId " << space_;

0 commit comments

Comments
 (0)