Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit ebb3bd5

Browse files
committed
Diskless replication: child -> parent communication improved.
Child now reports full info to the parent including IDs of slaves in failure state and exit code.
1 parent b50e321 commit ebb3bd5

File tree

1 file changed

+34
-20
lines changed

1 file changed

+34
-20
lines changed

src/rdb.c

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1292,11 +1292,11 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
12921292
server.rdb_save_time_start = -1;
12931293

12941294
/* If the child returns an OK exit code, read the set of slave client
1295-
* IDs that received the full RDB payload, closing all the slaves
1296-
* which are not among the ones listed.
1295+
* IDs and the associated status code. We'll terminate all the slaves
1296+
* in error state.
12971297
*
12981298
* If the process returned an error, consider the list of slaves that
1299-
* can continue to be emtpy, so that it's just a speical case of the
1299+
* can continue to be emtpy, so that it's just a special case of the
13001300
* normal code path. */
13011301
ok_slaves = zmalloc(sizeof(uint64_t)); /* Make space for the count. */
13021302
ok_slaves[0] = 0;
@@ -1306,7 +1306,7 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
13061306
if (read(server.rdb_pipe_read_result_from_child, ok_slaves, readlen) ==
13071307
readlen)
13081308
{
1309-
readlen = ok_slaves[0]*sizeof(uint64_t);
1309+
readlen = ok_slaves[0]*sizeof(uint64_t)*2;
13101310

13111311
/* Make space for enough elements as specified by the first
13121312
* uint64_t element in the array. */
@@ -1334,14 +1334,23 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
13341334

13351335
if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
13361336
uint64_t j;
1337+
int errorcode = 0;
13371338

1339+
/* Search for the slave ID in the reply. In order for a slave to
1340+
* continue the replication process, we need to find it in the list,
1341+
* and it must have an error code set to 0 (which means success). */
13381342
for (j = 0; j < ok_slaves[0]; j++) {
1339-
if (slave->id == ok_slaves[j+1]) break; /* Found in OK list. */
1343+
if (slave->id == ok_slaves[2*j+1]) {
1344+
errorcode = ok_slaves[2*j+2];
1345+
break; /* Found in slaves list. */
1346+
}
13401347
}
1341-
if (j == ok_slaves[0]) {
1348+
if (j == ok_slaves[0] || errorcode != 0) {
13421349
redisLog(REDIS_WARNING,
1343-
"Closing slave %llu: child->slave RDB transfer failed.",
1344-
slave->id);
1350+
"Closing slave %llu: child->slave RDB transfer failed: %s",
1351+
slave->id,
1352+
(errorcode == 0) ? "RDB transfer child aborted"
1353+
: strerror(errorcode));
13451354
freeClient(slave);
13461355
} else {
13471356
redisLog(REDIS_WARNING,
@@ -1448,29 +1457,34 @@ int rdbSaveToSlavesSockets(void) {
14481457
/* If we are returning OK, at least one slave was served
14491458
* with the RDB file as expected, so we need to send a report
14501459
* to the parent via the pipe. The format of the message is:
1451-
* just an array of uint64_t integers (to avoid alignment concerns),
1452-
* where the first element is the number of uint64_t elements
1453-
* that follows, representing slave client IDs that were
1454-
* successfully served. */
1455-
void *msg = zmalloc(sizeof(uint64_t)*(1+numfds));
1460+
*
1461+
* <len> <slave[0].id> <slave[0].error> ...
1462+
*
1463+
* len, slave IDs, and slave errors, are all uint64_t integers,
1464+
* so basically the reply is composed of 64 bits for the len field
1465+
* plus 2 additional 64 bit integers for each entry, for a total
1466+
* of 'len' entries.
1467+
*
1468+
* The 'id' represents the slave's client ID, so that the master
1469+
* can match the report with a specific slave, and 'error' is
1470+
* set to 0 if the replication process terminated with a success
1471+
* or the error code if an error occurred. */
1472+
void *msg = zmalloc(sizeof(uint64_t)*(1+2*numfds));
14561473
uint64_t *len = msg;
14571474
uint64_t *ids = len+1;
14581475
int j, msglen;
14591476

1460-
*len = 0;
1477+
*len = numfds;
14611478
for (j = 0; j < numfds; j++) {
1462-
/* No error? Add it. */
1463-
if (slave_sockets.io.fdset.state[j] == 0) {
1464-
ids[*len] = clientids[j];
1465-
(*len)++;
1466-
}
1479+
*ids++ = clientids[j];
1480+
*ids++ = slave_sockets.io.fdset.state[j];
14671481
}
14681482

14691483
/* Write the message to the parent. If we have no good slaves or
14701484
* we are unable to transfer the message to the parent, we exit
14711485
* with an error so that the parent will abort the replication
14721486
* process with all the childre that were waiting. */
1473-
msglen = sizeof(uint64_t)*(1+(*len));
1487+
msglen = sizeof(uint64_t)*(1+2*numfds);
14741488
if (*len == 0 ||
14751489
write(server.rdb_pipe_write_result_to_parent,msg,msglen)
14761490
!= msglen)

0 commit comments

Comments
 (0)