@@ -1292,11 +1292,11 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
1292
1292
server .rdb_save_time_start = -1 ;
1293
1293
1294
1294
/* If the child returns an OK exit code, read the set of slave client
1295
- * IDs that received the full RDB payload, closing all the slaves
1296
- * which are not among the ones listed .
1295
+ * IDs and the associated status code. We'll terminate all the slaves
1296
+ * in error state .
1297
1297
*
1298
1298
* If the process returned an error, consider the list of slaves that
1299
- * can continue to be emtpy, so that it's just a speical case of the
1299
+ * can continue to be emtpy, so that it's just a special case of the
1300
1300
* normal code path. */
1301
1301
ok_slaves = zmalloc (sizeof (uint64_t )); /* Make space for the count. */
1302
1302
ok_slaves [0 ] = 0 ;
@@ -1306,7 +1306,7 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
1306
1306
if (read (server .rdb_pipe_read_result_from_child , ok_slaves , readlen ) ==
1307
1307
readlen )
1308
1308
{
1309
- readlen = ok_slaves [0 ]* sizeof (uint64_t );
1309
+ readlen = ok_slaves [0 ]* sizeof (uint64_t )* 2 ;
1310
1310
1311
1311
/* Make space for enough elements as specified by the first
1312
1312
* uint64_t element in the array. */
@@ -1334,14 +1334,23 @@ void backgroundSaveDoneHandlerSocket(int exitcode, int bysignal) {
1334
1334
1335
1335
if (slave -> replstate == REDIS_REPL_WAIT_BGSAVE_END ) {
1336
1336
uint64_t j ;
1337
+ int errorcode = 0 ;
1337
1338
1339
+ /* Search for the slave ID in the reply. In order for a slave to
1340
+ * continue the replication process, we need to find it in the list,
1341
+ * and it must have an error code set to 0 (which means success). */
1338
1342
for (j = 0 ; j < ok_slaves [0 ]; j ++ ) {
1339
- if (slave -> id == ok_slaves [j + 1 ]) break ; /* Found in OK list. */
1343
+ if (slave -> id == ok_slaves [2 * j + 1 ]) {
1344
+ errorcode = ok_slaves [2 * j + 2 ];
1345
+ break ; /* Found in slaves list. */
1346
+ }
1340
1347
}
1341
- if (j == ok_slaves [0 ]) {
1348
+ if (j == ok_slaves [0 ] || errorcode != 0 ) {
1342
1349
redisLog (REDIS_WARNING ,
1343
- "Closing slave %llu: child->slave RDB transfer failed." ,
1344
- slave -> id );
1350
+ "Closing slave %llu: child->slave RDB transfer failed: %s" ,
1351
+ slave -> id ,
1352
+ (errorcode == 0 ) ? "RDB transfer child aborted"
1353
+ : strerror (errorcode ));
1345
1354
freeClient (slave );
1346
1355
} else {
1347
1356
redisLog (REDIS_WARNING ,
@@ -1448,29 +1457,34 @@ int rdbSaveToSlavesSockets(void) {
1448
1457
/* If we are returning OK, at least one slave was served
1449
1458
* with the RDB file as expected, so we need to send a report
1450
1459
* to the parent via the pipe. The format of the message is:
1451
- * just an array of uint64_t integers (to avoid alignment concerns),
1452
- * where the first element is the number of uint64_t elements
1453
- * that follows, representing slave client IDs that were
1454
- * successfully served. */
1455
- void * msg = zmalloc (sizeof (uint64_t )* (1 + numfds ));
1460
+ *
1461
+ * <len> <slave[0].id> <slave[0].error> ...
1462
+ *
1463
+ * len, slave IDs, and slave errors, are all uint64_t integers,
1464
+ * so basically the reply is composed of 64 bits for the len field
1465
+ * plus 2 additional 64 bit integers for each entry, for a total
1466
+ * of 'len' entries.
1467
+ *
1468
+ * The 'id' represents the slave's client ID, so that the master
1469
+ * can match the report with a specific slave, and 'error' is
1470
+ * set to 0 if the replication process terminated with a success
1471
+ * or the error code if an error occurred. */
1472
+ void * msg = zmalloc (sizeof (uint64_t )* (1 + 2 * numfds ));
1456
1473
uint64_t * len = msg ;
1457
1474
uint64_t * ids = len + 1 ;
1458
1475
int j , msglen ;
1459
1476
1460
- * len = 0 ;
1477
+ * len = numfds ;
1461
1478
for (j = 0 ; j < numfds ; j ++ ) {
1462
- /* No error? Add it. */
1463
- if (slave_sockets .io .fdset .state [j ] == 0 ) {
1464
- ids [* len ] = clientids [j ];
1465
- (* len )++ ;
1466
- }
1479
+ * ids ++ = clientids [j ];
1480
+ * ids ++ = slave_sockets .io .fdset .state [j ];
1467
1481
}
1468
1482
1469
1483
/* Write the message to the parent. If we have no good slaves or
1470
1484
* we are unable to transfer the message to the parent, we exit
1471
1485
* with an error so that the parent will abort the replication
1472
1486
* process with all the childre that were waiting. */
1473
- msglen = sizeof (uint64_t )* (1 + ( * len ) );
1487
+ msglen = sizeof (uint64_t )* (1 + 2 * numfds );
1474
1488
if (* len == 0 ||
1475
1489
write (server .rdb_pipe_write_result_to_parent ,msg ,msglen )
1476
1490
!= msglen )
0 commit comments