forked from EvoMap/evolver
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex.js
More file actions
executable file
·3127 lines (2977 loc) · 152 KB
/
Copy pathindex.js
File metadata and controls
executable file
·3127 lines (2977 loc) · 152 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env node
function _printProxyTokenUsage(out = process.stderr) {
out.write('Usage: node index.js proxy-token [--settings FILE]\n');
}
function _readProxyTokenFromSettingsFile(fs, settingsFile) {
try {
const parsed = JSON.parse(fs.readFileSync(settingsFile, 'utf8'));
return parsed && parsed.proxy && typeof parsed.proxy.token === 'string'
? parsed.proxy.token
: '';
} catch {
return '';
}
}
// `proxy-token` is a credential helper for Codex. Handle it before loading any
// project .env so a workspace cannot change EVOLVER_SETTINGS_DIR or other local
// state used to find the proxy token.
if (process.argv[2] === 'proxy-token') {
try {
const _fs = require('fs');
const _os = require('os');
const _path = require('path');
let settingsFile = '';
for (let i = 3; i < process.argv.length; i++) {
const arg = process.argv[i];
if (arg === '-h' || arg === '--help') {
_printProxyTokenUsage(process.stdout);
process.exit(0);
}
if (arg === '--settings') {
if (!process.argv[i + 1]) {
_printProxyTokenUsage();
console.error('[proxy-token] missing value for --settings');
process.exit(2);
}
settingsFile = process.argv[i + 1];
i++;
continue;
}
_printProxyTokenUsage();
console.error('[proxy-token] unknown argument');
process.exit(2);
}
const defaultSettingsFile = _path.join(
process.env.EVOLVER_SETTINGS_DIR || _path.join(_os.homedir(), '.evolver'),
'settings.json',
);
const token = _readProxyTokenFromSettingsFile(_fs, settingsFile || defaultSettingsFile);
if (!token) {
console.error('[proxy-token] no active proxy token found; start evolver with EVOMAP_PROXY=1 first');
process.exit(1);
}
process.stdout.write(token + '\n');
process.exit(0);
} catch (e) {
console.error('[proxy-token] Failed:', e && e.message || e);
process.exit(1);
}
}
// Load .env BEFORE any internal require so that a2aProtocol and ATP
// modules see A2A_NODE_SECRET / A2A_NODE_ID / A2A_HUB_URL at first
// access and never fall back to a stale persisted/cached secret.
// Reported in #460.
//
// Load order matters (see #526): we must not call getRepoRoot() before
// .env is loaded, otherwise EVOLVER_REPO_ROOT set in .env is silently
// ignored because getRepoRoot() caches the .git-walk result on first
// call. Strategy:
// 1. Try .env at process.cwd() first. This is where a user running
// `evolver` from their project root expects the file, and it is
// independent of getRepoRoot() caching.
// 2. Read EVOLVER_REPO_ROOT from process.env (dotenv just populated it
// if set in cwd/.env).
// 3. Only now call getRepoRoot(), which will honor EVOLVER_REPO_ROOT
// if present; then try .env at that root as well (dotenv never
// overwrites already-set keys, so step 1 wins when both exist).
try {
const _path = require('path');
// Step 1: load .env from process.cwd() before any internal require.
// Matches the regression test for #460 which asserts
// `require('dotenv').config` appears before any ./src/* require other
// than ./src/gep/paths.
require('dotenv').config({ path: _path.join(process.cwd(), '.env') });
// Suppress the "Using host git repository at" banner during bootstrap.
// If .env at the discovered root overrides EVOLVER_REPO_ROOT, the
// initial banner would point at the wrong path and mislead users
// debugging the very chicken-and-egg problem #526 reported. The banner
// prints for real when getRepoRoot() is called later by application code.
const _prevQuiet = process.env.EVOLVER_QUIET_PARENT_GIT;
process.env.EVOLVER_QUIET_PARENT_GIT = '1';
const { getRepoRoot: _getRepoRoot } = require('./src/gep/paths');
const _root = _getRepoRoot();
if (_root && _root !== process.cwd()) {
require('dotenv').config({ path: _path.join(_root, '.env') });
}
if (_prevQuiet === undefined) delete process.env.EVOLVER_QUIET_PARENT_GIT;
else process.env.EVOLVER_QUIET_PARENT_GIT = _prevQuiet;
} catch (e) { /* dotenv is optional */ }
const evolve = require('./src/evolve');
const { solidify } = require('./src/gep/solidify');
const path = require('path');
const os = require('os');
const { getRepoRoot } = require('./src/gep/paths');
const fs = require('fs');
const { spawn } = require('child_process');
// Interruptible sleep: SIGCONT (and any future wake hook) can short-circuit
// pending sleeps so a daemon that just woke from macOS sleep doesn't sit
// out the rest of its pre-sleep adaptive-sleep window on the resumed
// monotonic clock. Without this, the heartbeat side recovers via the
// drift detector but the outer evolve cycle stays paused up to maxSleepMs
// (default 5 min) after wake. Each call tracks its own resolver in
// _activeSleeps so the wake hook can resolve all of them.
const _activeSleeps = new Set();
function sleepMs(ms) {
const n = parseInt(String(ms), 10);
const t = Number.isFinite(n) ? Math.max(0, n) : 0;
return new Promise(resolve => {
let done = false;
const finish = () => {
if (done) return;
done = true;
clearTimeout(timer);
_activeSleeps.delete(finish);
resolve();
};
const timer = setTimeout(finish, t);
// NOTE: intentionally NOT calling timer.unref() here. When the daemon is in
// a long adaptive sleep (up to maxSleepMs = 5 min by default), this timer is
// often the ONLY ref'd handle keeping the event loop alive. All other timers
// (_heartbeatTimer, _heartbeatDriftInterval, _selfDrivingPollTimer, etc.) are
// unref'd, so once the evolve loop's sleepMs timer was also unref'd, Node.js
// could see zero ref'd handles and silently exit the process mid-sleep. That
// was the root cause of "first launch ok, idle for a while, then evolver dead
// with no log trace" on macOS. A ref'd sleep timer is the load-bearing event-
// loop anchor during idle periods; it fires within maxSleepMs and the daemon
// then reschedules itself normally. Leaving it ref'd has no observable cost.
_activeSleeps.add(finish);
});
}
function _interruptAllSleeps() {
if (_activeSleeps.size === 0) return;
// Snapshot first because resolvers mutate the set as they run.
const finishers = Array.from(_activeSleeps);
for (const fn of finishers) {
try { fn(); } catch (_) {}
}
}
// Round-6 (§19.5): heartbeat-internal wake recovery (drainPool +
// pokeHeartbeat + SSE restart + self-driving-poll re-arm) lives in
// a2aProtocol so the drift detector can drive it directly. Process-
// level wake hooks (sleepMs interrupter, validator daemon poke) are
// registered with a2aProtocol so both the SIGCONT handler and the
// drift detector long-sleep branch run them. Lazy-register so requires
// resolve cleanly under test (single Set of registered hooks; cheap to
// re-register idempotently).
let _wakeHooksRegistered = false;
function _registerProcessWakeHooks() {
if (_wakeHooksRegistered) return;
try {
const a2a = require('./src/gep/a2aProtocol.js');
if (typeof a2a.registerWakeHook !== 'function') return;
a2a.registerWakeHook(function () {
try { _interruptAllSleeps(); } catch (_) {}
});
// R13: guards.sleepMs is a separate private helper used for 60-120s
// backoffs inside evolve.run() arms (active-sessions, system-load,
// pending-solidify). Without this hook, a guard sleep that spans
// macOS suspend would block the cycle for the full window on the
// resumed monotonic clock even though the outer sleep was interrupted.
a2a.registerWakeHook(function () {
try {
const guards = require('./src/evolve/guards');
if (guards && typeof guards._interruptGuardSleeps === 'function') {
guards._interruptGuardSleeps();
}
} catch (_) {}
});
a2a.registerWakeHook(function () {
try {
const v = require('./src/gep/validator');
if (v && typeof v.pokeValidatorDaemon === 'function') {
v.pokeValidatorDaemon();
}
} catch (_) {}
});
_wakeHooksRegistered = true;
} catch (_) {}
}
function readJsonSafe(p) {
try {
if (!fs.existsSync(p)) return null;
const raw = fs.readFileSync(p, 'utf8');
if (!raw.trim()) return null;
return JSON.parse(raw);
} catch (e) {
return null;
}
}
/**
* Mark a pending evolution run as rejected (state-only, no git rollback).
* @param {string} statePath - Path to evolution_solidify_state.json
* @returns {boolean} true if a pending run was found and rejected
*/
function rejectPendingRun(statePath) {
try {
const state = readJsonSafe(statePath);
if (state && state.last_run && state.last_run.run_id) {
state.last_solidify = {
run_id: state.last_run.run_id,
rejected: true,
reason: 'loop_bridge_disabled_autoreject_no_rollback',
timestamp: new Date().toISOString(),
};
const tmp = `${statePath}.tmp`;
fs.writeFileSync(tmp, JSON.stringify(state, null, 2) + '\n', 'utf8');
fs.renameSync(tmp, statePath);
return true;
}
} catch (e) {
console.warn('[Loop] Failed to clear pending run state: ' + (e.message || e));
}
return false;
}
function isPendingSolidify(state) {
const lastRun = state && state.last_run ? state.last_run : null;
const lastSolid = state && state.last_solidify ? state.last_solidify : null;
if (!lastRun || !lastRun.run_id) return false;
if (!lastSolid || !lastSolid.run_id) return true;
return String(lastSolid.run_id) !== String(lastRun.run_id);
}
function parseMs(v, fallback) {
const n = parseInt(String(v == null ? '' : v), 10);
if (Number.isFinite(n)) return Math.max(0, n);
return fallback;
}
function parseBoolEnv(v, fallback) {
if (v == null) return fallback;
const s = String(v).toLowerCase().trim();
if (s === '' ) return fallback;
if (s === 'false' || s === '0' || s === 'off' || s === 'no') return false;
if (s === 'true' || s === '1' || s === 'on' || s === 'yes') return true;
return fallback;
}
class CycleTimeoutError extends Error {
constructor(timeoutMs, phase, cycleNum) {
super('Cycle hard-timeout exceeded after ' + timeoutMs + 'ms (cycle=' + cycleNum + ', phase=' + phase + ')');
this.name = 'CycleTimeoutError';
this.code = 'CYCLE_TIMEOUT';
this.timeoutMs = timeoutMs;
this.phase = phase;
this.cycleNum = cycleNum;
}
}
// Issue #528: on Windows, child_process.spawn(detached: true, windowsHide: true)
// allocates a new conhost window every time -- windowsHide is silently ignored
// in detached mode. So suicide-respawn (cycles >= max, RSS over budget, or the
// new cycle hard-timeout) opens a new cmd popup on every restart. We now skip
// the in-process detached spawn on Windows by default and rely on an external
// supervisor (NSSM, pm2-windows, etc.) to respawn the daemon on non-zero exit.
// Users who insist can opt back in with EVOLVER_SUICIDE_WINDOWS=true (and accept
// the popups).
function spawnReplacementProcess({ reason, args, logPath }) {
const isWindows = process.platform === 'win32';
const allowOnWindows = parseBoolEnv(process.env.EVOLVER_SUICIDE_WINDOWS, false);
if (isWindows && !allowOnWindows) {
console.log(
'[Daemon] Skipping in-process respawn on Windows (' + reason + '). ' +
'Native Node spawn(detached, windowsHide) opens a cmd popup on every restart (Issue #528). ' +
'Set EVOLVER_SUICIDE_WINDOWS=true to opt back in. ' +
'Recommended: run evolver under an external supervisor (NSSM, pm2-windows, etc.) so it restarts on exit.'
);
return { spawned: false, reason: 'windows_default_skip' };
}
try {
const logFd = fs.openSync(logPath, 'a');
const spawnOpts = {
detached: true,
stdio: ['ignore', logFd, logFd],
env: process.env,
windowsHide: true,
};
const child = spawn(process.execPath, [__filename, ...args], spawnOpts);
child.unref();
return { spawned: true };
} catch (e) {
console.error('[Daemon] Spawn-replacement failed (' + reason + '): ' + (e && e.message || e));
return { spawned: false, reason: 'spawn_error', error: e };
}
}
// Atomic write of the cycle_progress.json file. Wrapper polls this file every
// 60s; if updated_at goes stale beyond EVOLVE_INNER_STUCK_TIMEOUT_SEC the
// wrapper treats the inner core as zombie and SIGKILLs it. See Issue #19 (the
// 22-day stuck-cycle incident) and the cross-repo timeout plan for context.
function writeCycleProgressAtomic(progressPath, fields) {
try {
const data = Object.assign({}, fields, { updated_at: Date.now() });
const tmp = progressPath + '.tmp.' + process.pid;
fs.writeFileSync(tmp, JSON.stringify(data, null, 2) + '\n', 'utf8');
fs.renameSync(tmp, progressPath);
return true;
} catch (e) {
return false;
}
}
function getLastSignals(statePath) {
try {
const st = readJsonSafe(statePath);
return (st && st.last_run && Array.isArray(st.last_run.signals)) ? st.last_run.signals : [];
} catch (e) {
return [];
}
}
// Singleton Guard - prevent multiple evolver daemon instances.
//
// Lock location + lease tunables live in src/adapters/scripts/_lockPaths.js
// (issue #176): the session-start hook's auto-restart guard needs the exact
// same resolution, and inlining it in both places drifted. The Round-4
// (per-install-mode pidfile convergence) and Round-9 (lease staleness)
// history notes moved there with the code.
const {
getLockFilePath,
lockIsStaleByLease: _lockIsStaleByLease,
STALE_LOCK_TTL_MS,
LOCK_REFRESH_MS,
} = require('./src/adapters/scripts/_lockPaths');
function _writeLockAtomic(lockFile, payload) {
// Round-6 (§19.8): the previous implementation used tmp + rename, which
// makes the WRITE atomic but not the OWNERSHIP claim. Two processes
// could both rename their own tmp file over the same lockFile (rename
// is atomic per call but successive renames overwrite each other), then
// each read it back and -- if the second rename happened between the
// first process's rename and its read-back -- see the OTHER process's
// PID. Each then concludes "I lost the race" and exits, leaving the
// lockFile owned by no live process. Symmetrically, two processes can
// each see their own PID if the reads happen between their respective
// renames, and both conclude they won.
//
// The proper primitive is link(2): given a unique tmp file, link to the
// target path fails atomically with EEXIST if the target already
// exists. Only one of N concurrent linkers succeeds.
// NOTE(windows): mode 0o700 / 0o600 are silently ignored on Windows.
// The lock directory and tmp file will NOT be owner-only on Windows.
// Isolation relies solely on the user-profile directory ACLs.
const dir = path.dirname(lockFile);
try { fs.mkdirSync(dir, { recursive: true, mode: 0o700 }); } catch (_) {}
const tmp = lockFile + '.' + process.pid + '.tmp';
fs.writeFileSync(tmp, payload, { encoding: 'utf8', mode: 0o600 });
// link() requires the target NOT to exist. The caller in the takeover
// path has already unlinked the stale lockFile via fs.unlinkSync
// (ignoring ENOENT). If a concurrent process beat us to the link, our
// linkSync below throws EEXIST -- we surface that to the caller and
// clean up our tmp.
//
// EXDEV: fs.link() fails with EXDEV when tmp and lockFile are on different
// volumes (can happen on Windows when EVOLVER_LOCK_DIR points to a drive
// other than the tmp dir). Fall back to renameSync, which Node.js handles
// cross-device by copying + deleting. rename is not atomic in this path,
// so the EEXIST guard is lost, but this is an unusual configuration and
// the result is still safe (worst case: two daemons both think they won,
// the second write wins, the first will exit on its next tick when it
// reads back a foreign PID via the heartbeat).
try {
fs.linkSync(tmp, lockFile);
} catch (err) {
if (err && err.code === 'EXDEV') {
// Cross-device: rename falls back to copy+delete inside Node.js; this
// loses the atomic-EEXIST guarantee but is better than hard-failing.
try {
fs.renameSync(tmp, lockFile);
} catch (renameErr) {
try { fs.unlinkSync(tmp); } catch (_) {}
throw renameErr;
}
return; // tmp has been consumed by renameSync, skip unlinkSync below
}
try { fs.unlinkSync(tmp); } catch (_) {}
throw err;
}
try { fs.unlinkSync(tmp); } catch (_) {}
}
function _readLockPayload(lockFile) {
try {
const raw = fs.readFileSync(lockFile, 'utf8').trim();
if (!raw) return null;
// Backward-compat: older lock files contained only the pid as text.
// Newer payloads are JSON {pid, uid, startedAt}.
if (raw[0] === '{') {
try { return JSON.parse(raw); } catch (_) { return null; }
}
const pid = parseInt(raw, 10);
return Number.isFinite(pid) && pid > 0 ? { pid: pid } : null;
} catch (_) { return null; }
}
function _lockPayload() {
return JSON.stringify({
pid: process.pid,
uid: typeof process.getuid === 'function' ? process.getuid() : null,
startedAt: new Date().toISOString(),
// Round-9: marks a daemon that refreshes this lock file's mtime on a
// lease (see startLockRefresh). Only when this flag is present do
// acquireLock / refuseHelloIfDaemonRunning trust mtime-staleness to
// reclaim a lock whose PID is alive -- the PID-reuse / SIGKILL-stale
// guard. A lock written by an OLDER daemon (no flag) keeps the legacy
// kill(0)-only behavior so a new binary can never falsely steal a
// still-running old daemon's lock (which would run two daemons).
lease: true,
});
}
// STALE_LOCK_TTL_MS / LOCK_REFRESH_MS / _lockIsStaleByLease come from
// src/adapters/scripts/_lockPaths.js (required next to getLockFilePath
// above) — see issue #176 and the Round-9 history note in that module.
let _lockRefreshTimer = null;
// Start refreshing the lock file's mtime so other processes can tell this
// daemon is alive without trusting a (recyclable) PID. unref'd: it never
// keeps the event loop open on its own, but fires for as long as the daemon
// is otherwise alive.
function startLockRefresh() {
if (_lockRefreshTimer) return;
const lockFile = getLockFilePath();
_lockRefreshTimer = setInterval(function () {
try {
const now = new Date();
fs.utimesSync(lockFile, now, now);
} catch (_) { /* lock gone / FS error: nothing we can do here */ }
}, LOCK_REFRESH_MS);
if (_lockRefreshTimer && typeof _lockRefreshTimer.unref === 'function') {
_lockRefreshTimer.unref();
}
}
function stopLockRefresh() {
if (_lockRefreshTimer) {
clearInterval(_lockRefreshTimer);
_lockRefreshTimer = null;
}
}
function acquireLock() {
const lockFile = getLockFilePath();
// NOTE(windows): mode 0o700 / 0o600 are silently ignored on Windows.
// Lock directory and file permissions provide no OS-level isolation on
// Windows; rely on user-profile directory ACLs (%USERPROFILE%\.evomap).
try {
try { fs.mkdirSync(path.dirname(lockFile), { recursive: true, mode: 0o700 }); } catch (_) {}
try {
fs.writeFileSync(lockFile, _lockPayload(), { flag: 'wx', mode: 0o600 });
return true;
} catch (exclErr) {
if (exclErr.code !== 'EEXIST') throw exclErr;
}
const payload = _readLockPayload(lockFile);
if (!payload || !Number.isFinite(payload.pid) || payload.pid <= 0) {
console.log('[Singleton] Corrupt lock file. Taking over.');
} else if (_lockIsStaleByLease(lockFile, payload)) {
// Round-9: a lease-aware daemon has not refreshed this lock's mtime
// within the stale TTL. Either it was SIGKILLed/crashed, or its PID
// has since been reused by an unrelated process (kill(0) below would
// then falsely report it alive and we would refuse to start forever).
// The expired lease is authoritative: take over.
console.log('[Singleton] Lock lease expired (PID ' + payload.pid + ', no mtime refresh for > ' +
Math.round(STALE_LOCK_TTL_MS / 60_000) + 'min). Taking over.');
} else {
try {
process.kill(payload.pid, 0);
// Process exists. Distinguish "alive, our user" (refuse) from
// "alive, different uid" (also refuse -- never barge into a root
// daemon under a user-launched evolver, etc.).
console.log(`[Singleton] Evolver loop already running (PID ${payload.pid}). Exiting.`);
return false;
} catch (e) {
if (e && e.code === 'EPERM') {
// PID exists but belongs to another user. Conservatively
// refuse: barging in would race the existing daemon for
// secret/heartbeat ownership.
console.warn(`[Singleton] Lock owned by PID ${payload.pid} (different user). Refusing to take over. ` +
`Remove ${lockFile} manually if the PID is actually dead.`);
return false;
}
console.log(`[Singleton] Stale lock found (PID ${payload.pid}). Taking over.`);
}
}
// Atomic takeover so two daemons that both observe the same stale PID
// and pass the kill(0) check cannot both end up "owning" the lock.
//
// Bug it fixes: the previous "unconditional unlinkSync then linkSync"
// pattern was NOT atomic across acquirers. Interleaving where P1 wins
// the linkSync but P2's unlinkSync then deletes P1's freshly-linked
// file (P2 never re-verifies it's deleting the same stale lock it
// observed) lets P2's subsequent linkSync also succeed. Both processes
// then return true and start a daemon, racing each other on the
// shared singleton secret store.
//
// renameSync is atomic at the filesystem level: only one of N racing
// acquirers can move the stale lockFile to a unique claim name, the
// rest see ENOENT and abort. After the claim succeeds, _writeLockAtomic
// installs the fresh lock; the claim file is unlinked in every exit
// path so it doesn't accumulate.
const claimFile = lockFile + '.' + process.pid + '.' + Date.now() + '.takeover';
try {
fs.renameSync(lockFile, claimFile);
} catch (e) {
if (e && e.code === 'ENOENT') {
// Another concurrent acquirer already claimed the stale lock.
// They'll race us on _writeLockAtomic below; the EEXIST branch
// handles the loser case correctly.
} else {
console.warn('[Singleton] Cannot claim stale lock at ' + lockFile + ': ' + e.message);
return false;
}
}
try {
_writeLockAtomic(lockFile, _lockPayload());
} catch (linkErr) {
try { fs.unlinkSync(claimFile); } catch (_) {}
if (linkErr && linkErr.code === 'EEXIST') {
// Lost the link race to another concurrent acquirer. Read who
// won (best-effort) for the log line.
const winner = _readLockPayload(lockFile);
console.log('[Singleton] Lost takeover race to PID ' + (winner && winner.pid) + '. Exiting.');
return false;
}
throw linkErr;
}
try { fs.unlinkSync(claimFile); } catch (_) {}
return true;
} catch (err) {
console.error('[Singleton] Lock acquisition failed:', err);
return false;
}
}
function releaseLock() {
const lockFile = getLockFilePath();
try {
if (fs.existsSync(lockFile)) {
const payload = _readLockPayload(lockFile);
if (payload && payload.pid === process.pid) fs.unlinkSync(lockFile);
}
} catch (e) { /* ignore */ }
}
// Round-7 (§20.7): the daemon-lock acquireLock() only fires for `--loop`
// mode; CLI subcommands like `evolver fetch` and `evolver sync` run
// without acquiring the lock and freely call sendHelloToHub when
// node_secret is missing. The hub-side hello-with-rotate rewrites the
// node_secret on disk, so two writers (the daemon's heartbeat path
// rotating one secret + this CLI's sendHelloToHub writing a different
// one) race to be "last writer." Whichever wrote second silences the
// other -- the daemon then 401-loops -> enters reauth backoff -> goes
// silent for 30 min..4 h. The original §6 "instance lock" scenario.
//
// This helper does NOT take the lock (the daemon legitimately owns it);
// it only refuses to proceed if a LIVE daemon owns the lock AND we are
// about to send a fresh hello. If the daemon is alive it already has a
// valid secret in ~/.evomap/node_secret, so the right thing for the CLI
// is to wait briefly for the secret to appear (newly registered daemon)
// or exit with an actionable error.
//
// Callers: every CLI subcommand whose runner could call sendHelloToHub()
// when getHubNodeSecret() returns empty. Currently: fetch, sync
// (round-7 §20.7), plus atp-complete, buy, orders, verify (round-8
// §21.8 -- the ATP runners hit the same vector via consumerAgent /
// merchantAgent / atpExecute paths).
function refuseHelloIfDaemonRunning(toolLabel) {
try {
const lockFile = getLockFilePath();
if (!fs.existsSync(lockFile)) return; // no daemon
const payload = _readLockPayload(lockFile);
if (!payload || !Number.isFinite(payload.pid) || payload.pid <= 0) return;
if (payload.pid === process.pid) return; // shouldn't happen for CLI
// Round-9: a lease-aware lock whose mtime has gone stale means the
// daemon is dead (or its PID was reused). Do NOT refuse on it -- that
// was the "CLI hard-exits because it trusts a recyclable PID" hole.
if (_lockIsStaleByLease(lockFile, payload)) return;
try {
process.kill(payload.pid, 0);
} catch (e) {
if (e && e.code === 'ESRCH') return; // stale lock, daemon is gone
// EPERM = alive under a different user; still a real daemon. Fall
// through to refuse.
}
console.error(
'[' + toolLabel + '] Refusing to send hello: an evolver daemon ' +
'(PID ' + payload.pid + ') is running and owns ~/.evomap/instance.lock.'
);
console.error(
' Two concurrent hello calls would rotate node_secret against ' +
'each other and silence the daemon for hours.'
);
console.error(
' Either wait for the daemon to register (the secret will ' +
'appear at ~/.evomap/node_secret), or stop the daemon and retry.'
);
process.exit(1);
} catch (_) {
// Never let the lock-check helper itself escape; if the helper
// throws (FS permission, etc.) we fall through to the original code
// path. The race we're protecting against is rare; failing closed
// here would block legitimate CLI use.
}
}
async function main() {
const args = process.argv.slice(2);
const command = args[0];
const isLoop = args.includes('--loop') || args.includes('--mad-dog');
const isVerbose = args.includes('--verbose') || args.includes('-v') ||
String(process.env.EVOLVER_VERBOSE || '').toLowerCase() === 'true';
if (isVerbose) process.env.EVOLVER_VERBOSE = 'true';
if (!command || command === 'run' || command === '/evolve' || isLoop) {
if (isLoop) {
// EPIPE protection. The daemon may outlive the controlling
// terminal (user closes the iTerm tab, ssh session drops, parent
// shell exits). The SIGHUP handler below covers the signal side,
// but the underlying pty fd is gone and the FIRST subsequent
// console.log writes to a closed pipe -> stdout emits 'error'
// with EPIPE. Without a listener attached, Node escalates EPIPE
// to uncaughtException, which our handler then turns into
// process.exit(1). Net result: daemon silently dies the next
// time it tries to log, with no useful trace. Swallow EPIPE
// explicitly so the daemon stays alive when its terminal goes
// away (matching standard daemonization practice).
try {
// EPIPE: swallow (daemon must outlive its controlling terminal).
// Non-EPIPE (EIO, ENOSPC on redirected log, etc.): the listener
// already prevents 'error' from escalating to uncaughtException,
// so write a one-line trace to the *other* stream so operators
// can see the failure mode instead of finding a silent daemon.
process.stdout.on('error', function (err) {
if (err && err.code === 'EPIPE') return;
try { process.stderr.write('[evolver] stdout error: ' + (err && (err.code || err.message) || err) + '\n'); } catch (_) {}
});
process.stderr.on('error', function (err) {
if (err && err.code === 'EPIPE') return;
try { process.stdout.write('[evolver] stderr error: ' + (err && (err.code || err.message) || err) + '\n'); } catch (_) {}
});
} catch (_) {}
const originalLog = console.log;
const originalWarn = console.warn;
const originalError = console.error;
function ts() { return '[' + new Date().toISOString() + ']'; }
// Wrap originals in try/catch so a broken transport (closed pty,
// disk full on a redirected log file) cannot escape and trip
// unhandledException -> exit(1) the next time we log.
console.log = (...args) => {
try { originalLog.call(console, ts(), ...args); } catch (_) {}
};
console.warn = (...args) => {
try { originalWarn.call(console, ts(), ...args); } catch (_) {}
};
console.error = (...args) => {
try { originalError.call(console, ts(), ...args); } catch (_) {}
};
}
console.log('Starting evolver...');
// Preflight: fail fast if git is not on PATH. On Windows in particular
// a missing git binary can cause evolver to hang silently (see #394),
// because several cycle-critical steps shell out to git early (repo
// resolution, diff, blast-radius). Catching this up front makes the
// failure mode obvious.
try {
const { execSync } = require('child_process');
execSync('git --version', { stdio: 'ignore', timeout: 5000, windowsHide: true });
} catch (_gitErr) {
console.error('');
console.error('[Preflight] Could not run "git --version". Evolver requires git to be installed and available on PATH.');
console.error('[Preflight] On Windows: install Git from https://git-scm.com/download/win and make sure `git --version` works in a fresh terminal.');
console.error('[Preflight] On macOS: xcode-select --install (or `brew install git`)');
console.error('[Preflight] On Linux: sudo apt-get install -y git (or your distro equivalent)');
console.error('');
process.exit(1);
}
if (isLoop) {
// Internal daemon loop (no wrapper required).
if (!acquireLock()) process.exit(0);
// Round-9: refresh the lock lease so other processes can detect a
// crash / PID reuse via stale mtime instead of trusting kill(0).
startLockRefresh();
// Linux OOM score adjustment: lower oom_score_adj so the kernel
// deprioritises evolver when choosing an OOM victim. This is a
// best-effort hint -- the kernel can still kill us under extreme
// memory pressure, but we will not be the first target.
//
// Value -500 (range -1000..1000; -1000 = never kill, 0 = default,
// +1000 = kill first). -500 gives meaningful protection without
// reserving the slot for truly critical system services.
//
// Requires the process to be either root or to have CAP_SYS_RESOURCE.
// On most Docker/k8s images running as non-root this write will fail
// with EACCES -- that is expected and harmless; we log a one-liner so
// operators know to pass --oom-score-adj=-500 via their container spec,
// or to set /proc/<pid>/oom_score_adj from the supervising process.
//
// Users who want to set this from outside the process (safer, no CAP):
// echo -500 > /proc/$(pgrep -f "node.*evolver.*--loop")/oom_score_adj
//
// Opt-out: EVOLVER_DISABLE_OOM_ADJUST=1
if (process.platform === 'linux' &&
String(process.env.EVOLVER_DISABLE_OOM_ADJUST || '') !== '1') {
try {
const _oomPath = '/proc/self/oom_score_adj';
const _oomTarget = '-500';
require('fs').writeFileSync(_oomPath, _oomTarget + '\n', 'utf8');
console.log('[evolver] Set Linux oom_score_adj=' + _oomTarget +
' to reduce OOM-kill priority.');
} catch (oomErr) {
// EACCES under non-root / no CAP_SYS_RESOURCE is expected; EPERM
// inside stricter seccomp/apparmor profiles. Both are non-fatal.
const oomCode = oomErr && oomErr.code ? oomErr.code : 'unknown';
console.log('[evolver] Could not set oom_score_adj (' + oomCode +
'). To protect evolver from OOM kill, run as root, add ' +
'CAP_SYS_RESOURCE, or set oom_score_adj externally via your ' +
'container spec (e.g. resources.requests + oom_score_adj in k8s).');
}
}
// Round-4: macOS App Nap / QoS demotion mitigation. Without this,
// a backgrounded `evolver --loop` running in an iTerm tab gets its
// process QoS demoted to UTILITY/BACKGROUND once the parent app
// is no longer focused. CPU runtime caps to ~5% of one core,
// setTimeout resolution drops toward 1 Hz, disk I/O is throttled.
// The drift detector cannot rescue this because the demotion does
// NOT cause Date.now() to jump -- only the inter-tick interval
// dilates, which the detector samples through its own (also
// demoted) setInterval. Net result: heartbeat appears alive but
// ticks fire so slowly that the hub marks the node offline,
// matching the user-reported "first launch ok -> idle -> dead
// forever" pattern.
//
// os.setPriority() raises BSD process priority; macOS bridges that
// to Mach thread QoS via the priority bridge so the demotion does
// not engage. -10 is the most negative value raisable without
// root. Failures are logged but non-fatal (e.g. EPERM under a
// restrictive sandbox -- the daemon continues, just unprotected).
// Opt-out via EVOLVER_DISABLE_PRIORITY_BOOST=1 for users on
// power-constrained battery profiles who would rather accept
// the throttle than the extra wake-time.
if (process.platform === 'darwin' &&
String(process.env.EVOLVER_DISABLE_PRIORITY_BOOST || '') !== '1') {
let priorityBoostOk = false;
try {
const os = require('os');
os.setPriority(0, -10);
// Round-5: actually verify the boost landed. macOS silently
// returns success from setPriority(2) under some sandboxes
// even when the underlying syscall was rejected by the
// Mach thread-policy bridge. Read it back; if the value is
// still 0 (or worse), App Nap will engage and the user
// sees the "first launch -> idle -> dead" symptom from
// round-3 with NO log evidence to RCA from.
const observed = os.getPriority();
if (observed <= -10) {
priorityBoostOk = true;
console.log('[evolver] Raised process priority on macOS to ' + observed +
' to prevent App Nap / QoS demotion.');
} else {
console.warn('[evolver] setPriority(-10) reported success but observed priority is ' +
observed + '; App Nap protection NOT in effect. ' +
'Run with EVOLVER_CAFFEINATE=1 or via `caffeinate -is node index.js --loop`.');
}
} catch (e) {
console.warn('[evolver] setPriority(-10) refused (' + (e && e.code || 'unknown') +
'): ' + (e && e.message || e) + '. App Nap protection NOT in effect. ' +
'Run with EVOLVER_CAFFEINATE=1 or via `caffeinate -is node index.js --loop`.');
}
// Round-5: caffeinate side-child. Round-4 made this opt-in via
// EVOLVER_CAFFEINATE=1 to avoid the extra Activity-Monitor row;
// the round-5 audit found that 99% of users never set the env
// var, so the App Nap fallback was effectively unused. Promote
// to default-on when the priority boost did NOT land (so we
// either have priority or have caffeinate, never neither),
// unless the user has explicitly opted out via
// EVOLVER_CAFFEINATE=0. The combined effect: a fresh laptop
// user gets at least one layer of throttle protection without
// having to learn about either env var.
const caffeinateRaw = String(process.env.EVOLVER_CAFFEINATE || '').toLowerCase().trim();
const caffeinateOptedIn = caffeinateRaw === '1' || caffeinateRaw === 'true';
const caffeinateOptedOut = caffeinateRaw === '0' || caffeinateRaw === 'false';
const caffeinateFallback = !priorityBoostOk && !caffeinateOptedOut;
if (caffeinateOptedIn || caffeinateFallback) {
try {
const child = spawn('caffeinate', ['-i', '-w', String(process.pid)], {
detached: true,
stdio: 'ignore',
});
child.unref();
console.log('[evolver] Spawned caffeinate -i -w ' + process.pid +
' to block App Nap (pid ' + child.pid + ').' +
(caffeinateFallback ? ' (fallback because priority boost was refused)' : ''));
} catch (e) {
console.warn('[evolver] caffeinate spawn failed: ' +
(e && e.message || e) + '. App Nap may throttle the heartbeat. ' +
'Install caffeinate (Xcode CLT) or run under a launchd plist with NSAppSleepDisabled=1.');
}
}
}
// Event-loop keep-alive anchor (defense-in-depth for the sleepMs fix).
//
// All timers in a2aProtocol.js (heartbeat, drift detector, self-driving
// poll, SSE reconnect) are unref'd so they never prevent a clean exit.
// The sleepMs() timer above is now ref'd (the primary fix), but as an
// additional safety net we install one ref'd setInterval here that fires
// every 10 minutes. Its only job is to emit a lightweight log line so
// the evolver_loop.log gets touched even when the daemon is completely
// idle (no session signals, evolve cycle sleeping at maxSleepMs). This
// guarantees the event loop has at least one ref'd handle at all times
// while the daemon is running, and provides a heartbeat-on-disk so
// lifecycle.checkHealth() (MAX_SILENCE_MS = 30 min default) does not
// wrongly declare the process stagnant during legitimate long idle windows.
// Cleared in shutdown() so it does not outlive the daemon.
const _KEEPALIVE_INTERVAL_MS = 10 * 60 * 1000;
let _keepAliveTimer = setInterval(function () {
try {
// Inline append that mirrors a2aProtocol._appendHeartbeatLog's
// ENOENT-retry (that helper is not exported).
const a2aKA = require('./src/gep/a2aProtocol');
if (typeof a2aKA.getHeartbeatStats === 'function') {
const s = a2aKA.getHeartbeatStats();
const { getEvolverLogPath } = require('./src/gep/paths');
const fsKA = require('fs');
const pathKA = require('path');
try {
const logPath = getEvolverLogPath();
fsKA.mkdirSync(pathKA.dirname(logPath), { recursive: true });
const line = JSON.stringify({
ts: new Date().toISOString(),
type: 'keepalive_tick',
hb_running: s.running,
hb_last_tick_ago_s: s.lastTickAt ? Math.round((Date.now() - s.lastTickAt) / 1000) : null,
}) + '\n';
try {
fsKA.appendFileSync(logPath, line, { encoding: 'utf8' });
} catch (e) {
if (e && e.code === 'ENOENT') {
try {
fsKA.mkdirSync(pathKA.dirname(logPath), { recursive: true });
fsKA.appendFileSync(logPath, line, { encoding: 'utf8' });
} catch (_) { /* log destination broken; do not throw out */ }
}
}
} catch (_) { /* never let the log write kill the timer */ }
}
} catch (_) { /* never let any error kill the keep-alive timer */ }
}, _KEEPALIVE_INTERVAL_MS);
// Intentionally ref'd: this is the explicit event-loop anchor.
// Do NOT add .unref() here -- that would defeat the purpose.
function shutdown() {
if (_keepAliveTimer) { clearInterval(_keepAliveTimer); _keepAliveTimer = null; }
stopLockRefresh();
releaseLock();
// stopHeartbeat() clears the drift detector interval and the heartbeat
// timer, preventing "ghost tick" log noise after exit and ensuring a
// clean state if the process is somehow continued (test harness, etc.).
try { require('./src/gep/a2aProtocol').stopHeartbeat(); } catch (e) {}
try { require('./src/gep/a2aProtocol').stopEventStream(); } catch (e) {}
}
process.on('exit', shutdown);
process.on('SIGINT', () => { shutdown(); process.exit(); });
process.on('SIGTERM', () => { shutdown(); process.exit(); });
// SIGHUP: two meanings depending on platform and how the daemon was started.
//
// macOS / interactive terminal: closing the iTerm/Terminal tab sends
// SIGHUP to the controlling process, and Node's default action is to
// terminate. That is the most common "first-launch, then idle, then
// evolver dead" path on macOS. As a daemon we intentionally ignore it.
//
// Linux systemd: `systemctl reload evolver` delivers SIGHUP to signal
// configuration reload. The socket / connection state may be stale (e.g.
// the hub URL changed in .env, or the admin wants a fresh hello after a
// manual secret rotation). We treat reload as a soft wake-recovery: drain
// the undici pool, poke the heartbeat, and restart the SSE stream, which
// is identical to what SIGCONT / the drift detector do on system resume.
// We also emit sd_notify RELOADING=1 / READY=1 so systemd can track the
// reload state (required for Type=notify units that call systemctl reload).
//
// A one-shot (non --loop) invocation keeps the default behavior because
// this branch is gated on `isLoop`.
process.on('SIGHUP', () => {
try {
if (process.platform === 'linux') {
// On Linux, SIGHUP from systemd means reload, not terminal close.
// Announce reload state to the service manager first so systemd
// does not time out waiting, then perform the recovery, then signal
// READY=1 again to confirm we are back in steady state.
try {
const a2aForSd = require('./src/gep/a2aProtocol.js');
if (typeof a2aForSd._sdNotify === 'function') {
// MONOTONIC_USEC requires microseconds from the monotonic clock.
// process.hrtime() returns [sec, nsec] from a fixed epoch;
// avoids BigInt literals for Node <10.3 compatibility.
const _hrt = process.hrtime();
const _monUsec = _hrt[0] * 1000000 + Math.floor(_hrt[1] / 1000);
a2aForSd._sdNotify('RELOADING=1\nMONOTONIC_USEC=' + _monUsec);
}
} catch (_) {}
console.warn('[evolver] Received SIGHUP on Linux (systemctl reload?). ' +
'Running wake recovery (drain pool + poke heartbeat + restart SSE). ' +
'To stop the daemon use SIGINT/SIGTERM.');
try {
const a2a = require('./src/gep/a2aProtocol.js');
if (typeof a2a._runWakeRecovery === 'function') a2a._runWakeRecovery();
} catch (_) {}
// Interrupt any pending sleepMs so the evolve loop picks up
// immediately after the reload rather than sitting out its window.
try { _interruptAllSleeps(); } catch (_) {}
// Signal READY=1 to close the RELOADING window. systemd will mark
// the reload complete once it sees this notification.
try {
const a2aForSd2 = require('./src/gep/a2aProtocol.js');
if (typeof a2aForSd2._sdNotify === 'function') {
a2aForSd2._sdNotify('READY=1');
}
} catch (_) {}
} else {
// macOS / non-systemd: terminal-close semantics, ignore the signal.
console.warn('[evolver] Received SIGHUP (controlling terminal closed?). ' +
'Daemon ignoring -- heartbeat loop continues. To stop the daemon use SIGINT/SIGTERM.');
}
} catch (_) {}
});
// SIGCONT fires on `kill -CONT`, debugger detach, and some VM/sleep
// resume paths. Nudge the heartbeat loop so it doesn't sit waiting for
// its next scheduled tick (which could be up to 30 min away under
// backoff) before reconnecting after a wake event. Also restart the
// SSE stream: the underlying TCP socket almost certainly died during
// the SIGSTOP window without a FIN reaching us, and the existing
// exponential reconnect could be up to 120s away on the resumed
// monotonic clock.
// Round-6 (§19.5): register process-level wake hooks so both the
// SIGCONT handler and the drift detector's long-sleep branch
// (a2aProtocol) interrupt the outer evolve sleepMs and poke the
// validator daemon, not just the heartbeat-internal recovery.
_registerProcessWakeHooks();
// SIGCONT is not supported on Windows (process.on() throws ERR_UNKNOWN_SIGNAL).
// Wake recovery on Windows is handled exclusively by the drift detector.
if (process.platform !== 'win32') {
process.on('SIGCONT', () => {
// Real recovery delegates to a2aProtocol._runWakeRecovery so
// SIGCONT and the drift detector share one code path. NOTE:
// per followups §18.2, SIGCONT is never sent by the macOS
// kernel on system wake; this handler primarily covers:
// - hypervisor/docker resume (container unpause)
// - `kill -CONT <pid>` from operators or supervisors
// - Linux debugger attach/detach (ptrace SIGSTOP+SIGCONT;
// on Linux this is a true job-control signal unlike macOS)
// - `docker unpause` (sends SIGCONT to all cgroup processes)
// Bare-metal macOS wake recovery is driven by the drift
// detector only. _runWakeRecovery() has a 1s debounce gate so
// a rapid burst (e.g. gdb repeatedly attaching) collapses into
// one recovery without leaking undici agents or SSE connections.
try {
const a2a = require('./src/gep/a2aProtocol.js');
if (typeof a2a._runWakeRecovery === 'function') a2a._runWakeRecovery();
} catch (_) {}
});
}
process.on('uncaughtException', (err) => {
console.error('[FATAL] Uncaught exception:', err && err.stack ? err.stack : String(err));
releaseLock();
process.exit(1);
});
// Sliding window: only exit if many rejections cluster in a short
// period AND the daemon shows no other signs of life. A daemon
// running for weeks can accumulate harmless, unrelated rejections
// (transient network blips, hub timeouts); the original cumulative
// counter would eventually kill the process for noise. Cluster =
// real failure cascade. But macOS wake bursts also synthesize
// clusters: heartbeat / SSE / validator / merchantAgent / ATP all
// fire near-simultaneously on resume and any subsystem with an
// unhandled async-callback throw can blow past 5 rejections in