From b8457ddb95bd69d94b58ceebd47da029c2abbed8 Mon Sep 17 00:00:00 2001 From: jim800121chen Date: Mon, 18 May 2026 14:02:54 +0800 Subject: [PATCH] =?UTF-8?q?fix(task-scheduler):=20Bug=20#8=20=E2=80=94=20t?= =?UTF-8?q?erminal=20release=20=E4=BF=9D=E7=95=99=20job=20record=EF=BC=88v?= =?UTF-8?q?isionA=20poll=20404=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit visionA e2e 撞到:job COMPLETED 後 < 1s poll GET /api/v1/jobs/:id 回 404。 根因:jobService terminal release(COMPLETED / FAILED)path 用了 release_active_job.lua、該 Lua 給 enqueue rollback 用、會 DEL job:{id} + SREM user:jobs Set、post-completion API 拿不到 record。 修法 A(不改舊 Lua、新增專用 Lua): - 新增 src/redis/luaScripts/release_lock_only.lua — 只 DEL active_job lock、 保留 job:{id} record 與 user:{}:jobs Set、給正常 terminal 用 - 新增 releaseActiveLockOnly() JS wrapper(同 releaseActiveJob 的 API surface + atomic guard) - jobService.js terminal release path 改用 releaseActiveLockOnly - enqueue 失敗 rollback path 仍用舊 releaseActiveJob(語意正確、該情境 job 尚未 schedule 完成、清乾淨才對) - t9 unit / integration test 用 destructuring alias rename 避免改 27 個 assertion post-completion API 路徑(visionA poll / Phase B /result / promote)都需要 job record 仍在 — 本修法解此契約。 Tests: 666/666 pass(無回歸) Reviewer: ✅ 通過、設計嚴謹(atomic guard、rollback vs terminal 語意分離) Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/task-scheduler/src/redis/luaScripts.js | 42 +++++++++++++++++ .../redis/luaScripts/release_lock_only.lua | 45 +++++++++++++++++++ .../jobService.t9.integration.test.js | 22 ++++++++- .../services/__tests__/jobService.t9.test.js | 6 ++- .../task-scheduler/src/services/jobService.js | 19 +++++--- 5 files changed, 126 insertions(+), 8 deletions(-) create mode 100644 apps/task-scheduler/src/redis/luaScripts/release_lock_only.lua diff --git a/apps/task-scheduler/src/redis/luaScripts.js b/apps/task-scheduler/src/redis/luaScripts.js index d9e406e..484d533 100644 --- a/apps/task-scheduler/src/redis/luaScripts.js +++ b/apps/task-scheduler/src/redis/luaScripts.js @@ -176,9 +176,51 @@ async function releaseActiveJob(redis, { userId, jobId }) { ); } +/** + * Release active_job lock only — terminal use (Bug #8 修法 A, 2026-05-18). + * + * 跟 releaseActiveJob 的差別:**只 DEL active_job lock、保留 job:{id} record 與 + * user:{}:jobs Set**。給 job 正常 terminal(COMPLETED / FAILED)時用、讓 post-completion + * API (poll / promote / download) 仍可拿到 job record。 + * + * 為什麼新 Lua 而不改舊 Lua:舊 Lua(release_active_job.lua)給 enqueue 失敗 rollback 用、 + * 該情境 job record 確實該清(沒成功 schedule、無 observer);改舊 Lua 會破壞 rollback 語意。 + * + * @param {import('ioredis').Redis} redis + * @param {object} args + * @param {string} args.userId + * @param {string} args.jobId + * @returns {Promise<{ok: true, released: boolean}>} + */ +async function releaseActiveLockOnly(redis, { userId, jobId }) { + if (!userId || typeof userId !== 'string') { + throw new Error('[releaseActiveLockOnly] userId is required'); + } + if (!jobId || typeof jobId !== 'string') { + throw new Error('[releaseActiveLockOnly] jobId is required'); + } + + const script = loadScript('release_lock_only.lua'); + const keys = [`user:${userId}:active_job`]; + const args = [jobId]; + + const result = await evalScript(redis, script, keys, args); + + if (Array.isArray(result) && result[0] === 'OK') { + return { ok: true, released: true }; + } + if (Array.isArray(result) && result[0] === 'NOOP') { + return { ok: true, released: false }; + } + throw new Error( + `[releaseActiveLockOnly] Unexpected Lua response: ${JSON.stringify(result)}` + ); +} + module.exports = { claimActiveJob, releaseActiveJob, + releaseActiveLockOnly, // 內部 helper 暴露給單元測試 _internals: { loadScript, diff --git a/apps/task-scheduler/src/redis/luaScripts/release_lock_only.lua b/apps/task-scheduler/src/redis/luaScripts/release_lock_only.lua new file mode 100644 index 0000000..544a735 --- /dev/null +++ b/apps/task-scheduler/src/redis/luaScripts/release_lock_only.lua @@ -0,0 +1,45 @@ +-- release_lock_only.lua +-- +-- 2026-05-18 e2e debug 新增(Bug #8 修法 A): +-- 用於 job 正常 terminal(COMPLETED / FAILED)時釋放 active_job lock, +-- **保留 job:{id} record 與 user:{}:jobs Set**,讓後續 post-completion API +-- (poll / promote / download) 仍可拿到 job record。 +-- +-- 跟 release_active_job.lua 的差別: +-- - release_active_job.lua: enqueue 失敗回滾用、DEL active_job + DEL job + SREM +-- (清得乾淨、因為 job 還沒真的 schedule、user 看不到、no observer) +-- - release_lock_only.lua: 正常 terminal 用、只 DEL active_job +-- (job record 必須留著、Phase 0.8 visionA / Phase 2 download-tokens 都需要) +-- +-- 為什麼用 Lua(與 release_active_job.lua 同理): +-- 1. **Atomic guard**:只有當 active_job 仍然指向 ARGV[1] 時才釋放, +-- 避免「completion + 新 claim 連續發生」造成誤刪別人的 active_job lock +-- 2. **單次 round-trip** +-- +-- KEYS: +-- KEYS[1] = user:{user_id}:active_job — 該 user 當前 in-progress job_id(String) +-- +-- ARGV: +-- ARGV[1] = job_id — 要釋放的 job_id;只有當 active_job +-- 的值等於這個 job_id 時才執行 DEL +-- +-- Returns: +-- {"OK"} — active_job lock 已 DEL;job record 保留 +-- {"NOOP"} — active_job 不等於 ARGV[1] 或不存在; +-- 未做任何修改 +-- +-- 注意事項: +-- * 不刪 job:{id}:post-completion API(visionA GET /api/v1/jobs/:id、Phase 2 +-- download-tokens、promote)都需要 job record 存在 +-- * 不 SREM user:{}:jobs:user 的 jobs Set 是 audit trail / list 用, +-- job 完成後仍應在 list +-- * 不在 Lua 內 log,所有觀察性靠呼叫端的 structured log + +local current = redis.call('GET', KEYS[1]) +if current ~= ARGV[1] then + return {'NOOP'} +end + +redis.call('DEL', KEYS[1]) + +return {'OK'} diff --git a/apps/task-scheduler/src/services/__tests__/jobService.t9.integration.test.js b/apps/task-scheduler/src/services/__tests__/jobService.t9.integration.test.js index d9c722c..31ee347 100644 --- a/apps/task-scheduler/src/services/__tests__/jobService.t9.integration.test.js +++ b/apps/task-scheduler/src/services/__tests__/jobService.t9.integration.test.js @@ -47,9 +47,9 @@ jest.mock('../../redis/luaScripts', () => { return { ok: true }; }), releaseActiveJob: jest.fn(async (_redis, { userId, jobId }) => { + // 舊行為:enqueue 失敗 rollback 用、刪 lock + job + SREM const current = state.activeJobMap.get(userId); if (current !== jobId) { - // Lua atomic guard:active_job 不等於 ARGV[1] → NOOP return { ok: true, released: false }; } state.activeJobMap.delete(userId); @@ -58,6 +58,17 @@ jest.mock('../../redis/luaScripts', () => { if (set) set.delete(jobId); return { ok: true, released: true }; }), + // 2026-05-18 Bug #8 修法 A:terminal release 用、**只刪 active_job lock**、 + // 保留 jobMap + userJobsMap 給 post-completion API 用 + releaseActiveLockOnly: jest.fn(async (_redis, { userId, jobId }) => { + const current = state.activeJobMap.get(userId); + if (current !== jobId) { + return { ok: true, released: false }; + } + state.activeJobMap.delete(userId); + // **不刪** jobMap / userJobsMap(與 release_lock_only.lua 對齊) + return { ok: true, released: true }; + }), _internals: { loadScript: jest.fn(), evalScript: jest.fn(), @@ -67,7 +78,14 @@ jest.mock('../../redis/luaScripts', () => { }; }); -const { claimActiveJob, releaseActiveJob, _internals } = require('../../redis/luaScripts'); +const { + claimActiveJob, + // 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly。 + // 既有 test assert `releaseActiveJob.toHaveBeenCalledTimes(1)` 等檢查的是 + // terminal release 行為,故 alias rename 讓 assertion 不用全改。 + releaseActiveLockOnly: releaseActiveJob, + _internals, +} = require('../../redis/luaScripts'); const { createJobService } = require('../jobService'); function makeFakeRedis() { diff --git a/apps/task-scheduler/src/services/__tests__/jobService.t9.test.js b/apps/task-scheduler/src/services/__tests__/jobService.t9.test.js index 302c780..88899de 100644 --- a/apps/task-scheduler/src/services/__tests__/jobService.t9.test.js +++ b/apps/task-scheduler/src/services/__tests__/jobService.t9.test.js @@ -17,9 +17,13 @@ 'use strict'; // Mock luaScripts,控制 release 結果而不啟動真 Redis Lua +// 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly(保留 job record); +// releaseActiveJob 仍存在但只給 enqueue 失敗 rollback 用。本 test file 測 terminal 場景、 +// 故 spy 改用 releaseActiveLockOnly。 jest.mock('../../redis/luaScripts', () => ({ claimActiveJob: jest.fn(), releaseActiveJob: jest.fn(), + releaseActiveLockOnly: jest.fn(), _internals: { loadScript: jest.fn(), evalScript: jest.fn(), @@ -27,7 +31,7 @@ jest.mock('../../redis/luaScripts', () => ({ }, })); -const { releaseActiveJob } = require('../../redis/luaScripts'); +const { releaseActiveLockOnly: releaseActiveJob } = require('../../redis/luaScripts'); const { createJobService } = require('../jobService'); function makeFakeRedis() { diff --git a/apps/task-scheduler/src/services/jobService.js b/apps/task-scheduler/src/services/jobService.js index 2c53b41..49bd41d 100644 --- a/apps/task-scheduler/src/services/jobService.js +++ b/apps/task-scheduler/src/services/jobService.js @@ -35,7 +35,11 @@ const path = require('path'); const crypto = require('crypto'); -const { claimActiveJob, releaseActiveJob } = require('../redis/luaScripts'); +const { + claimActiveJob, + releaseActiveJob, + releaseActiveLockOnly, +} = require('../redis/luaScripts'); const { toExternalStatus, isInProgress } = require('./statusMapper'); // Pipeline: fixed stage order,對齊 server.js L84-91 @@ -189,10 +193,15 @@ function createJobService(deps) { return; } try { - // 注意:releaseActiveJobByUser 定義在下方(T5 既有 closure 內 function - // declaration,會被 hoist)。本 helper 是 T9 新增,刻意不挪動 T5 既有 - // function 順序避免 diff 干擾 reviewer。 - const result = await releaseActiveJobByUser(userId, jobId); + // 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly。 + // 舊行為(releaseActiveJobByUser → release_active_job.lua)會同時 DEL job:{id} + // + SREM user:{}:jobs、導致 post-completion API(visionA poll / promote / + // download)拿不到 job record(COMPLETED 後 < 1s 就 404)。 + // 新行為(releaseActiveLockOnly → release_lock_only.lua)只 DEL active_job lock、 + // 保留 job record 與 user:jobs Set。 + // 注意:enqueue 失敗 rollback path(jobs.js 內 jobService.releaseActiveJob) + // 仍使用舊 Lua、因為該情境 job 尚未 schedule、清乾淨是正確語意。 + const result = await releaseActiveLockOnly(redis, { userId, jobId }); // eslint-disable-next-line no-console console.log( JSON.stringify({