fix(task-scheduler): Bug #8 — terminal release 保留 job record(visionA poll 404)
visionA e2e 撞到:job COMPLETED 後 < 1s poll GET /api/v1/jobs/:id 回 404。
根因:jobService terminal release(COMPLETED / FAILED)path 用了
release_active_job.lua、該 Lua 給 enqueue rollback 用、會 DEL job:{id}
+ SREM user:jobs Set、post-completion API 拿不到 record。
修法 A(不改舊 Lua、新增專用 Lua):
- 新增 src/redis/luaScripts/release_lock_only.lua — 只 DEL active_job lock、
保留 job:{id} record 與 user:{}:jobs Set、給正常 terminal 用
- 新增 releaseActiveLockOnly() JS wrapper(同 releaseActiveJob 的 API surface +
atomic guard)
- jobService.js terminal release path 改用 releaseActiveLockOnly
- enqueue 失敗 rollback path 仍用舊 releaseActiveJob(語意正確、該情境 job
尚未 schedule 完成、清乾淨才對)
- t9 unit / integration test 用 destructuring alias rename 避免改 27 個
assertion
post-completion API 路徑(visionA poll / Phase B /result / promote)都需要
job record 仍在 — 本修法解此契約。
Tests: 666/666 pass(無回歸)
Reviewer: ✅ 通過、設計嚴謹(atomic guard、rollback vs terminal 語意分離)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3b7aa4c79a
commit
b8457ddb95
@ -176,9 +176,51 @@ async function releaseActiveJob(redis, { userId, jobId }) {
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Release active_job lock only — terminal use (Bug #8 修法 A, 2026-05-18).
|
||||
*
|
||||
* 跟 releaseActiveJob 的差別:**只 DEL active_job lock、保留 job:{id} record 與
|
||||
* user:{}:jobs Set**。給 job 正常 terminal(COMPLETED / FAILED)時用、讓 post-completion
|
||||
* API (poll / promote / download) 仍可拿到 job record。
|
||||
*
|
||||
* 為什麼新 Lua 而不改舊 Lua:舊 Lua(release_active_job.lua)給 enqueue 失敗 rollback 用、
|
||||
* 該情境 job record 確實該清(沒成功 schedule、無 observer);改舊 Lua 會破壞 rollback 語意。
|
||||
*
|
||||
* @param {import('ioredis').Redis} redis
|
||||
* @param {object} args
|
||||
* @param {string} args.userId
|
||||
* @param {string} args.jobId
|
||||
* @returns {Promise<{ok: true, released: boolean}>}
|
||||
*/
|
||||
async function releaseActiveLockOnly(redis, { userId, jobId }) {
|
||||
if (!userId || typeof userId !== 'string') {
|
||||
throw new Error('[releaseActiveLockOnly] userId is required');
|
||||
}
|
||||
if (!jobId || typeof jobId !== 'string') {
|
||||
throw new Error('[releaseActiveLockOnly] jobId is required');
|
||||
}
|
||||
|
||||
const script = loadScript('release_lock_only.lua');
|
||||
const keys = [`user:${userId}:active_job`];
|
||||
const args = [jobId];
|
||||
|
||||
const result = await evalScript(redis, script, keys, args);
|
||||
|
||||
if (Array.isArray(result) && result[0] === 'OK') {
|
||||
return { ok: true, released: true };
|
||||
}
|
||||
if (Array.isArray(result) && result[0] === 'NOOP') {
|
||||
return { ok: true, released: false };
|
||||
}
|
||||
throw new Error(
|
||||
`[releaseActiveLockOnly] Unexpected Lua response: ${JSON.stringify(result)}`
|
||||
);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
claimActiveJob,
|
||||
releaseActiveJob,
|
||||
releaseActiveLockOnly,
|
||||
// 內部 helper 暴露給單元測試
|
||||
_internals: {
|
||||
loadScript,
|
||||
|
||||
@ -0,0 +1,45 @@
|
||||
-- release_lock_only.lua
|
||||
--
|
||||
-- 2026-05-18 e2e debug 新增(Bug #8 修法 A):
|
||||
-- 用於 job 正常 terminal(COMPLETED / FAILED)時釋放 active_job lock,
|
||||
-- **保留 job:{id} record 與 user:{}:jobs Set**,讓後續 post-completion API
|
||||
-- (poll / promote / download) 仍可拿到 job record。
|
||||
--
|
||||
-- 跟 release_active_job.lua 的差別:
|
||||
-- - release_active_job.lua: enqueue 失敗回滾用、DEL active_job + DEL job + SREM
|
||||
-- (清得乾淨、因為 job 還沒真的 schedule、user 看不到、no observer)
|
||||
-- - release_lock_only.lua: 正常 terminal 用、只 DEL active_job
|
||||
-- (job record 必須留著、Phase 0.8 visionA / Phase 2 download-tokens 都需要)
|
||||
--
|
||||
-- 為什麼用 Lua(與 release_active_job.lua 同理):
|
||||
-- 1. **Atomic guard**:只有當 active_job 仍然指向 ARGV[1] 時才釋放,
|
||||
-- 避免「completion + 新 claim 連續發生」造成誤刪別人的 active_job lock
|
||||
-- 2. **單次 round-trip**
|
||||
--
|
||||
-- KEYS:
|
||||
-- KEYS[1] = user:{user_id}:active_job — 該 user 當前 in-progress job_id(String)
|
||||
--
|
||||
-- ARGV:
|
||||
-- ARGV[1] = job_id — 要釋放的 job_id;只有當 active_job
|
||||
-- 的值等於這個 job_id 時才執行 DEL
|
||||
--
|
||||
-- Returns:
|
||||
-- {"OK"} — active_job lock 已 DEL;job record 保留
|
||||
-- {"NOOP"} — active_job 不等於 ARGV[1] 或不存在;
|
||||
-- 未做任何修改
|
||||
--
|
||||
-- 注意事項:
|
||||
-- * 不刪 job:{id}:post-completion API(visionA GET /api/v1/jobs/:id、Phase 2
|
||||
-- download-tokens、promote)都需要 job record 存在
|
||||
-- * 不 SREM user:{}:jobs:user 的 jobs Set 是 audit trail / list 用,
|
||||
-- job 完成後仍應在 list
|
||||
-- * 不在 Lua 內 log,所有觀察性靠呼叫端的 structured log
|
||||
|
||||
local current = redis.call('GET', KEYS[1])
|
||||
if current ~= ARGV[1] then
|
||||
return {'NOOP'}
|
||||
end
|
||||
|
||||
redis.call('DEL', KEYS[1])
|
||||
|
||||
return {'OK'}
|
||||
@ -47,9 +47,9 @@ jest.mock('../../redis/luaScripts', () => {
|
||||
return { ok: true };
|
||||
}),
|
||||
releaseActiveJob: jest.fn(async (_redis, { userId, jobId }) => {
|
||||
// 舊行為:enqueue 失敗 rollback 用、刪 lock + job + SREM
|
||||
const current = state.activeJobMap.get(userId);
|
||||
if (current !== jobId) {
|
||||
// Lua atomic guard:active_job 不等於 ARGV[1] → NOOP
|
||||
return { ok: true, released: false };
|
||||
}
|
||||
state.activeJobMap.delete(userId);
|
||||
@ -58,6 +58,17 @@ jest.mock('../../redis/luaScripts', () => {
|
||||
if (set) set.delete(jobId);
|
||||
return { ok: true, released: true };
|
||||
}),
|
||||
// 2026-05-18 Bug #8 修法 A:terminal release 用、**只刪 active_job lock**、
|
||||
// 保留 jobMap + userJobsMap 給 post-completion API 用
|
||||
releaseActiveLockOnly: jest.fn(async (_redis, { userId, jobId }) => {
|
||||
const current = state.activeJobMap.get(userId);
|
||||
if (current !== jobId) {
|
||||
return { ok: true, released: false };
|
||||
}
|
||||
state.activeJobMap.delete(userId);
|
||||
// **不刪** jobMap / userJobsMap(與 release_lock_only.lua 對齊)
|
||||
return { ok: true, released: true };
|
||||
}),
|
||||
_internals: {
|
||||
loadScript: jest.fn(),
|
||||
evalScript: jest.fn(),
|
||||
@ -67,7 +78,14 @@ jest.mock('../../redis/luaScripts', () => {
|
||||
};
|
||||
});
|
||||
|
||||
const { claimActiveJob, releaseActiveJob, _internals } = require('../../redis/luaScripts');
|
||||
const {
|
||||
claimActiveJob,
|
||||
// 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly。
|
||||
// 既有 test assert `releaseActiveJob.toHaveBeenCalledTimes(1)` 等檢查的是
|
||||
// terminal release 行為,故 alias rename 讓 assertion 不用全改。
|
||||
releaseActiveLockOnly: releaseActiveJob,
|
||||
_internals,
|
||||
} = require('../../redis/luaScripts');
|
||||
const { createJobService } = require('../jobService');
|
||||
|
||||
function makeFakeRedis() {
|
||||
|
||||
@ -17,9 +17,13 @@
|
||||
'use strict';
|
||||
|
||||
// Mock luaScripts,控制 release 結果而不啟動真 Redis Lua
|
||||
// 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly(保留 job record);
|
||||
// releaseActiveJob 仍存在但只給 enqueue 失敗 rollback 用。本 test file 測 terminal 場景、
|
||||
// 故 spy 改用 releaseActiveLockOnly。
|
||||
jest.mock('../../redis/luaScripts', () => ({
|
||||
claimActiveJob: jest.fn(),
|
||||
releaseActiveJob: jest.fn(),
|
||||
releaseActiveLockOnly: jest.fn(),
|
||||
_internals: {
|
||||
loadScript: jest.fn(),
|
||||
evalScript: jest.fn(),
|
||||
@ -27,7 +31,7 @@ jest.mock('../../redis/luaScripts', () => ({
|
||||
},
|
||||
}));
|
||||
|
||||
const { releaseActiveJob } = require('../../redis/luaScripts');
|
||||
const { releaseActiveLockOnly: releaseActiveJob } = require('../../redis/luaScripts');
|
||||
const { createJobService } = require('../jobService');
|
||||
|
||||
function makeFakeRedis() {
|
||||
|
||||
@ -35,7 +35,11 @@
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
|
||||
const { claimActiveJob, releaseActiveJob } = require('../redis/luaScripts');
|
||||
const {
|
||||
claimActiveJob,
|
||||
releaseActiveJob,
|
||||
releaseActiveLockOnly,
|
||||
} = require('../redis/luaScripts');
|
||||
const { toExternalStatus, isInProgress } = require('./statusMapper');
|
||||
|
||||
// Pipeline: fixed stage order,對齊 server.js L84-91
|
||||
@ -189,10 +193,15 @@ function createJobService(deps) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// 注意:releaseActiveJobByUser 定義在下方(T5 既有 closure 內 function
|
||||
// declaration,會被 hoist)。本 helper 是 T9 新增,刻意不挪動 T5 既有
|
||||
// function 順序避免 diff 干擾 reviewer。
|
||||
const result = await releaseActiveJobByUser(userId, jobId);
|
||||
// 2026-05-18 Bug #8 修法 A:terminal release 改用 releaseActiveLockOnly。
|
||||
// 舊行為(releaseActiveJobByUser → release_active_job.lua)會同時 DEL job:{id}
|
||||
// + SREM user:{}:jobs、導致 post-completion API(visionA poll / promote /
|
||||
// download)拿不到 job record(COMPLETED 後 < 1s 就 404)。
|
||||
// 新行為(releaseActiveLockOnly → release_lock_only.lua)只 DEL active_job lock、
|
||||
// 保留 job record 與 user:jobs Set。
|
||||
// 注意:enqueue 失敗 rollback path(jobs.js 內 jobService.releaseActiveJob)
|
||||
// 仍使用舊 Lua、因為該情境 job 尚未 schedule、清乾淨是正確語意。
|
||||
const result = await releaseActiveLockOnly(redis, { userId, jobId });
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(
|
||||
JSON.stringify({
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user