Add web frontend, MinIO storage, monitoring, and docker-compose deployment
- Frontend: rewrite Home.vue to match backend POST /jobs API (remove single-stage options) - Frontend: add Monitor page (/monitor) for queue and job monitoring - Frontend: add job history with localStorage tracking (per-browser) - Frontend: fix Nginx proxy rewrite (/api -> /) and add 500MB upload limit - Backend: add MinIO storage support (STORAGE_BACKEND=minio) alongside local mode - Backend: add GET /queues/stats API for queue monitoring - Backend: fix download handler for MinIO (buffer mode for Node 18 compat) - Workers: add S3/MinIO download/upload in consumer.py with isolated temp dirs - Workers: add s3_storage.py helper with lifecycle rule (7-day TTL) - Docker: add docker-compose.yml with all services (web, scheduler, redis, workers) - Docker: ports mapped to 9500 (web) and 9501 (scheduler) - Config: add .env to .gitignore to protect secrets Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
fdebf4db5d
commit
efa67d59a4
9
.gitignore
vendored
9
.gitignore
vendored
@ -15,6 +15,9 @@ coverage.xml
|
||||
venv/
|
||||
env/
|
||||
|
||||
# Environment (contains secrets)
|
||||
.env
|
||||
|
||||
# OS / Editor
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
@ -72,3 +75,9 @@ toolchain/prebuild/**/logs/
|
||||
|
||||
# Test outputs
|
||||
tests/fixtures/outputs/
|
||||
|
||||
CLAUDE.md.backup
|
||||
|
||||
# Autoflow Agent(由 autoflow-agent init 自動產生)
|
||||
.claude/
|
||||
.autoflow/CLAUDE.md.backup.*
|
||||
|
||||
@ -1,30 +1,23 @@
|
||||
FROM node:18-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 複製package文件
|
||||
COPY package*.json ./
|
||||
|
||||
# 安裝依賴
|
||||
RUN npm ci --only=production
|
||||
|
||||
# 複製應用代碼
|
||||
COPY . .
|
||||
|
||||
# 創建非root用戶
|
||||
RUN addgroup -g 1001 -S nodejs
|
||||
RUN adduser -S nextjs -u 1001
|
||||
|
||||
# 更改文件所有權
|
||||
RUN chown -R nextjs:nodejs /app
|
||||
USER nextjs
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 4000
|
||||
|
||||
# 健康檢查
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:4000/health || exit 1
|
||||
|
||||
# 啟動命令
|
||||
CMD ["npm", "start"]
|
||||
FROM node:18-alpine
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apk add --no-cache curl
|
||||
|
||||
COPY package*.json ./
|
||||
RUN npm ci --only=production
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN addgroup -g 1001 -S appgroup && \
|
||||
adduser -S appuser -u 1001 -G appgroup
|
||||
|
||||
RUN mkdir -p /data/jobs && chown -R appuser:appgroup /app /data/jobs
|
||||
USER appuser
|
||||
|
||||
EXPOSE 4000
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:4000/health || exit 1
|
||||
|
||||
CMD ["npm", "start"]
|
||||
|
||||
@ -1,16 +1,23 @@
|
||||
# Task Scheduler Configuration
|
||||
PORT=4000
|
||||
NODE_ENV=development
|
||||
|
||||
# Frontend URL
|
||||
FRONTEND_URL=http://localhost:3000
|
||||
|
||||
# API Services Configuration
|
||||
ONNX_SERVICE_URL=http://localhost:5001
|
||||
BIE_SERVICE_URL=http://localhost:5002
|
||||
NEF_SERVICE_URL=http://localhost:5003
|
||||
|
||||
# API Keys
|
||||
ONNX_API_KEY=onnx-secret-key
|
||||
BIE_API_KEY=bie-secret-key
|
||||
NEF_API_KEY=nef-secret-key
|
||||
# Task Scheduler Configuration
|
||||
PORT=4000
|
||||
NODE_ENV=development
|
||||
|
||||
# Redis
|
||||
REDIS_URL=redis://localhost:6379
|
||||
|
||||
# Job data directory (shared volume with workers)
|
||||
JOB_DATA_DIR=/data/jobs
|
||||
|
||||
# Frontend URL (for CORS)
|
||||
FRONTEND_URL=http://localhost:3000
|
||||
|
||||
# Storage backend: "local" (shared volume) or "minio"
|
||||
STORAGE_BACKEND=local
|
||||
|
||||
# MinIO settings (only used when STORAGE_BACKEND=minio)
|
||||
MINIO_ENDPOINT_URL=http://192.168.0.130:9000
|
||||
MINIO_BUCKET=convertet-working-space
|
||||
MINIO_ACCESS_KEY=convuser
|
||||
MINIO_SECRET_KEY=your-secret-here
|
||||
MINIO_REGION=us-east-1
|
||||
MINIO_LIFECYCLE_DAYS=7
|
||||
|
||||
6866
apps/task-scheduler/package-lock.json
generated
Normal file
6866
apps/task-scheduler/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,34 +1,37 @@
|
||||
{
|
||||
"name": "kneron-webgui-task-scheduler",
|
||||
"version": "1.0.0",
|
||||
"description": "Kneron Toolchain Web GUI Task Scheduler",
|
||||
"main": "server.js",
|
||||
"scripts": {
|
||||
"start": "node server.js",
|
||||
"dev": "nodemon server.js",
|
||||
"test": "jest"
|
||||
},
|
||||
"dependencies": {
|
||||
"express": "^4.18.2",
|
||||
"cors": "^2.8.5",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"axios": "^1.5.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"helmet": "^7.0.0",
|
||||
"express-rate-limit": "^6.10.0",
|
||||
"morgan": "^1.10.0",
|
||||
"compression": "^1.7.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"nodemon": "^3.0.1",
|
||||
"jest": "^29.6.2"
|
||||
},
|
||||
"keywords": [
|
||||
"kneron",
|
||||
"toolchain",
|
||||
"api",
|
||||
"proxy"
|
||||
],
|
||||
"author": "Kneron Team",
|
||||
"license": "MIT"
|
||||
}
|
||||
{
|
||||
"name": "kneron-task-scheduler",
|
||||
"version": "2.0.0",
|
||||
"description": "Kneron Toolchain Task Scheduler - Job management and queue orchestration",
|
||||
"main": "server.js",
|
||||
"scripts": {
|
||||
"start": "node server.js",
|
||||
"dev": "nodemon server.js",
|
||||
"test": "jest"
|
||||
},
|
||||
"dependencies": {
|
||||
"express": "^4.18.2",
|
||||
"cors": "^2.8.5",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"ioredis": "^5.3.2",
|
||||
"uuid": "^9.0.0",
|
||||
"dotenv": "^16.3.1",
|
||||
"helmet": "^7.0.0",
|
||||
"express-rate-limit": "^6.10.0",
|
||||
"morgan": "^1.10.0",
|
||||
"compression": "^1.7.4",
|
||||
"@aws-sdk/client-s3": "^3.400.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"nodemon": "^3.0.1",
|
||||
"jest": "^29.6.2"
|
||||
},
|
||||
"keywords": [
|
||||
"kneron",
|
||||
"toolchain",
|
||||
"scheduler",
|
||||
"queue",
|
||||
"redis-stream"
|
||||
],
|
||||
"author": "Kneron Team",
|
||||
"license": "MIT"
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,6 +18,8 @@ RUN npm run build
|
||||
# 生產階段
|
||||
FROM nginx:alpine
|
||||
|
||||
RUN apk add --no-cache curl
|
||||
|
||||
# 複製構建結果
|
||||
COPY --from=build /app/dist /usr/share/nginx/html
|
||||
|
||||
|
||||
@ -4,6 +4,9 @@ server {
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# 上傳大小限制(模型檔最大 500MB)
|
||||
client_max_body_size 500m;
|
||||
|
||||
# 啟用gzip壓縮
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
@ -16,9 +19,24 @@ server {
|
||||
add_header Cache-Control "public, immutable";
|
||||
}
|
||||
|
||||
# API代理
|
||||
location /api {
|
||||
proxy_pass http://task-scheduler:4000;
|
||||
# SSE 端點 — 必須關閉 buffering(rewrite /api → /)
|
||||
location ~ ^/api/(jobs/.*/events)$ {
|
||||
proxy_pass http://scheduler:4000/$1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
proxy_buffering off;
|
||||
proxy_cache off;
|
||||
proxy_read_timeout 86400s;
|
||||
proxy_send_timeout 86400s;
|
||||
chunked_transfer_encoding on;
|
||||
gzip off;
|
||||
}
|
||||
|
||||
# API 代理(rewrite /api → /)
|
||||
location /api/ {
|
||||
proxy_pass http://scheduler:4000/;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
|
||||
1773
apps/web/package-lock.json
generated
Normal file
1773
apps/web/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,9 @@
|
||||
<el-icon><Cpu /></el-icon>
|
||||
<span>Model Converter Web</span>
|
||||
</div>
|
||||
<nav class="nav-links">
|
||||
<router-link to="/" class="nav-link">轉換</router-link>
|
||||
</nav>
|
||||
<div class="status">
|
||||
<el-icon v-if="isConnected" color="#67c23a"><CircleCheck /></el-icon>
|
||||
<el-icon v-else color="#f56c6c"><CircleClose /></el-icon>
|
||||
@ -75,6 +78,29 @@ const checkConnection = async () => {
|
||||
font-size: 24px;
|
||||
}
|
||||
|
||||
.nav-links {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
}
|
||||
|
||||
.nav-link {
|
||||
color: rgba(255, 255, 255, 0.8);
|
||||
text-decoration: none;
|
||||
font-size: 15px;
|
||||
padding: 4px 0;
|
||||
border-bottom: 2px solid transparent;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.nav-link:hover {
|
||||
color: white;
|
||||
}
|
||||
|
||||
.nav-link.router-link-exact-active {
|
||||
color: white;
|
||||
border-bottom-color: white;
|
||||
}
|
||||
|
||||
.status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
|
||||
172
apps/web/src/composables/useJobStatus.js
Normal file
172
apps/web/src/composables/useJobStatus.js
Normal file
@ -0,0 +1,172 @@
|
||||
import { onUnmounted } from 'vue'
|
||||
|
||||
const TERMINAL_STATES = ['COMPLETED', 'FAILED']
|
||||
const POLL_INTERVAL = 3000
|
||||
const SSE_RETRY_BASE = 5000
|
||||
const SSE_RETRY_MAX = 30000
|
||||
const SSE_MAX_RETRIES = 5
|
||||
|
||||
export function useJobStatus() {
|
||||
const watchers = new Map() // jobId -> watcher state
|
||||
|
||||
function watchJob(jobId, onUpdate) {
|
||||
if (watchers.has(jobId)) return
|
||||
|
||||
const state = {
|
||||
sse: null,
|
||||
pollTimer: null,
|
||||
sseRetryTimer: null,
|
||||
sseRetryCount: 0,
|
||||
mode: 'idle',
|
||||
stopped: false,
|
||||
}
|
||||
watchers.set(jobId, state)
|
||||
|
||||
function handleUpdate(jobData) {
|
||||
onUpdate(jobData)
|
||||
if (TERMINAL_STATES.includes(jobData.status)) {
|
||||
stopWatch(jobId)
|
||||
}
|
||||
}
|
||||
|
||||
// --- SSE ---
|
||||
function startSSE() {
|
||||
if (state.stopped) return
|
||||
if (typeof EventSource === 'undefined') {
|
||||
startPolling()
|
||||
return
|
||||
}
|
||||
|
||||
const es = new EventSource(`/api/jobs/${jobId}/events`)
|
||||
state.sse = es
|
||||
state.mode = 'sse'
|
||||
|
||||
es.onmessage = (event) => {
|
||||
state.sseRetryCount = 0
|
||||
try {
|
||||
handleUpdate(JSON.parse(event.data))
|
||||
} catch (e) {
|
||||
console.warn('[useJobStatus] SSE parse error:', e)
|
||||
}
|
||||
}
|
||||
|
||||
es.onerror = () => {
|
||||
closeSSE()
|
||||
startPolling()
|
||||
}
|
||||
}
|
||||
|
||||
function closeSSE() {
|
||||
if (state.sse) {
|
||||
state.sse.close()
|
||||
state.sse = null
|
||||
}
|
||||
}
|
||||
|
||||
// --- Polling ---
|
||||
function startPolling() {
|
||||
if (state.stopped || state.pollTimer) return
|
||||
state.mode = 'polling'
|
||||
|
||||
async function poll() {
|
||||
if (state.stopped) return
|
||||
try {
|
||||
const res = await fetch(`/api/jobs/${jobId}`)
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`)
|
||||
const data = await res.json()
|
||||
handleUpdate(data)
|
||||
|
||||
if (!state.stopped && state.sseRetryCount < SSE_MAX_RETRIES) {
|
||||
attemptSSERecovery()
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('[useJobStatus] Poll error:', e)
|
||||
}
|
||||
}
|
||||
|
||||
poll()
|
||||
state.pollTimer = setInterval(poll, POLL_INTERVAL)
|
||||
}
|
||||
|
||||
function stopPolling() {
|
||||
if (state.pollTimer) {
|
||||
clearInterval(state.pollTimer)
|
||||
state.pollTimer = null
|
||||
}
|
||||
}
|
||||
|
||||
// --- SSE Recovery ---
|
||||
function attemptSSERecovery() {
|
||||
if (state.sseRetryTimer || state.sse) return
|
||||
if (typeof EventSource === 'undefined') return
|
||||
|
||||
const delay = Math.min(
|
||||
SSE_RETRY_BASE * Math.pow(2, state.sseRetryCount),
|
||||
SSE_RETRY_MAX
|
||||
)
|
||||
state.sseRetryCount++
|
||||
|
||||
state.sseRetryTimer = setTimeout(() => {
|
||||
state.sseRetryTimer = null
|
||||
if (state.stopped) return
|
||||
|
||||
const testES = new EventSource(`/api/jobs/${jobId}/events`)
|
||||
const timeout = setTimeout(() => {
|
||||
testES.close()
|
||||
}, 5000)
|
||||
|
||||
testES.onmessage = (event) => {
|
||||
clearTimeout(timeout)
|
||||
stopPolling()
|
||||
state.sse = testES
|
||||
state.mode = 'sse'
|
||||
state.sseRetryCount = 0
|
||||
|
||||
try {
|
||||
handleUpdate(JSON.parse(event.data))
|
||||
} catch (e) { /* ignore */ }
|
||||
|
||||
testES.onerror = () => {
|
||||
closeSSE()
|
||||
startPolling()
|
||||
}
|
||||
}
|
||||
|
||||
testES.onerror = () => {
|
||||
clearTimeout(timeout)
|
||||
testES.close()
|
||||
}
|
||||
}, delay)
|
||||
}
|
||||
|
||||
// --- Cleanup ---
|
||||
function stopWatch(id) {
|
||||
const s = watchers.get(id)
|
||||
if (!s) return
|
||||
s.stopped = true
|
||||
closeSSE()
|
||||
stopPolling()
|
||||
if (s.sseRetryTimer) {
|
||||
clearTimeout(s.sseRetryTimer)
|
||||
s.sseRetryTimer = null
|
||||
}
|
||||
watchers.delete(id)
|
||||
}
|
||||
|
||||
startSSE()
|
||||
|
||||
return () => stopWatch(jobId)
|
||||
}
|
||||
|
||||
onUnmounted(() => {
|
||||
for (const [, state] of watchers) {
|
||||
state.stopped = true
|
||||
if (state.sse) state.sse.close()
|
||||
if (state.pollTimer) clearInterval(state.pollTimer)
|
||||
if (state.sseRetryTimer) clearTimeout(state.sseRetryTimer)
|
||||
}
|
||||
watchers.clear()
|
||||
})
|
||||
|
||||
return { watchJob }
|
||||
}
|
||||
@ -1,11 +1,17 @@
|
||||
import { createRouter, createWebHistory } from 'vue-router'
|
||||
import Home from '@/views/Home.vue'
|
||||
import Monitor from '@/views/Monitor.vue'
|
||||
|
||||
const routes = [
|
||||
{
|
||||
path: '/',
|
||||
name: 'Home',
|
||||
component: Home
|
||||
},
|
||||
{
|
||||
path: '/monitor',
|
||||
name: 'Monitor',
|
||||
component: Monitor
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@ -1,43 +1,21 @@
|
||||
import { defineStore } from 'pinia'
|
||||
import axios from 'axios'
|
||||
|
||||
export const useSystemStore = defineStore('system', {
|
||||
state: () => ({
|
||||
isConnected: false,
|
||||
services: {
|
||||
onnx: { status: 'unknown', activeTasks: 0 },
|
||||
bie: { status: 'unknown', activeTasks: 0 },
|
||||
nef: { status: 'unknown', activeTasks: 0 }
|
||||
}
|
||||
}),
|
||||
|
||||
actions: {
|
||||
async checkHealth() {
|
||||
try {
|
||||
const response = await axios.get('/api/health')
|
||||
this.isConnected = true
|
||||
return response.data
|
||||
} catch (error) {
|
||||
this.isConnected = false
|
||||
throw error
|
||||
}
|
||||
},
|
||||
|
||||
async checkServiceHealth(service) {
|
||||
try {
|
||||
const response = await axios.get(`http://localhost:500${service === 'onnx' ? '1' : service === 'bie' ? '2' : '3'}/health`)
|
||||
this.services[service] = {
|
||||
status: response.data.status,
|
||||
activeTasks: response.data.active_tasks
|
||||
}
|
||||
return response.data
|
||||
} catch (error) {
|
||||
this.services[service] = {
|
||||
status: 'unreachable',
|
||||
activeTasks: 0
|
||||
}
|
||||
throw error
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
import { defineStore } from 'pinia'
|
||||
import axios from 'axios'
|
||||
|
||||
export const useSystemStore = defineStore('system', {
|
||||
state: () => ({
|
||||
isConnected: false,
|
||||
}),
|
||||
|
||||
actions: {
|
||||
async checkHealth() {
|
||||
try {
|
||||
const response = await axios.get('/api/health')
|
||||
this.isConnected = true
|
||||
return response.data
|
||||
} catch (error) {
|
||||
this.isConnected = false
|
||||
throw error
|
||||
}
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
339
apps/web/src/views/Monitor.vue
Normal file
339
apps/web/src/views/Monitor.vue
Normal file
@ -0,0 +1,339 @@
|
||||
<template>
|
||||
<div class="monitor">
|
||||
<!-- 自動刷新控制 -->
|
||||
<div class="monitor-toolbar">
|
||||
<el-switch v-model="autoRefresh" active-text="自動刷新" />
|
||||
<el-select v-model="refreshInterval" size="small" style="width: 100px; margin-left: 12px" :disabled="!autoRefresh">
|
||||
<el-option label="3 秒" :value="3000" />
|
||||
<el-option label="5 秒" :value="5000" />
|
||||
<el-option label="10 秒" :value="10000" />
|
||||
</el-select>
|
||||
<el-button size="small" @click="fetchStats" :loading="loading" style="margin-left: 12px">
|
||||
<el-icon><Refresh /></el-icon> 立即刷新
|
||||
</el-button>
|
||||
<span class="last-updated" v-if="lastUpdated">最後更新: {{ lastUpdated }}</span>
|
||||
</div>
|
||||
|
||||
<!-- Job 統計 -->
|
||||
<el-card shadow="hover" class="stats-card">
|
||||
<template #header>
|
||||
<div class="card-header">
|
||||
<el-icon><DataAnalysis /></el-icon>
|
||||
<span>任務統計</span>
|
||||
</div>
|
||||
</template>
|
||||
<div class="job-stats">
|
||||
<div class="stat-item">
|
||||
<div class="stat-value">{{ jobStats.total }}</div>
|
||||
<div class="stat-label">總任務數</div>
|
||||
</div>
|
||||
<div class="stat-item stat-onnx">
|
||||
<div class="stat-value">{{ jobStats.ONNX }}</div>
|
||||
<div class="stat-label">ONNX 處理中</div>
|
||||
</div>
|
||||
<div class="stat-item stat-bie">
|
||||
<div class="stat-value">{{ jobStats.BIE }}</div>
|
||||
<div class="stat-label">BIE 處理中</div>
|
||||
</div>
|
||||
<div class="stat-item stat-nef">
|
||||
<div class="stat-value">{{ jobStats.NEF }}</div>
|
||||
<div class="stat-label">NEF 處理中</div>
|
||||
</div>
|
||||
<div class="stat-item stat-completed">
|
||||
<div class="stat-value">{{ jobStats.COMPLETED }}</div>
|
||||
<div class="stat-label">已完成</div>
|
||||
</div>
|
||||
<div class="stat-item stat-failed">
|
||||
<div class="stat-value">{{ jobStats.FAILED }}</div>
|
||||
<div class="stat-label">失敗</div>
|
||||
</div>
|
||||
</div>
|
||||
</el-card>
|
||||
|
||||
<!-- Queue 狀態 -->
|
||||
<el-card shadow="hover" class="stats-card">
|
||||
<template #header>
|
||||
<div class="card-header">
|
||||
<el-icon><List /></el-icon>
|
||||
<span>佇列狀態</span>
|
||||
</div>
|
||||
</template>
|
||||
<el-table :data="queueRows" stripe>
|
||||
<el-table-column prop="name" label="佇列" width="160">
|
||||
<template #default="{ row }">
|
||||
<el-tag :type="queueTagType(row.name)" size="small">{{ row.label }}</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="length" label="佇列長度" width="120" align="center" />
|
||||
<el-table-column prop="pending" label="處理中" width="120" align="center">
|
||||
<template #default="{ row }">
|
||||
<span :class="{ 'text-warning': row.pending > 0 }">{{ row.pending }}</span>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="consumerCount" label="Worker 數" width="120" align="center" />
|
||||
<el-table-column label="Workers" min-width="250">
|
||||
<template #default="{ row }">
|
||||
<div v-if="row.consumers.length > 0" class="consumer-list">
|
||||
<el-tag
|
||||
v-for="c in row.consumers"
|
||||
:key="c.name"
|
||||
size="small"
|
||||
:type="c.idle < 30000 ? 'success' : 'warning'"
|
||||
class="consumer-tag"
|
||||
>
|
||||
{{ c.name }} (pending: {{ c.pending }}, idle: {{ formatIdle(c.idle) }})
|
||||
</el-tag>
|
||||
</div>
|
||||
<span v-else class="text-muted">-</span>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</el-card>
|
||||
|
||||
<!-- Job 列表 -->
|
||||
<el-card shadow="hover" class="stats-card">
|
||||
<template #header>
|
||||
<div class="card-header">
|
||||
<el-icon><Document /></el-icon>
|
||||
<span>任務列表</span>
|
||||
</div>
|
||||
</template>
|
||||
<el-table :data="jobs" stripe :default-sort="{ prop: 'created_at', order: 'descending' }">
|
||||
<el-table-column prop="job_id" label="Job ID" width="120">
|
||||
<template #default="{ row }">
|
||||
<span class="job-id">{{ row.job_id.substring(0, 8) }}...</span>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="status" label="狀態" width="130" align="center">
|
||||
<template #default="{ row }">
|
||||
<el-tag :type="statusTagType(row.status)" size="small">{{ row.status }}</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="progress" label="進度" width="160">
|
||||
<template #default="{ row }">
|
||||
<el-progress
|
||||
:percentage="row.progress || 0"
|
||||
:status="row.status === 'FAILED' ? 'exception' : row.status === 'COMPLETED' ? 'success' : undefined"
|
||||
:stroke-width="10"
|
||||
/>
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="parameters.platform" label="平台" width="100" align="center">
|
||||
<template #default="{ row }">
|
||||
KDP{{ row.parameters?.platform || '-' }}
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="created_at" label="建立時間" width="180">
|
||||
<template #default="{ row }">
|
||||
{{ formatTime(row.created_at) }}
|
||||
</template>
|
||||
</el-table-column>
|
||||
<el-table-column prop="error" label="錯誤" min-width="200">
|
||||
<template #default="{ row }">
|
||||
<span v-if="row.error" class="text-danger">
|
||||
[{{ row.error.step }}] {{ row.error.reason }}
|
||||
</span>
|
||||
<span v-else class="text-muted">-</span>
|
||||
</template>
|
||||
</el-table-column>
|
||||
</el-table>
|
||||
</el-card>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, computed, watch, onMounted, onUnmounted } from 'vue'
|
||||
import axios from 'axios'
|
||||
|
||||
const loading = ref(false)
|
||||
const autoRefresh = ref(true)
|
||||
const refreshInterval = ref(5000)
|
||||
const lastUpdated = ref('')
|
||||
let timer = null
|
||||
|
||||
const queueData = ref({})
|
||||
const jobStats = ref({ total: 0, ONNX: 0, BIE: 0, NEF: 0, COMPLETED: 0, FAILED: 0 })
|
||||
const jobs = ref([])
|
||||
|
||||
const queueRows = computed(() => {
|
||||
const labels = {
|
||||
'queue:onnx': 'ONNX',
|
||||
'queue:bie': 'BIE',
|
||||
'queue:nef': 'NEF',
|
||||
'queue:done': 'Done',
|
||||
}
|
||||
return Object.entries(queueData.value).map(([name, data]) => ({
|
||||
name,
|
||||
label: labels[name] || name,
|
||||
length: data.length || 0,
|
||||
pending: data.pending || 0,
|
||||
consumers: data.consumers || [],
|
||||
consumerCount: (data.consumers || []).length,
|
||||
}))
|
||||
})
|
||||
|
||||
async function fetchStats() {
|
||||
loading.value = true
|
||||
try {
|
||||
const [statsRes, jobsRes] = await Promise.all([
|
||||
axios.get('/api/queues/stats'),
|
||||
axios.get('/api/jobs'),
|
||||
])
|
||||
queueData.value = statsRes.data.queues || {}
|
||||
jobStats.value = statsRes.data.jobs || jobStats.value
|
||||
jobs.value = jobsRes.data || []
|
||||
lastUpdated.value = new Date().toLocaleTimeString()
|
||||
} catch (e) {
|
||||
console.warn('Failed to fetch stats:', e)
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
function startTimer() {
|
||||
stopTimer()
|
||||
if (autoRefresh.value) {
|
||||
timer = setInterval(fetchStats, refreshInterval.value)
|
||||
}
|
||||
}
|
||||
|
||||
function stopTimer() {
|
||||
if (timer) {
|
||||
clearInterval(timer)
|
||||
timer = null
|
||||
}
|
||||
}
|
||||
|
||||
watch([autoRefresh, refreshInterval], () => {
|
||||
startTimer()
|
||||
})
|
||||
|
||||
onMounted(() => {
|
||||
fetchStats()
|
||||
startTimer()
|
||||
})
|
||||
|
||||
onUnmounted(() => {
|
||||
stopTimer()
|
||||
})
|
||||
|
||||
function formatIdle(ms) {
|
||||
if (ms < 1000) return `${ms}ms`
|
||||
if (ms < 60000) return `${Math.round(ms / 1000)}s`
|
||||
return `${Math.round(ms / 60000)}m`
|
||||
}
|
||||
|
||||
function formatTime(iso) {
|
||||
if (!iso) return '-'
|
||||
return new Date(iso).toLocaleString()
|
||||
}
|
||||
|
||||
function queueTagType(name) {
|
||||
const types = {
|
||||
'queue:onnx': '',
|
||||
'queue:bie': 'warning',
|
||||
'queue:nef': 'success',
|
||||
'queue:done': 'info',
|
||||
}
|
||||
return types[name] || 'info'
|
||||
}
|
||||
|
||||
function statusTagType(status) {
|
||||
const types = {
|
||||
ONNX: 'primary',
|
||||
BIE: '',
|
||||
NEF: 'warning',
|
||||
COMPLETED: 'success',
|
||||
FAILED: 'danger',
|
||||
}
|
||||
return types[status] || 'info'
|
||||
}
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.monitor {
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.monitor-toolbar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
margin-bottom: 20px;
|
||||
padding: 12px 16px;
|
||||
background: white;
|
||||
border-radius: 8px;
|
||||
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
|
||||
}
|
||||
|
||||
.last-updated {
|
||||
margin-left: auto;
|
||||
color: #909399;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.stats-card {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.card-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.card-header .el-icon {
|
||||
margin-right: 8px;
|
||||
}
|
||||
|
||||
.job-stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(6, 1fr);
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.stat-item {
|
||||
text-align: center;
|
||||
padding: 16px;
|
||||
border-radius: 8px;
|
||||
background: #f5f7fa;
|
||||
}
|
||||
|
||||
.stat-value {
|
||||
font-size: 32px;
|
||||
font-weight: 700;
|
||||
color: #303133;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 13px;
|
||||
color: #909399;
|
||||
margin-top: 4px;
|
||||
}
|
||||
|
||||
.stat-onnx .stat-value { color: #409eff; }
|
||||
.stat-bie .stat-value { color: #e6a23c; }
|
||||
.stat-nef .stat-value { color: #67c23a; }
|
||||
.stat-completed .stat-value { color: #67c23a; }
|
||||
.stat-failed .stat-value { color: #f56c6c; }
|
||||
|
||||
.consumer-list {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
.consumer-tag {
|
||||
font-size: 11px;
|
||||
}
|
||||
|
||||
.job-id {
|
||||
font-family: monospace;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.text-warning { color: #e6a23c; font-weight: 600; }
|
||||
.text-danger { color: #f56c6c; }
|
||||
.text-muted { color: #c0c4cc; }
|
||||
</style>
|
||||
@ -15,7 +15,8 @@ export default defineConfig({
|
||||
'/api': {
|
||||
target: 'http://localhost:4000',
|
||||
changeOrigin: true,
|
||||
secure: false
|
||||
secure: false,
|
||||
rewrite: (path) => path.replace(/^\/api/, '')
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -23,7 +24,7 @@ export default defineConfig({
|
||||
outDir: 'dist',
|
||||
assetsDir: 'assets',
|
||||
sourcemap: false,
|
||||
minify: 'terser',
|
||||
minify: 'esbuild',
|
||||
rollupOptions: {
|
||||
output: {
|
||||
chunkFileNames: 'js/[name]-[hash].js',
|
||||
|
||||
132
docker-compose.yml
Normal file
132
docker-compose.yml
Normal file
@ -0,0 +1,132 @@
|
||||
##
|
||||
# Kneron Model Converter — Development docker-compose
|
||||
#
|
||||
# Usage:
|
||||
# docker-compose up # local mode (shared volume)
|
||||
# STORAGE_BACKEND=s3 docker-compose up # S3/MinIO mode
|
||||
# docker-compose up --scale bie-worker=3 # scale BIE workers
|
||||
##
|
||||
|
||||
volumes:
|
||||
job-data:
|
||||
|
||||
services:
|
||||
# ---------- Infrastructure ----------
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
expose:
|
||||
- "6379"
|
||||
command: redis-server --save ""
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 5s
|
||||
timeout: 3s
|
||||
retries: 3
|
||||
|
||||
# ---------- Web UI ----------
|
||||
|
||||
web:
|
||||
build: ./apps/web
|
||||
ports:
|
||||
- "9500:3000"
|
||||
depends_on:
|
||||
scheduler:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# ---------- Scheduler ----------
|
||||
|
||||
scheduler:
|
||||
build: ./apps/task-scheduler
|
||||
ports:
|
||||
- "9501:4000"
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- job-data:/data/jobs
|
||||
environment:
|
||||
- PORT=4000
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- JOB_DATA_DIR=/data/jobs
|
||||
- FRONTEND_URL=http://localhost:9500
|
||||
- STORAGE_BACKEND=${STORAGE_BACKEND:-local}
|
||||
- MINIO_ENDPOINT_URL=${MINIO_ENDPOINT_URL:-http://192.168.0.130:9000}
|
||||
- MINIO_BUCKET=${MINIO_BUCKET:-convertet-working-space}
|
||||
- MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-convuser}
|
||||
- MINIO_SECRET_KEY=${MINIO_SECRET_KEY}
|
||||
- MINIO_REGION=${MINIO_REGION:-us-east-1}
|
||||
- MINIO_LIFECYCLE_DAYS=${MINIO_LIFECYCLE_DAYS:-7}
|
||||
restart: unless-stopped
|
||||
|
||||
# ---------- Workers (stub mode) ----------
|
||||
|
||||
onnx-worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: services/workers/Dockerfile.stub
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- job-data:/data/jobs
|
||||
environment:
|
||||
- STAGE=onnx
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- JOB_DATA_DIR=/data/jobs
|
||||
- WORKER_MODE=${WORKER_MODE:-stub}
|
||||
- STORAGE_BACKEND=${STORAGE_BACKEND:-local}
|
||||
- MINIO_ENDPOINT_URL=${MINIO_ENDPOINT_URL:-http://192.168.0.130:9000}
|
||||
- MINIO_BUCKET=${MINIO_BUCKET:-convertet-working-space}
|
||||
- MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-convuser}
|
||||
- MINIO_SECRET_KEY=${MINIO_SECRET_KEY}
|
||||
- MINIO_REGION=${MINIO_REGION:-us-east-1}
|
||||
- MINIO_LIFECYCLE_DAYS=${MINIO_LIFECYCLE_DAYS:-7}
|
||||
restart: unless-stopped
|
||||
|
||||
bie-worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: services/workers/Dockerfile.stub
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- job-data:/data/jobs
|
||||
environment:
|
||||
- STAGE=bie
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- JOB_DATA_DIR=/data/jobs
|
||||
- WORKER_MODE=${WORKER_MODE:-stub}
|
||||
- STORAGE_BACKEND=${STORAGE_BACKEND:-local}
|
||||
- MINIO_ENDPOINT_URL=${MINIO_ENDPOINT_URL:-http://192.168.0.130:9000}
|
||||
- MINIO_BUCKET=${MINIO_BUCKET:-convertet-working-space}
|
||||
- MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-convuser}
|
||||
- MINIO_SECRET_KEY=${MINIO_SECRET_KEY}
|
||||
- MINIO_REGION=${MINIO_REGION:-us-east-1}
|
||||
- MINIO_LIFECYCLE_DAYS=${MINIO_LIFECYCLE_DAYS:-7}
|
||||
restart: unless-stopped
|
||||
|
||||
nef-worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: services/workers/Dockerfile.stub
|
||||
depends_on:
|
||||
redis:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- job-data:/data/jobs
|
||||
environment:
|
||||
- STAGE=nef
|
||||
- REDIS_URL=redis://redis:6379
|
||||
- JOB_DATA_DIR=/data/jobs
|
||||
- WORKER_MODE=${WORKER_MODE:-stub}
|
||||
- STORAGE_BACKEND=${STORAGE_BACKEND:-local}
|
||||
- MINIO_ENDPOINT_URL=${MINIO_ENDPOINT_URL:-http://192.168.0.130:9000}
|
||||
- MINIO_BUCKET=${MINIO_BUCKET:-convertet-working-space}
|
||||
- MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-convuser}
|
||||
- MINIO_SECRET_KEY=${MINIO_SECRET_KEY}
|
||||
- MINIO_REGION=${MINIO_REGION:-us-east-1}
|
||||
- MINIO_LIFECYCLE_DAYS=${MINIO_LIFECYCLE_DAYS:-7}
|
||||
restart: unless-stopped
|
||||
24
services/workers/Dockerfile.stub
Normal file
24
services/workers/Dockerfile.stub
Normal file
@ -0,0 +1,24 @@
|
||||
# Stub Worker Dockerfile
|
||||
# Lightweight image for development — no toolchain dependencies
|
||||
#
|
||||
# Build from project root:
|
||||
# docker build -f services/workers/Dockerfile.stub .
|
||||
|
||||
FROM python:3.9-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir redis>=5.0 boto3>=1.28
|
||||
|
||||
COPY services/workers/ /app/services/workers/
|
||||
|
||||
RUN mkdir -p /data/jobs
|
||||
|
||||
ENV WORKER_MODE=stub
|
||||
ENV REDIS_URL=redis://redis:6379
|
||||
ENV JOB_DATA_DIR=/data/jobs
|
||||
ENV STORAGE_BACKEND=local
|
||||
# STAGE should be set to: onnx, bie, or nef
|
||||
ENV STAGE=onnx
|
||||
|
||||
CMD python -m services.workers.${STAGE}.worker
|
||||
53
services/workers/bie/worker.py
Normal file
53
services/workers/bie/worker.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""
|
||||
BIE Worker — Redis Stream queue consumer entry point.
|
||||
|
||||
Usage:
|
||||
python -m services.workers.bie.worker
|
||||
# or
|
||||
WORKER_MODE=stub python -m services.workers.bie.worker
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from services.workers.consumer import WorkerConsumer
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("bie-worker")
|
||||
|
||||
|
||||
def get_process_fn():
|
||||
mode = os.environ.get("WORKER_MODE", "real").lower()
|
||||
|
||||
if mode == "stub":
|
||||
from services.workers.stubs import process_bie_core_stub
|
||||
logger.info("Running in STUB mode")
|
||||
return process_bie_core_stub
|
||||
else:
|
||||
from services.workers.bie.core import process_bie_core
|
||||
logger.info("Running in REAL mode")
|
||||
return process_bie_core
|
||||
|
||||
|
||||
def main():
|
||||
process_fn = get_process_fn()
|
||||
|
||||
consumer = WorkerConsumer(
|
||||
stage="bie",
|
||||
process_fn=process_fn,
|
||||
queue_name="queue:bie",
|
||||
group_name="bie-workers",
|
||||
)
|
||||
consumer.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
268
services/workers/consumer.py
Normal file
268
services/workers/consumer.py
Normal file
@ -0,0 +1,268 @@
|
||||
"""
|
||||
Generic Redis Stream queue consumer for workers.
|
||||
|
||||
每個 Worker(ONNX/BIE/NEF)使用此模組作為進入點:
|
||||
1. 從指定的 Redis Stream queue 拉取任務(XREADGROUP)
|
||||
2. 從 S3/MinIO 下載輸入檔案到本地暫存目錄
|
||||
3. 呼叫對應的 core function 處理
|
||||
4. 將結果上傳到 S3/MinIO
|
||||
5. 將結果推送到 queue:done
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import tempfile
|
||||
import time
|
||||
from typing import Any, Callable, Dict
|
||||
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WorkerConsumer:
|
||||
"""Redis Stream based queue consumer with S3/MinIO storage."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
stage: str,
|
||||
process_fn: Callable[[Dict[str, str], str, Dict[str, Any]], Dict[str, Any]],
|
||||
queue_name: str,
|
||||
group_name: str,
|
||||
redis_url: str = None,
|
||||
job_data_dir: str = None,
|
||||
):
|
||||
self.stage = stage
|
||||
self.process_fn = process_fn
|
||||
self.queue_name = queue_name
|
||||
self.group_name = group_name
|
||||
self.redis_url = redis_url or os.environ.get("REDIS_URL", "redis://localhost:6379")
|
||||
self.job_data_dir = job_data_dir or os.environ.get("JOB_DATA_DIR", "/data/jobs")
|
||||
self.consumer_name = f"{stage}-worker-{socket.gethostname()}-{os.getpid()}"
|
||||
self.running = True
|
||||
|
||||
self.client = redis.Redis.from_url(self.redis_url, decode_responses=True)
|
||||
|
||||
# Initialize MinIO storage
|
||||
self.minio = None
|
||||
if os.environ.get("STORAGE_BACKEND", "local") == "minio":
|
||||
from services.workers.s3_storage import MinIOStorage
|
||||
self.minio = MinIOStorage()
|
||||
logger.info("Using MinIO storage backend")
|
||||
else:
|
||||
logger.info("Using local filesystem storage backend")
|
||||
|
||||
def _ensure_group(self):
|
||||
"""Create consumer group if it doesn't exist."""
|
||||
try:
|
||||
self.client.xgroup_create(self.queue_name, self.group_name, id="0", mkstream=True)
|
||||
logger.info(f"Created consumer group '{self.group_name}' on '{self.queue_name}'")
|
||||
except redis.ResponseError as e:
|
||||
if "BUSYGROUP" not in str(e):
|
||||
raise
|
||||
# Group already exists — OK
|
||||
|
||||
def _prepare_local_dir(self, job_id: str) -> str:
|
||||
"""Prepare a local working directory for the job.
|
||||
|
||||
For S3 mode: downloads required files from S3 to a temp dir.
|
||||
For local mode: returns the existing job dir on shared volume.
|
||||
"""
|
||||
if not self.minio:
|
||||
return os.path.join(self.job_data_dir, job_id)
|
||||
|
||||
# MinIO mode: use a local temp dir (isolated per worker, no shared volume conflict)
|
||||
local_dir = os.path.join(tempfile.gettempdir(), "kneron-jobs", f"{job_id}-{self.stage}")
|
||||
os.makedirs(local_dir, exist_ok=True)
|
||||
|
||||
s3_prefix = f"jobs/{job_id}"
|
||||
|
||||
if self.stage == "onnx":
|
||||
# Download input/ directory (model file + ref_images)
|
||||
self.minio.download_prefix(f"{s3_prefix}/input", os.path.join(local_dir, "input"))
|
||||
logger.info(f"Downloaded input files from S3 for job {job_id}")
|
||||
|
||||
elif self.stage == "bie":
|
||||
# Download out.onnx from previous stage
|
||||
self.minio.download_file(f"{s3_prefix}/out.onnx", os.path.join(local_dir, "out.onnx"))
|
||||
# Download ref_images for quantization
|
||||
self.minio.download_prefix(
|
||||
f"{s3_prefix}/input/ref_images",
|
||||
os.path.join(local_dir, "input", "ref_images"),
|
||||
)
|
||||
logger.info(f"Downloaded ONNX + ref_images from S3 for job {job_id}")
|
||||
|
||||
elif self.stage == "nef":
|
||||
# Download out.bie from previous stage
|
||||
self.minio.download_file(f"{s3_prefix}/out.bie", os.path.join(local_dir, "out.bie"))
|
||||
logger.info(f"Downloaded BIE from S3 for job {job_id}")
|
||||
|
||||
return local_dir
|
||||
|
||||
def _upload_output(self, job_id: str, job_dir: str):
|
||||
"""Upload the output file to S3 after processing."""
|
||||
if not self.minio:
|
||||
return
|
||||
|
||||
output_files = {
|
||||
"onnx": "out.onnx",
|
||||
"bie": "out.bie",
|
||||
"nef": "out.nef",
|
||||
}
|
||||
output_name = output_files[self.stage]
|
||||
local_path = os.path.join(job_dir, output_name)
|
||||
s3_key = f"jobs/{job_id}/{output_name}"
|
||||
|
||||
if os.path.exists(local_path):
|
||||
self.minio.upload_file(local_path, s3_key)
|
||||
logger.info(f"Uploaded {output_name} to S3 for job {job_id}")
|
||||
|
||||
def _cleanup_local(self, job_dir: str):
|
||||
"""Clean up local temp directory after S3 upload."""
|
||||
if not self.minio:
|
||||
return
|
||||
try:
|
||||
shutil.rmtree(job_dir, ignore_errors=True)
|
||||
logger.debug(f"Cleaned up local dir: {job_dir}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up {job_dir}: {e}")
|
||||
|
||||
def _build_input_paths(self, job_dir: str, parameters: dict) -> dict:
|
||||
"""Build input_paths dict based on stage and job directory contents."""
|
||||
input_dir = os.path.join(job_dir, "input")
|
||||
|
||||
if self.stage == "onnx":
|
||||
# Find the single input file in input/
|
||||
input_file = None
|
||||
if os.path.isdir(input_dir):
|
||||
for f in os.listdir(input_dir):
|
||||
fpath = os.path.join(input_dir, f)
|
||||
if os.path.isfile(fpath):
|
||||
input_file = fpath
|
||||
break
|
||||
if not input_file:
|
||||
raise FileNotFoundError(f"No input file found in {input_dir}")
|
||||
return {"file_path": input_file}
|
||||
|
||||
elif self.stage == "bie":
|
||||
onnx_path = os.path.join(job_dir, "out.onnx")
|
||||
ref_images_dir = os.path.join(input_dir, "ref_images")
|
||||
return {
|
||||
"onnx_file_path": onnx_path,
|
||||
"data_dir": ref_images_dir,
|
||||
}
|
||||
|
||||
elif self.stage == "nef":
|
||||
bie_path = os.path.join(job_dir, "out.bie")
|
||||
return {"bie_file_path": bie_path}
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown stage: {self.stage}")
|
||||
|
||||
def _get_output_path(self, job_dir: str) -> str:
|
||||
"""Get the expected output file path for this stage."""
|
||||
output_files = {
|
||||
"onnx": "out.onnx",
|
||||
"bie": "out.bie",
|
||||
"nef": "out.nef",
|
||||
}
|
||||
return os.path.join(job_dir, output_files[self.stage])
|
||||
|
||||
def _push_done(self, job_id: str, result: str, reason: str = None):
|
||||
"""Push a done event to queue:done."""
|
||||
message = {
|
||||
"job_id": job_id,
|
||||
"step": self.stage,
|
||||
"result": result,
|
||||
"completed_at": time.strftime("%Y-%m-%dT%H:%M:%S%z"),
|
||||
}
|
||||
if reason:
|
||||
message["reason"] = reason
|
||||
self.client.xadd("queue:done", {"data": json.dumps(message)})
|
||||
logger.info(f"Pushed done: job={job_id} step={self.stage} result={result}")
|
||||
|
||||
def _process_message(self, message_id: str, data: dict):
|
||||
"""Process a single task message."""
|
||||
job_id = data["job_id"]
|
||||
parameters = data.get("parameters", {})
|
||||
|
||||
logger.info(f"Processing job {job_id} (stage={self.stage})")
|
||||
|
||||
job_dir = None
|
||||
try:
|
||||
# Prepare local working directory (download from S3 if needed)
|
||||
job_dir = self._prepare_local_dir(job_id)
|
||||
|
||||
input_paths = self._build_input_paths(job_dir, parameters)
|
||||
output_path = self._get_output_path(job_dir)
|
||||
|
||||
# Add work_dir to parameters so core can set up toolchain paths
|
||||
parameters["work_dir"] = job_dir
|
||||
|
||||
result = self.process_fn(input_paths, output_path, parameters)
|
||||
|
||||
# Upload output to S3
|
||||
self._upload_output(job_id, job_dir)
|
||||
|
||||
logger.info(f"Job {job_id} completed: {result.get('file_path', 'N/A')}")
|
||||
self._push_done(job_id, "ok")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Job {job_id} failed: {e}", exc_info=True)
|
||||
self._push_done(job_id, "fail", reason=str(e))
|
||||
|
||||
finally:
|
||||
# Clean up local temp files in S3 mode
|
||||
if job_dir:
|
||||
self._cleanup_local(job_dir)
|
||||
|
||||
# ACK the message regardless of success/failure
|
||||
self.client.xack(self.queue_name, self.group_name, message_id)
|
||||
|
||||
def run(self):
|
||||
"""Main loop: pull tasks from queue and process them."""
|
||||
self._ensure_group()
|
||||
|
||||
logger.info(
|
||||
f"[{self.consumer_name}] Listening on {self.queue_name} "
|
||||
f"(group={self.group_name})"
|
||||
)
|
||||
|
||||
# Handle graceful shutdown
|
||||
def handle_signal(signum, frame):
|
||||
logger.info(f"[{self.consumer_name}] Received signal {signum}, shutting down...")
|
||||
self.running = False
|
||||
|
||||
signal.signal(signal.SIGTERM, handle_signal)
|
||||
signal.signal(signal.SIGINT, handle_signal)
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
results = self.client.xreadgroup(
|
||||
self.group_name,
|
||||
self.consumer_name,
|
||||
{self.queue_name: ">"},
|
||||
count=1,
|
||||
block=5000, # 5 second timeout
|
||||
)
|
||||
|
||||
if not results:
|
||||
continue
|
||||
|
||||
for stream_name, messages in results:
|
||||
for message_id, fields in messages:
|
||||
data = json.loads(fields["data"])
|
||||
self._process_message(message_id, data)
|
||||
|
||||
except redis.ConnectionError:
|
||||
logger.error("Redis connection lost, retrying in 3s...")
|
||||
time.sleep(3)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error: {e}", exc_info=True)
|
||||
time.sleep(1)
|
||||
|
||||
logger.info(f"[{self.consumer_name}] Stopped")
|
||||
53
services/workers/nef/worker.py
Normal file
53
services/workers/nef/worker.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""
|
||||
NEF Worker — Redis Stream queue consumer entry point.
|
||||
|
||||
Usage:
|
||||
python -m services.workers.nef.worker
|
||||
# or
|
||||
WORKER_MODE=stub python -m services.workers.nef.worker
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from services.workers.consumer import WorkerConsumer
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("nef-worker")
|
||||
|
||||
|
||||
def get_process_fn():
|
||||
mode = os.environ.get("WORKER_MODE", "real").lower()
|
||||
|
||||
if mode == "stub":
|
||||
from services.workers.stubs import process_nef_core_stub
|
||||
logger.info("Running in STUB mode")
|
||||
return process_nef_core_stub
|
||||
else:
|
||||
from services.workers.nef.core import process_nef_core
|
||||
logger.info("Running in REAL mode")
|
||||
return process_nef_core
|
||||
|
||||
|
||||
def main():
|
||||
process_fn = get_process_fn()
|
||||
|
||||
consumer = WorkerConsumer(
|
||||
stage="nef",
|
||||
process_fn=process_fn,
|
||||
queue_name="queue:nef",
|
||||
group_name="nef-workers",
|
||||
)
|
||||
consumer.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
services/workers/onnx/worker.py
Normal file
55
services/workers/onnx/worker.py
Normal file
@ -0,0 +1,55 @@
|
||||
"""
|
||||
ONNX Worker — Redis Stream queue consumer entry point.
|
||||
|
||||
Usage:
|
||||
python -m services.workers.onnx.worker
|
||||
# or
|
||||
WORKER_MODE=stub python -m services.workers.onnx.worker
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Ensure project root is in sys.path
|
||||
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if PROJECT_ROOT not in sys.path:
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
|
||||
from services.workers.consumer import WorkerConsumer
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger("onnx-worker")
|
||||
|
||||
|
||||
def get_process_fn():
|
||||
"""Return the appropriate core function based on WORKER_MODE."""
|
||||
mode = os.environ.get("WORKER_MODE", "real").lower()
|
||||
|
||||
if mode == "stub":
|
||||
from services.workers.stubs import process_onnx_core_stub
|
||||
logger.info("Running in STUB mode")
|
||||
return process_onnx_core_stub
|
||||
else:
|
||||
from services.workers.onnx.core import process_onnx_core
|
||||
logger.info("Running in REAL mode")
|
||||
return process_onnx_core
|
||||
|
||||
|
||||
def main():
|
||||
process_fn = get_process_fn()
|
||||
|
||||
consumer = WorkerConsumer(
|
||||
stage="onnx",
|
||||
process_fn=process_fn,
|
||||
queue_name="queue:onnx",
|
||||
group_name="onnx-workers",
|
||||
)
|
||||
consumer.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
145
services/workers/s3_storage.py
Normal file
145
services/workers/s3_storage.py
Normal file
@ -0,0 +1,145 @@
|
||||
"""
|
||||
MinIO storage helper for workers.
|
||||
|
||||
Provides upload/download functionality to replace Docker Shared Volume.
|
||||
Workers download inputs from MinIO to local temp dir, process, then upload results.
|
||||
|
||||
Uses boto3 (S3-compatible API) to communicate with MinIO.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config as BotoConfig
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MinIOStorage:
|
||||
"""MinIO storage client for job file exchange."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint_url: str = None,
|
||||
bucket: str = None,
|
||||
access_key: str = None,
|
||||
secret_key: str = None,
|
||||
region: str = None,
|
||||
lifecycle_days: int = None,
|
||||
):
|
||||
self.endpoint_url = endpoint_url or os.environ.get("MINIO_ENDPOINT_URL", "http://192.168.0.130:9000")
|
||||
self.bucket = bucket or os.environ.get("MINIO_BUCKET", "convertet-working-space")
|
||||
self.access_key = access_key or os.environ.get("MINIO_ACCESS_KEY", "convuser")
|
||||
self.secret_key = secret_key or os.environ.get("MINIO_SECRET_KEY", "")
|
||||
self.region = region or os.environ.get("MINIO_REGION", "us-east-1")
|
||||
self.lifecycle_days = lifecycle_days if lifecycle_days is not None else int(os.environ.get("MINIO_LIFECYCLE_DAYS", "7"))
|
||||
|
||||
self.client = boto3.client(
|
||||
"s3",
|
||||
endpoint_url=self.endpoint_url,
|
||||
aws_access_key_id=self.access_key,
|
||||
aws_secret_access_key=self.secret_key,
|
||||
region_name=self.region,
|
||||
config=BotoConfig(signature_version="s3v4"),
|
||||
)
|
||||
self._ensure_bucket()
|
||||
if self.lifecycle_days > 0:
|
||||
self._ensure_lifecycle_rule()
|
||||
|
||||
def _ensure_bucket(self):
|
||||
"""Verify bucket exists (do not auto-create — bucket is managed externally)."""
|
||||
try:
|
||||
self.client.head_bucket(Bucket=self.bucket)
|
||||
logger.info(f"MinIO bucket verified: {self.bucket}")
|
||||
except Exception as e:
|
||||
logger.error(f"MinIO bucket '{self.bucket}' is not accessible: {e}")
|
||||
raise
|
||||
|
||||
def _ensure_lifecycle_rule(self):
|
||||
"""Set lifecycle rule to auto-expire objects under jobs/ prefix."""
|
||||
rule_id = "auto-cleanup-jobs"
|
||||
try:
|
||||
self.client.put_bucket_lifecycle_configuration(
|
||||
Bucket=self.bucket,
|
||||
LifecycleConfiguration={
|
||||
"Rules": [
|
||||
{
|
||||
"ID": rule_id,
|
||||
"Status": "Enabled",
|
||||
"Filter": {"Prefix": "jobs/"},
|
||||
"Expiration": {"Days": self.lifecycle_days},
|
||||
}
|
||||
]
|
||||
},
|
||||
)
|
||||
logger.info(f"MinIO lifecycle rule set: jobs/* expire after {self.lifecycle_days} days")
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not set lifecycle rule: {e}")
|
||||
|
||||
def upload_file(self, local_path: str, key: str):
|
||||
"""Upload a local file to MinIO."""
|
||||
self.client.upload_file(local_path, self.bucket, key)
|
||||
logger.debug(f"Uploaded {local_path} -> minio://{self.bucket}/{key}")
|
||||
|
||||
def upload_data(self, data: bytes, key: str):
|
||||
"""Upload raw bytes to MinIO."""
|
||||
self.client.put_object(Bucket=self.bucket, Key=key, Body=data)
|
||||
logger.debug(f"Uploaded {len(data)} bytes -> minio://{self.bucket}/{key}")
|
||||
|
||||
def download_file(self, key: str, local_path: str):
|
||||
"""Download a file from MinIO to local path."""
|
||||
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
||||
self.client.download_file(self.bucket, key, local_path)
|
||||
logger.debug(f"Downloaded minio://{self.bucket}/{key} -> {local_path}")
|
||||
|
||||
def download_to_stream(self, key: str):
|
||||
"""Get a streaming body for a MinIO object."""
|
||||
response = self.client.get_object(Bucket=self.bucket, Key=key)
|
||||
return response["Body"]
|
||||
|
||||
def list_keys(self, prefix: str) -> list:
|
||||
"""List all keys under a prefix."""
|
||||
keys = []
|
||||
paginator = self.client.get_paginator("list_objects_v2")
|
||||
for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
|
||||
for obj in page.get("Contents", []):
|
||||
keys.append(obj["Key"])
|
||||
return keys
|
||||
|
||||
def upload_directory(self, local_dir: str, prefix: str):
|
||||
"""Upload all files in a local directory to MinIO under a prefix."""
|
||||
for root, _, files in os.walk(local_dir):
|
||||
for fname in files:
|
||||
local_path = os.path.join(root, fname)
|
||||
rel_path = os.path.relpath(local_path, local_dir)
|
||||
key = f"{prefix}/{rel_path}"
|
||||
self.upload_file(local_path, key)
|
||||
|
||||
def download_prefix(self, prefix: str, local_dir: str):
|
||||
"""Download all files under a MinIO prefix to a local directory."""
|
||||
keys = self.list_keys(prefix)
|
||||
for key in keys:
|
||||
rel_path = key[len(prefix):].lstrip("/")
|
||||
if not rel_path:
|
||||
continue
|
||||
local_path = os.path.join(local_dir, rel_path)
|
||||
self.download_file(key, local_path)
|
||||
|
||||
def exists(self, key: str) -> bool:
|
||||
"""Check if a MinIO key exists."""
|
||||
try:
|
||||
self.client.head_object(Bucket=self.bucket, Key=key)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_size(self, key: str) -> Optional[int]:
|
||||
"""Get the size of a MinIO object in bytes."""
|
||||
try:
|
||||
response = self.client.head_object(Bucket=self.bucket, Key=key)
|
||||
return response["ContentLength"]
|
||||
except Exception:
|
||||
return None
|
||||
122
services/workers/stubs.py
Normal file
122
services/workers/stubs.py
Normal file
@ -0,0 +1,122 @@
|
||||
"""
|
||||
Stub implementations of worker core functions.
|
||||
|
||||
Used when WORKER_MODE=stub, allowing development and testing of
|
||||
Scheduler / Queue / UI without requiring the Kneron Toolchain environment.
|
||||
|
||||
Each stub:
|
||||
- Sleeps to simulate processing time
|
||||
- Creates a minimal output file
|
||||
- Returns a result dict matching the real core function signature
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
def process_onnx_core_stub(
|
||||
input_paths: Dict[str, str],
|
||||
output_path: str,
|
||||
parameters: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Stub ONNX processing: sleep 2s, create a fake out.onnx."""
|
||||
file_path = input_paths["file_path"]
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"Input file not found: {file_path}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
# Create minimal valid-looking output
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(b"STUB_ONNX_OUTPUT_" + os.path.basename(file_path).encode())
|
||||
|
||||
return {
|
||||
"file_path": output_path,
|
||||
"file_size": os.path.getsize(output_path),
|
||||
"eval_report": "",
|
||||
"model_info": {
|
||||
"model_id": parameters.get("model_id"),
|
||||
"version": parameters.get("version"),
|
||||
"platform": parameters.get("platform"),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def process_bie_core_stub(
|
||||
input_paths: Dict[str, str],
|
||||
output_path: str,
|
||||
parameters: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Stub BIE processing: sleep 3s, create a fake out.bie."""
|
||||
onnx_file_path = input_paths["onnx_file_path"]
|
||||
data_dir = input_paths["data_dir"]
|
||||
|
||||
if not os.path.exists(onnx_file_path):
|
||||
raise FileNotFoundError(f"ONNX file not found: {onnx_file_path}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
# Count ref images (if any)
|
||||
img_count = 0
|
||||
if os.path.isdir(data_dir):
|
||||
img_count = len([f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))])
|
||||
|
||||
time.sleep(3)
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(b"STUB_BIE_OUTPUT")
|
||||
|
||||
return {
|
||||
"file_path": output_path,
|
||||
"file_size": os.path.getsize(output_path),
|
||||
"model_info": {
|
||||
"model_id": parameters.get("model_id"),
|
||||
"version": parameters.get("version"),
|
||||
"platform": parameters.get("platform"),
|
||||
},
|
||||
"analysis_info": {
|
||||
"input_name": "stub_input",
|
||||
"batch_size": 1,
|
||||
"channels": 3,
|
||||
"height": 224,
|
||||
"width": 224,
|
||||
},
|
||||
"processed_images": img_count,
|
||||
}
|
||||
|
||||
|
||||
def process_nef_core_stub(
|
||||
input_paths: Dict[str, str],
|
||||
output_path: str,
|
||||
parameters: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
"""Stub NEF processing: sleep 2s, create a fake out.nef."""
|
||||
bie_file_path = input_paths["bie_file_path"]
|
||||
|
||||
if not os.path.exists(bie_file_path):
|
||||
raise FileNotFoundError(f"BIE file not found: {bie_file_path}")
|
||||
|
||||
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(b"STUB_NEF_OUTPUT")
|
||||
|
||||
return {
|
||||
"file_path": output_path,
|
||||
"file_size": os.path.getsize(output_path),
|
||||
"model_info": {
|
||||
"model_id": parameters.get("model_id"),
|
||||
"version": parameters.get("version"),
|
||||
"platform": parameters.get("platform"),
|
||||
},
|
||||
"compilation_info": {
|
||||
"optimization_level": "stub",
|
||||
"memory_usage": "stub",
|
||||
"inference_speed": "stub",
|
||||
},
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user