diff --git a/.github/workflows/build-appimage-manual.yml b/.github/workflows/build-appimage-manual.yml new file mode 100644 index 00000000..9fdffcb7 --- /dev/null +++ b/.github/workflows/build-appimage-manual.yml @@ -0,0 +1,81 @@ +name: Build ProxMenux Monitor AppImage + +on: + + workflow_dispatch: + +permissions: + contents: write + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install dependencies + working-directory: AppImage + run: npm install --legacy-peer-deps + + - name: Build Next.js app + working-directory: AppImage + run: npm run build + + - name: Install Python dependencies + run: | + sudo apt-get update + sudo apt-get install -y python3 python3-pip python3-venv + + - name: Make build script executable + working-directory: AppImage + run: chmod +x scripts/build_appimage.sh + + - name: Build AppImage + working-directory: AppImage + run: ./scripts/build_appimage.sh + + - name: Get version from package.json + id: version + working-directory: AppImage + run: echo "VERSION=$(node -p "require('./package.json').version")" >> $GITHUB_OUTPUT + + - name: Upload AppImage artifact + uses: actions/upload-artifact@v4 + with: + name: ProxMenux-${{ steps.version.outputs.VERSION }}-AppImage + path: AppImage/dist/*.AppImage + retention-days: 30 + + - name: Generate SHA256 checksum + run: | + cd AppImage/dist + sha256sum *.AppImage > ProxMenux-Monitor.AppImage.sha256 + echo "Generated SHA256:" + cat ProxMenux-Monitor.AppImage.sha256 + + - name: Upload AppImage and checksum to /AppImage folder in main + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + git fetch origin main + git checkout main + + rm -f AppImage/*.AppImage AppImage/*.sha256 || true + + # Copy new files + cp AppImage/dist/*.AppImage AppImage/ + cp AppImage/dist/ProxMenux-Monitor.AppImage.sha256 AppImage/ + + git add AppImage/*.AppImage AppImage/*.sha256 + git commit -m "Update AppImage build ($(date +'%Y-%m-%d %H:%M:%S'))" || echo "No changes to commit" + git push origin main diff --git a/AppImage/components/gpu-switch-mode-indicator.tsx b/AppImage/components/gpu-switch-mode-indicator.tsx index d268a9c4..792a7734 100644 --- a/AppImage/components/gpu-switch-mode-indicator.tsx +++ b/AppImage/components/gpu-switch-mode-indicator.tsx @@ -2,12 +2,20 @@ import { cn } from "@/lib/utils" +interface SriovInfo { + role: "vf" | "pf-active" | "pf-idle" + physfn?: string // VF only: parent PF BDF + vfCount?: number // PF only: active VF count + totalvfs?: number // PF only: maximum VFs +} + interface GpuSwitchModeIndicatorProps { - mode: "lxc" | "vm" | "unknown" + mode: "lxc" | "vm" | "sriov" | "unknown" isEditing?: boolean pendingMode?: "lxc" | "vm" | null onToggle?: (e: React.MouseEvent) => void className?: string + sriovInfo?: SriovInfo } export function GpuSwitchModeIndicator({ @@ -16,20 +24,38 @@ export function GpuSwitchModeIndicator({ pendingMode = null, onToggle, className, + sriovInfo, }: GpuSwitchModeIndicatorProps) { - const displayMode = pendingMode ?? mode + // SR-IOV is a non-editable hardware state. Pending toggles don't apply here. + const displayMode = mode === "sriov" ? "sriov" : (pendingMode ?? mode) const isLxcActive = displayMode === "lxc" const isVmActive = displayMode === "vm" - const hasChanged = pendingMode !== null && pendingMode !== mode + const isSriovActive = displayMode === "sriov" + const hasChanged = + mode !== "sriov" && pendingMode !== null && pendingMode !== mode // Colors - const activeColor = isLxcActive ? "#3b82f6" : isVmActive ? "#a855f7" : "#6b7280" + const sriovColor = "#14b8a6" // teal-500 + const activeColor = isSriovActive + ? sriovColor + : isLxcActive + ? "#3b82f6" + : isVmActive + ? "#a855f7" + : "#6b7280" const inactiveColor = "#374151" // gray-700 for dark theme + const dimmedColor = "#4b5563" // gray-600 for dashed SR-IOV branches const lxcColor = isLxcActive ? "#3b82f6" : inactiveColor const vmColor = isVmActive ? "#a855f7" : inactiveColor const handleClick = (e: React.MouseEvent) => { - // Only stop propagation and handle toggle when in editing mode + // SR-IOV state can't be toggled — swallow the click so it doesn't reach + // the card (which would open the detail modal unexpectedly from this + // area). For lxc/vm, preserve the original behavior. + if (isSriovActive) { + e.stopPropagation() + return + } if (isEditing) { e.stopPropagation() if (onToggle) { @@ -39,11 +65,20 @@ export function GpuSwitchModeIndicator({ // When not editing, let the click propagate to the card to open the modal } + // Build the VF count label shown in the SR-IOV badge. For PFs we know + // exactly how many VFs are active; for a VF we show its parent PF. + const sriovBadgeText = (() => { + if (!isSriovActive) return "" + if (sriovInfo?.role === "vf") return "SR-IOV VF" + if (sriovInfo?.vfCount && sriovInfo.vfCount > 0) return `SR-IOV ×${sriovInfo.vfCount}` + return "SR-IOV" + })() + return ( -
{/* GPU text */} - @@ -115,112 +150,198 @@ export function GpuSwitchModeIndicator({ cx="95" cy="50" r="6" - fill={isEditing ? "#f59e0b" : activeColor} + fill={isEditing && !isSriovActive ? "#f59e0b" : activeColor} className="transition-all duration-300" /> - {/* LXC Branch Line - going up-right */} + {/* LXC Branch Line - going up-right. + In SR-IOV mode the branch is dashed + dimmed to show that the + target is theoretically reachable via a VF but not controlled + by ProxMenux. */} - {/* VM Branch Line - going down-right */} + {/* VM Branch Line - going down-right (dashed/dimmed in SR-IOV). */} - {/* LXC Container Icon - Server/Stack icon */} - - {/* Container box */} - - {/* Container layers/lines */} - - - {/* Status dots */} - - - - + {/* SR-IOV in-line connector + badge (only when mode === 'sriov'). + A horizontal line from the switch node leads to a pill-shaped + badge carrying the "SR-IOV ×N" label. Placed on the GPU's + baseline to visually read as an in-line extension, not as a + third branch. */} + {isSriovActive && ( + <> + + + + {sriovBadgeText} + + + )} + + {/* LXC Container Icon - dimmed/smaller in SR-IOV mode. */} + {!isSriovActive && ( + + + + + + + + + )} + {/* SR-IOV: compact dimmed LXC glyph so the geometry stays recognizable + but it's clearly not the active target. */} + {isSriovActive && ( + + + + + + )} {/* LXC Label */} - - LXC - + {!isSriovActive && ( + + LXC + + )} + {isSriovActive && ( + + LXC + + )} - {/* VM Monitor Icon */} - - {/* Monitor screen */} - - {/* Screen inner/shine */} - - {/* Monitor stand */} - - {/* Monitor base */} - - + {/* VM Monitor Icon - active view */} + {!isSriovActive && ( + + + + + + + )} + {/* SR-IOV: compact dimmed VM monitor glyph, mirror of the LXC glyph. */} + {isSriovActive && ( + + + + + + )} {/* VM Label */} - - VM - + {!isSriovActive && ( + + VM + + )} + {isSriovActive && ( + + VM + + )} {/* Status Text - Large like GPU name */} @@ -228,22 +349,41 @@ export function GpuSwitchModeIndicator({ - {isLxcActive - ? "Ready for LXC containers" - : isVmActive - ? "Ready for VM passthrough" - : "Mode unknown"} + {isSriovActive + ? "SR-IOV active" + : isLxcActive + ? "Ready for LXC containers" + : isVmActive + ? "Ready for VM passthrough" + : "Mode unknown"} - {isLxcActive - ? "Native driver active" - : isVmActive - ? "VFIO-PCI driver active" - : "No driver detected"} + {isSriovActive + ? "Virtual Functions managed externally" + : isLxcActive + ? "Native driver active" + : isVmActive + ? "VFIO-PCI driver active" + : "No driver detected"} + {isSriovActive && sriovInfo && ( + + {sriovInfo.role === "vf" + ? `Virtual Function${sriovInfo.physfn ? ` · parent PF ${sriovInfo.physfn}` : ""}` + : sriovInfo.vfCount !== undefined + ? `1 PF + ${sriovInfo.vfCount} VF${sriovInfo.vfCount === 1 ? "" : "s"}${sriovInfo.totalvfs ? ` / ${sriovInfo.totalvfs} max` : ""}` + : null} + + )} {hasChanged && ( Change pending... diff --git a/AppImage/components/hardware.tsx b/AppImage/components/hardware.tsx index 88c2e26a..21959c37 100644 --- a/AppImage/components/hardware.tsx +++ b/AppImage/components/hardware.tsx @@ -293,11 +293,16 @@ export default function Hardware() { const [showSwitchModeModal, setShowSwitchModeModal] = useState(false) const [switchModeParams, setSwitchModeParams] = useState<{ gpuSlot: string; targetMode: "lxc" | "vm" } | null>(null) - // Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC) - const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "unknown" => { + // Determine GPU mode based on driver (vfio-pci = VM, native driver = LXC). + // SR-IOV short-circuits the driver check: if the GPU is either a VF or a + // PF with active VFs, the slot is in a hardware-partitioned state that + // ProxMenux does not manage from the UI, so it's surfaced as its own mode. + const getGpuSwitchMode = (gpu: GPU): "lxc" | "vm" | "sriov" | "unknown" => { + if (gpu.sriov_role === "vf" || gpu.sriov_role === "pf-active") return "sriov" + const driver = gpu.pci_driver?.toLowerCase() || "" const kernelModule = gpu.pci_kernel_module?.toLowerCase() || "" - + // Check driver first if (driver === "vfio-pci") return "vm" if (driver === "nvidia" || driver === "amdgpu" || driver === "radeon" || driver === "i915" || driver === "xe" || driver === "nouveau" || driver === "mgag200") return "lxc" @@ -940,7 +945,11 @@ return ( Switch Mode
- {editingSwitchModeGpu === fullSlot ? ( + {getGpuSwitchMode(gpu) === "sriov" ? ( + // SR-IOV: edit controls hidden — the state is + // hardware-managed and not togglable from here. + null + ) : editingSwitchModeGpu === fullSlot ? ( <>
)} @@ -1053,8 +1072,104 @@ return (

Loading real-time data...

+ ) : selectedGPU.sriov_role === "vf" ? ( + // SR-IOV Virtual Function: per-VF telemetry is not exposed + // by the kernel, so we skip the metrics panel and show + // identity + consumer + a link back to the parent PF. +
+
+
+
+ + + +
+
+

SR-IOV Virtual Function

+

+ This device is a Virtual Function spawned by a Physical Function. Per-VF + telemetry (temperature, utilization, memory) is not exposed by the kernel — + open the parent PF to see aggregate GPU metrics. +

+
+
+
+ +
+

+ Virtual Function Detail +

+
+ Parent Physical Function + {selectedGPU.sriov_physfn ? ( + + ) : ( + unknown + )} +
+
+ Current Driver + + {selectedGPU.pci_driver || "none"} + +
+
+ Consumer +
+ {realtimeGPUData?.sriov_consumer ? ( + + + {realtimeGPUData.sriov_consumer.type.toUpperCase()} {realtimeGPUData.sriov_consumer.id} + {realtimeGPUData.sriov_consumer.name && ` · ${realtimeGPUData.sriov_consumer.name}`} + {` · ${realtimeGPUData.sriov_consumer.running ? "running" : "stopped"}`} + + ) : ( + unused + )} +
+
+
+
) : realtimeGPUData?.has_monitoring_tool === true ? ( <> + {selectedGPU.sriov_role === "pf-active" && ( + // SR-IOV Physical Function: metrics below are the + // aggregate of the whole GPU (PF + all active VFs). + // Flag it explicitly so the reader interprets numbers + // correctly. +
+
+ + + SR-IOV active + + + Metrics below reflect the Physical Function (aggregate across + {" "} + + {realtimeGPUData?.sriov_vf_count ?? selectedGPU.sriov_vf_count ?? "N"} + + {" "}VFs). + +
+
+ )}
Updating every 3 seconds @@ -1285,6 +1400,67 @@ return (
)} + {selectedGPU.sriov_role === "pf-active" && + Array.isArray(realtimeGPUData?.sriov_vfs) && + realtimeGPUData.sriov_vfs.length > 0 && ( + // Per-VF table: one row per virtfn* under the PF. + // Driver is color-coded (teal native / purple vfio-pci + // / muted fallback) and consumer pills go green when + // the guest is currently running, muted otherwise. +
+

+ Virtual Functions +

+
+ {realtimeGPUData.sriov_vfs.map((vf: any) => ( +
+ {vf.bdf} +
+ + {vf.driver || "unbound"} + + {vf.consumer ? ( + + + {vf.consumer.type.toUpperCase()} {vf.consumer.id} + {vf.consumer.name && ( + · {vf.consumer.name} + )} + + ) : ( + + unused + + )} +
+
+ ))} +
+
+ )} ) : (findPCIDeviceForGPU(selectedGPU)?.driver === 'vfio-pci' || selectedGPU.pci_driver === 'vfio-pci') ? (
diff --git a/AppImage/config/verified_ai_models.json b/AppImage/config/verified_ai_models.json index 2e0c716d..da915473 100644 --- a/AppImage/config/verified_ai_models.json +++ b/AppImage/config/verified_ai_models.json @@ -1,7 +1,8 @@ { "_description": "Verified AI models for ProxMenux notifications. Only models listed here will be shown to users. Models are tested to work with the chat/completions API format.", - "_updated": "2026-03-20", - + "_updated": "2026-04-19", + "_verifier": "Refreshed with tools/ai-models-verifier (private). Re-run before each ProxMenux release to keep the list current. The verifier and ProxMenux share the same reasoning/thinking-model handlers so their verdicts stay aligned with runtime behaviour.", + "groq": { "models": [ "llama-3.3-70b-versatile", @@ -12,37 +13,46 @@ "mixtral-8x7b-32768", "gemma2-9b-it" ], - "recommended": "llama-3.3-70b-versatile" + "recommended": "llama-3.3-70b-versatile", + "_note": "Not yet re-verified in 2026-04 refresh — kept from previous curation. Run the verifier with a Groq key to prune deprecated entries." }, - + "gemini": { "models": [ - "gemini-2.5-flash", "gemini-2.5-flash-lite", - "gemini-2.5-pro" + "gemini-2.5-flash", + "gemini-3-flash-preview" ], - "recommended": "gemini-2.5-flash", - "_note": "gemini-2.5-flash-lite is cheaper but may struggle with complex prompts. Use with simple/custom prompts.", + "recommended": "gemini-2.5-flash-lite", + "_note": "flash-lite / flash pass the verifier consistently; pro variants reject thinkingBudget=0 and are overkill for notification translation anyway. 'latest' aliases (gemini-flash-latest, gemini-flash-lite-latest) are intentionally omitted because they resolved to different models across runs and produced timeouts in some regions.", "_deprecated": ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-1.5-flash", "gemini-1.0-pro", "gemini-pro"] }, - + "openai": { "models": [ + "gpt-4.1-nano", "gpt-4.1-mini", - "gpt-4o-mini" + "gpt-4o-mini", + "gpt-4.1", + "gpt-4o", + "gpt-5-chat-latest", + "gpt-5.4-nano", + "gpt-5.4-mini" ], - "recommended": "gpt-4o-mini" + "recommended": "gpt-4.1-nano", + "_note": "Reasoning models (o-series, gpt-5/5.1/5.2 non-chat variants) are supported by openai_provider.py via max_completion_tokens + reasoning_effort=minimal, but not listed here by default: their latency is higher than the chat models and they do not improve translation quality for notifications. Add specific reasoning IDs to this list only if a user explicitly wants them." }, - + "anthropic": { "models": [ "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest" ], - "recommended": "claude-3-5-haiku-latest" + "recommended": "claude-3-5-haiku-latest", + "_note": "Not re-verified in 2026-04 refresh — kept from previous curation. Add claude-4.x / claude-4.5 / claude-4.6 / claude-4.7 variants after running the verifier with an Anthropic key." }, - + "openrouter": { "models": [ "meta-llama/llama-3.3-70b-instruct", @@ -50,14 +60,15 @@ "meta-llama/llama-3.1-8b-instruct", "anthropic/claude-3.5-haiku", "anthropic/claude-3.5-sonnet", - "google/gemini-flash-2.5-flash-lite", + "google/gemini-flash-1.5", "openai/gpt-4o-mini", "mistralai/mistral-7b-instruct", "mistralai/mixtral-8x7b-instruct" ], - "recommended": "meta-llama/llama-3.3-70b-instruct" + "recommended": "meta-llama/llama-3.3-70b-instruct", + "_note": "Not re-verified in 2026-04 refresh. google/gemini-flash-2.5-flash-lite was malformed in the previous entry and has been replaced with google/gemini-flash-1.5." }, - + "ollama": { "_note": "Ollama models are local, we don't filter them. User manages their own models.", "models": [], diff --git a/AppImage/scripts/ai_providers/gemini_provider.py b/AppImage/scripts/ai_providers/gemini_provider.py index 49224fb6..85d251b8 100644 --- a/AppImage/scripts/ai_providers/gemini_provider.py +++ b/AppImage/scripts/ai_providers/gemini_provider.py @@ -30,6 +30,23 @@ class GeminiProvider(AIProvider): 'gemini-1.0-pro', 'gemini-pro', ] + + @staticmethod + def _has_thinking_mode(model: str) -> bool: + """True for Gemini variants that enable "thinking" by default. + + Gemini 2.5+ and 3.x Pro/Flash models spend output tokens on + internal reasoning before emitting the final answer. With a small + max_tokens budget (≤250) that consumes the whole allowance and + leaves an empty reply. For the short translate/explain use case + in ProxMenux we want direct output, so we disable thinking for + these. Lite variants (flash-lite) do NOT have thinking enabled + and are safe to leave alone. + """ + m = model.lower() + if 'lite' in m: + return False + return m.startswith('gemini-2.5') or m.startswith('gemini-3') def list_models(self) -> List[str]: """List available Gemini models that support generateContent. @@ -118,6 +135,18 @@ class GeminiProvider(AIProvider): url = f"{self.API_BASE}/{self.model}:generateContent?key={self.api_key}" # Gemini uses a specific format with contents array + gen_config = { + 'maxOutputTokens': max_tokens, + 'temperature': 0.3, + } + + # Disable thinking on 2.5+ / 3.x pro & flash models so the limited + # output budget actually produces visible text. thinkingBudget=0 + # is the official switch for this; lite variants and legacy + # models don't need (and ignore) the field. + if self._has_thinking_mode(self.model): + gen_config['thinkingConfig'] = {'thinkingBudget': 0} + payload = { 'systemInstruction': { 'parts': [{'text': system_prompt}] @@ -128,10 +157,7 @@ class GeminiProvider(AIProvider): 'parts': [{'text': user_message}] } ], - 'generationConfig': { - 'maxOutputTokens': max_tokens, - 'temperature': 0.3, - } + 'generationConfig': gen_config, } headers = { diff --git a/AppImage/scripts/ai_providers/openai_provider.py b/AppImage/scripts/ai_providers/openai_provider.py index d5877da5..86484767 100644 --- a/AppImage/scripts/ai_providers/openai_provider.py +++ b/AppImage/scripts/ai_providers/openai_provider.py @@ -37,23 +37,49 @@ class OpenAIProvider(AIProvider): # Recommended models for chat (in priority order) RECOMMENDED_PREFIXES = ['gpt-4o-mini', 'gpt-4o', 'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo'] + + @staticmethod + def _is_reasoning_model(model: str) -> bool: + """True for OpenAI reasoning models (o-series + non-chat gpt-5+). + + These use a stricter API contract than chat models: + - Must use ``max_completion_tokens`` instead of ``max_tokens`` + - ``temperature`` is not accepted (only the default is supported) + + Chat-optimized variants (``gpt-5-chat-latest``, + ``gpt-5.1-chat-latest``, etc.) keep the classic contract and are + NOT flagged here. + """ + m = model.lower() + # o1, o3, o4, o5 ... (o...) + if len(m) >= 2 and m[0] == 'o' and m[1].isdigit(): + return True + # gpt-5, gpt-5-mini, gpt-5.1, gpt-5.2-pro ... EXCEPT *-chat-latest + if m.startswith('gpt-5') and '-chat' not in m: + return True + return False def list_models(self) -> List[str]: - """List available OpenAI models for chat completions. - - Filters to only chat-capable models, excluding: - - Embedding models - - Audio/speech models (whisper, tts) - - Image models (dall-e) - - Instruct models (different API) - - Legacy models (babbage, davinci, etc.) - + """List available models for chat completions. + + Two modes: + - Official OpenAI (no custom base_url): restrict to GPT chat models, + excluding embedding/whisper/tts/dall-e/instruct/legacy variants. + - OpenAI-compatible endpoint (LiteLLM, MLX, LM Studio, vLLM, + LocalAI, Ollama-proxy, etc.): the "gpt" substring check is + dropped so user-served models (e.g. ``mlx-community/Llama-3.1-8B``, + ``Qwen3-32B``, ``mistralai/...``) show up. EXCLUDED_PATTERNS + still applies — embeddings/whisper/tts aren't chat-capable on + any backend. + Returns: List of model IDs suitable for chat completions. """ if not self.api_key: return [] - + + is_custom_endpoint = bool(self.base_url) + try: # Determine models URL from base_url if set if self.base_url: @@ -63,42 +89,46 @@ class OpenAIProvider(AIProvider): models_url = f"{base}/models" else: models_url = self.DEFAULT_MODELS_URL - + req = urllib.request.Request( models_url, headers={'Authorization': f'Bearer {self.api_key}'}, method='GET' ) - + with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read().decode('utf-8')) - + models = [] for model in data.get('data', []): model_id = model.get('id', '') if not model_id: continue - + model_lower = model_id.lower() - - # Must be a GPT model - if 'gpt' not in model_lower: + + # Official OpenAI: restrict to GPT chat models. Custom + # endpoints serve arbitrarily named models, so this + # substring check would drop every valid result there. + if not is_custom_endpoint and 'gpt' not in model_lower: continue - - # Exclude non-chat models + + # Exclude non-chat models on every backend. if any(pattern in model_lower for pattern in self.EXCLUDED_PATTERNS): continue - + models.append(model_id) - - # Sort with recommended models first + + # Sort with recommended models first (only meaningful for OpenAI + # official; on custom endpoints the prefixes rarely match, so + # entries fall through to alphabetical order, which is fine). def sort_key(m): m_lower = m.lower() for i, prefix in enumerate(self.RECOMMENDED_PREFIXES): if m_lower.startswith(prefix): return (i, m) return (len(self.RECOMMENDED_PREFIXES), m) - + return sorted(models, key=sort_key) except Exception as e: print(f"[OpenAIProvider] Failed to list models: {e}") @@ -133,17 +163,35 @@ class OpenAIProvider(AIProvider): """ if not self.api_key: raise AIProviderError("API key required for OpenAI") - + payload = { 'model': self.model, 'messages': [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_message}, ], - 'max_tokens': max_tokens, - 'temperature': 0.3, } - + + # Reasoning models (o1/o3/o4/gpt-5*, excluding *-chat-latest) use a + # different parameter contract: max_completion_tokens instead of + # max_tokens, and no temperature field. Sending the classic chat + # parameters to them produces HTTP 400 Bad Request. + # + # They also spend output budget on internal reasoning by default, + # which empties the user-visible reply when max_tokens is small + # (like the ~200 we use for notifications). reasoning_effort + # 'minimal' keeps that internal reasoning to a minimum so the + # entire budget is available for the translation, which is + # exactly what this pipeline wants. OpenAI documents 'minimal', + # 'low', 'medium', 'high' — 'minimal' is the right setting for a + # straightforward translate+explain task. + if self._is_reasoning_model(self.model): + payload['max_completion_tokens'] = max_tokens + payload['reasoning_effort'] = 'minimal' + else: + payload['max_tokens'] = max_tokens + payload['temperature'] = 0.3 + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}', diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py index 7c3294b7..024804a0 100644 --- a/AppImage/scripts/flask_notification_routes.py +++ b/AppImage/scripts/flask_notification_routes.py @@ -220,10 +220,20 @@ def get_provider_models(): # Get all models from provider API api_models = ai_provider.list_models() - + + # OpenAI with a custom base URL means an OpenAI-compatible endpoint + # (LiteLLM, MLX, LM Studio, vLLM, LocalAI, Ollama-proxy...). The + # verified_ai_models.json list only contains official OpenAI IDs + # (gpt-4o-mini etc.), so intersecting against it would strip every + # model the user actually serves. Treat the custom-endpoint case + # like Ollama: return whatever the endpoint advertises, no filter. + is_openai_compat = (provider == 'openai' and bool(openai_base_url)) + if not api_models: - # API failed, fall back to verified list only - if verified_models: + # API failed, fall back to verified list only (but not for + # custom endpoints — we don't know what the endpoint serves, + # so "gpt-4o-mini" as a fallback would be misleading). + if verified_models and not is_openai_compat: models = sorted(verified_models) return jsonify({ 'success': True, @@ -232,27 +242,38 @@ def get_provider_models(): 'message': f'{len(models)} verified models (API unavailable)' }) return jsonify({ - 'success': False, - 'models': [], - 'message': 'Could not retrieve models. Check your API key.' + 'success': False, + 'models': [], + 'message': 'Could not retrieve models. Check your API key and endpoint URL.' }) - + + if is_openai_compat: + # Custom OpenAI-compatible endpoint: surface every model the + # endpoint reports. No verified-list intersection. + models = sorted(api_models) + return jsonify({ + 'success': True, + 'models': models, + 'recommended': models[0] if models else '', + 'message': f'Found {len(models)} models on custom endpoint' + }) + # Filter: only models that are BOTH in API and verified list if verified_models: api_models_set = set(api_models) filtered_models = [m for m in verified_models if m in api_models_set] - + if not filtered_models: # No intersection - maybe verified list is outdated # Return verified list anyway (will fail on use if truly unavailable) filtered_models = list(verified_models) - + # Sort with recommended first def sort_key(m): if m == recommended: return (0, m) return (1, m) - + models = sorted(filtered_models, key=sort_key) else: # No verified list for this provider, return all from API diff --git a/AppImage/scripts/flask_server.py b/AppImage/scripts/flask_server.py index 83e01204..7effb6c6 100644 --- a/AppImage/scripts/flask_server.py +++ b/AppImage/scripts/flask_server.py @@ -6151,6 +6151,211 @@ def get_network_hardware_info(pci_slot): return net_info +def _get_sriov_info(slot): + """Return SR-IOV role for a PCI slot via sysfs. + + Reads /sys/bus/pci/devices// for: + - physfn symlink → slot is a Virtual Function; link target is its PF + - sriov_numvfs → active VF count if slot is a Physical Function + - sriov_totalvfs → maximum VFs this PF can spawn + + Returns a dict ready to merge into the GPU object, or {} on any error. + The 'role' key uses the same vocabulary as _pci_sriov_role in the + bash helpers (pci_passthrough_helpers.sh): vf | pf-active | pf-idle | none. + """ + try: + bdf = slot if slot.startswith('0000:') else f'0000:{slot}' + base = f'/sys/bus/pci/devices/{bdf}' + if not os.path.isdir(base): + return {} + + physfn = os.path.join(base, 'physfn') + if os.path.islink(physfn): + parent = os.path.basename(os.path.realpath(physfn)) + return { + 'sriov_role': 'vf', + 'sriov_physfn': parent, + } + + totalvfs_path = os.path.join(base, 'sriov_totalvfs') + if not os.path.isfile(totalvfs_path): + return {'sriov_role': 'none'} + + try: + totalvfs = int((open(totalvfs_path).read() or '0').strip() or 0) + except (ValueError, OSError): + totalvfs = 0 + if totalvfs <= 0: + return {'sriov_role': 'none'} + + try: + numvfs = int((open(os.path.join(base, 'sriov_numvfs')).read() or '0').strip() or 0) + except (ValueError, OSError): + numvfs = 0 + + return { + 'sriov_role': 'pf-active' if numvfs > 0 else 'pf-idle', + 'sriov_vf_count': numvfs, + 'sriov_totalvfs': totalvfs, + } + except Exception: + return {} + + +def _sriov_list_vfs_of_pf(pf_bdf): + """Return sorted list of VF BDFs that belong to a Physical Function. + Reads /sys/bus/pci/devices//virtfn symlinks (one per VF). + """ + try: + pf_full = pf_bdf if pf_bdf.startswith('0000:') else f'0000:{pf_bdf}' + base = f'/sys/bus/pci/devices/{pf_full}' + if not os.path.isdir(base): + return [] + # virtfn links are numbered (virtfn0, virtfn1, ...) and point to the VF. + entries = sorted(glob.glob(f'{base}/virtfn*'), + key=lambda p: int(re.search(r'virtfn(\d+)', p).group(1)) + if re.search(r'virtfn(\d+)', p) else 0) + return [os.path.basename(os.path.realpath(p)) for p in entries] + except Exception: + return [] + + +def _sriov_pci_driver(bdf): + """Return the current driver bound to a PCI BDF, '' if unbound.""" + try: + link = f'/sys/bus/pci/devices/{bdf}/driver' + if os.path.islink(link): + return os.path.basename(os.path.realpath(link)) + except Exception: + pass + return '' + + +def _sriov_pci_render_node(bdf): + """If the device exposes a DRM render node, return '/dev/dri/renderDX'. + LXC containers consume GPUs through these nodes, so this lets us + cross-reference an LXC's `dev: /dev/dri/renderD` config line + back to a specific VF. + """ + try: + drm_dir = f'/sys/bus/pci/devices/{bdf}/drm' + if not os.path.isdir(drm_dir): + return '' + for name in sorted(os.listdir(drm_dir)): + if name.startswith('renderD'): + return f'/dev/dri/{name}' + except Exception: + pass + return '' + + +def _sriov_guest_running(guest_type, gid): + """Best-effort status check. Returns True if running, False otherwise.""" + try: + cmd = ['qm' if guest_type == 'vm' else 'pct', 'status', str(gid)] + r = subprocess.run(cmd, capture_output=True, text=True, timeout=3) + return 'running' in (r.stdout or '').lower() + except Exception: + return False + + +def _sriov_find_guest_consumer(bdf): + """Find the VM or LXC that consumes a given VF (or PF) on the host. + + VMs: scan /etc/pve/qemu-server/*.conf for a `hostpci: ` line that + references the BDF (short or full form, possibly alongside other + ids separated by ';' and trailing options after ','). + LXCs: resolve the BDF to its DRM render node (if any) and scan + /etc/pve/lxc/*.conf for `dev:` or `lxc.mount.entry:` lines that + reference that node. + + Returns {type, id, name, running} or None. + """ + short_bdf = bdf[5:] if bdf.startswith('0000:') else bdf + full_bdf = bdf if bdf.startswith('0000:') else f'0000:{bdf}' + + # ── VM scan ── + try: + for conf in sorted(glob.glob('/etc/pve/qemu-server/*.conf')): + try: + with open(conf, 'r') as f: + text = f.read() + except OSError: + continue + if re.search( + rf'^hostpci\d+:\s*[^\n]*(?:0000:)?{re.escape(short_bdf)}(?:[,;\s]|$)', + text, re.MULTILINE, + ): + vmid = os.path.basename(conf)[:-5] # strip '.conf' + nm = re.search(r'^name:\s*(\S+)', text, re.MULTILINE) + name = nm.group(1) if nm else '' + return { + 'type': 'vm', + 'id': vmid, + 'name': name, + 'running': _sriov_guest_running('vm', vmid), + } + except Exception: + pass + + # ── LXC scan (via render node) ── + render_node = _sriov_pci_render_node(full_bdf) + if render_node: + try: + for conf in sorted(glob.glob('/etc/pve/lxc/*.conf')): + try: + with open(conf, 'r') as f: + text = f.read() + except OSError: + continue + if re.search( + rf'^(?:dev\d+|lxc\.mount\.entry):\s*[^\n]*{re.escape(render_node)}(?:[,;\s]|$)', + text, re.MULTILINE, + ): + ctid = os.path.basename(conf)[:-5] + nm = re.search(r'^hostname:\s*(\S+)', text, re.MULTILINE) + name = nm.group(1) if nm else '' + return { + 'type': 'lxc', + 'id': ctid, + 'name': name, + 'running': _sriov_guest_running('lxc', ctid), + } + except Exception: + pass + + return None + + +def _sriov_enrich_detail(gpu): + """On-demand enrichment for the GPU detail modal. + + For a PF with active VFs, populates gpu['sriov_vfs'] with per-VF driver + and consumer info. For a VF, populates gpu['sriov_consumer'] with the + guest (if any) currently referencing it. Heavier than _get_sriov_info() + because it scans guest configs, so it is NOT called from the hardware + snapshot path — only from the realtime endpoint. + """ + role = gpu.get('sriov_role') + slot = gpu.get('slot', '') + if not slot: + return + full_bdf = slot if slot.startswith('0000:') else f'0000:{slot}' + + if role == 'pf-active': + vf_list = [] + for vf_bdf in _sriov_list_vfs_of_pf(full_bdf): + vf_list.append({ + 'bdf': vf_bdf, + 'driver': _sriov_pci_driver(vf_bdf) or '', + 'render_node': _sriov_pci_render_node(vf_bdf) or '', + 'consumer': _sriov_find_guest_consumer(vf_bdf), + }) + gpu['sriov_vfs'] = vf_list + elif role == 'vf': + gpu['sriov_consumer'] = _sriov_find_guest_consumer(full_bdf) + + def get_gpu_info(): """Detect and return information about GPUs in the system""" gpus = [] @@ -6196,7 +6401,11 @@ def get_gpu_info(): gpu['pci_class'] = pci_info.get('class', '') gpu['pci_driver'] = pci_info.get('driver', '') gpu['pci_kernel_module'] = pci_info.get('kernel_module', '') - + + sriov_fields = _get_sriov_info(slot) + if sriov_fields: + gpu.update(sriov_fields) + # detailed_info = get_detailed_gpu_info(gpu) # Removed this call here # gpu.update(detailed_info) # It will be called later in api_gpu_realtime @@ -10010,7 +10219,12 @@ def api_gpu_realtime(slot): pass detailed_info = get_detailed_gpu_info(gpu) gpu.update(detailed_info) - + + # SR-IOV detail is only relevant when the modal is actually open, + # so we build it on demand here (not in get_gpu_info) to avoid + # scanning every guest config on the hardware snapshot path. + _sriov_enrich_detail(gpu) + # Extract only the monitoring-related fields realtime_data = { 'has_monitoring_tool': gpu.get('has_monitoring_tool', False), @@ -10035,9 +10249,17 @@ def api_gpu_realtime(slot): # Added for NVIDIA/AMD specific engine info if available 'engine_encoder': gpu.get('engine_encoder'), 'engine_decoder': gpu.get('engine_decoder'), - 'driver_version': gpu.get('driver_version') # Added driver_version + 'driver_version': gpu.get('driver_version'), # Added driver_version + # SR-IOV modal detail (populated only when the GPU is an SR-IOV + # Physical Function with active VFs, or a Virtual Function). + 'sriov_role': gpu.get('sriov_role'), + 'sriov_physfn': gpu.get('sriov_physfn'), + 'sriov_vf_count': gpu.get('sriov_vf_count'), + 'sriov_totalvfs': gpu.get('sriov_totalvfs'), + 'sriov_vfs': gpu.get('sriov_vfs'), + 'sriov_consumer': gpu.get('sriov_consumer'), } - + return jsonify(realtime_data) except Exception as e: # print(f"[v0] Error getting real-time GPU data: {e}") diff --git a/AppImage/types/hardware.ts b/AppImage/types/hardware.ts index 751cfe4c..092b1dd1 100644 --- a/AppImage/types/hardware.ts +++ b/AppImage/types/hardware.ts @@ -190,6 +190,34 @@ export interface GPU { }> has_monitoring_tool?: boolean note?: string + // SR-IOV state — populated from sysfs (physfn symlink + sriov_{num,total}vfs). + // "vf" — this slot is a Virtual Function; sriov_physfn is its PF. + // "pf-active" — this slot is a Physical Function with sriov_vf_count > 0. + // "pf-idle" — SR-IOV capable PF but no VFs currently active. + // "none" — not involved in SR-IOV. + sriov_role?: "vf" | "pf-active" | "pf-idle" | "none" + sriov_physfn?: string + sriov_vf_count?: number + sriov_totalvfs?: number + // SR-IOV detail — only populated by the /api/gpu//realtime endpoint + // when the modal is open (scanning guest configs is too expensive for the + // hardware snapshot path). + sriov_vfs?: SriovVfDetail[] // filled when role === "pf-active" + sriov_consumer?: SriovConsumer | null // filled when role === "vf" +} + +export interface SriovVfDetail { + bdf: string // e.g. "0000:00:02.1" + driver: string // current kernel driver (i915, vfio-pci, ...) + render_node: string // "" when the VF does not expose a DRM node + consumer: SriovConsumer | null // which guest is using this VF, if any +} + +export interface SriovConsumer { + type: "vm" | "lxc" + id: string // VMID or CTID + name: string // VM name / LXC hostname + running: boolean } export interface DiskHardwareInfo { diff --git a/images/riov-indicator.png b/images/riov-indicator.png new file mode 100644 index 00000000..a1af122d Binary files /dev/null and b/images/riov-indicator.png differ diff --git a/scripts/global/gpu_hook_guard_helpers.sh b/scripts/global/gpu_hook_guard_helpers.sh index 22dc36ab..3c54f681 100644 --- a/scripts/global/gpu_hook_guard_helpers.sh +++ b/scripts/global/gpu_hook_guard_helpers.sh @@ -138,6 +138,12 @@ if [[ -f "$vm_conf" ]]; then slot_has_gpu=false for dev in /sys/bus/pci/devices/0000:${slot}.*; do [[ -e "$dev" ]] || continue + # SR-IOV: skip Virtual Functions when iterating a whole slot. + # VFs share the slot with their PF but carry their own driver + # state; their vfio-pci rebind is handled by Proxmox at VM + # start. Pre-flighting them would falsely block SR-IOV setups + # where the PF legitimately stays on the native driver. + [[ -L "${dev}/physfn" ]] && continue class_hex="$(cat "$dev/class" 2>/dev/null | sed 's/^0x//')" [[ "${class_hex:0:2}" != "03" ]] && continue slot_has_gpu=true @@ -159,6 +165,14 @@ if [[ -f "$vm_conf" ]]; then details+=$'\n'"- ${id}: PCI device not found" continue fi + # SR-IOV VF: do not pre-flight the driver. Proxmox rebinds the VF + # to vfio-pci as part of VM start; at pre-start time the VF may + # still be on its native driver (i915, etc.) — that is normal, + # not an error. Blocking here would prevent every SR-IOV VF + # passthrough from starting. + if [[ -L "${dev_path}/physfn" ]]; then + continue + fi class_hex="$(cat "$dev_path/class" 2>/dev/null | sed 's/^0x//')" # Enforce vfio only for display/3D devices (PCI class 03xx). [[ "${class_hex:0:2}" == "03" ]] || continue diff --git a/scripts/global/pci_passthrough_helpers.sh b/scripts/global/pci_passthrough_helpers.sh index fa29ace6..1317ff32 100644 --- a/scripts/global/pci_passthrough_helpers.sh +++ b/scripts/global/pci_passthrough_helpers.sh @@ -11,6 +11,205 @@ function _pci_is_iommu_active() { find /sys/kernel/iommu_groups -mindepth 1 -maxdepth 1 -type d -print -quit 2>/dev/null | grep -q . } +# Audio-companion cascade helpers (Part 2 of the SR-IOV / audio rework). +# +# When a GPU is detached from a VM (user chooses "Remove GPU from VM +# config" during a mode switch), the historic sed-based cleanup only +# removes hostpci lines that match the GPU's PCI slot (e.g. 00:02). +# That leaves any "companion" audio that lives at a different slot — +# typically the chipset audio at 00:1f.X, which add_gpu_vm.sh now adds +# alongside an Intel iGPU via the checklist from Part 1 — stranded in +# the VM config. On the next VM start, vfio-pci is no longer claiming +# that audio device (its vendor:device was pulled from vfio.conf +# during the switch-back) and either QEMU fails to rebind it or it +# breaks host audio. +# +# _vm_list_orphan_audio_hostpci reports those stranded entries; each +# caller uses its own UI (dialog, whiptail, hybrid_msgbox) to confirm +# removal and then calls _vm_remove_hostpci_index per selected entry. + +# Usage: _vm_list_orphan_audio_hostpci +# gpu_slot_base: the GPU's PCI slot WITHOUT function suffix, e.g. "00:02". +# Output: one line per orphan entry, in the form "idx|bdf|human_name". +# Empty output when the VM has no audio passthrough outside the GPU slot. +# +# A hostpci audio entry is reported as "orphan" ONLY if the same VM has +# no display/3D-class hostpci at the same slot base. Rationale: the +# audio at e.g. 02:00.1 is the HDMI codec of a dGPU at 02:00.0 — if +# that dGPU is still being passed through to this VM (as a separate +# hostpciN), the audio belongs to it and must not be touched when +# detaching an unrelated GPU (e.g. an Intel iGPU at 00:02.0) from the +# same VM. Without this filter we would strip the HDMI audio of every +# other GPU in the VM, leaving them silent on next start. +function _vm_list_orphan_audio_hostpci() { + local vmid="$1" gpu_slot="$2" + [[ -n "$vmid" && -n "$gpu_slot" ]] || return 1 + local conf="/etc/pve/qemu-server/${vmid}.conf" + [[ -f "$conf" ]] || return 1 + + # ── Pass 1 ── collect the slot bases of hostpci entries whose target + # device is display/3D (class 03xx). These slots "own" any audio at + # the same slot base (the .1 HDMI codec pattern). + local -a display_slots=() + local line raw_bdf bdf class_hex slot_base + while IFS= read -r line; do + raw_bdf=$(printf '%s' "$line" \ + | grep -oE '(0000:)?[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-7]' \ + | head -1) + [[ -z "$raw_bdf" ]] && continue + bdf="$raw_bdf" + [[ "$bdf" =~ ^0000: ]] || bdf="0000:$bdf" + class_hex=$(cat "/sys/bus/pci/devices/${bdf}/class" 2>/dev/null | sed 's/^0x//') + if [[ "${class_hex:0:2}" == "03" ]]; then + slot_base="${bdf#0000:}" + slot_base="${slot_base%.*}" + display_slots+=("$slot_base") + fi + done < <(grep -E '^hostpci[0-9]+:' "$conf") + + # ── Pass 2 ── classify audio entries. + local idx raw name + local has_display_sibling ds + while IFS= read -r line; do + idx=$(printf '%s' "$line" | sed -nE 's/^hostpci([0-9]+):.*/\1/p') + [[ -z "$idx" ]] && continue + + raw=$(printf '%s' "$line" \ + | grep -oE '(0000:)?[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-7]' \ + | head -1) + [[ -z "$raw" ]] && continue + bdf="$raw" + [[ "$bdf" =~ ^0000: ]] || bdf="0000:$bdf" + slot_base="${bdf#0000:}" + slot_base="${slot_base%.*}" + + # Skip entries that match the GPU slot — those go through the + # caller's primary sed/qm-set cleanup, not through this helper. + [[ "$slot_base" == "$gpu_slot" ]] && continue + + # Only audio class devices (PCI class 04xx) are candidates. + class_hex=$(cat "/sys/bus/pci/devices/${bdf}/class" 2>/dev/null | sed 's/^0x//') + [[ "${class_hex:0:2}" == "04" ]] || continue + + # Display-sibling guard: skip audio that is the HDMI/DP codec of a + # still-present dGPU in this VM. + has_display_sibling=false + for ds in "${display_slots[@]}"; do + if [[ "$ds" == "$slot_base" ]]; then + has_display_sibling=true + break + fi + done + $has_display_sibling && continue + + name=$(lspci -nn -s "${bdf#0000:}" 2>/dev/null \ + | sed 's/^[^ ]* //' \ + | cut -c1-52) + [[ -z "$name" ]] && name="PCI audio device" + + printf '%s|%s|%s\n' "$idx" "$bdf" "$name" + done < <(grep -E '^hostpci[0-9]+:' "$conf") +} + +# Returns 0 if the given PCI BDF still appears as a hostpci passthrough +# target in any VM config, optionally excluding one or more VM IDs. +# Usage: _pci_bdf_in_any_vm [excluded_vmid]... +# +# Used by the switch-mode cascade to decide whether a companion audio +# device's vendor:device pair is safe to remove from /etc/modprobe.d/ +# vfio.conf (only if no other VM still references it). +function _pci_bdf_in_any_vm() { + local bdf="$1"; shift + [[ -n "$bdf" ]] || return 1 + local short_bdf="${bdf#0000:}" + local conf vmid ex skip + for conf in /etc/pve/qemu-server/*.conf; do + [[ -f "$conf" ]] || continue + vmid=$(basename "$conf" .conf) + skip=false + for ex in "$@"; do + if [[ "$vmid" == "$ex" ]]; then + skip=true + break + fi + done + $skip && continue + if grep -qE "^hostpci[0-9]+:.*(0000:)?${short_bdf}([,[:space:]]|$)" "$conf" 2>/dev/null; then + return 0 + fi + done + return 1 +} + +# Usage: _vm_remove_hostpci_index [log_file] +# Removes hostpci from the VM config via `qm set --delete` so the +# change goes through Proxmox's own validation path (running VMs get a +# staged update). Returns the exit code of qm set. +function _vm_remove_hostpci_index() { + local vmid="$1" idx="$2" + local log="${3:-${LOG_FILE:-/dev/null}}" + [[ -n "$vmid" && -n "$idx" ]] || return 1 + qm set "$vmid" --delete "hostpci${idx}" >>"$log" 2>&1 +} + +# Robust LXC stop for switch-mode / passthrough flows. +# +# A plain `pct stop` can hang indefinitely when: +# - the container has a stale lock from a previous aborted operation, +# - processes inside the container (Plex, Jellyfin, databases) ignore +# the initial TERM and sit in uninterruptible-sleep (D state) while +# the GPU they were using is being yanked out, +# - the host is under load and Proxmox's state polling stalls, +# - `pct shutdown --timeout` is not always enforced by pct itself +# (observed field reports of 5+ min waits despite --timeout 30). +# +# Strategy: +# 1) return 0 immediately if the container is not running, +# 2) clear any stale lock (most common cause of hangs), +# 3) try `pct shutdown --forceStop 1 --timeout 30`, wrapped in an +# external `timeout 45` as belt-and-braces in case pct itself +# blocks on backend I/O, +# 4) verify actual status via `pct status` — do not trust exit codes, +# pct can return non-zero while the container is actually stopped, +# 5) if still running, fall back to `pct stop` wrapped in `timeout 60`, +# 6) verify again and return 1 if the container is truly stuck +# (only happens when processes are in D state — requires manual +# intervention, but the wizard moves on instead of hanging). +# +# Usage: _pmx_stop_lxc [log_file] +# log_file defaults to $LOG_FILE if set, otherwise /dev/null. +# Returns 0 on stopped / already-stopped, non-zero if every attempt failed. +function _pmx_stop_lxc() { + local ctid="$1" + local log="${2:-${LOG_FILE:-/dev/null}}" + + _pmx_lxc_running() { + pct status "$1" 2>/dev/null | grep -q "status: running" + } + + _pmx_lxc_running "$ctid" || return 0 + + # Best-effort unlock — silent on failure because most containers aren't + # actually locked; we only care about the cases where they are. + pct unlock "$ctid" >>"$log" 2>&1 || true + + # Graceful shutdown with forced kill after 30 s. The external `timeout 45` + # guarantees we never wait longer than that for this step, even if pct + # itself is stuck (the cushion over 30 s is to let the internal timeout + # cleanly unwind before we kill pct). + timeout 45 pct shutdown "$ctid" --forceStop 1 --timeout 30 >>"$log" 2>&1 || true + sleep 1 + _pmx_lxc_running "$ctid" || return 0 + + # Fallback: abrupt stop, also externally capped so the wizard does not + # hang the user indefinitely if lxc-stop blocks on D-state processes. + timeout 60 pct stop "$ctid" >>"$log" 2>&1 || true + sleep 1 + _pmx_lxc_running "$ctid" || return 0 + + return 1 +} + function _pci_next_hostpci_index() { local vmid="$1" local idx=0 @@ -50,3 +249,109 @@ function _pci_function_assigned_to_vm() { qm config "$vmid" 2>/dev/null | grep -qE "$pattern" } + +# ========================================================== +# SR-IOV detection helpers +# ========================================================== +# A PCI device participates in SR-IOV when either: +# - It is a Physical Function (PF) with one or more active VFs +# → /sys/bus/pci/devices//sriov_numvfs > 0 +# - It is a Virtual Function (VF) spawned by a PF +# → /sys/bus/pci/devices//physfn is a symlink to the PF +# +# These helpers accept a BDF in either "0000:00:02.0" or "00:02.0" form. +# Return 0 on match, non-zero otherwise (shell convention). + +function _pci_normalize_bdf() { + local id="$1" + [[ -z "$id" ]] && return 1 + [[ "$id" =~ ^0000: ]] || id="0000:${id}" + printf '%s\n' "$id" +} + +function _pci_is_vf() { + local id + id=$(_pci_normalize_bdf "$1") || return 1 + [[ -L "/sys/bus/pci/devices/${id}/physfn" ]] +} + +function _pci_get_pf_of_vf() { + local id + id=$(_pci_normalize_bdf "$1") || return 1 + local link="/sys/bus/pci/devices/${id}/physfn" + [[ -L "$link" ]] || return 1 + basename "$(readlink -f "$link")" +} + +function _pci_is_sriov_capable() { + local id total + id=$(_pci_normalize_bdf "$1") || return 1 + total=$(cat "/sys/bus/pci/devices/${id}/sriov_totalvfs" 2>/dev/null) + [[ -n "$total" && "$total" -gt 0 ]] +} + +function _pci_active_vf_count() { + local id num + id=$(_pci_normalize_bdf "$1") || { echo 0; return 1; } + num=$(cat "/sys/bus/pci/devices/${id}/sriov_numvfs" 2>/dev/null) + [[ -n "$num" ]] || num=0 + echo "$num" +} + +function _pci_has_active_vfs() { + local n + n=$(_pci_active_vf_count "$1") + [[ "$n" -gt 0 ]] +} + +# Filter an array (by name) of PCI BDFs in place, removing entries that +# are SR-IOV Virtual Functions or Physical Functions with active VFs — +# i.e. the configurations ProxMenux refuses to operate on today. +# +# Usage: _pci_sriov_filter_array +# Output: one line per removed entry, formatted "BDF|role" where role is +# whatever _pci_sriov_role prints (e.g. "vf 0000:00:02.0" or +# "pf-active 7"). The caller decides how to surface the removals. +# Returns: 0 if the caller should continue (even if some entries were +# filtered); the array mutation happens either way. +function _pci_sriov_filter_array() { + local -n _arr_ref="$1" + local -a _kept=() + local bdf role first + for bdf in "${_arr_ref[@]}"; do + role=$(_pci_sriov_role "$bdf" 2>/dev/null) + first="${role%% *}" + if [[ "$first" == "vf" || "$first" == "pf-active" ]]; then + echo "${bdf}|${role}" + else + _kept+=("$bdf") + fi + done + _arr_ref=("${_kept[@]}") +} + +# Emits a one-line SR-IOV role description for diagnostics/messages. +# Prints one of: +# "pf-active " — PF with N>0 active VFs +# "pf-idle" — SR-IOV capable PF with 0 VFs (benign) +# "vf " — VF (names its parent PF) +# "none" — device not involved in SR-IOV +function _pci_sriov_role() { + local id + id=$(_pci_normalize_bdf "$1") || { echo "none"; return 0; } + if _pci_is_vf "$id"; then + echo "vf $(_pci_get_pf_of_vf "$id")" + return 0 + fi + if _pci_is_sriov_capable "$id"; then + local n + n=$(_pci_active_vf_count "$id") + if [[ "$n" -gt 0 ]]; then + echo "pf-active ${n}" + else + echo "pf-idle" + fi + return 0 + fi + echo "none" +} diff --git a/scripts/gpu_tpu/add_gpu_lxc.sh b/scripts/gpu_tpu/add_gpu_lxc.sh index c765da6b..762c44ae 100644 --- a/scripts/gpu_tpu/add_gpu_lxc.sh +++ b/scripts/gpu_tpu/add_gpu_lxc.sh @@ -28,6 +28,11 @@ NVIDIA_VID_DID="" if [[ -f "$UTILS_FILE" ]]; then source "$UTILS_FILE" fi +if [[ -f "$LOCAL_SCRIPTS/global/pci_passthrough_helpers.sh" ]]; then + source "$LOCAL_SCRIPTS/global/pci_passthrough_helpers.sh" +elif [[ -f "$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/global/pci_passthrough_helpers.sh" ]]; then + source "$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/global/pci_passthrough_helpers.sh" +fi if [[ -f "$LOCAL_SCRIPTS/global/gpu_hook_guard_helpers.sh" ]]; then source "$LOCAL_SCRIPTS/global/gpu_hook_guard_helpers.sh" elif [[ -f "$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. && pwd)/global/gpu_hook_guard_helpers.sh" ]]; then @@ -259,6 +264,67 @@ select_container() { # ============================================================ # GPU checklist selection # ============================================================ +# ============================================================ +# SR-IOV guard — refuse to pass an SR-IOV GPU to an LXC via ProxMenux. +# Although the LXC flow does not rewrite vfio.conf/blacklist (so it is +# not destructive like add_gpu_vm.sh), it blindly globs /dev/dri/card* +# and /dev/dri/renderD* without mapping each node to its BDF. With 7 +# VFs the container may end up holding any/all of them, which is not +# the behavior a user asking for "one VF to this LXC" expects. Until a +# VF-aware LXC flow exists, stop and point to manual configuration — +# matching the policy used in switch_gpu_mode.sh and add_gpu_vm.sh. +# ============================================================ +check_sriov_and_block_if_needed() { + declare -F _pci_sriov_role >/dev/null 2>&1 || return 0 + + local gpu_type pci role first_word + local -a offenders=() + + for gpu_type in "${SELECTED_GPUS[@]}"; do + case "$gpu_type" in + intel) pci="$INTEL_PCI" ;; + amd) pci="$AMD_PCI" ;; + nvidia) pci="$NVIDIA_PCI" ;; + *) continue ;; + esac + [[ -n "$pci" ]] || continue + + role=$(_pci_sriov_role "$pci") + first_word="${role%% *}" + case "$first_word" in + vf) + offenders+=("${pci}|vf|${role#vf }") + ;; + pf-active) + offenders+=("${pci}|pf-active|${role#pf-active }") + ;; + esac + done + + [[ ${#offenders[@]} -eq 0 ]] && return 0 + + local msg entry bdf kind info + msg="\n\Zb\Z6$(translate 'SR-IOV Configuration Detected')\Zn\n\n" + for entry in "${offenders[@]}"; do + bdf="${entry%%|*}" + kind="${entry#*|}"; kind="${kind%%|*}" + info="${entry##*|}" + if [[ "$kind" == "vf" ]]; then + msg+=" • \Zb${bdf}\Zn — $(translate 'Virtual Function (parent PF:') ${info})\n" + else + msg+=" • \Zb${bdf}\Zn — $(translate 'Physical Function with') ${info} $(translate 'active VFs')\n" + fi + done + msg+="\n$(translate 'To pass SR-IOV Virtual Functions to a container, edit the LXC configuration manually via the Proxmox web interface. The Physical Function will remain bound to the native driver.')" + + dialog --backtitle "ProxMenux" --colors \ + --title "$(translate 'SR-IOV Configuration Detected')" \ + --msgbox "$msg" 16 82 + + exit 0 +} + + select_gpus() { local gpu_items=() $HAS_INTEL && gpu_items+=("intel" "${INTEL_NAME:-Intel iGPU}" "off") @@ -927,6 +993,7 @@ main() { detect_host_gpus select_container select_gpus + check_sriov_and_block_if_needed check_vfio_switch_mode precheck_existing_lxc_gpu_config diff --git a/scripts/gpu_tpu/add_gpu_vm.sh b/scripts/gpu_tpu/add_gpu_vm.sh index f57360e8..19906686 100644 --- a/scripts/gpu_tpu/add_gpu_vm.sh +++ b/scripts/gpu_tpu/add_gpu_vm.sh @@ -71,6 +71,7 @@ SELECTED_GPU_NAME="" declare -a IOMMU_DEVICES=() # all PCI addrs in IOMMU group (endpoint devices) declare -a IOMMU_VFIO_IDS=() # vendor:device for vfio-pci ids= declare -a EXTRA_AUDIO_DEVICES=() # sibling audio function(s), typically *.1 +declare -a EXTRA_AUDIO_INFO=() # parallel to EXTRA_AUDIO_DEVICES — "BDF|current_driver" pairs for the summary dialog IOMMU_GROUP="" IOMMU_PENDING_REBOOT=false @@ -212,28 +213,32 @@ _strip_colors() { printf '%s' "$1" | sed 's/\\Z[0-9a-zA-Z]//g' } -# Msgbox: dialog in standalone mode, whiptail in wizard mode +# Msgbox: dialog in standalone mode, whiptail in wizard mode. +# I/O pinned to /dev/tty so the dialog renders reliably regardless of +# how the caller redirected stdin/stdout, and immune to the SIGTTOU +# trap that fires when this script is resumed as a background job. _pmx_msgbox() { local title="$1" msg="$2" h="${3:-10}" w="${4:-72}" if [[ "$WIZARD_CALL" == "true" ]]; then whiptail --backtitle "ProxMenux" --title "$title" \ - --msgbox "$(_strip_colors "$msg")" "$h" "$w" + --msgbox "$(_strip_colors "$msg")" "$h" "$w" < /dev/tty > /dev/tty else dialog --backtitle "ProxMenux" --colors \ - --title "$title" --msgbox "$msg" "$h" "$w" + --title "$title" --msgbox "$msg" "$h" "$w" < /dev/tty > /dev/tty fi } -# Yesno: dialog in standalone mode, whiptail in wizard mode -# Returns 0 for yes, 1 for no (same as dialog/whiptail) +# Yesno: dialog in standalone mode, whiptail in wizard mode. +# Returns 0 for yes, 1 for no (same as dialog/whiptail). +# I/O pinned to /dev/tty — see the note on _pmx_msgbox. _pmx_yesno() { local title="$1" msg="$2" h="${3:-10}" w="${4:-72}" if [[ "$WIZARD_CALL" == "true" ]]; then whiptail --backtitle "ProxMenux" --title "$title" \ - --yesno "$(_strip_colors "$msg")" "$h" "$w" + --yesno "$(_strip_colors "$msg")" "$h" "$w" < /dev/tty > /dev/tty else dialog --backtitle "ProxMenux" --colors \ - --title "$title" --yesno "$msg" "$h" "$w" + --title "$title" --yesno "$msg" "$h" "$w" < /dev/tty > /dev/tty fi return $? } @@ -265,6 +270,27 @@ _pmx_menu() { return $? } +# Checklist: dialog in standalone mode, whiptail in wizard mode. +# Usage: _pmx_checklist title msg h w list_h tag1 desc1 state1 tag2 desc2 state2 ... +# state is "on" or "off". Returns the space-separated list of selected +# tags on stdout (one line). Returns non-zero if the user cancels. +_pmx_checklist() { + local title="$1" msg="$2" h="$3" w="$4" lh="$5" + shift 5 + if [[ "$WIZARD_CALL" == "true" ]]; then + whiptail --backtitle "ProxMenux" \ + --title "$title" \ + --checklist "$(_strip_colors "$msg")" "$h" "$w" "$lh" \ + "$@" 3>&1 1>&2 2>&3 + else + dialog --backtitle "ProxMenux" --colors \ + --title "$title" \ + --checklist "$msg" "$h" "$w" "$lh" \ + "$@" 2>&1 >/dev/tty + fi + return $? +} + _file_has_exact_line() { local line="$1" local file="$2" @@ -718,6 +744,48 @@ select_gpu() { } +# ========================================================== +# SR-IOV guard — refuse to assign a Virtual Function or a Physical +# Function with active VFs. Matches the policy in switch_gpu_mode.sh: +# writing this GPU's vendor:device to /etc/modprobe.d/vfio.conf would +# let vfio-pci claim the PF at next boot and destroy the whole VF +# tree. ProxMenux does not yet manage SR-IOV lifecycle, so we stop +# before touching vfio.conf / blacklist.conf. +# ========================================================== +check_sriov_and_block_if_needed() { + declare -F _pci_sriov_role >/dev/null 2>&1 || return 0 + [[ -n "$SELECTED_GPU_PCI" ]] || return 0 + + local role first_word detail="" + role=$(_pci_sriov_role "$SELECTED_GPU_PCI") + first_word="${role%% *}" + + case "$first_word" in + vf) + local parent="${role#vf }" + detail="$(translate 'The selected device') \Zb${SELECTED_GPU_PCI}\Zn $(translate 'is an SR-IOV Virtual Function (VF). Its parent Physical Function is') \Zb${parent}\Zn." + ;; + pf-active) + local n="${role#pf-active }" + detail="$(translate 'The selected device') \Zb${SELECTED_GPU_PCI}\Zn $(translate 'is a Physical Function with') \Zb${n}\Zn $(translate 'active Virtual Functions. Changing its driver binding would destroy every VF.')" + ;; + *) + return 0 + ;; + esac + + local msg + msg="\n\Zb\Z6$(translate 'SR-IOV Configuration Detected')\Zn\n\n" + msg+="${detail}\n\n" + msg+="$(translate 'To assign VFs to VMs or LXCs, edit the configuration manually via the Proxmox web interface. The Physical Function will remain bound to the native driver.')" + + _pmx_msgbox "$(translate 'SR-IOV Configuration Detected')" "$msg" 16 82 + + [[ "$WIZARD_CALL" == "true" ]] && _set_wizard_result "cancelled" + exit 0 +} + + # ========================================================== # Phase 1 — Step 4: Single-GPU warning # ========================================================== @@ -1067,30 +1135,39 @@ analyze_iommu_group() { } -detect_optional_gpu_audio() { - EXTRA_AUDIO_DEVICES=() - - local sibling_audio="${SELECTED_GPU_PCI%.*}.1" - local dev_path="/sys/bus/pci/devices/${sibling_audio}" - [[ -d "$dev_path" ]] || return 0 - +# Returns 0 if the BDF at $1 is a real PCI audio device (class 04xx). +_pci_is_audio_device() { + local bdf="$1" + [[ -n "$bdf" ]] || return 1 + local dev_path="/sys/bus/pci/devices/${bdf}" + [[ -d "$dev_path" ]] || return 1 local class_hex class_hex=$(cat "${dev_path}/class" 2>/dev/null | sed 's/^0x//') - [[ "${class_hex:0:2}" == "04" ]] || return 0 + [[ "${class_hex:0:2}" == "04" ]] +} - local already_in_group=false dev +# Registers an audio BDF for passthrough alongside the GPU. +# Idempotent: skips if the BDF was already recorded by analyze_iommu_group +# (IOMMU_DEVICES) or by a previous call here (EXTRA_AUDIO_DEVICES). +# Updates EXTRA_AUDIO_DEVICES, EXTRA_AUDIO_INFO, and IOMMU_VFIO_IDS. +_register_gpu_audio_device() { + local bdf="$1" + [[ -n "$bdf" ]] || return 1 + local dev_path="/sys/bus/pci/devices/${bdf}" + [[ -d "$dev_path" ]] || return 1 + + local dev for dev in "${IOMMU_DEVICES[@]}"; do - if [[ "$dev" == "$sibling_audio" ]]; then - already_in_group=true - break - fi + [[ "$dev" == "$bdf" ]] && return 0 + done + for dev in "${EXTRA_AUDIO_DEVICES[@]}"; do + [[ "$dev" == "$bdf" ]] && return 0 done - if [[ "$already_in_group" == "true" ]]; then - return 0 - fi - - EXTRA_AUDIO_DEVICES+=("$sibling_audio") + EXTRA_AUDIO_DEVICES+=("$bdf") + local drv + drv=$(_get_pci_driver "$bdf") + EXTRA_AUDIO_INFO+=("${bdf}|${drv}") local vid did new_id vid=$(cat "${dev_path}/vendor" 2>/dev/null | sed 's/0x//') @@ -1101,6 +1178,98 @@ detect_optional_gpu_audio() { IOMMU_VFIO_IDS+=("$new_id") fi fi + return 0 +} + +# Scans the host for all class-04 PCI audio devices and lets the user +# pick which ones to pass to the VM. Only invoked when the selected GPU +# has no .1 sibling audio function — the dGPU fast path continues to +# auto-include that sibling without prompting. +# +# Devices already in the GPU's IOMMU group are excluded from the list +# (analyze_iommu_group has already queued them). The checklist defaults +# to all-OFF so nothing gets passed through silently. +_prompt_user_for_audio_devices() { + # Collect eligible audio BDFs from sysfs. + local -a candidates=() + local dev_path bdf + for dev_path in /sys/bus/pci/devices/*; do + [[ -d "$dev_path" ]] || continue + bdf=$(basename "$dev_path") + _pci_is_audio_device "$bdf" || continue + # Skip ones already queued by the IOMMU group sweep. + local skip=false dev + for dev in "${IOMMU_DEVICES[@]}"; do + [[ "$dev" == "$bdf" ]] && { skip=true; break; } + done + $skip && continue + candidates+=("$bdf") + done + + [[ ${#candidates[@]} -eq 0 ]] && return 0 + + # Build checklist items: tag=BDF, description=" (driver: X)". + local -a items=() + local name drv label + for bdf in "${candidates[@]}"; do + name=$(lspci -nn -s "${bdf#0000:}" 2>/dev/null \ + | sed 's/^[^ ]* //' \ + | sed 's/ \[0401\]//; s/ \[0403\]//; s/ \[0400\]//' \ + | cut -c1-52) + [[ -z "$name" ]] && name="PCI audio" + drv=$(_get_pci_driver "$bdf") + label="${name} (driver: ${drv})" + items+=("$bdf" "$label" "off") + done + + local prompt selection dialog_h list_h + prompt="$(translate 'The selected GPU has no dedicated .1 audio sibling function.')\n" + prompt+="$(translate 'If you want HDMI/analog audio inside the VM, select the audio controller(s) to pass through along with the GPU.')\n\n" + prompt+="$(translate 'Default is none (video-only passthrough). Use SPACE to toggle selections.')" + + # Give the list area a floor of 4 rows so a single candidate doesn't + # render cramped under the description. Overall dialog height scales + # with that floor + room for the 4-line prompt, blank line, borders + # and button row. + list_h=${#candidates[@]} + (( list_h < 4 )) && list_h=4 + dialog_h=$(( list_h + 14 )) + + selection=$(_pmx_checklist \ + "$(translate 'Add Audio Passthrough')" \ + "$prompt" \ + "$dialog_h" 82 "$list_h" \ + "${items[@]}") || return 0 + + # dialog wraps selected tags in quotes, whiptail does not — _strip them. + selection=$(echo "$selection" | tr -d '"') + [[ -z "$selection" ]] && return 0 + + local picked + for picked in $selection; do + _register_gpu_audio_device "$picked" + done +} + +detect_optional_gpu_audio() { + EXTRA_AUDIO_DEVICES=() + EXTRA_AUDIO_INFO=() + + # Fast path: dGPUs (NVIDIA / AMD discrete) and some APUs expose audio + # as function .1 of the same slot. When present, auto-include it — + # this is the unambiguous, always-safe case because such audio only + # outputs through the GPU's own ports and was never used by the host. + local sibling_audio="${SELECTED_GPU_PCI%.*}.1" + if _pci_is_audio_device "$sibling_audio"; then + _register_gpu_audio_device "$sibling_audio" + return 0 + fi + + # Slow path: no sibling audio (typical for Intel iGPUs whose HDMI + # audio lives on the PCH, or setups with an external sound card). + # Ask the user explicitly via checklist — the decision of whether to + # pass chipset audio alongside an iGPU is intentional, not automatic. + _prompt_user_for_audio_devices } @@ -1375,8 +1544,19 @@ confirm_summary() { else msg+=" • $(translate 'hostpci entries for all IOMMU group devices')\n" fi - [[ ${#EXTRA_AUDIO_DEVICES[@]} -gt 0 ]] && \ - msg+=" • $(translate 'Additional GPU audio function will be added'): ${EXTRA_AUDIO_DEVICES[*]}\n" + if [[ ${#EXTRA_AUDIO_DEVICES[@]} -gt 0 ]]; then + msg+=" • $(translate 'Additional audio function(s) to be added'):\n" + local _audio_info _audio_bdf _audio_drv + for _audio_info in "${EXTRA_AUDIO_INFO[@]}"; do + _audio_bdf="${_audio_info%%|*}" + _audio_drv="${_audio_info#*|}" + if [[ -n "$_audio_drv" && "$_audio_drv" != "none" && "$_audio_drv" != "vfio-pci" ]]; then + msg+=" • ${_audio_bdf} \Zb(${_audio_drv})\Zn\n" + else + msg+=" • ${_audio_bdf}\n" + fi + done + fi [[ "$SELECTED_GPU" == "nvidia" ]] && \ msg+=" • $(translate 'NVIDIA KVM hiding (cpu hidden=1)')\n" if [[ "$SWITCH_FROM_LXC" == "true" ]]; then @@ -1698,7 +1878,7 @@ cleanup_lxc_configs() { [[ "$SWITCH_FROM_LXC" != "true" ]] && return 0 [[ ${#LXC_AFFECTED_CTIDS[@]} -eq 0 ]] && return 0 - msg_info "$(translate 'Applying selected LXC switch action...')" + msg_info2 "$(translate 'Applying selected LXC switch action')" local i for i in "${!LXC_AFFECTED_CTIDS[@]}"; do @@ -1708,7 +1888,11 @@ cleanup_lxc_configs() { if [[ "${LXC_AFFECTED_RUNNING[$i]}" == "1" ]]; then msg_info "$(translate 'Stopping LXC') ${ctid}..." - if pct stop "$ctid" >>"$LOG_FILE" 2>&1; then + # _pmx_stop_lxc: graceful shutdown with forceStop+timeout, then + # fallback to pct stop. Avoids the indefinite hang that raw + # `pct stop` produces when the container is locked or has + # unresponsive processes (Plex, databases, etc.). + if _pmx_stop_lxc "$ctid" "$LOG_FILE"; then msg_ok "$(translate 'LXC stopped') ${ctid}" | tee -a "$screen_capture" else msg_warn "$(translate 'Could not stop LXC') ${ctid}" | tee -a "$screen_capture" @@ -1765,8 +1949,73 @@ cleanup_vm_config() { local src_conf="/etc/pve/qemu-server/${SWITCH_VM_SRC}.conf" if [[ -f "$src_conf" ]]; then msg_info "$(translate 'Removing GPU from VM') ${SWITCH_VM_SRC}..." - sed -i "/^hostpci[0-9]\+:.*${pci_slot}/d" "$src_conf" + # Precise regex: slot must be followed by "." and a + # delimiter. Kept in sync with switch_gpu_mode.sh. A looser + # ".*${pci_slot}" would match the slot as a substring and wipe + # unrelated hostpci entries (e.g. slot "00:02" matching inside + # a dGPU BDF 0000:02:00.0). + sed -E -i "/^hostpci[0-9]+:[[:space:]]*(0000:)?${pci_slot}\.[0-7]([,[:space:]]|$)/d" "$src_conf" msg_ok "$(translate 'GPU removed from VM') ${SWITCH_VM_SRC}" | tee -a "$screen_capture" + + # Cascade cleanup: detect audio companions orphaned in the + # source VM after the GPU slot is removed. Typical case: the + # source VM had an Intel iGPU at 00:02.0 paired with chipset + # audio at 00:1f.3 via the Part 1 checklist — the sed above + # only strips 00:02.* entries, leaving the chipset audio + # hostpci pointing at a device the source VM no longer uses. + # + # Unlike switch_gpu_mode (detach flow), we deliberately do NOT + # touch /etc/modprobe.d/vfio.conf here. The GPU is being moved + # to the current target VM, which may select the same audio + # companion in its own Part 1 checklist. Any vendor:device + # orphaned in vfio.conf after this move is inert — the user + # can clean it up later via switch_gpu_mode if they want. + if declare -F _vm_list_orphan_audio_hostpci >/dev/null 2>&1; then + local _orphan_audio + _orphan_audio=$(_vm_list_orphan_audio_hostpci "$SWITCH_VM_SRC" "$pci_slot") + if [[ -n "$_orphan_audio" ]]; then + local -a _orph_items=() + local _oline _o_idx _o_bdf _o_name + while IFS= read -r _oline; do + [[ -z "$_oline" ]] && continue + _o_idx="${_oline%%|*}" + _oline="${_oline#*|}" + _o_bdf="${_oline%%|*}" + _o_name="${_oline#*|}" + _orph_items+=("$_o_idx" "${_o_bdf} ${_o_name}" "on") + done <<< "$_orphan_audio" + + local _prompt + _prompt="\n$(translate 'The GPU has been moved out of VM') \Zb${SWITCH_VM_SRC}\Zn.\n\n" + _prompt+="$(translate 'The source VM also has these audio devices, likely added together with the GPU. Remove them too?')\n\n" + _prompt+="$(translate '(Checked entries will be removed. Uncheck to keep in VM.)')" + + local _selected + _selected=$(_pmx_checklist \ + "$(translate 'Associated Audio Devices')" \ + "$_prompt" \ + 20 84 "$(( ${#_orph_items[@]} / 3 ))" \ + "${_orph_items[@]}") || _selected="" + _selected=$(echo "$_selected" | tr -d '"') + + local _sel _removed="" + for _sel in $_selected; do + if declare -F _vm_remove_hostpci_index >/dev/null 2>&1; then + _vm_remove_hostpci_index "$SWITCH_VM_SRC" "$_sel" "$LOG_FILE" \ + && _removed+=" hostpci${_sel}" + else + qm set "$SWITCH_VM_SRC" --delete "hostpci${_sel}" >>"$LOG_FILE" 2>&1 \ + && _removed+=" hostpci${_sel}" + fi + done + if [[ -n "$_removed" ]]; then + show_proxmenux_logo + msg_title "${run_title}" + msg_ok "$(translate 'Associated audio removed from VM'): ${SWITCH_VM_SRC} —${_removed}" \ + | tee -a "$screen_capture" + fi + fi + fi fi } @@ -1922,6 +2171,7 @@ main() { detect_host_gpus check_iommu_enabled select_gpu + check_sriov_and_block_if_needed warn_single_gpu select_vm ensure_selected_gpu_not_already_in_target_vm @@ -2025,10 +2275,23 @@ main() { rm -f "$screen_capture" + # Final reboot prompt. Whiptail is invoked directly (not through + # the _pmx_yesno helper) because the ProxMenux menu chain + # (menu → main_menu → hw_grafics_menu → add_gpu_vm) has been + # verified to work reliably with a bare whiptail here, while the + # dialog-based helper path hits process-group / TTY edge cases in + # that exact chain. + # + # The extra `Press Enter to continue ... read -r` between whiptail + # and `reboot` is deliberate — it gives the user a visible pause + # after the dialog closes so an accidental Enter on the yes button + # cannot trigger an immediate reboot. if [[ "$HOST_CONFIG_CHANGED" == "true" ]]; then whiptail --title "$(translate 'Reboot Required')" \ --yesno "$(translate 'A reboot is required for VFIO binding to take effect. Do you want to restart now?')" 10 68 if [[ $? -eq 0 ]]; then + msg_success "$(translate 'Press Enter to continue...')" + read -r msg_warn "$(translate 'Rebooting the system...')" reboot else diff --git a/scripts/gpu_tpu/switch_gpu_mode.sh b/scripts/gpu_tpu/switch_gpu_mode.sh index 4ac95e23..6a98a95a 100644 --- a/scripts/gpu_tpu/switch_gpu_mode.sh +++ b/scripts/gpu_tpu/switch_gpu_mode.sh @@ -624,6 +624,75 @@ select_gpus() { read -ra SELECTED_GPU_IDX <<< "$sel" } +# ========================================================== +# SR-IOV guard — abort mode switch when SR-IOV is active +# ========================================================== +# Intel i915-sriov-dkms and AMD MxGPU split a Physical Function (PF) into +# multiple Virtual Functions (VFs). Switching the PF's driver destroys +# every VF; switching a VF's driver affects only that VF. ProxMenux does +# not yet manage the SR-IOV lifecycle (create/destroy VFs, track per-VF +# ownership), so operating on a PF with active VFs — or on a VF itself — +# would leave the user's virtualization stack in an inconsistent state. +# We detect the situation early and hand the user back to the Proxmox +# web UI, which understands VFs as first-class PCI devices. +check_sriov_and_block_if_needed() { + declare -F _pci_sriov_role >/dev/null 2>&1 || return 0 + + local idx pci role first_word pf_bdf active_count + local -a vf_list=() + local -a pf_list=() + + for idx in "${SELECTED_GPU_IDX[@]}"; do + pci="${ALL_GPU_PCIS[$idx]}" + role=$(_pci_sriov_role "$pci") + first_word="${role%% *}" + case "$first_word" in + vf) + pf_bdf="${role#vf }" + vf_list+=("${pci}|${pf_bdf}") + ;; + pf-active) + active_count="${role#pf-active }" + pf_list+=("${pci}|${active_count}") + ;; + esac + done + + [[ ${#vf_list[@]} -eq 0 && ${#pf_list[@]} -eq 0 ]] && return 0 + + local title msg entry bdf parent cnt + title="$(translate 'SR-IOV Configuration Detected')" + msg="\n" + + if [[ ${#vf_list[@]} -gt 0 ]]; then + msg+="$(translate 'The following selected device(s) are SR-IOV Virtual Functions (VFs):')\n\n" + for entry in "${vf_list[@]}"; do + bdf="${entry%%|*}" + parent="${entry#*|}" + msg+=" • ${bdf} $(translate '(parent PF:') ${parent})\n" + done + msg+="\n" + fi + + if [[ ${#pf_list[@]} -gt 0 ]]; then + msg+="$(translate 'The following selected device(s) are Physical Functions with active Virtual Functions:')\n\n" + for entry in "${pf_list[@]}"; do + bdf="${entry%%|*}" + cnt="${entry#*|}" + msg+=" • ${bdf} — ${cnt} $(translate 'active VF(s)')\n" + done + msg+="\n" + fi + + msg+="$(translate 'To assign VFs to VMs or LXCs, edit the configuration manually via the Proxmox web interface. The Physical Function will remain bound to the native driver.')" + + dialog --backtitle "ProxMenux" \ + --title "$title" \ + --msgbox "$msg" 20 80 + + exit 0 +} + collect_selected_iommu_ids() { SELECTED_IOMMU_IDS=() SELECTED_PCI_SLOTS=() @@ -766,8 +835,14 @@ apply_lxc_action_for_vm_mode() { if [[ "${LXC_AFFECTED_RUNNING[$i]}" == "1" ]]; then msg_info "$(translate 'Stopping LXC') ${ctid}..." - pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true - msg_ok "$(translate 'LXC stopped') ${ctid}" | tee -a "$screen_capture" + # _pmx_stop_lxc: unlock + graceful shutdown with forceStop+timeout, + # fallback to pct stop. Prevents the indefinite hang that raw + # `pct stop` triggers on locked / stuck containers. + if _pmx_stop_lxc "$ctid" "$LOG_FILE"; then + msg_ok "$(translate 'LXC stopped') ${ctid}" | tee -a "$screen_capture" + else + msg_warn "$(translate 'Could not stop LXC') ${ctid}" | tee -a "$screen_capture" + fi fi if [[ "$LXC_ACTION" == "keep_gpu_disable_onboot" && "${LXC_AFFECTED_ONBOOT[$i]}" == "1" ]]; then @@ -879,11 +954,102 @@ apply_vm_action_for_lxc_mode() { fi if [[ "$VM_ACTION" == "remove_gpu_keep_onboot" && -f "$conf" ]]; then + # Primary cleanup: strip hostpci lines whose BDF matches any of + # the GPU's selected slots. Matches both the PF function (.0) and + # any sibling audio or HDMI codec that shares the slot (typical + # for discrete NVIDIA/AMD cards where .1 is the HDMI audio). + # + # Precise regex: the slot must be followed by "." and + # either a delimiter or end-of-line. A looser ".*${slot}" would + # match by pure substring and delete unrelated hostpci entries — + # e.g. slot "00:02" would match inside "0000:02:00.0" (a dGPU at + # 02:00) and wipe both the iGPU and the unrelated dGPU. local slot for slot in "${SELECTED_PCI_SLOTS[@]}"; do - sed -i "/^hostpci[0-9]\+:.*${slot}/d" "$conf" + sed -E -i "/^hostpci[0-9]+:[[:space:]]*(0000:)?${slot}\.[0-7]([,[:space:]]|$)/d" "$conf" done msg_ok "$(translate 'GPU removed from VM config') ${vmid}" | tee -a "$screen_capture" + + # Cascade cleanup: Intel iGPU passthrough typically pairs the GPU + # at 00:02.0 with chipset audio at 00:1f.3, which lives at a + # different slot and therefore survives the sed above. If it + # stays in the VM config after the GPU is gone, the VM either + # fails to start (vfio-pci no longer claims 8086:51c8 after the + # switch-back) or it steals host audio unnecessarily. Enumerate + # orphan audio hostpci entries and ask the user what to do. + if declare -F _vm_list_orphan_audio_hostpci >/dev/null 2>&1; then + local _orphan_audio + _orphan_audio=$(_vm_list_orphan_audio_hostpci "$vmid" "${SELECTED_PCI_SLOTS[0]}") + if [[ -n "$_orphan_audio" ]]; then + local -a _orph_items=() + local _line _o_idx _o_bdf _o_name + while IFS= read -r _line; do + [[ -z "$_line" ]] && continue + _o_idx="${_line%%|*}" + _line="${_line#*|}" + _o_bdf="${_line%%|*}" + _o_name="${_line#*|}" + _orph_items+=("$_o_idx" "${_o_bdf} ${_o_name}" "on") + done <<< "$_orphan_audio" + + local _prompt _selected + _prompt="\n$(translate 'The GPU is being detached from VM') \Zb${vmid}\Zn.\n\n" + _prompt+="$(translate 'The VM also has these audio devices assigned via PCI passthrough — typically added together with the GPU. Remove them too?')\n\n" + _prompt+="$(translate '(Checked entries will be removed. Uncheck to keep in VM.)')" + + _selected=$(dialog --backtitle "ProxMenux" --colors \ + --title "$(translate 'Associated Audio Devices')" \ + --checklist "$_prompt" 20 84 "$(( ${#_orph_items[@]} / 3 ))" \ + "${_orph_items[@]}" \ + 2>&1 >/dev/tty) || _selected="" + _selected=$(echo "$_selected" | tr -d '"') + + # Cross-reference table so we can recover each selected idx's + # original BDF (we need it for vendor:device lookup below). + declare -A _orphan_bdf_by_idx=() + local _o_line _o_i _o_b + while IFS= read -r _o_line; do + [[ -z "$_o_line" ]] && continue + _o_i="${_o_line%%|*}" + _o_line="${_o_line#*|}" + _o_b="${_o_line%%|*}" + _orphan_bdf_by_idx["$_o_i"]="$_o_b" + done <<< "$_orphan_audio" + + local _sel _removed_audio="" _rem_bdf _vd_hex _dd_hex _vd_id + for _sel in $_selected; do + _rem_bdf="${_orphan_bdf_by_idx[$_sel]:-}" + if _vm_remove_hostpci_index "$vmid" "$_sel" "$LOG_FILE"; then + _removed_audio+=" hostpci${_sel}" + + # Fix B: if the removed audio BDF is not referenced by any + # OTHER VM, its vendor:device can safely come out of + # /etc/modprobe.d/vfio.conf too. Without this step, + # SELECTED_IOMMU_IDS only held the GPU's own IOMMU group + # (e.g. 8086:46a3 for Intel iGPU) and the companion audio + # id (e.g. 8086:51c8 for chipset audio) survived in + # vfio.conf, so vfio-pci kept claiming it at next boot + # even though nothing used it. + [[ -z "$_rem_bdf" ]] && continue + if ! _pci_bdf_in_any_vm "$_rem_bdf" "${VM_AFFECTED_IDS[@]}"; then + _vd_hex=$(cat "/sys/bus/pci/devices/${_rem_bdf}/vendor" 2>/dev/null | sed 's/^0x//') + _dd_hex=$(cat "/sys/bus/pci/devices/${_rem_bdf}/device" 2>/dev/null | sed 's/^0x//') + if [[ -n "$_vd_hex" && -n "$_dd_hex" ]]; then + _vd_id="${_vd_hex}:${_dd_hex}" + if ! _contains_in_array "$_vd_id" "${SELECTED_IOMMU_IDS[@]}"; then + SELECTED_IOMMU_IDS+=("$_vd_id") + fi + fi + fi + fi + done + unset _orphan_bdf_by_idx + if [[ -n "$_removed_audio" ]]; then + msg_ok "$(translate 'Associated audio removed from VM'): ${_removed_audio# }" \ + | tee -a "$screen_capture" + fi + fi + fi fi done } @@ -1164,6 +1330,7 @@ main() { detect_host_gpus while true; do select_gpus + check_sriov_and_block_if_needed select_target_mode [[ $? -eq 2 ]] && continue validate_vm_mode_blocked_ids diff --git a/scripts/gpu_tpu/switch_gpu_mode_direct.sh b/scripts/gpu_tpu/switch_gpu_mode_direct.sh index dfe5536e..20e9ce17 100644 --- a/scripts/gpu_tpu/switch_gpu_mode_direct.sh +++ b/scripts/gpu_tpu/switch_gpu_mode_direct.sh @@ -507,6 +507,67 @@ find_gpu_by_slot() { return 1 } +# ========================================================== +# SR-IOV guard — abort mode switch when SR-IOV is active +# ========================================================== +# Same policy as the interactive switch_gpu_mode.sh: refuse to operate on +# a Virtual Function or on a Physical Function that already has active +# VFs, since flipping drivers in that state collapses the VF tree and +# breaks every guest that was consuming a VF. +check_sriov_and_block_if_needed() { + declare -F _pci_sriov_role >/dev/null 2>&1 || return 0 + + local idx pci role first_word pf_bdf active_count + local -a vf_list=() + local -a pf_list=() + + for idx in "${SELECTED_GPU_IDX[@]}"; do + pci="${ALL_GPU_PCIS[$idx]}" + role=$(_pci_sriov_role "$pci") + first_word="${role%% *}" + case "$first_word" in + vf) + pf_bdf="${role#vf }" + vf_list+=("${pci}|${pf_bdf}") + ;; + pf-active) + active_count="${role#pf-active }" + pf_list+=("${pci}|${active_count}") + ;; + esac + done + + [[ ${#vf_list[@]} -eq 0 && ${#pf_list[@]} -eq 0 ]] && return 0 + + local msg entry bdf parent cnt + msg="
$(translate 'SR-IOV Configuration Detected')
" + + if [[ ${#vf_list[@]} -gt 0 ]]; then + msg+="

$(translate 'The following selected device(s) are SR-IOV Virtual Functions (VFs):')

    " + for entry in "${vf_list[@]}"; do + bdf="${entry%%|*}" + parent="${entry#*|}" + msg+="
  • ${bdf} — $(translate 'parent PF:') ${parent}
  • " + done + msg+="
" + fi + + if [[ ${#pf_list[@]} -gt 0 ]]; then + msg+="

$(translate 'The following selected device(s) are Physical Functions with active Virtual Functions:')

    " + for entry in "${pf_list[@]}"; do + bdf="${entry%%|*}" + cnt="${entry#*|}" + msg+="
  • ${bdf} — ${cnt} $(translate 'active VF(s)')
  • " + done + msg+="
" + fi + + msg+="

$(translate 'To assign VFs to VMs or LXCs, edit the configuration manually via the Proxmox web interface. The Physical Function will remain bound to the native driver.')

" + + hybrid_msgbox "$(translate 'SR-IOV Configuration Detected')" "$msg" + return 1 +} + validate_vm_mode_blocked_ids() { [[ "$TARGET_MODE" != "vm" ]] && return 0 @@ -687,8 +748,14 @@ apply_lxc_action_for_vm_mode() { if [[ "${LXC_AFFECTED_RUNNING[$i]}" == "1" ]]; then msg_info "$(translate 'Stopping LXC') ${ctid}..." - pct stop "$ctid" >>"$LOG_FILE" 2>&1 || true - msg_ok "$(translate 'LXC stopped') ${ctid}" | tee -a "$screen_capture" + # _pmx_stop_lxc: unlock + graceful shutdown with forceStop+timeout, + # fallback to pct stop. Prevents the indefinite hang that raw + # `pct stop` triggers on locked / stuck containers. + if _pmx_stop_lxc "$ctid" "$LOG_FILE"; then + msg_ok "$(translate 'LXC stopped') ${ctid}" | tee -a "$screen_capture" + else + msg_warn "$(translate 'Could not stop LXC') ${ctid}" | tee -a "$screen_capture" + fi fi if [[ "$LXC_ACTION" == "keep_gpu_disable_onboot" && "${LXC_AFFECTED_ONBOOT[$i]}" == "1" ]]; then @@ -804,11 +871,67 @@ apply_vm_action_for_lxc_mode() { fi if [[ "$VM_ACTION" == "remove_gpu_keep_onboot" && -f "$conf" ]]; then + # Primary cleanup: strip hostpci lines whose BDF matches any of + # the GPU's selected slots. Matches both the PF function (.0) and + # sibling audio/HDMI codecs (.1, typical for discrete cards). + # + # Precise regex: the slot must be followed by "." and a + # delimiter. Kept in sync with switch_gpu_mode.sh — a looser + # substring match would wipe unrelated hostpci entries (e.g. slot + # "00:02" matching as a substring inside a dGPU BDF 0000:02:00.0). local slot for slot in "${SELECTED_PCI_SLOTS[@]}"; do - sed -i "/^hostpci[0-9]\+:.*${slot}/d" "$conf" + sed -E -i "/^hostpci[0-9]+:[[:space:]]*(0000:)?${slot}\.[0-7]([,[:space:]]|$)/d" "$conf" done msg_ok "$(translate 'GPU removed from VM config') ${vmid}" | tee -a "$screen_capture" + + # Cascade cleanup for the web flow: auto-remove any PCI audio + # hostpci entries at a slot DIFFERENT from the GPU (typical Intel + # iGPU case where 00:1f.3 chipset audio was paired with the iGPU + # at 00:02.0). The helper skips audio devices whose slot already + # has a display sibling in the same VM (HDMI codec of another + # still-present dGPU), so those are not touched. The web runner + # has no good way to render a multi-select checklist, so the + # eligible ones are auto-removed and reported verbatim in the log. + if declare -F _vm_list_orphan_audio_hostpci >/dev/null 2>&1; then + local _orphan_audio _line _o_idx _o_bdf _o_name _removed="" + local _vd_hex _dd_hex _vd_id + _orphan_audio=$(_vm_list_orphan_audio_hostpci "$vmid" "${SELECTED_PCI_SLOTS[0]}") + if [[ -n "$_orphan_audio" ]]; then + while IFS= read -r _line; do + [[ -z "$_line" ]] && continue + _o_idx="${_line%%|*}" + _line="${_line#*|}" + _o_bdf="${_line%%|*}" + _o_name="${_line#*|}" + if _vm_remove_hostpci_index "$vmid" "$_o_idx" "$LOG_FILE"; then + _removed+=" • hostpci${_o_idx}: ${_o_bdf} ${_o_name}\n" + + # Fix B: also surface the audio's vendor:device to the + # upcoming vfio.conf cleanup if no other VM still uses + # this BDF. Ensures e.g. 8086:51c8 (Intel chipset audio) + # is stripped from /etc/modprobe.d/vfio.conf when the + # iGPU it was paired with leaves VM mode. + if declare -F _pci_bdf_in_any_vm >/dev/null 2>&1 \ + && ! _pci_bdf_in_any_vm "$_o_bdf" "${VM_AFFECTED_IDS[@]}"; then + _vd_hex=$(cat "/sys/bus/pci/devices/${_o_bdf}/vendor" 2>/dev/null | sed 's/^0x//') + _dd_hex=$(cat "/sys/bus/pci/devices/${_o_bdf}/device" 2>/dev/null | sed 's/^0x//') + if [[ -n "$_vd_hex" && -n "$_dd_hex" ]]; then + _vd_id="${_vd_hex}:${_dd_hex}" + if ! _contains_in_array "$_vd_id" "${SELECTED_IOMMU_IDS[@]}"; then + SELECTED_IOMMU_IDS+=("$_vd_id") + fi + fi + fi + fi + done <<< "$_orphan_audio" + if [[ -n "$_removed" ]]; then + msg_ok "$(translate 'Associated audio removed from VM'): ${vmid}" \ + | tee -a "$screen_capture" + echo -e "$_removed" | tee -a "$screen_capture" + fi + fi + fi fi done } @@ -1147,6 +1270,12 @@ main() { exit 1 fi + # SR-IOV guard: refuse to toggle the driver on a VF or on a PF with + # active VFs. Manual handling via Proxmox web UI is required. + if ! check_sriov_and_block_if_needed; then + exit 1 + fi + # Validate if GPU is blocked for VM mode (certain Intel GPUs) if ! validate_vm_mode_blocked_ids; then exit 1 diff --git a/scripts/storage/add_controller_nvme_vm.sh b/scripts/storage/add_controller_nvme_vm.sh index 3b4c6644..3ddf1dfd 100644 --- a/scripts/storage/add_controller_nvme_vm.sh +++ b/scripts/storage/add_controller_nvme_vm.sh @@ -364,6 +364,41 @@ select_controller_nvme() { return 1 fi + # SR-IOV guard: drop VFs / active PFs and inform the user. Same policy + # as add_gpu_vm.sh and the VM creators — refuse to rewrite host VFIO + # config for an SR-IOV device since it would collapse the VF tree. + if declare -F _pci_sriov_filter_array >/dev/null 2>&1; then + local sriov_removed="" + sriov_removed=$(_pci_sriov_filter_array SELECTED_CONTROLLER_PCIS) + if [[ -n "$sriov_removed" ]]; then + local sriov_msg="" + sriov_msg="\n$(translate "The following devices were excluded because they are part of an SR-IOV configuration:")\n" + local entry bdf role first + while IFS= read -r entry; do + [[ -z "$entry" ]] && continue + bdf="${entry%%|*}" + role="${entry#*|}" + first="${role%% *}" + if [[ "$first" == "vf" ]]; then + sriov_msg+="\n • ${bdf} — $(translate "Virtual Function")" + else + sriov_msg+="\n • ${bdf} — $(translate "Physical Function with") ${role#pf-active } $(translate "active VFs")" + fi + done <<< "$sriov_removed" + sriov_msg+="\n\n$(translate "To pass SR-IOV Virtual Functions to a VM, edit the VM configuration manually via the Proxmox web interface.")" + dialog --backtitle "ProxMenux" --colors \ + --title "$(translate "SR-IOV Configuration Detected")" \ + --msgbox "$sriov_msg" 18 82 + fi + + if [[ ${#SELECTED_CONTROLLER_PCIS[@]} -eq 0 ]]; then + dialog --backtitle "ProxMenux" \ + --title "$(translate "Controller + NVMe")" \ + --msgbox "\n$(translate "No eligible controllers remain after SR-IOV filtering.")" 8 70 + return 1 + fi + fi + return 0 } diff --git a/scripts/vm/synology.sh b/scripts/vm/synology.sh index 19984774..10756735 100644 --- a/scripts/vm/synology.sh +++ b/scripts/vm/synology.sh @@ -1255,6 +1255,48 @@ if [[ ${#EFFECTIVE_IMPORT_DISKS[@]} -gt 0 ]]; then done fi +if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then + # SR-IOV guard: exclude VFs / active PFs before staging. Mid-flow + # phase-2 output; a whiptail msgbox stops the scrolling so the user + # actually sees which devices were dropped. After the ack, each + # skipped BDF is logged via msg_warn so the action is visible in the + # captured log as well. + if declare -F _pci_sriov_filter_array >/dev/null 2>&1; then + SRIOV_REMOVED=$(_pci_sriov_filter_array CONTROLLER_NVME_PCIS) + if [[ -n "$SRIOV_REMOVED" ]]; then + SRIOV_MSG="" + SRIOV_BDFS=() + SRIOV_NL=$'\n' + SRIOV_MSG="$(translate "The following devices were excluded from Controller/NVMe passthrough because they are part of an SR-IOV configuration:")" + while IFS= read -r SRIOV_ENTRY; do + [[ -z "$SRIOV_ENTRY" ]] && continue + SRIOV_BDF="${SRIOV_ENTRY%%|*}" + SRIOV_ROLE="${SRIOV_ENTRY#*|}" + SRIOV_FIRST="${SRIOV_ROLE%% *}" + SRIOV_BDFS+=("$SRIOV_BDF") + if [[ "$SRIOV_FIRST" == "vf" ]]; then + SRIOV_MSG+="${SRIOV_NL} • ${SRIOV_BDF} — $(translate "Virtual Function")" + else + SRIOV_MSG+="${SRIOV_NL} • ${SRIOV_BDF} — $(translate "Physical Function with") ${SRIOV_ROLE#pf-active } $(translate "active VFs")" + fi + done <<< "$SRIOV_REMOVED" + SRIOV_MSG+="${SRIOV_NL}${SRIOV_NL}$(translate "To pass SR-IOV Virtual Functions to a VM, edit the VM configuration manually via the Proxmox web interface.")" + + whiptail --backtitle "ProxMenux" \ + --title "$(translate "SR-IOV Configuration Detected")" \ + --msgbox "$SRIOV_MSG" 18 82 + + for SRIOV_SKIPPED in "${SRIOV_BDFS[@]}"; do + msg_warn "$(translate "Skipping SR-IOV device"): ${SRIOV_SKIPPED}" + done + fi + fi + + if [[ ${#CONTROLLER_NVME_PCIS[@]} -eq 0 ]]; then + msg_warn "$(translate "No eligible Controller/NVMe devices remain after SR-IOV filtering. Skipping.")" + fi +fi + if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then local CONTROLLER_CAN_STAGE=true if declare -F _pci_is_iommu_active >/dev/null 2>&1 && ! _pci_is_iommu_active; then diff --git a/scripts/vm/vm_creator.sh b/scripts/vm/vm_creator.sh index da035e65..3218995a 100644 --- a/scripts/vm/vm_creator.sh +++ b/scripts/vm/vm_creator.sh @@ -468,6 +468,55 @@ fi done fi + if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then + # SR-IOV guard: drop Virtual Functions / active-PFs before staging. + # Proxmox's VFIO rebind via qm hostpci would trigger the same VF-tree + # collapse described in the GPU flows, so we exclude them and tell + # the user to manage those passthroughs manually. + # + # UI choice: this runs mid-flow (phase 2 of the wizard, interleaved + # with msg_info/msg_ok output), so a whiptail msgbox is used to force + # the user to acknowledge the exclusion instead of letting the notice + # scroll by with the rest of the processing output. After the user + # clicks OK, a per-device msg_warn is emitted so the skipped BDFs + # remain visible in the captured log. + if declare -F _pci_sriov_filter_array >/dev/null 2>&1; then + local _sriov_removed="" + _sriov_removed=$(_pci_sriov_filter_array CONTROLLER_NVME_PCIS) + if [[ -n "$_sriov_removed" ]]; then + local _sriov_msg="" _entry _bdf _role _first _sb + local -a _sriov_bdfs=() + local _nl=$'\n' + _sriov_msg="$(translate "The following devices were excluded from Controller/NVMe passthrough because they are part of an SR-IOV configuration:")" + while IFS= read -r _entry; do + [[ -z "$_entry" ]] && continue + _bdf="${_entry%%|*}" + _role="${_entry#*|}" + _first="${_role%% *}" + _sriov_bdfs+=("$_bdf") + if [[ "$_first" == "vf" ]]; then + _sriov_msg+="${_nl} • ${_bdf} — $(translate "Virtual Function")" + else + _sriov_msg+="${_nl} • ${_bdf} — $(translate "Physical Function with") ${_role#pf-active } $(translate "active VFs")" + fi + done <<< "$_sriov_removed" + _sriov_msg+="${_nl}${_nl}$(translate "To pass SR-IOV Virtual Functions to a VM, edit the VM configuration manually via the Proxmox web interface.")" + + whiptail --backtitle "ProxMenux" \ + --title "$(translate "SR-IOV Configuration Detected")" \ + --msgbox "$_sriov_msg" 18 82 + + for _sb in "${_sriov_bdfs[@]}"; do + msg_warn "$(translate "Skipping SR-IOV device"): ${_sb}" + done + fi + fi + + if [[ ${#CONTROLLER_NVME_PCIS[@]} -eq 0 ]]; then + msg_warn "$(translate "No eligible Controller/NVMe devices remain after SR-IOV filtering. Skipping.")" + fi + fi + if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then local CONTROLLER_CAN_STAGE=true if declare -F _pci_is_iommu_active >/dev/null 2>&1 && ! _pci_is_iommu_active; then diff --git a/scripts/vm/zimaos.sh b/scripts/vm/zimaos.sh index 7ec02c54..ef609381 100644 --- a/scripts/vm/zimaos.sh +++ b/scripts/vm/zimaos.sh @@ -1270,6 +1270,48 @@ function create_vm() { done fi + if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then + # SR-IOV guard: mirror of the synology.sh/vm_creator.sh block — + # drop VFs and active-PF devices before staging so Proxmox does + # not collapse the VF tree at VM start. Mid-flow, so the notice + # goes through whiptail (blocking acknowledgment) and each + # skipped BDF is then echoed via msg_warn for the log trail. + if declare -F _pci_sriov_filter_array >/dev/null 2>&1; then + SRIOV_REMOVED=$(_pci_sriov_filter_array CONTROLLER_NVME_PCIS) + if [[ -n "$SRIOV_REMOVED" ]]; then + SRIOV_MSG="" + SRIOV_BDFS=() + SRIOV_NL=$'\n' + SRIOV_MSG="$(translate "The following devices were excluded from Controller/NVMe passthrough because they are part of an SR-IOV configuration:")" + while IFS= read -r SRIOV_ENTRY; do + [[ -z "$SRIOV_ENTRY" ]] && continue + SRIOV_BDF="${SRIOV_ENTRY%%|*}" + SRIOV_ROLE="${SRIOV_ENTRY#*|}" + SRIOV_FIRST="${SRIOV_ROLE%% *}" + SRIOV_BDFS+=("$SRIOV_BDF") + if [[ "$SRIOV_FIRST" == "vf" ]]; then + SRIOV_MSG+="${SRIOV_NL} • ${SRIOV_BDF} — $(translate "Virtual Function")" + else + SRIOV_MSG+="${SRIOV_NL} • ${SRIOV_BDF} — $(translate "Physical Function with") ${SRIOV_ROLE#pf-active } $(translate "active VFs")" + fi + done <<< "$SRIOV_REMOVED" + SRIOV_MSG+="${SRIOV_NL}${SRIOV_NL}$(translate "To pass SR-IOV Virtual Functions to a VM, edit the VM configuration manually via the Proxmox web interface.")" + + whiptail --backtitle "ProxMenux" \ + --title "$(translate "SR-IOV Configuration Detected")" \ + --msgbox "$SRIOV_MSG" 18 82 + + for SRIOV_SKIPPED in "${SRIOV_BDFS[@]}"; do + msg_warn "$(translate "Skipping SR-IOV device"): ${SRIOV_SKIPPED}" + done + fi + fi + + if [[ ${#CONTROLLER_NVME_PCIS[@]} -eq 0 ]]; then + msg_warn "$(translate "No eligible Controller/NVMe devices remain after SR-IOV filtering. Skipping.")" + fi + fi + if [[ ${#CONTROLLER_NVME_PCIS[@]} -gt 0 ]]; then local CONTROLLER_CAN_STAGE=true if declare -F _pci_is_iommu_active >/dev/null 2>&1 && ! _pci_is_iommu_active; then diff --git a/version.txt b/version.txt index 26aaba0e..6085e946 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.2.0 +1.2.1