diff --git a/AppImage/config/verified_ai_models.json b/AppImage/config/verified_ai_models.json index 2e0c716d..da915473 100644 --- a/AppImage/config/verified_ai_models.json +++ b/AppImage/config/verified_ai_models.json @@ -1,7 +1,8 @@ { "_description": "Verified AI models for ProxMenux notifications. Only models listed here will be shown to users. Models are tested to work with the chat/completions API format.", - "_updated": "2026-03-20", - + "_updated": "2026-04-19", + "_verifier": "Refreshed with tools/ai-models-verifier (private). Re-run before each ProxMenux release to keep the list current. The verifier and ProxMenux share the same reasoning/thinking-model handlers so their verdicts stay aligned with runtime behaviour.", + "groq": { "models": [ "llama-3.3-70b-versatile", @@ -12,37 +13,46 @@ "mixtral-8x7b-32768", "gemma2-9b-it" ], - "recommended": "llama-3.3-70b-versatile" + "recommended": "llama-3.3-70b-versatile", + "_note": "Not yet re-verified in 2026-04 refresh — kept from previous curation. Run the verifier with a Groq key to prune deprecated entries." }, - + "gemini": { "models": [ - "gemini-2.5-flash", "gemini-2.5-flash-lite", - "gemini-2.5-pro" + "gemini-2.5-flash", + "gemini-3-flash-preview" ], - "recommended": "gemini-2.5-flash", - "_note": "gemini-2.5-flash-lite is cheaper but may struggle with complex prompts. Use with simple/custom prompts.", + "recommended": "gemini-2.5-flash-lite", + "_note": "flash-lite / flash pass the verifier consistently; pro variants reject thinkingBudget=0 and are overkill for notification translation anyway. 'latest' aliases (gemini-flash-latest, gemini-flash-lite-latest) are intentionally omitted because they resolved to different models across runs and produced timeouts in some regions.", "_deprecated": ["gemini-2.0-flash", "gemini-2.0-flash-lite", "gemini-1.5-flash", "gemini-1.0-pro", "gemini-pro"] }, - + "openai": { "models": [ + "gpt-4.1-nano", "gpt-4.1-mini", - "gpt-4o-mini" + "gpt-4o-mini", + "gpt-4.1", + "gpt-4o", + "gpt-5-chat-latest", + "gpt-5.4-nano", + "gpt-5.4-mini" ], - "recommended": "gpt-4o-mini" + "recommended": "gpt-4.1-nano", + "_note": "Reasoning models (o-series, gpt-5/5.1/5.2 non-chat variants) are supported by openai_provider.py via max_completion_tokens + reasoning_effort=minimal, but not listed here by default: their latency is higher than the chat models and they do not improve translation quality for notifications. Add specific reasoning IDs to this list only if a user explicitly wants them." }, - + "anthropic": { "models": [ "claude-3-5-haiku-latest", "claude-3-5-sonnet-latest", "claude-3-opus-latest" ], - "recommended": "claude-3-5-haiku-latest" + "recommended": "claude-3-5-haiku-latest", + "_note": "Not re-verified in 2026-04 refresh — kept from previous curation. Add claude-4.x / claude-4.5 / claude-4.6 / claude-4.7 variants after running the verifier with an Anthropic key." }, - + "openrouter": { "models": [ "meta-llama/llama-3.3-70b-instruct", @@ -50,14 +60,15 @@ "meta-llama/llama-3.1-8b-instruct", "anthropic/claude-3.5-haiku", "anthropic/claude-3.5-sonnet", - "google/gemini-flash-2.5-flash-lite", + "google/gemini-flash-1.5", "openai/gpt-4o-mini", "mistralai/mistral-7b-instruct", "mistralai/mixtral-8x7b-instruct" ], - "recommended": "meta-llama/llama-3.3-70b-instruct" + "recommended": "meta-llama/llama-3.3-70b-instruct", + "_note": "Not re-verified in 2026-04 refresh. google/gemini-flash-2.5-flash-lite was malformed in the previous entry and has been replaced with google/gemini-flash-1.5." }, - + "ollama": { "_note": "Ollama models are local, we don't filter them. User manages their own models.", "models": [], diff --git a/AppImage/scripts/ai_providers/gemini_provider.py b/AppImage/scripts/ai_providers/gemini_provider.py index 49224fb6..85d251b8 100644 --- a/AppImage/scripts/ai_providers/gemini_provider.py +++ b/AppImage/scripts/ai_providers/gemini_provider.py @@ -30,6 +30,23 @@ class GeminiProvider(AIProvider): 'gemini-1.0-pro', 'gemini-pro', ] + + @staticmethod + def _has_thinking_mode(model: str) -> bool: + """True for Gemini variants that enable "thinking" by default. + + Gemini 2.5+ and 3.x Pro/Flash models spend output tokens on + internal reasoning before emitting the final answer. With a small + max_tokens budget (≤250) that consumes the whole allowance and + leaves an empty reply. For the short translate/explain use case + in ProxMenux we want direct output, so we disable thinking for + these. Lite variants (flash-lite) do NOT have thinking enabled + and are safe to leave alone. + """ + m = model.lower() + if 'lite' in m: + return False + return m.startswith('gemini-2.5') or m.startswith('gemini-3') def list_models(self) -> List[str]: """List available Gemini models that support generateContent. @@ -118,6 +135,18 @@ class GeminiProvider(AIProvider): url = f"{self.API_BASE}/{self.model}:generateContent?key={self.api_key}" # Gemini uses a specific format with contents array + gen_config = { + 'maxOutputTokens': max_tokens, + 'temperature': 0.3, + } + + # Disable thinking on 2.5+ / 3.x pro & flash models so the limited + # output budget actually produces visible text. thinkingBudget=0 + # is the official switch for this; lite variants and legacy + # models don't need (and ignore) the field. + if self._has_thinking_mode(self.model): + gen_config['thinkingConfig'] = {'thinkingBudget': 0} + payload = { 'systemInstruction': { 'parts': [{'text': system_prompt}] @@ -128,10 +157,7 @@ class GeminiProvider(AIProvider): 'parts': [{'text': user_message}] } ], - 'generationConfig': { - 'maxOutputTokens': max_tokens, - 'temperature': 0.3, - } + 'generationConfig': gen_config, } headers = { diff --git a/AppImage/scripts/ai_providers/openai_provider.py b/AppImage/scripts/ai_providers/openai_provider.py index d5877da5..86484767 100644 --- a/AppImage/scripts/ai_providers/openai_provider.py +++ b/AppImage/scripts/ai_providers/openai_provider.py @@ -37,23 +37,49 @@ class OpenAIProvider(AIProvider): # Recommended models for chat (in priority order) RECOMMENDED_PREFIXES = ['gpt-4o-mini', 'gpt-4o', 'gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo'] + + @staticmethod + def _is_reasoning_model(model: str) -> bool: + """True for OpenAI reasoning models (o-series + non-chat gpt-5+). + + These use a stricter API contract than chat models: + - Must use ``max_completion_tokens`` instead of ``max_tokens`` + - ``temperature`` is not accepted (only the default is supported) + + Chat-optimized variants (``gpt-5-chat-latest``, + ``gpt-5.1-chat-latest``, etc.) keep the classic contract and are + NOT flagged here. + """ + m = model.lower() + # o1, o3, o4, o5 ... (o...) + if len(m) >= 2 and m[0] == 'o' and m[1].isdigit(): + return True + # gpt-5, gpt-5-mini, gpt-5.1, gpt-5.2-pro ... EXCEPT *-chat-latest + if m.startswith('gpt-5') and '-chat' not in m: + return True + return False def list_models(self) -> List[str]: - """List available OpenAI models for chat completions. - - Filters to only chat-capable models, excluding: - - Embedding models - - Audio/speech models (whisper, tts) - - Image models (dall-e) - - Instruct models (different API) - - Legacy models (babbage, davinci, etc.) - + """List available models for chat completions. + + Two modes: + - Official OpenAI (no custom base_url): restrict to GPT chat models, + excluding embedding/whisper/tts/dall-e/instruct/legacy variants. + - OpenAI-compatible endpoint (LiteLLM, MLX, LM Studio, vLLM, + LocalAI, Ollama-proxy, etc.): the "gpt" substring check is + dropped so user-served models (e.g. ``mlx-community/Llama-3.1-8B``, + ``Qwen3-32B``, ``mistralai/...``) show up. EXCLUDED_PATTERNS + still applies — embeddings/whisper/tts aren't chat-capable on + any backend. + Returns: List of model IDs suitable for chat completions. """ if not self.api_key: return [] - + + is_custom_endpoint = bool(self.base_url) + try: # Determine models URL from base_url if set if self.base_url: @@ -63,42 +89,46 @@ class OpenAIProvider(AIProvider): models_url = f"{base}/models" else: models_url = self.DEFAULT_MODELS_URL - + req = urllib.request.Request( models_url, headers={'Authorization': f'Bearer {self.api_key}'}, method='GET' ) - + with urllib.request.urlopen(req, timeout=10) as resp: data = json.loads(resp.read().decode('utf-8')) - + models = [] for model in data.get('data', []): model_id = model.get('id', '') if not model_id: continue - + model_lower = model_id.lower() - - # Must be a GPT model - if 'gpt' not in model_lower: + + # Official OpenAI: restrict to GPT chat models. Custom + # endpoints serve arbitrarily named models, so this + # substring check would drop every valid result there. + if not is_custom_endpoint and 'gpt' not in model_lower: continue - - # Exclude non-chat models + + # Exclude non-chat models on every backend. if any(pattern in model_lower for pattern in self.EXCLUDED_PATTERNS): continue - + models.append(model_id) - - # Sort with recommended models first + + # Sort with recommended models first (only meaningful for OpenAI + # official; on custom endpoints the prefixes rarely match, so + # entries fall through to alphabetical order, which is fine). def sort_key(m): m_lower = m.lower() for i, prefix in enumerate(self.RECOMMENDED_PREFIXES): if m_lower.startswith(prefix): return (i, m) return (len(self.RECOMMENDED_PREFIXES), m) - + return sorted(models, key=sort_key) except Exception as e: print(f"[OpenAIProvider] Failed to list models: {e}") @@ -133,17 +163,35 @@ class OpenAIProvider(AIProvider): """ if not self.api_key: raise AIProviderError("API key required for OpenAI") - + payload = { 'model': self.model, 'messages': [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_message}, ], - 'max_tokens': max_tokens, - 'temperature': 0.3, } - + + # Reasoning models (o1/o3/o4/gpt-5*, excluding *-chat-latest) use a + # different parameter contract: max_completion_tokens instead of + # max_tokens, and no temperature field. Sending the classic chat + # parameters to them produces HTTP 400 Bad Request. + # + # They also spend output budget on internal reasoning by default, + # which empties the user-visible reply when max_tokens is small + # (like the ~200 we use for notifications). reasoning_effort + # 'minimal' keeps that internal reasoning to a minimum so the + # entire budget is available for the translation, which is + # exactly what this pipeline wants. OpenAI documents 'minimal', + # 'low', 'medium', 'high' — 'minimal' is the right setting for a + # straightforward translate+explain task. + if self._is_reasoning_model(self.model): + payload['max_completion_tokens'] = max_tokens + payload['reasoning_effort'] = 'minimal' + else: + payload['max_tokens'] = max_tokens + payload['temperature'] = 0.3 + headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}', diff --git a/AppImage/scripts/flask_notification_routes.py b/AppImage/scripts/flask_notification_routes.py index 7c3294b7..024804a0 100644 --- a/AppImage/scripts/flask_notification_routes.py +++ b/AppImage/scripts/flask_notification_routes.py @@ -220,10 +220,20 @@ def get_provider_models(): # Get all models from provider API api_models = ai_provider.list_models() - + + # OpenAI with a custom base URL means an OpenAI-compatible endpoint + # (LiteLLM, MLX, LM Studio, vLLM, LocalAI, Ollama-proxy...). The + # verified_ai_models.json list only contains official OpenAI IDs + # (gpt-4o-mini etc.), so intersecting against it would strip every + # model the user actually serves. Treat the custom-endpoint case + # like Ollama: return whatever the endpoint advertises, no filter. + is_openai_compat = (provider == 'openai' and bool(openai_base_url)) + if not api_models: - # API failed, fall back to verified list only - if verified_models: + # API failed, fall back to verified list only (but not for + # custom endpoints — we don't know what the endpoint serves, + # so "gpt-4o-mini" as a fallback would be misleading). + if verified_models and not is_openai_compat: models = sorted(verified_models) return jsonify({ 'success': True, @@ -232,27 +242,38 @@ def get_provider_models(): 'message': f'{len(models)} verified models (API unavailable)' }) return jsonify({ - 'success': False, - 'models': [], - 'message': 'Could not retrieve models. Check your API key.' + 'success': False, + 'models': [], + 'message': 'Could not retrieve models. Check your API key and endpoint URL.' }) - + + if is_openai_compat: + # Custom OpenAI-compatible endpoint: surface every model the + # endpoint reports. No verified-list intersection. + models = sorted(api_models) + return jsonify({ + 'success': True, + 'models': models, + 'recommended': models[0] if models else '', + 'message': f'Found {len(models)} models on custom endpoint' + }) + # Filter: only models that are BOTH in API and verified list if verified_models: api_models_set = set(api_models) filtered_models = [m for m in verified_models if m in api_models_set] - + if not filtered_models: # No intersection - maybe verified list is outdated # Return verified list anyway (will fail on use if truly unavailable) filtered_models = list(verified_models) - + # Sort with recommended first def sort_key(m): if m == recommended: return (0, m) return (1, m) - + models = sorted(filtered_models, key=sort_key) else: # No verified list for this provider, return all from API