diff --git a/main.py b/main.py deleted file mode 100644 index d5bf1c1..0000000 --- a/main.py +++ /dev/null @@ -1,311 +0,0 @@ -# main.py (tuned for faster Phi-3.1-mini / Phi-2-mini style models on CPU) -from fastapi import FastAPI, Request -from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from pydantic import BaseModel -import os -import sys -import threading -import json -import time -import multiprocessing -from functools import lru_cache - -# llama_cpp import -from llama_cpp import Llama - -# ------------------------- -# Configuration (env overrides allowed) -# ------------------------- -DEFAULT_MODEL = os.environ.get("LLM_MODEL_PATH", "models/Phi-3.1-mini-4k-instruct-Q4_K_M.gguf") - -# sensible defaults for your 13GB / 9-core VM -CPU_CORES = multiprocessing.cpu_count() -DEFAULT_THREADS = max(1, CPU_CORES - 1) - -MODEL_PATH = DEFAULT_MODEL -N_THREADS = int(os.environ.get("LLM_N_THREADS", DEFAULT_THREADS)) -N_CTX = int(os.environ.get("LLM_N_CTX", 1024)) # lowered from 1536/4096 -N_BATCH = int(os.environ.get("LLM_N_BATCH", 512)) # larger batch for CPU speed -MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", 150)) -TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", 0.2)) -TOP_P = float(os.environ.get("LLM_TOP_P", 0.9)) -CONCURRENCY = int(os.environ.get("LLM_CONCURRENCY", 1)) - -# memory mapping options -USE_MMAP = os.environ.get("LLM_USE_MMAP", "1") == "1" -USE_MLOCK = os.environ.get("LLM_USE_MLOCK", "0") == "1" # usually false on non-root - -# optional in-memory cache to speed repeated identical requests -ENABLE_CACHE = os.environ.get("LLM_ENABLE_CACHE", "0") == "1" -CACHE_MAXSIZE = int(os.environ.get("LLM_CACHE_MAXSIZE", 256)) - -# ------------------------- -# FastAPI -# ------------------------- -app = FastAPI(root_path="/api") -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# ------------------------- -# Model load (with safe guards) -# ------------------------- -print(f"Loading model from: {MODEL_PATH}", file=sys.stderr) -if not os.path.exists(MODEL_PATH): - raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}. Place the gguf file there or set LLM_MODEL_PATH env var.") - -try: - llm = Llama( - model_path=MODEL_PATH, - n_ctx=N_CTX, - n_threads=N_THREADS, - n_batch=N_BATCH, - use_mmap=USE_MMAP, - use_mlock=USE_MLOCK, - ) - print("Model loaded successfully.", file=sys.stderr) -except Exception as e: - # print traceback to stderr and raise so process doesn't silently continue - import traceback - print("ERROR loading model:", file=sys.stderr) - traceback.print_exc() - raise - -# concurrency control (1 by default to avoid OOM) -llm_sem = threading.Semaphore(CONCURRENCY) - -# ------------------------- -# Input schema (unchanged) -# ------------------------- -class HealthInput(BaseModel): - weight: float - height: float - age: int - spo2: int - heart_rate: int - activity: str - -# ------------------------- -# Utility: extract JSON object from model text -# ------------------------- -def extract_json_object(text: str) -> str: - start = text.find("{") - if start == -1: - return "{}" - stack = 0 - for i in range(start, len(text)): - if text[i] == "{": - stack += 1 - elif text[i] == "}": - stack -= 1 - if stack == 0: - return text[start:i+1] - return text[start:] - -# ------------------------- -# Normalization + typo fixes (keeps output schema intact) -# ------------------------- -def normalize_keys_and_fix_typos(result: dict) -> dict: - canonical = {} - for k, v in result.items(): - lk = k.strip().lower() - if lk.startswith("sum") or lk.startswith("summ") or lk == "summery": - canonical["summary"] = v - elif lk.startswith("bmi") and "category" in lk: - canonical["bmi_category"] = v - elif lk == "bmi": - canonical["bmi"] = v - elif lk.startswith("spo2") or lk.startswith("sp02"): - canonical["spo2_status"] = v - elif "heart" in lk and ("rate" in lk or "hr" in lk): - canonical["heart_rate_status"] = v - elif "risk" in lk: - canonical["health_risk"] = v - elif lk.startswith("recom") or "recomend" in lk: - canonical["recommendation"] = v - elif lk.startswith("expl") or "expla" in lk: - canonical["explanation"] = v - else: - canonical[k] = v - - defaults = { - "summary": "Unable to produce summary.", - "bmi": None, - "bmi_category": "normal", - "spo2_status": "normal", - "heart_rate_status": "normal", - "health_risk": "low", - "recommendation": "Maintain healthy lifestyle: balanced diet, regular exercise, and follow-up with healthcare if needed.", - "explanation": "No detailed explanation provided." - } - - for k, dv in defaults.items(): - if k not in canonical or canonical[k] is None or canonical[k] == "": - canonical[k] = dv - - def pick_most_likely(val, allowed): - if not isinstance(val, str): - return val - if "|" in val: - parts = [p.strip().lower() for p in val.split("|")] - for p in parts: - if p in allowed: - return p - return parts[0] - lv = val.strip().lower() - for a in allowed: - if a in lv: - return a - return val - - canonical["bmi_category"] = pick_most_likely(canonical["bmi_category"], ["underweight", "normal", "overweight"]) - canonical["spo2_status"] = pick_most_likely(canonical["spo2_status"], ["normal", "low", "dangerous"]) - canonical["heart_rate_status"] = pick_most_likely(canonical["heart_rate_status"], ["normal", "low", "high"]) - canonical["health_risk"] = pick_most_likely(canonical["health_risk"], ["low", "moderate", "high"]) - - return canonical - -# ------------------------- -# Prompt builder (short and strict) -# ------------------------- -SYSTEM_PROMPT = ( - "You are a concise medical assistant. Output ONLY a single JSON object with keys: " - "summary, bmi, bmi_category, spo2_status, heart_rate_status, health_risk, recommendation, explanation." -) - -def build_user_prompt(data: HealthInput, bmi: float) -> str: - return ( - f"Health data:\n" - f"weight: {data.weight}\n" - f"height: {data.height}\n" - f"age: {data.age}\n" - f"spo2: {data.spo2}\n" - f"heart_rate: {data.heart_rate}\n" - f"activity: {data.activity}\n" - f"bmi: {bmi}\n\n" - "Return only a single JSON object using the exact keys." - ) - -# ------------------------- -# Optional simple LRU cache for identical inputs (disabled by default) -# ------------------------- -if ENABLE_CACHE: - @lru_cache(maxsize=CACHE_MAXSIZE) - def cached_infer_cache_key(payload_str: str): - # payload_str is JSON string of inputs - # to keep function signature compatible with lru_cache, we accept string key - payload = json.loads(payload_str) - # we call the real infer function below by delegating to a non-cached function - return _infer_from_parsed(payload) - -# ------------------------- -# Core inference (non-cached) -# ------------------------- -def _infer_from_parsed(payload: dict) -> dict: - # map payload to HealthInput-like structure - try: - # compute BMI - height_m = payload.get("height", 0) / 100.0 if payload.get("height") else None - bmi_value = round(payload["weight"] / (height_m ** 2), 1) if height_m and payload.get("weight") else None - except Exception: - bmi_value = None - - # build prompt - class Tmp: - pass - tmp = Tmp() - tmp.weight = payload.get("weight") - tmp.height = payload.get("height") - tmp.age = payload.get("age") - tmp.spo2 = payload.get("spo2") - tmp.heart_rate = payload.get("heart_rate") - tmp.activity = payload.get("activity") - - user_prompt = build_user_prompt(tmp, bmi_value) - - # run LLM with semaphore - with llm_sem: - start_t = time.time() - kwargs = dict( - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt} - ], - temperature=TEMPERATURE, - max_tokens=MAX_TOKENS, - top_p=TOP_P, - ) - - # prefer response_format if supported — it's faster/cleaner - try: - kwargs_with_rf = kwargs.copy() - kwargs_with_rf["response_format"] = {"type": "json_object"} - response = llm.create_chat_completion(**kwargs_with_rf) - except TypeError: - response = llm.create_chat_completion(**kwargs) - - duration = time.time() - start_t - print(f"[inference] elapsed={duration:.2f}s tokens_limit={MAX_TOKENS} threads={N_THREADS} batch={N_BATCH}", file=sys.stderr) - - # extract generated text safely - raw_out = "" - try: - raw_out = response.get("choices", [{}])[0].get("message", {}).get("content", "") - except Exception: - raw_out = str(response) - - json_text = extract_json_object(raw_out) - parsed = {} - try: - parsed = json.loads(json_text) - except Exception: - parsed = {} - - normalized = normalize_keys_and_fix_typos(parsed) - normalized["bmi"] = bmi_value - return normalized - -# ------------------------- -# Endpoint: /analyze (keeps existing JSON schema) -# ------------------------- -@app.post("/analyze") -async def analyze(request: Request): - # parse request body - try: - payload = await request.json() - except Exception: - return JSONResponse(status_code=400, content={"error": "invalid JSON body"}) - - # validate minimal fields (fast check) - required = ["weight", "height", "age", "spo2", "heart_rate", "activity"] - for k in required: - if k not in payload: - return JSONResponse(status_code=400, content={"error": "missing_field", "field": k}) - - try: - if ENABLE_CACHE: - key = json.dumps(payload, sort_keys=True) - result = cached_infer_cache_key(key) - else: - result = _infer_from_parsed(payload) - except Exception as e: - # log and return JSON error so frontend doesn't get HTML - import traceback - tb = traceback.format_exc() - print("Inference error:", file=sys.stderr) - print(tb, file=sys.stderr) - return JSONResponse(status_code=500, content={"error": "model_inference_failed", "details": str(e)}) - - return JSONResponse(status_code=200, content=result) - -# ------------------------- -# Root health check -# ------------------------- -@app.get("/") -def root(): - return {"status": "ok", "model": os.path.basename(MODEL_PATH), "n_ctx": N_CTX, "n_threads": N_THREADS} \ No newline at end of file diff --git a/qwen/main.py b/qwen/main.py deleted file mode 100644 index 38b76d5..0000000 --- a/qwen/main.py +++ /dev/null @@ -1,224 +0,0 @@ -# main.py -# FastAPI backend di-tune untuk Qwen3.5-2B.Q4_K_M.gguf -from fastapi import FastAPI, Request -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from fastapi.responses import JSONResponse -import os, sys, threading, traceback, json, time -from llama_cpp import Llama - -# ---------- Config (sesuaikan jika perlu) ---------- -MODEL_DEFAULT = "models/Qwen3.5-2B.Q4_K_M.gguf" -MODEL_PATH = os.environ.get("LLM_MODEL_PATH", MODEL_DEFAULT) - -# VM: 13GB RAM, 9 core -> safe default -N_THREADS = int(os.environ.get("LLM_N_THREADS", 9)) # gunakan semua core jika ingin -N_CTX = int(os.environ.get("LLM_N_CTX", 1536)) # 2048 lebih aman daripada 4096 -N_BATCH = int(os.environ.get("LLM_N_BATCH", 256)) # batch lebih besar mempercepat -MAX_TOKENS = int(os.environ.get("LLM_MAX_TOKENS", 450)) -TEMPERATURE = float(os.environ.get("LLM_TEMPERATURE", 0.3)) -TOP_P = float(os.environ.get("LLM_TOP_P", 0.9)) -CONCURRENCY = int(os.environ.get("LLM_CONCURRENCY", 1)) -USE_MMAP = os.environ.get("LLM_USE_MMAP", "1") == "1" -USE_MLOCK = os.environ.get("LLM_USE_MLOCK", "0") == "1" # mlock bisa gagal tanpa privileges - -# ---------- FastAPI ---------- -app = FastAPI(root_path="/api") -app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]) - -# concurrency control -llm_sem = threading.Semaphore(CONCURRENCY) - -# ---------- HealthInput (tetap sama) ---------- -class HealthInput(BaseModel): - weight: float - height: float - age: int - spo2: int - heart_rate: int - activity: str - -# ---------- Try to load model (with informative error) ---------- -print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Loading model from: {MODEL_PATH}", file=sys.stderr) -if not os.path.exists(MODEL_PATH): - raise FileNotFoundError(f"Model file not found: {MODEL_PATH}. Place your .gguf file there or set LLM_MODEL_PATH env var.") - -try: - llm = Llama( - model_path=MODEL_PATH, - n_ctx=N_CTX, - n_threads=N_THREADS, - n_batch=N_BATCH, - use_mmap=USE_MMAP, - use_mlock=USE_MLOCK - ) - print("Model loaded successfully.", file=sys.stderr) -except Exception as e: - # jika model gagal load, print stack dan raise supaya operator tahu. - print("ERROR loading model:", file=sys.stderr) - traceback.print_exc() - raise - -# ---------- Helpers: JSON extraction & normalization ---------- -def extract_json_object(text: str) -> str: - start = text.find("{") - if start == -1: - return "{}" - stack = 0 - for i in range(start, len(text)): - if text[i] == "{": - stack += 1 - elif text[i] == "}": - stack -= 1 - if stack == 0: - return text[start:i+1] - return text[start:] - -def sanitize_and_normalize(parsed: dict, bmi_value) -> dict: - # map common typos to canonical keys - mapping = {} - for k, v in parsed.items(): - lk = k.strip().lower() - if "summ" in lk or "summary" in lk: - mapping["summary"] = v - elif lk == "bmi": - mapping["bmi"] = v - elif "bmi" in lk and "cat" in lk: - mapping["bmi_category"] = v - elif "spo2" in lk or "sp02" in lk: - mapping["spo2_status"] = v - elif "heart" in lk and ("rate" in lk or "hr" in lk): - mapping["heart_rate_status"] = v - elif "risk" in lk: - mapping["health_risk"] = v - elif "recom" in lk or "recomend" in lk: - mapping["recommendation"] = v - elif "expl" in lk or "expla" in lk: - mapping["explanation"] = v - else: - mapping[k] = v - # defaults - defaults = { - "summary": "Unable to produce summary.", - "bmi": bmi_value, - "bmi_category": "normal", - "spo2_status": "normal", - "heart_rate_status": "normal", - "health_risk": "low", - "recommendation": "Maintain balanced diet, regular exercise, and consult a professional if concerned.", - "explanation": "No detailed explanation provided." - } - for dk, dv in defaults.items(): - if dk not in mapping or mapping[dk] in (None, ""): - mapping[dk] = dv - # clean templates like "normal|low|dangerous" - def pick(val, allowed): - if not isinstance(val, str): return val - if "|" in val: - parts = [p.strip().lower() for p in val.split("|")] - for p in parts: - if p in allowed: - return p - return parts[0] - lv = val.strip().lower() - for a in allowed: - if a in lv: - return a - return val - mapping["bmi_category"] = pick(mapping["bmi_category"], ["underweight", "normal", "overweight"]) - mapping["spo2_status"] = pick(mapping["spo2_status"], ["normal", "low", "dangerous"]) - mapping["heart_rate_status"] = pick(mapping["heart_rate_status"], ["normal", "low", "high"]) - mapping["health_risk"] = pick(mapping["health_risk"], ["low", "moderate", "high"]) - mapping["bmi"] = bmi_value - return mapping - -# ---------- Prompt ---------- -SYSTEM_PROMPT = ( - "You are a conservative medical assistant. Output ONLY a single JSON object with keys: " - "summary, bmi, bmi_category, spo2_status, heart_rate_status, health_risk, recommendation, explanation." -) -def build_user_prompt(data: HealthInput, bmi): - return ( - f"Health data:\n" - f"weight: {data.weight}\n" - f"height: {data.height}\n" - f"age: {data.age}\n" - f"spo2: {data.spo2}\n" - f"heart_rate: {data.heart_rate}\n" - f"activity: {data.activity}\n" - f"bmi: {bmi}\n\n" - "Return only a single JSON object with the exact keys." - ) - -# ---------- Endpoints ---------- -@app.get("/") -def root(): - return {"status": "ok", "model": os.path.basename(MODEL_PATH), "n_ctx": N_CTX, "n_threads": N_THREADS} - -@app.post("/analyze") -async def analyze(request: Request): - # parse request body safely - try: - payload = await request.json() - except Exception: - return JSONResponse(status_code=400, content={"error": "invalid JSON body"}) - - # validate minimal fields (frontend uses same schema) - try: - data = HealthInput(**payload) - except Exception as e: - return JSONResponse(status_code=400, content={"error": "invalid payload", "details": str(e)}) - - # compute bmi - try: - height_m = data.height / 100.0 - bmi_value = round(data.weight / (height_m ** 2), 1) - except Exception: - bmi_value = None - - user_prompt = build_user_prompt(data, bmi_value) - - # call model (with semaphore) - with llm_sem: - try: - kwargs = dict( - messages=[ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": user_prompt} - ], - temperature=TEMPERATURE, - max_tokens=MAX_TOKENS, - top_p=TOP_P, - ) - # try to include response_format when supported - try: - kwargs_rf = kwargs.copy() - kwargs_rf["response_format"] = {"type": "json_object"} - response = llm.create_chat_completion(**kwargs_rf) - except TypeError: - response = llm.create_chat_completion(**kwargs) - except Exception as e: - # log exception and return JSON error (avoid HTML) - tb = traceback.format_exc() - print("Model inference error:", file=sys.stderr) - print(tb, file=sys.stderr) - return JSONResponse(status_code=500, content={"error": "model_inference_failed", "details": str(e)}) - - # extract text - raw_text = "" - try: - raw_text = response.get("choices", [{}])[0].get("message", {}).get("content", "") - except Exception: - raw_text = str(response) - - # attempt parse - json_text = extract_json_object(raw_text) - parsed = {} - try: - parsed = json.loads(json_text) - except Exception: - # fallback empty - parsed = {} - - out = sanitize_and_normalize(parsed, bmi_value) - return JSONResponse(status_code=200, content=out) \ No newline at end of file