org/config/ai/call_llm_model.py

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sympy.printing.pytorch import torch
import torch_directml
device = torch_directml.device()

QWEN_MODEL_PATH = "./models/Qwen3-0.6B"


_model = None
_tokenizer = None

def get_qwen_model():
    """
    Qwen 모델과 토크나이저를 로드하거나 캐시된 인스턴스를 반환합니다.
    torch.compile을 사용하여 추론 속도를 최적화합니다.

    Returns:
        tuple: (model, tokenizer)
    """
    global _model, _tokenizer
    if _model is not None:
        return _model, _tokenizer

    # 토크나이저 로드
    _tokenizer = AutoTokenizer.from_pretrained(
        QWEN_MODEL_PATH,
        trust_remote_code=True,
        local_files_only=True
    )

    # # 4-bit 양자화 설정 gpu
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_use_double_quant=True,     # 추가 압축
    #     bnb_4bit_quant_type="nf4",          # 성능이 좋은 양자화 방식
    #     bnb_4bit_compute_dtype=torch.float16 # 연산 속도 유지
    # )

    # 모델 로드
    _model = AutoModelForCausalLM.from_pretrained(
        QWEN_MODEL_PATH,
        # quantization_config=bnb_config, #gpu
        dtype=torch.float16,  # CPU: bfloat16, GPU: float16 권장
        device_map="auto",
        trust_remote_code=True,
        local_files_only=True,
        low_cpu_mem_usage=True
    )

    _model.to(device)

    # ✅ torch.compile() 적용 (PyTorch 2.0+)
    if hasattr(torch, 'compile'):
        try:
            print("🚀 torch.compile() 적용 중...")
            # mode="reduce-overhead": 추론 시 추천
            # dynamic=True: 입력 길이가 유동적인 RAG 환경에 적합
            _model = torch.compile(
                _model,
                mode="reduce-overhead",
                dynamic=True
            )
            print("✅ torch.compile() 성공!")
        except Exception as e:
            print(f"⚠️ torch.compile() 실패, 원본 모델 사용: {e}")
            pass  # 실패하면 원본 사용
    return _model, _tokenizer