From 353f13978862e02313cc4062f2e5f62d5e127394 Mon Sep 17 00:00:00 2001 From: bangae1 Date: Wed, 5 Nov 2025 21:45:20 +0900 Subject: [PATCH] first --- .gitignore | 5 + __pycache__/manual.cpython-312.pyc | Bin 0 -> 6828 bytes __pycache__/offline_manual.cpython-312.pyc | Bin 0 -> 7056 bytes download_embed.py | 16 +++ manual.py | 147 +++++++++++++++++++ offline_manual.py | 159 +++++++++++++++++++++ requirements.txx | Bin 0 -> 4184 bytes 7 files changed, 327 insertions(+) create mode 100644 .gitignore create mode 100644 __pycache__/manual.cpython-312.pyc create mode 100644 __pycache__/offline_manual.cpython-312.pyc create mode 100644 download_embed.py create mode 100644 manual.py create mode 100644 offline_manual.py create mode 100644 requirements.txx diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fa74f9a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.venv +.idea +chroma_db +models +*.iml \ No newline at end of file diff --git a/__pycache__/manual.cpython-312.pyc b/__pycache__/manual.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32cfe82bd6966ad7965e5d837ef68326e634a40c GIT binary patch literal 6828 zcmbVQYj6`+mcA{up4P*XY|F^lG==~o!2}3w!ZHbtfdm#j1jx%_&?xG*k6%E$dTC_>H=matTabd}#! z7X@`qNE6m}X{mA>-9z9 z-Mdg-u=?h#V}@(6DwZxgjMDcqO68FUYaBDpjdF1I-X|XERw}HSbBxto)0BIGuOh_9a=#MZvtA2*m`v=MUGNODX;rmB-rVtB-dUpB65)cw~zNngzhLG z=6G!8!f$gd8;tb$y5o_67>q^)uU0a~f-x=>jBt{*JuXJ~MGtY2U?0azby(dMWx3F{ zDBt0a3;xi~T|mdM%`b@UySMk%w5<*t6l+U*jMWhg;wnfMuIzct=hR^ z)w^p)Dajm0D!zz6%t?l@KN9zc1g}P-l?LYz*$OR0uh0N)zZs#f!BMU&e1_|c-`#W<$dqK>S-SsJM6u2t6YKg5fOTq>novzK`F zoqhIK4s(&!_*l2<>9!3o?gI@uL_RKvKAsClMa~xhrAV|Nlnhp=5u<$IppO-g#yH8$ za_%JJziM$^S$4X|OsG5z(-6rW2a+0At6!nYi z*77Kpfct>?e$|$Z`?s^4e>WfPe;6@X6 z!Xrh#F<4W-DAq6QH{YpmovL43s9&4gn1AKf+x4$OljDgg$NGX}ebLfzr?L4r*7Me) zySdoX`oN@fn)~$+EX3kGyD?wWoTr=b13e;Xd!ug$DjOCXNDcfI^M4N?VE(zj1i1(Y z$&jYH4SK-Xzon8?)g?q7lE9Ln)N$&23;943(Daai=4YUtq=wZh6zPQKA{nC3Xii&b zlGG+ORnzv)IZv5pnQlUx&;l*GLdyuWXjaGS!SxJ-TA*X5VT~j`X1rE$RcgKzriZ0C z3Zp5Up*gI6(T9oG+}T&dz_t<@LG<&Y*A)K;SYN)Hl0P40ra#TeLs>aJSUP!@k>4AW zFN`v!-;c_}$DU%QC;H`q%TF=#&`tSL2Hu|xm2RA8aPY_3(#JQMQZ8Hia8N!q$jG0L zls-?*3}oeVV~jkUhVcsNOHeyCD4)DCb9_)fmtjgD{-JbvXl7ts9=OTO42+ge^_Nav zX^S66jxL@pU4gk+9?9!Q1CR-ZKJq3)8ls+70psymp z+Gkxa&mrR z<~*q5mzPlm$VpXD?8;7$r%IoU;krXvMQg)A_v8(EXaZL#2)6-O@96TlwMdM{UzzmyvU z(x|S>gB6Xfc^axx<&0fwEk z)JM4TrH`j4a$sXvd4-%CDgEXIcn2^&HV*CZLpysbCM+FKGi$((xGHEId%?ayRDS>1 zbnY0mWu`w>^n-dE03D!blvPq*qN?rmAo7I5fcMG2y9qrHvG!|VZ6=!rr7)%IFb}#!{8a0GNs?0R-^}CdvD(Ef)TK%f5^pY@oS=@kjj_&j|3iPX}<*DXxXG|8Q4T zw!6AJQ%uLUpCV7CQ{!OMs`p6rmY;U`B+A!U+Z}}XiBQA$?wYn3+uZ})LYaL5qOD$t z#`yq%k#{*>030OIEEf}@!^B1d@i4$ALDB|UK{ACo(a-utztCqYpT1m#_39*Dj1NXc zNe2L{hm*7+E+Ux&(NG8iDw5OsSs2A*oa7NlNK{ZV0BIl43%|SPAol0KyJse@pHl+H<57G2 zI{M%2*7j#DlJ1>2#~($&5%V1i%z-f&h0#)p+;0G2dKkF%K0FErVM$Ou6icz11PM|A z*eWBE5L;OqHN@)h`B0n%m;-TEpP>67dP9^oLA2H->1xDh4GEnJ{2)m%CiNmhBc)Vf zAka5e=|8%fI=hM(e#Q{8A5ws%L3tXJCg8w?93;&NJ!?)FSbZO?0LY{!VP-8CY1W!B zLd`a0=(EBo{pnVkRC!G3RoZ|F=CBHDueLtQUs(&u8WF9a6g53i_^~9dY%S|xor91% zuyu(VSh+qyA)dju9!=7gu=WBU^Gm`8NsF6p7;99)Z4#2Dgf(FYt1KAvsG8`t&z}jF zv-BcHomW%|lC?m`((9bp4OMnYkR)oaP;3*sFi2gYeoZAE2}fB%v0fdYsO@b~zpL-^ zD0K~%ct|fAab8h%lXRNMLlozdtb`NNOu{xF*W%>frg_t$%C@L$C^mM1_Hg&kD@Z5d z@(|WNwrEaMi{}l3D$A(Ul6637$zw(=o*Pl0s82ZGCH&I=ALog=^ibivA);Y*$%Y@} zKhc0X{J;30)2F-YtBD$w;~(?j>R7flQK#zX$3`dIdmlo*ONcm^)GsBx4)(Dw>3Wl# z{?lP{MEfQ=OnH}fev2E2Z-ocuQOa@T6S*UBgqtFeYYFiXeCiG#TKqwctL4`>$)lp2JwryQ^ALI^w8nd}i zXT}0?<)|QX;rO3nIAV=c!x)@rE}T|^yn;AZLk-X#1EH`SIMc8OgCMeuu+!r+Cno@p zR4_GD;Q(>pg;At7{m&DCJ`e-}QEUSMcMibUX#_e7aKDnRqAwL@K`;jRTR~3{#u2%t zuatfd@fdgk2z?$RHlHN+yoGuo4nDiXxeO+ulB>Bjm)-vU@)Vaxk~jAG<`i>gya1f zL?OS8hGU{+=OX^ML!3`M7>pc(%L_>Zk(!chkp0H_2v|BIM0r87h5bi-5$pF>@OsR*}mlz5#yMr6JX zAIsn&nYaib1mF?keqS)gV?r#s{IOW*s4sBPF9Jv5SO~i{VpP(XQ+df)5sc&E1_y`) zBeA%Ma6rCA}`;G z3)QNshFg%bA6Wocm2C_&5PTisH^NAv5grOqdd#DYom}ND?}zz9JRAlg;r5DyX^m3N=U2ApX?Nc9BEqt#@{Ld4rk|SC6VJjt-EhaVPXXD)B_{Hvb=fWZMs=u;5O|p z+Uhfh3bxkaH3i!f!~TM81)xJcX}<+kg&Dc7cgYtzK5|GNLi{z=!4)J|Bkeo1EQxz014cZs%M zUoH|Ja1^Nts%9NPmujwXV#L5ADM%NB52JBl`-iYo}VDEwny6(Ew8lwVV5OK(lf26v*vf zK4Q*o$ZgH9*qGnFFYjJH>3(&}{Z_&KR(}7_C*A%$T~~B1oN}!#xZtL0()E05XVKuw z^c4(?@7Nrfm8YV^=Fw+H_KZAJV4j|0HWZi*xx^&1rC{4~Li@GN4py_d%bt|&8(A{4 zG{0!|7yI(ImPy-=DckOXZTF;YPwJ&_z)6;;J5^J`8$dh^y-h!@wiRofsm|~2MJdwM zLH+lG-4wAj+#{6ES*&%O)Xi$)>w5w2(#a>5mmBCmktHuTky}kQFMGAOR#5Q#o?89L zO&OrfEXP%WqA`Cp+mr9n?D52{n+0Ej}-aZ(U974$I1-v%Xw~F zH>I`}Ep?gh@|7QtkV`W9AW;iOe2})S@lxL+SuZ1Lq5_Y?OZ7tKHrofco$7VAz~eB< zSiXN|-c+aC+^80aNO3^PHvoMnSNBymwl zX=-XvWAIHCs>y{evW;M$D0z3!N0uDko zeJn?uI^-0wZ#lwfAqFT3BAks8QLzl9nX#Cp1u}}#QIU9~?(->uirn2(dLyS=SY;>xw34(cK0I znC+SSL~GNe%+P7JrSwIsBW1ov*HA69o<^#+Xil6y?h9ohA4lah2b>z4w*+ z-7Okou)QBlX}_lEU%8LD`(5uZoYiQlmQ2Sz0b8;5E8VRC zE@@T}F=HM(5tTTIU^8Zh1bgt-YJ!tBQ%r64$JA1_f25dMYkNp-swRJ8wX4kI$0mPv z&uOVGW0Kvf?XvrE@44rm*S+VvNB>q{ZX|HA+snNFT0_WR@k6~z3NH_ODMBs~mavqM z^yHsZ4+Y;EpT@84(Ng&~x`&1~txxCI_vn>2+E?N?^cei5J*9qQk5OsU`O5sJ9uvIl zd&=389y4p`v1kZK_cQ9m#V2Q(D7_A|Z+nn$cZUVHZ_i%fQ?lJHh;7}w2J9{C{n#z6fB7gE*!1Lv zmMt$1EL&)F`+QIC^#;6q_CC31%ad=n4wSX5_Z;Sfez%K#6Zp^}w|H36dO~5RMlvd# zTmiSAlk_b2w%5Z+C4P4x?Dh%JO)DL?$2IUQwj_p~7XBan0d5zFxX>5d%~DY!VxMyL z)9PFCDeBdCR>Nv0wATyNZdxo;d;a%vqG^H7tiBbWEUoa>UDxH?+>7KB4Juu)W*>3t zy9O+1(nWUN4@R(xd{_`&Jm(LJoXZ2ImnwWgkK5VpR9&Ih@lu4wLUSH5HZfbQ>Jeo?tWygHCxnt|WT`cGB=7as5M-&eF zeEx$4mo7Ymy}cNFAQ`&4C3IXe_i>`@EzrIoTRk#Et_R>2BVU)4$9|rzT0LmITh%yI z)taely=$t=HZ*4IS3WGEtBr%khel#sJ!9RRv2KR0yEXN{F27iwb<`J#%Jeqlp#Gtm zm~8KFO_$ZD>G}u2nn>FI;G3SpmVpffl~Lm9J9vS3TIh?AOR%MlXo}0Q54N=5P*JKF zV`9P2ghVKnPJMQecQqbOAMt2@3Y%zD!)g_dbVPHBjL_#b=S(z-YNMKBO!k+eK#yJ# zDx!^Offrri1yK~GSskm75VmAk3w(?;%n_w03~Dq}OSLFnti^Glrf`JjsQN}9B~D}4 zKp6uoS!4v!&5O>`@ZWXWc{C36 z3hT@8b#_=jb9L_2uzVrG>?7?&6R)$wO1IWj8B1hIO=4opnonk46M;ySB47~+qeNf!G)8G z1rki>E8Rs$pb%gxdFYHhd@DCTxnQ(UlAzyg3N-%=yaA3~@YDx5^W~3br&8c!n7LU_ zjpcrQ8lnS)o|uGo_(MNC3ogu^iZdI*k2ot>9DBjPz*K(c#BAyWv?XRgQS5_u8v+|( zWaLefpQ32(_%Oqr+x@p8~eI#6l3gs{|2F2WAV(8Jlaqgy`%K z39ZvVG!dOqnRi**Soddo`M3A2#PZy?_l?B*ME{GWyzOa|qCxGMKq4LLr{PwmwbN z94YU|fLwA#%xpF5U>y@R*9)jT3WvH#dBg%wObFieA^rI7(T&iuN zwo1M3D|GNf~8h2A%ahq zQMpAcfm74>P^?^_sESlYY;O~O)&EcONR`S-k-R#fVRg~!ACf;(jW+zhGG#k*iQF;R!b<~iP!qyFc-0*}%<&xT?_Pk^2s=JGcV-IkNC8tb1ql%MeK?NX zX8&ml&M)}$x5N&0)@5j=IdYqN2AZfb#xB$nyF^R${IOq?Fl7$Pn z-}G@V@vt{=1PUjT1~N4znc<8a<^$mAfDq&b$?SI@a|O7g`Qx$ReF(1o=D*?cirpH5 zr5E9dH{=pRoW~2Nb!ApTqKpHEZR-`*&kb1BDVTzCTgQph#;*h2_>FjT!c8jW0)P-e zM#%dE-Vl#xW6ACgg?z_dp2KbtB=U!R*sXz+IhG?NLqRK!e?sBl4TQoXVgX4n27M5b zlFl1oxnq*?HMcL!b@F_W#|n&6UeF05?+r;+AQPY!V2{Ut0UpS}S(67Dig*Yeq!Kma zDOR$3f&w1w1+U-hbMs!&%L$TAQSO-NI_ebmov@Jf9|+l zmuY)7{n~*{+fSxfz5egDJ*Uls)akz&Y)@2X&6dQf(}xGOSwropVaz(Vaoj$p8#^@B z{<(JQna|sAYd;U)?npPhoH4*(?YxfEy+J)R047Ttm*1i56#?$hj;y&VaU^4I9Nm~P z|7g^mF*gG=)RUUk(~dP6x-!-|_zVbWtDPrg&Cj*>0FM2gHfwdHSMR#L=JvAmD~Hl9 zHhqXoujgT5$Ur0B$rL8Z{Sa)TtyQZyw8{0jvr**ck zE9{Bg?^#CIj_)7i$M$C$TW1=d&NMzfRSiQj6)z0xfM?Ch8PMCYX3Ut{lIl!1Z%uda zPdnC3J6@e}ypeIdkv{mdX@@&aS7z-zS7kRo;M)PFtfriiKfKB08B zY=!lVZe9y--w6(g;g3x_tLZ#wE|akf zg)d0C3&0zJHo$DmfpA4BmX1(f3d)|kF8ku=+PIW@(-MbbkR`zVyKHOeNatT1?2 zww)tDFCZuNFk^r`3N?2ZR8Q5SyTBuv zWXM;&**7J#`qmN_y*`0Q%Son3Mks&+>#_L?Ye_E@+ub2AkC1@JJxQwU-1}l@N5`(N z9gAzT)({>ip*+lbc{m9M;apVFoo@ok+I?P3nB|4~yC?s->U(}D=v03s*k1UJKr$_= zx(B2hEYAjcmxt#d&%iFK9z6583IQ{ZkT##2w2(lQqa|#t0a0-s98g0cNegTg7opLy zxDQ5GQk)TUx=Yzv`JKoIJ)`Wm-x;>SRv-3p&+#Xr4dIhO!5spYqW(fk|4K|>lFff1 zmM=-mf0FGPvi(a^2b+SuIcCW^TGONwcZkL`v1Tir3Byc3 zX-ngbWk<%cBWr2OmQ_52tw|Fz!PaE1n zoNx@<-&sDd(NYbG_WJ}L541#MRvI8d(5AtGFSN_<>G6mRk({+QpV$rCBVDVQ=zjoH C@k$T? literal 0 HcmV?d00001 diff --git a/download_embed.py b/download_embed.py new file mode 100644 index 0000000..5defd8a --- /dev/null +++ b/download_embed.py @@ -0,0 +1,16 @@ +from sentence_transformers import SentenceTransformer +model = SentenceTransformer('all-MiniLM-L6-v2') +model.save('./all-MiniLM-L6-v2') + +# # HF CLI 설치 (처음 한 번만) +# pip install huggingface_hub +# +# # 로그인 (오프라인 사용 전 한 번만 필요) +# huggingface-cli login # 토큰 입력 (https://huggingface.co/settings/tokens) +# +# # 모델 로컬 저장 +# huggingface-cli download Qwen/Qwen3-0.6B --local-dir ./Qwen3-0.6B --local-dir-use-symlinks False +# +# +# +# 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 \ No newline at end of file diff --git a/manual.py b/manual.py new file mode 100644 index 0000000..2cbb691 --- /dev/null +++ b/manual.py @@ -0,0 +1,147 @@ +import os + +import torch +from sentence_transformers import SentenceTransformer +import chromadb +from chromadb.utils import embedding_functions +from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM +from fastapi import FastAPI + +# 2. 벡터 DB 설정 +persist_directory = "./chroma_db" +chroma_client = chromadb.PersistentClient(path=persist_directory) + +# ✅ Chroma 전용 임베딩 함수 사용 (오류 방지) +sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name="all-MiniLM-L6-v2" +) + +# 컬렉션 생성 +collection = chroma_client.get_or_create_collection( + name="manuals", + embedding_function=sentence_transformer_ef # ← 여기가 핵심! +) + +_model = None +_tokenizer = None + +def get_qwen_model() : + global _model, _tokenizer + if _model is None: + model_name = "Qwen/Qwen3-0.6B" + _tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + _model = AutoModelForCausalLM.from_pretrained( + model_name, + torch_dtype=torch.float32, # CPU 안정성 + device_map="auto", + trust_remote_code=True + ) + return _model, _tokenizer + +def init(job: str) : + print(f'{job} init start') + # 1. 문서 준비 (실제로는 PDF/Word 등에서 추출) + manuals = [ + "지출 결의서는 사용 목적, 금액, 일자, 증빙 서류를 반드시 첨부하여 전자 결재 시스템에 등록해야 합니다.", + "월말 마감은 매월 25일부터 시작되며, 모든 부서는 28일까지 비용 집행 내역을 최종 확정해야 합니다.", + "외화 송금은 반드시 외환관리부의 사전 승인을 받은 후 금융팀을 통해 진행되어야 하며, 계약서 사본을 첨부해야 합니다.", + "세금계산서는 발행일로부터 10일 이내에 ERP에 등록되지 않으면 비용 처리가 불가합니다.", + "장기자산(차량, 사무기기 등)은 매년 1월에 정기 감가상각 점검을 받아야 하며, 자산관리부서가 이를 주관합니다.", + "현금 보관은 원칙적으로 금지되며, 불가피한 경우는 금고 보관 후 당일 중 재무팀에 입금 처리해야 합니다.", + "연말 정산 대상 직원은 매년 12월 10일까지 개인 소득공제 자료를 인사 시스템에 제출해야 합니다.", + "예산 초과 지출은 사전에 재무부와 협의 후 예산 조정 승인을 받아야 하며, 미승인 시 결재가 거부됩니다.", + "재무 제표 초안은 분기 마감 후 5영업일 이내에 감사법인에 제출되어야 하며, 최종 승인은 CFO가 담당합니다.", + ] + + # 문서 ID 생성 및 추가 + doc_ids = [f"DOC_{job}_{i}" for i in range(len(manuals))] + # collection.add(documents=manuals, ids=doc_ids) + collection.add( + documents=manuals, + ids=doc_ids, + metadatas=[{"source": "fi_manual_v1.pdf", "version":1.0, "dept": job} for _ in doc_ids] + ) + + print(f'{job} init end') + +# 3. 질의 처리 +def query_and_summarize(job: str, query: str, top_k: int = 3): + + # 관련 문서 검색 + results = collection.query( + query_texts=[query], + n_results=5, + where={"dept": job} + ) + # results = collection.query(query_texts=[query], n_results=top_k) + cosine_similarities = [1 - d for d in results['distances'][0]] + print("유사도:", cosine_similarities) + # 출력 예: [0.610, 0.473, 0.154, 0.142] + + context_with_score = "" + for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])): + sim = 1 - dist + context_with_score += f"[문서 {i+1} | 유사도: {sim:.3f}]\n{doc}\n\n" + + print(context_with_score) + print("\n\n\n\n\n") + top_doc = results['documents'][0][0] + + # ✅ 명시적으로 모델과 토크나이저 로드 + model, tokenizer = get_qwen_model() + + messages = [ + {"role": "system", "content": "당신은 회사 재무/회계 업무 전문 어시스턴트입니다. 문서 내용은 그대로 사용자에게 보여 줘야 하며 이를 기반으로 부가설명을 정확하고 상세하게 답변하세요."}, + {"role": "user", "content": f"다음 문서를 참고하세요:\n{top_doc}\n\n질문: {query}"} + ] + + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=False # Switches between thinking and non-thinking modes. Default is True. + ) + model_inputs = tokenizer([text], return_tensors="pt").to(model.device) + + # conduct text completion + generated_ids = model.generate( + **model_inputs, + max_new_tokens=500 + ) + output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() + + # parsing thinking content + try: + # rindex finding 151668 () + index = len(output_ids) - output_ids[::-1].index(151668) + except ValueError: + index = 0 + + thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") + end_think_id = tokenizer.convert_tokens_to_ids("") + if end_think_id in output_ids: + idx = len(output_ids) - output_ids[::-1].index(end_think_id) + else: + idx = 0 + content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip() + + print(top_doc) + print("\n\n\n\n\n") + print("thinking content:", thinking_content) + print("\n\n\n\n\n") + return content + +app = FastAPI() + +@app.get("/") +def question(query: str) : + user_query = query + answer = query_and_summarize(job="FI", query=user_query) + return {"answer": answer} + +# 예시 사용 +if __name__ == "__main__": + # init(job="FI") + # FI : 재무 HR : 인사 + print('1') +# 실행방법 uvicorn manual:app --reload \ No newline at end of file diff --git a/offline_manual.py b/offline_manual.py new file mode 100644 index 0000000..8ca38cd --- /dev/null +++ b/offline_manual.py @@ -0,0 +1,159 @@ +import os + +import torch +from sentence_transformers import SentenceTransformer +import chromadb +from chromadb.utils import embedding_functions +from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM +from fastapi import FastAPI + +# # === 경로 설정 (모두 로컬) === +QWEN_MODEL_PATH = "./models/Qwen3-0.6B" +EMBEDDING_MODEL_PATH = "./models/all-MiniLM-L6-v2" + +# 2. 벡터 DB 설정 +persist_directory = "./chroma_db" +chroma_client = chromadb.PersistentClient(path=persist_directory) + + +embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name=EMBEDDING_MODEL_PATH, # ← 로컬 폴더 경로 가능 + device="cpu" +) + +collection = chroma_client.get_or_create_collection( + name="manuals", + embedding_function=embedding_fn +) + +_model = None +_tokenizer = None + +def get_qwen_model() : + global _model, _tokenizer + if _model is None: + model_name = "Qwen/Qwen3-0.6B" + _tokenizer = AutoTokenizer.from_pretrained( + QWEN_MODEL_PATH, + trust_remote_code=True, + local_files_only=True # 🔒 오프라인 강제 + ) + _model = AutoModelForCausalLM.from_pretrained( + QWEN_MODEL_PATH, + torch_dtype=torch.float32, # CPU 안정성 + device_map="auto", + trust_remote_code=True, + local_files_only=True # 🔒 오프라인 강제 + ) + return _model, _tokenizer + +def init(job: str) : + print(f'{job} init start') + # 1. 문서 준비 (실제로는 PDF/Word 등에서 추출) + manuals = [ + "지출 결의서는 사용 목적, 금액, 일자, 증빙 서류를 반드시 첨부하여 전자 결재 시스템에 등록해야 합니다.", + "월말 마감은 매월 25일부터 시작되며, 모든 부서는 28일까지 비용 집행 내역을 최종 확정해야 합니다.", + "외화 송금은 반드시 외환관리부의 사전 승인을 받은 후 금융팀을 통해 진행되어야 하며, 계약서 사본을 첨부해야 합니다.", + "세금계산서는 발행일로부터 10일 이내에 ERP에 등록되지 않으면 비용 처리가 불가합니다.", + "장기자산(차량, 사무기기 등)은 매년 1월에 정기 감가상각 점검을 받아야 하며, 자산관리부서가 이를 주관합니다.", + "현금 보관은 원칙적으로 금지되며, 불가피한 경우는 금고 보관 후 당일 중 재무팀에 입금 처리해야 합니다.", + "연말 정산 대상 직원은 매년 12월 10일까지 개인 소득공제 자료를 인사 시스템에 제출해야 합니다.", + "예산 초과 지출은 사전에 재무부와 협의 후 예산 조정 승인을 받아야 하며, 미승인 시 결재가 거부됩니다.", + "재무 제표 초안은 분기 마감 후 5영업일 이내에 감사법인에 제출되어야 하며, 최종 승인은 CFO가 담당합니다.", + ] + + # 문서 ID 생성 및 추가 + doc_ids = [f"DOC_{job}_{i}" for i in range(len(manuals))] + # collection.add(documents=manuals, ids=doc_ids) + collection.add( + documents=manuals, + ids=doc_ids, + metadatas=[{"source": "fi_manual_v1.pdf", "version":1.0, "dept": job} for _ in doc_ids] + ) + + print(f'{job} init end') + +# 3. 질의 처리 +def query_and_summarize(job: str, query: str, top_k: int = 3): + + # 관련 문서 검색 + results = collection.query( + query_texts=[query], + n_results=5, + where={"dept": job} + ) + # results = collection.query(query_texts=[query], n_results=top_k) + cosine_similarities = [1 - d for d in results['distances'][0]] + print("유사도:", cosine_similarities) + # 출력 예: [0.610, 0.473, 0.154, 0.142] + + context_with_score = "" + for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0])): + sim = 1 - dist + context_with_score += f"[문서 {i+1} | 유사도: {sim:.3f}]\n{doc}\n\n" + + print(context_with_score) + print("\n\n\n\n\n") + top_doc = results['documents'][0][0] + + # ✅ 명시적으로 모델과 토크나이저 로드 + model, tokenizer = get_qwen_model() + + messages = [ + {"role": "system", "content": "당신은 회사 재무/회계 업무 전문 어시스턴트입니다. 문서 내용은 그대로 사용자에게 보여 줘야 하며 이를 기반으로 부가설명을 정확하고 상세하게 답변하세요."}, + {"role": "user", "content": f"다음 문서를 참고하세요:\n{top_doc}\n\n질문: {query}"} + ] + + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=True # Switches between thinking and non-thinking modes. Default is True. + ) + model_inputs = tokenizer([text], return_tensors="pt").to(model.device) + + # conduct text completion + generated_ids = model.generate( + **model_inputs, + max_new_tokens=500 + ) + output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() + + # parsing thinking content + try: + # rindex finding 151668 () + index = len(output_ids) - output_ids[::-1].index(151668) + except ValueError: + index = 0 + + thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") + end_think_id = tokenizer.convert_tokens_to_ids("") + if end_think_id in output_ids: + idx = len(output_ids) - output_ids[::-1].index(end_think_id) + else: + idx = 0 + content = tokenizer.decode(output_ids[idx:], skip_special_tokens=True).strip() + + print(top_doc) + print("\n\n\n\n\n") + print("thinking content:", thinking_content) + print("\n\n\n\n\n") + return content + +app = FastAPI() + +@app.get("/") +def question(query: str) : + print(1) + answer = query_and_summarize(job="FI", query=query) + return {"answer": answer} + +# 예시 사용 +if __name__ == "__main__": + # init(job="FI") + # FI : 재무 HR : 인사 + print(1) + user_query = "외화 송금 방법?" + # answer = query_and_summarize(job="FI", query=user_query) + # print(answer) + # 실행방법 uvicorn manual:app --reload \ No newline at end of file diff --git a/requirements.txx b/requirements.txx new file mode 100644 index 0000000000000000000000000000000000000000..1ef6a41262ad33c324be938082ef3b9937305562 GIT binary patch literal 4184 zcmb`KOK%%T5QOI(AU}nINJ_GO$RXDt1`OnslOR4sQjaCGOInhDe3JUQwmC~#4j>>{ zkg_@bsHv{*nf>SQn=;q4ED!p$<)-ZQ_ZL0ma;6X8=jG?}lU5o%Tdmt>QATB<|Ifqv zJnYWO^oO?l@=|t+-5u>eoWy=98$~1Y#dn$a%G&mxlX9tbY>aCrJo9od-&;NSPqfBo z99U*rZ6o(SR`E7kb~P&p<$cgja?jIaSUlYI7Epc$vv^3$)IMpT1A7@j+(>2D`VC{HR+XjtCcQvXPpvfSQ{f- zDH}T{^GLA|ioOo~=;%To$->am`{?8`m-nL{c0N)zDu!l^3Z^~`_PEmePClu3ucIoX zS<7}jU~jl`tN71CX)WUkw)CS}fB* zjq+UU7dW6NP=bBXew`_AdBIAjNN5$@wV&zMNo4yi_I9V3lL3xfmFiCIqHuF_AH356jeMEiww*>!^kHt9 zAFXBt6Z=WFBSmHkWuJPUruFCE`k?hc^ly4BtK#~yi$_I+$%k0uck9t>k78gNE9NI- zQ%lffhv8EvmUra}lhcTt0!7pIdXkxirj{=XbK zf8HCv&b9Y?c2bJZr}uEiNr_TitGSqBvNh#vUz=fXpuS5@x!JK6?Xk;1YZZ_^)k2h3 z4^t<*hUrl!$WFOAgKm!Vj!5ofR|6$H=%J^~iKlTYf{JtEt@U>m7W9I9jV`H-`Ya;` zlXDt)Ge;M?CKKzP)Yv$}i|}<~rJfi>u;S^ITxDi%qhnFDrv$8dHf1bsQ}BMJ-@T*P z7EIi%2o^pY-HRDzXmcG^%kv1kiyW{xKnDshF<&b4D+`lMN&)Pk741D!>8Q|qkS zOJ8^U((O!?uSZt#b(PrPpXJx`x%?ihK0C9*AoU#we@+KZ-sE9A&?}ZJeLOXXHyw8! zd&1@~?EwW0`JM(pp0clPm{s4;R`Z!kr;j-EtQl*nuIG4SjRkH9Z>l^w2VKz{|MrIM_k?p87D&y@vM=^xg>v z**OUk14^;7(swUrU*E1ihF+}_+i^2sYPgcT6+9l1R`qpScO{Lo&Lz5?I|8?adKa(f zobTA2%P&eln7WFK3jyjplIP(uqT{YJwIj1vF3SGpU<2{1j^OORObg>f zcO)wBTUp-4DwmZBBHHI#yL=AL@f@B|;K+9>F#G1@yNag|9R+XhiGyYZJ%#G5WHdj# z3p|PYeXx~pO?BG2p5%EFqM5MxeF{Zzj*{0}cgN-&d)p)9qE1kFZhcdXogN~9+czkq F{}(>GZe0KX literal 0 HcmV?d00001