Spaces:
Runtime error
Runtime error
| import os | |
| import gradio as gr | |
| from langchain.document_loaders import PyPDFLoader, YoutubeLoader, TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.chains import RetrievalQA | |
| from langchain.chat_models import init_chat_model | |
| # --- API KEY HANDLING --- | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("openai") | |
| if not OPENAI_API_KEY: | |
| raise ValueError("β OPENAI API Key not found. Please add it in Hugging Face secrets as 'OPENAI_API_KEY' or 'openai'.") | |
| # --- PROCESSING FUNCTION --- | |
| def process_inputs(pdf_file, youtube_url, txt_file, query): | |
| docs = [] | |
| # Load PDF | |
| try: | |
| pdf_path = pdf_file.name | |
| pdf_loader = PyPDFLoader(pdf_path) | |
| docs.extend(pdf_loader.load()) | |
| except Exception as e: | |
| return f"β Failed to load PDF: {e}" | |
| # Load YouTube Transcript (optional) | |
| yt_loaded = False | |
| if youtube_url: | |
| try: | |
| yt_loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=False) | |
| docs.extend(yt_loader.load()) | |
| yt_loaded = True | |
| except Exception as e: | |
| print(f"β οΈ YouTube transcript not loaded: {e}") | |
| # Load text transcript file (optional fallback) | |
| if not yt_loaded and txt_file is not None: | |
| try: | |
| txt_path = txt_file.name | |
| txt_loader = TextLoader(txt_path) | |
| docs.extend(txt_loader.load()) | |
| except Exception as e: | |
| return f"β Failed to load transcript file: {e}" | |
| if not docs: | |
| return "β No documents could be loaded. Please check your inputs." | |
| # Split text into chunks | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| splits = splitter.split_documents(docs) | |
| # Embed documents | |
| embedding = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY) | |
| db = FAISS.from_documents(splits, embedding) | |
| # Query using RetrievalQA | |
| llm = init_chat_model("gpt-4o-mini", model_provider="openai", api_key=OPENAI_API_KEY) | |
| qa = RetrievalQA.from_chain_type(llm, retriever=db.as_retriever()) | |
| try: | |
| result = qa.invoke({"query": query}) | |
| return result["result"] | |
| except Exception as e: | |
| return f"β Retrieval failed: {e}" | |
| # --- GRADIO UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π Ask Questions from PDF + YouTube Transcript or .txt Upload") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| yt_input = gr.Textbox(label="YouTube URL (Optional)", placeholder="https://www.youtube.com/watch?v=...") | |
| txt_input = gr.File(label="Upload Transcript .txt (Optional fallback)", file_types=[".txt"]) | |
| query_input = gr.Textbox(label="Your Question", placeholder="e.g., What did the document say about X?") | |
| output = gr.Textbox(label="Answer") | |
| run_button = gr.Button("Get Answer") | |
| run_button.click(fn=process_inputs, inputs=[pdf_input, yt_input, txt_input, query_input], outputs=output) | |
| if __name__ == "__main__": | |
| demo.launch() | |