diff --git a/README.md b/README.md index 33292e6..cf0002b 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ # 🧠 AI Lab – Transformers CLI Playground > A **pedagogical and technical project** designed for AI practitioners and students to experiment with Hugging Face Transformers through an **interactive Command‑Line Interface (CLI)**. -> This playground provides ready‑to‑use NLP pipelines (Sentiment Analysis, Named Entity Recognition, Text Generation, Fill‑Mask, Moderation, etc.) in a modular, extensible, and educational codebase. +> This playground provides ready‑to‑use NLP pipelines (Sentiment Analysis, Named Entity Recognition, Text Generation, Fill‑Mask, Question Answering, Moderation, etc.) in a modular, extensible, and educational codebase. --- ## πŸ“š Overview The **AI Lab – Transformers CLI Playground** allows you to explore multiple natural language processing tasks directly from the terminal. -Each task (e.g., sentiment, NER, text generation) is implemented as a **Command Module**, which interacts with a **Pipeline Module** built on top of the `transformers` library. +Each task (e.g., sentiment, NER, text generation, question answering) is implemented as a **Command Module**, which interacts with a **Pipeline Module** built on top of the `transformers` library. The lab is intentionally structured to demonstrate **clean software design for ML codebases** β€” with strict separation between configuration, pipelines, CLI logic, and display formatting. @@ -32,7 +32,8 @@ src/ β”‚ β”œβ”€β”€ fillmask.py # Masked token prediction command β”‚ β”œβ”€β”€ textgen.py # Text generation command β”‚ β”œβ”€β”€ ner.py # Named Entity Recognition command -β”‚ └── moderation.py # Toxicity / content moderation command +β”‚ β”œβ”€β”€ moderation.py # Toxicity / content moderation command +β”‚ └── qa.py # Question Answering command β”‚ β”œβ”€β”€ pipelines/ # Machine learning logic (Hugging Face Transformers) β”‚ β”œβ”€β”€ __init__.py @@ -41,7 +42,8 @@ src/ β”‚ β”œβ”€β”€ fillmask.py β”‚ β”œβ”€β”€ textgen.py β”‚ β”œβ”€β”€ ner.py -β”‚ └── moderation.py +β”‚ β”œβ”€β”€ moderation.py +β”‚ └── qa.py # Question Answering pipeline β”‚ └── config/ β”œβ”€β”€ __init__.py @@ -104,7 +106,7 @@ python -m src.main poetry run python src/main.py ``` -You’ll see an interactive menu listing the available commands: +You'll see an interactive menu listing the available commands: ``` Welcome to AI Lab - Transformers CLI Playground @@ -114,6 +116,7 @@ Available commands: β€’ textgen – Generate text from a prompt β€’ ner – Extract named entities from text β€’ moderation – Detect toxic or unsafe content + β€’ qa – Question Answering on given text context ``` ### Example Sessions @@ -152,6 +155,14 @@ Available commands: - California (LOC) ``` +#### πŸ”Ή Question Answering + +```text +πŸ’¬ Context: Albert Einstein was born in 1879 in Germany. He developed the theory of relativity. +❓ Question: When was Einstein born? +β†’ Answer: 1879 (confidence: 0.95) +``` + #### πŸ”Ή Moderation ```text @@ -173,13 +184,13 @@ The internal structure follows a clean **Command ↔ Pipeline ↔ Display** patt β”‚ β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Command Layer β”‚ ← e.g. sentiment.py + β”‚ Command Layer β”‚ ← e.g. sentiment.py, qa.py β”‚ (user commands) β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” - β”‚ Pipeline Layer β”‚ ← e.g. pipelines/sentiment.py + β”‚ Pipeline Layer β”‚ ← e.g. pipelines/sentiment.py, pipelines/qa.py β”‚ (ML logic) β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β”‚ @@ -195,8 +206,8 @@ The internal structure follows a clean **Command ↔ Pipeline ↔ Display** patt | Layer | Description | | ------------ | -------------------------------------------------------------------------- | | **CLI** | Manages user input/output, help menus, and navigation between commands. | -| **Command** | Encapsulates a single user-facing operation (e.g., run sentiment). | -| **Pipeline** | Wraps Hugging Face’s `transformers.pipeline()` to perform inference. | +| **Command** | Encapsulates a single user-facing operation (e.g., run sentiment, QA). | +| **Pipeline** | Wraps Hugging Face's `transformers.pipeline()` to perform inference. | | **Display** | Handles clean console rendering (colored output, tables, JSON formatting). | | **Config** | Centralizes model names, limits, and global constants. | @@ -215,7 +226,8 @@ class Config: "fillmask": "bert-base-uncased", "textgen": "gpt2", "ner": "dslim/bert-base-NER", - "moderation":"unitary/toxic-bert" + "moderation":"unitary/toxic-bert", + "qa": "distilbert-base-cased-distilled-squad" } MAX_LENGTH = 512 BATCH_SIZE = 8 @@ -260,6 +272,7 @@ Recommended structure: tests/ β”œβ”€β”€ test_sentiment.py β”œβ”€β”€ test_textgen.py +β”œβ”€β”€ test_qa.py └── ... ``` diff --git a/src/cli/display.py b/src/cli/display.py index df2dc27..6379541 100644 --- a/src/cli/display.py +++ b/src/cli/display.py @@ -190,3 +190,78 @@ class DisplayFormatter: output.append(f" β€’ {entity} ({count}x)") return "\n".join(output) + + @staticmethod + def format_qa_result(result: Dict[str, Any]) -> str: + """Format Question Answering result for display""" + if "error" in result: + return f"❌ {result['error']}" + + output = [] + output.append(f"❓ Question: {result['question']}") + + # Confidence indicator + confidence = result['confidence'] + confidence_emoji = "βœ…" if result['is_confident'] else "⚠️" + confidence_bar = "β–ˆ" * int(confidence * 10) + + output.append(f"{confidence_emoji} Answer: {result['answer']}") + output.append(f"πŸ“Š Confidence: {result['confidence_level']} ({confidence:.1%}) {confidence_bar}") + + if not result['is_confident']: + output.append("⚠️ Low confidence - answer might not be reliable") + + output.append(f"\nπŸ“ Position: characters {result['start_position']}-{result['end_position']}") + output.append(f"πŸ“„ Context with answer highlighted:") + output.append(f" {result['highlighted_context']}") + + return "\n".join(output) + + @staticmethod + def format_qa_context_analysis(analysis: Dict[str, Any]) -> str: + """Format QA context analysis for display""" + if "error" in analysis: + return f"❌ {analysis['error']}" + + output = [] + output.append("βœ… Context set successfully!") + output.append(f"πŸ“Š Context Statistics:") + + stats = analysis['context_stats'] + output.append(f" β€’ Words: {stats['word_count']}") + output.append(f" β€’ Sentences: ~{stats['sentence_count']}") + output.append(f" β€’ Characters: {stats['character_count']}") + + if analysis['suggested_questions']: + output.append(f"\nπŸ’‘ Suggested question types:") + for suggestion in analysis['suggested_questions']: + output.append(f" β€’ {suggestion}") + + if analysis['tips']: + output.append(f"\nπŸ“ Tips for good questions:") + for tip in analysis['tips']: + output.append(f" β€’ {tip}") + + return "\n".join(output) + + @staticmethod + def format_qa_multiple_result(result: Dict[str, Any]) -> str: + """Format multiple QA results for display""" + if "error" in result: + return f"❌ {result['error']}" + + output = [] + output.append(f"πŸ“Š Multiple Questions Analysis") + output.append("=" * 50) + output.append(f"Total Questions: {result['total_questions']}") + output.append(f"Successfully Processed: {result['processed_questions']}") + output.append(f"Confident Answers: {result['confident_answers']}") + output.append(f"Average Confidence: {result['average_confidence']:.1%}") + + output.append(f"\nπŸ“‹ Results:") + for qa_result in result['results']: + confidence_emoji = "βœ…" if qa_result['is_confident'] else "⚠️" + output.append(f"\n{qa_result['question_number']}. {qa_result['question']}") + output.append(f" {confidence_emoji} {qa_result['answer']} ({qa_result['confidence']:.1%})") + + return "\n".join(output) diff --git a/src/commands/__init__.py b/src/commands/__init__.py index e8ea5d3..344e3d6 100644 --- a/src/commands/__init__.py +++ b/src/commands/__init__.py @@ -6,5 +6,6 @@ from .fillmask import FillMaskCommand from .textgen import TextGenCommand from .moderation import ModerationCommand from .ner import NERCommand +from .qa import QACommand -__all__ = ['SentimentCommand', 'FillMaskCommand', 'TextGenCommand', 'ModerationCommand', 'NERCommand'] +__all__ = ['SentimentCommand', 'FillMaskCommand', 'TextGenCommand', 'ModerationCommand', 'NERCommand', 'QACommand'] diff --git a/src/commands/qa.py b/src/commands/qa.py new file mode 100644 index 0000000..28f4543 --- /dev/null +++ b/src/commands/qa.py @@ -0,0 +1,214 @@ +from src.cli.base import CLICommand +from src.cli.display import DisplayFormatter +from src.pipelines.qa import QuestionAnsweringSystem + + +class QACommand(CLICommand): + """Interactive Question Answering command""" + + def __init__(self): + self.qa_system = None + self.current_context = None + self.session_questions = [] + + @property + def name(self) -> str: + return "qa" + + @property + def description(self) -> str: + return "Question Answering - Ask questions about a given text" + + def _initialize_qa_system(self): + """Lazy initialization of the QA system""" + if self.qa_system is None: + print("πŸ”„ Loading Question Answering model...") + self.qa_system = QuestionAnsweringSystem() + DisplayFormatter.show_success("QA model loaded!") + + def _show_instructions(self): + """Show usage instructions and examples""" + print("\n❓ Question Answering System") + print("Ask questions about a text context and get precise answers.") + print("\nπŸ“ How it works:") + print(" 1. First, provide a context (text containing information)") + print(" 2. Then ask questions about that context") + print(" 3. The system extracts answers directly from the text") + print("\nπŸ’‘ Example context:") + print(" 'Albert Einstein was born in 1879 in Germany. He developed the theory of relativity.'") + print("πŸ’‘ Example questions:") + print(" - When was Einstein born?") + print(" - Where was Einstein born?") + print(" - What theory did Einstein develop?") + print("\nπŸŽ›οΈ Commands:") + print(" 'back' - Return to main menu") + print(" 'help' - Show these instructions") + print(" 'context' - Set new context") + print(" 'multi' - Ask multiple questions at once") + print(" 'session' - Review session history") + print(" 'settings' - Adjust confidence threshold") + print("-" * 70) + + def _set_context(self): + """Allow user to set or change the context""" + print("\nπŸ“„ Set Context") + print("Enter the text that will serve as context for your questions.") + print("You can enter multiple lines. Type 'done' when finished.") + print("-" * 50) + + lines = [] + while True: + line = input("πŸ“ ").strip() + if line.lower() == 'done': + break + if line: + lines.append(line) + + if not lines: + DisplayFormatter.show_warning("No context provided") + return False + + self.current_context = " ".join(lines) + + # Analyze context + analysis = self.qa_system.interactive_qa(self.current_context) + if "error" in analysis: + DisplayFormatter.show_error(analysis["error"]) + return False + + formatted_analysis = DisplayFormatter.format_qa_context_analysis(analysis) + print(formatted_analysis) + + return True + + def _ask_single_question(self): + """Ask a single question about the current context""" + if not self.current_context: + DisplayFormatter.show_warning("Please set a context first using 'context' command") + return + + question = input("\n❓ Your question: ").strip() + + if not question: + DisplayFormatter.show_warning("Please enter a question") + return + + DisplayFormatter.show_loading("Finding answer...") + result = self.qa_system.answer(question, self.current_context) + + if "error" not in result: + self.session_questions.append(result) + + formatted_result = DisplayFormatter.format_qa_result(result) + print(formatted_result) + + def _multi_question_mode(self): + """Allow asking multiple questions at once""" + if not self.current_context: + DisplayFormatter.show_warning("Please set a context first using 'context' command") + return + + print("\n❓ Multiple Questions Mode") + print("Enter your questions one by one. Type 'done' when finished.") + print("-" * 50) + + questions = [] + while True: + question = input(f"Question #{len(questions)+1}: ").strip() + if question.lower() == 'done': + break + if question: + questions.append(question) + + if not questions: + DisplayFormatter.show_warning("No questions provided") + return + + DisplayFormatter.show_loading(f"Processing {len(questions)} questions...") + result = self.qa_system.answer_multiple(questions, self.current_context) + + if "error" not in result: + self.session_questions.extend(result["results"]) + + formatted_result = DisplayFormatter.format_qa_multiple_result(result) + print(formatted_result) + + def _show_session_history(self): + """Show the history of questions asked in this session""" + if not self.session_questions: + DisplayFormatter.show_warning("No questions asked in this session yet") + return + + print(f"\nπŸ“š Session History ({len(self.session_questions)} questions)") + print("=" * 60) + + for i, qa in enumerate(self.session_questions, 1): + confidence_emoji = "βœ…" if qa["is_confident"] else "⚠️" + print(f"\n{i}. {qa['question']}") + print(f" {confidence_emoji} {qa['answer']} (confidence: {qa['confidence']:.1%})") + + def _adjust_settings(self): + """Allow user to adjust QA settings""" + current_threshold = self.qa_system.confidence_threshold + print(f"\nβš™οΈ Current Settings:") + print(f"Confidence threshold: {current_threshold:.2f}") + print("\nLower threshold = more answers accepted (less strict)") + print("Higher threshold = fewer answers accepted (more strict)") + + try: + new_threshold = input(f"Enter new threshold (0.0-1.0, current: {current_threshold}): ").strip() + if new_threshold: + threshold = float(new_threshold) + self.qa_system.set_confidence_threshold(threshold) + DisplayFormatter.show_success(f"Threshold set to {threshold:.2f}") + except ValueError: + DisplayFormatter.show_error("Invalid threshold value") + + def run(self): + """Run interactive Question Answering""" + self._initialize_qa_system() + self._show_instructions() + + while True: + if self.current_context: + context_preview = (self.current_context[:50] + "...") if len(self.current_context) > 50 else self.current_context + prompt = f"\nπŸ’¬ [{context_preview}] Ask a question: " + else: + prompt = "\nπŸ’¬ Enter command or set context first: " + + user_input = input(prompt).strip() + + if user_input.lower() == 'back': + break + elif user_input.lower() == 'help': + self._show_instructions() + continue + elif user_input.lower() == 'context': + self._set_context() + continue + elif user_input.lower() == 'multi': + self._multi_question_mode() + continue + elif user_input.lower() == 'session': + self._show_session_history() + continue + elif user_input.lower() == 'settings': + self._adjust_settings() + continue + + if not user_input: + DisplayFormatter.show_warning("Please enter a question or command") + continue + + # If we have a context and user input is not a command, treat it as a question + if self.current_context: + DisplayFormatter.show_loading("Finding answer...") + result = self.qa_system.answer(user_input, self.current_context) + + if "error" not in result: + self.session_questions.append(result) + + formatted_result = DisplayFormatter.format_qa_result(result) + print(formatted_result) + else: + DisplayFormatter.show_warning("Please set a context first using 'context' command") diff --git a/src/config/settings.py b/src/config/settings.py index 8d9b2e8..4e585ca 100644 --- a/src/config/settings.py +++ b/src/config/settings.py @@ -19,6 +19,7 @@ class Config: "textgen": "gpt2", "moderation": "unitary/toxic-bert", "ner": "dbmdz/bert-large-cased-finetuned-conll03-english", + "qa": "distilbert-base-cased-distilled-squad", } # Interface diff --git a/src/main.py b/src/main.py index aa16046..494843f 100644 --- a/src/main.py +++ b/src/main.py @@ -13,6 +13,7 @@ from src.commands import ( FillMaskCommand, ModerationCommand, NERCommand, + QACommand, SentimentCommand, TextGenCommand, ) @@ -31,6 +32,7 @@ def main(): TextGenCommand, ModerationCommand, NERCommand, + QACommand, ] for command in commands_to_register: cli.register_command(command()) diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py index 5ad9ab6..04a777b 100644 --- a/src/pipelines/__init__.py +++ b/src/pipelines/__init__.py @@ -6,6 +6,7 @@ from .fillmask import FillMaskAnalyzer from .textgen import TextGenerator from .moderation import ContentModerator from .ner import NamedEntityRecognizer +from .qa import QuestionAnsweringSystem from .template import TemplatePipeline -__all__ = ['SentimentAnalyzer', 'FillMaskAnalyzer', 'TextGenerator', 'ContentModerator', 'NamedEntityRecognizer', 'TemplatePipeline'] +__all__ = ['SentimentAnalyzer', 'FillMaskAnalyzer', 'TextGenerator', 'ContentModerator', 'NamedEntityRecognizer', 'QuestionAnsweringSystem', 'TemplatePipeline'] diff --git a/src/pipelines/qa.py b/src/pipelines/qa.py new file mode 100644 index 0000000..be3aecb --- /dev/null +++ b/src/pipelines/qa.py @@ -0,0 +1,266 @@ +from transformers import pipeline +from typing import Dict, List, Optional, Tuple +from src.config import Config +import re + + +class QuestionAnsweringSystem: + """Question Answering system using transformers""" + + def __init__(self, model_name: Optional[str] = None): + """ + Initialize the question-answering pipeline + + Args: + model_name: Name of the model to use (optional) + """ + self.model_name = model_name or Config.get_model("qa") + print(f"Loading Question Answering model: {self.model_name}") + self.pipeline = pipeline("question-answering", model=self.model_name) + print("QA model loaded successfully!") + + # Default confidence threshold + self.confidence_threshold = 0.1 + + def answer(self, question: str, context: str, max_answer_len: int = 50) -> Dict: + """ + Answer a question based on the given context + + Args: + question: Question to answer + context: Context text containing the answer + max_answer_len: Maximum length of the answer + + Returns: + Dictionary with answer, score, and position information + """ + if not question.strip(): + return {"error": "Empty question"} + + if not context.strip(): + return {"error": "Empty context"} + + try: + result = self.pipeline( + question=question, + context=context, + max_answer_len=max_answer_len + ) + + confidence_level = self._get_confidence_level(result["score"]) + highlighted_context = self._highlight_answer_in_context( + context, result["answer"], result["start"], result["end"] + ) + + return { + "question": question, + "context": context, + "answer": result["answer"], + "confidence": round(result["score"], 4), + "confidence_level": confidence_level, + "start_position": result["start"], + "end_position": result["end"], + "highlighted_context": highlighted_context, + "is_confident": result["score"] >= self.confidence_threshold + } + + except Exception as e: + return {"error": f"QA processing error: {str(e)}"} + + def _get_confidence_level(self, score: float) -> str: + """ + Convert numerical score to confidence level + + Args: + score: Confidence score (0-1) + + Returns: + Confidence level description + """ + if score >= 0.8: + return "Very High" + elif score >= 0.6: + return "High" + elif score >= 0.4: + return "Medium" + elif score >= 0.2: + return "Low" + else: + return "Very Low" + + def _highlight_answer_in_context(self, context: str, answer: str, start: int, end: int) -> str: + """ + Highlight the answer within the context + + Args: + context: Original context + answer: Extracted answer + start: Start position of answer + end: End position of answer + + Returns: + Context with highlighted answer + """ + if start < 0 or end > len(context): + return context + + before = context[:start] + highlighted_answer = f"**{answer}**" + after = context[end:] + + return before + highlighted_answer + after + + def answer_multiple(self, questions: List[str], context: str, max_answer_len: int = 50) -> Dict: + """ + Answer multiple questions for the same context + + Args: + questions: List of questions to answer + context: Context text + max_answer_len: Maximum length of answers + + Returns: + Dictionary with all answers and summary statistics + """ + if not questions: + return {"error": "No questions provided"} + + if not context.strip(): + return {"error": "Empty context"} + + results = [] + confident_answers = 0 + total_confidence = 0 + + for i, question in enumerate(questions, 1): + result = self.answer(question, context, max_answer_len) + + if "error" not in result: + results.append({ + "question_number": i, + **result + }) + + if result["is_confident"]: + confident_answers += 1 + total_confidence += result["confidence"] + + if not results: + return {"error": "No valid questions processed"} + + average_confidence = total_confidence / len(results) if results else 0 + + return { + "context": context, + "total_questions": len(questions), + "processed_questions": len(results), + "confident_answers": confident_answers, + "average_confidence": round(average_confidence, 4), + "confidence_threshold": self.confidence_threshold, + "results": results + } + + def interactive_qa(self, context: str) -> Dict: + """ + Prepare context for interactive Q&A session + + Args: + context: Context text for questions + + Returns: + Context analysis and preparation info + """ + if not context.strip(): + return {"error": "Empty context"} + + # Basic context analysis + word_count = len(context.split()) + sentence_count = len([s for s in context.split('.') if s.strip()]) + char_count = len(context) + + # Suggest question types based on content + suggested_questions = self._generate_question_suggestions(context) + + return { + "context": context, + "context_stats": { + "word_count": word_count, + "sentence_count": sentence_count, + "character_count": char_count + }, + "suggested_questions": suggested_questions, + "tips": [ + "Ask specific questions about facts mentioned in the text", + "Use question words: Who, What, When, Where, Why, How", + "Keep questions clear and focused", + "The answer should be present in the provided context" + ] + } + + def _generate_question_suggestions(self, context: str) -> List[str]: + """ + Generate suggested questions based on context analysis + + Args: + context: Context text + + Returns: + List of suggested question templates + """ + suggestions = [] + + # Check for common patterns and suggest relevant questions + if re.search(r'\b\d{4}\b', context): # Years + suggestions.append("When did [event] happen?") + + if re.search(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', context): # Names + suggestions.append("Who is [person name]?") + + if re.search(r'\b(founded|created|established|built)\b', context, re.IGNORECASE): + suggestions.append("Who founded/created [organization]?") + + if re.search(r'\b(located|situated|based)\b', context, re.IGNORECASE): + suggestions.append("Where is [place/organization] located?") + + if re.search(r'\b(because|due to|reason)\b', context, re.IGNORECASE): + suggestions.append("Why did [event] happen?") + + if re.search(r'\b(how|method|process)\b', context, re.IGNORECASE): + suggestions.append("How does [process] work?") + + if not suggestions: + suggestions = [ + "What is the main topic of this text?", + "Who are the key people mentioned?", + "What important events are described?" + ] + + return suggestions[:5] # Limit to 5 suggestions + + def set_confidence_threshold(self, threshold: float): + """ + Set the confidence threshold for answers + + Args: + threshold: Threshold between 0 and 1 + """ + if 0 <= threshold <= 1: + self.confidence_threshold = threshold + else: + raise ValueError("Threshold must be between 0 and 1") + + def answer_batch(self, qa_pairs: List[Tuple[str, str]], max_answer_len: int = 50) -> List[Dict]: + """ + Process multiple question-context pairs + + Args: + qa_pairs: List of (question, context) tuples + max_answer_len: Maximum length of answers + + Returns: + List of QA results + """ + return [ + self.answer(question, context, max_answer_len) + for question, context in qa_pairs + ]