from transformers import pipeline from typing import Dict, List, Optional import re from src.config import Config class ContentModerator: """Content moderator that detects and replaces inappropriate content""" def __init__(self, model_name: Optional[str] = None): """ Initialize the content moderation pipeline Args: model_name: Name of the model to use (optional) """ self.model_name = model_name or Config.get_model("moderation") print(f"Loading moderation model: {self.model_name}") self.classifier = pipeline("text-classification", model=self.model_name) print("Moderation model loaded successfully!") # Threshold for considering content as toxic self.toxicity_threshold = 0.5 def moderate(self, text: str, replacement: str = "***") -> Dict: """ Moderate content by detecting and replacing inappropriate words Args: text: Text to moderate replacement: String to replace inappropriate content with Returns: Dictionary with original text, moderated text, and detection info """ if not text.strip(): return {"error": "Empty text"} try: # First, check overall toxicity result = self.classifier(text) # Handle different model output formats if isinstance(result, list): predictions = result else: predictions = [result] # Find toxicity score toxic_score = 0.0 is_toxic = False for pred in predictions: label = pred["label"].upper() score = pred["score"] # Check different possible toxic labels if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]: toxic_score = max(toxic_score, score) if score > self.toxicity_threshold: is_toxic = True elif label in ["NOT_TOXIC", "CLEAN", "0"]: # For models where high score means NOT toxic toxic_score = max(toxic_score, 1.0 - score) if (1.0 - score) > self.toxicity_threshold: is_toxic = True if not is_toxic: return { "original_text": text, "moderated_text": text, "is_modified": False, "toxic_score": float(toxic_score), "words_replaced": 0 } # If toxic, analyze word by word to find problematic parts moderated_text, words_replaced = self._moderate_by_words(text, replacement) return { "original_text": text, "moderated_text": moderated_text, "is_modified": True, "toxic_score": float(toxic_score), "words_replaced": int(words_replaced) } except Exception as e: return {"error": f"Moderation error: {str(e)}"} def _moderate_by_words(self, text: str, replacement: str) -> tuple[str, int]: """ Moderate text by analyzing individual words and phrases Args: text: Original text replacement: Replacement string Returns: Tuple of (moderated_text, words_replaced_count) """ words = text.split() moderated_words = [] words_replaced = 0 # Check individual words for word in words: # Clean word for analysis (remove punctuation) clean_word = re.sub(r'[^\w]', '', word) if not clean_word: moderated_words.append(word) continue try: word_result = self.classifier(clean_word) # Handle different model output formats if isinstance(word_result, list): predictions = word_result else: predictions = [word_result] is_word_toxic = False for pred in predictions: label = pred["label"].upper() score = pred["score"] if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]: if score > self.toxicity_threshold: is_word_toxic = True break elif label in ["NOT_TOXIC", "CLEAN", "0"]: if (1.0 - score) > self.toxicity_threshold: is_word_toxic = True break if is_word_toxic: # Replace the clean part with asterisks, keep punctuation moderated_word = re.sub(r'\w+', replacement, word) moderated_words.append(moderated_word) words_replaced += 1 else: moderated_words.append(word) except: # If analysis fails for a word, keep it as is moderated_words.append(word) return " ".join(moderated_words), words_replaced def moderate_batch(self, texts: List[str], replacement: str = "***") -> List[Dict]: """ Moderate multiple texts Args: texts: List of texts to moderate replacement: String to replace inappropriate content with Returns: List of moderation results """ return [self.moderate(text, replacement) for text in texts] def set_threshold(self, threshold: float): """ Set the toxicity threshold Args: threshold: Threshold between 0 and 1 """ if 0 <= threshold <= 1: self.toxicity_threshold = threshold else: raise ValueError("Threshold must be between 0 and 1")