ai-lab-transformers-playground/src/pipelines/moderation.py

from transformers import pipeline
from typing import Dict, List, Optional
import re
from src.config import Config


class ContentModerator:
    """Content moderator that detects and replaces inappropriate content"""

    def __init__(self, model_name: Optional[str] = None):
        """
        Initialize the content moderation pipeline

        Args:
            model_name: Name of the model to use (optional)
        """
        self.model_name = model_name or Config.get_model("moderation")
        print(f"Loading moderation model: {self.model_name}")
        self.classifier = pipeline("text-classification", model=self.model_name)
        print("Moderation model loaded successfully!")

        # Threshold for considering content as toxic
        self.toxicity_threshold = 0.5

    def moderate(self, text: str, replacement: str = "***") -> Dict:
        """
        Moderate content by detecting and replacing inappropriate words

        Args:
            text: Text to moderate
            replacement: String to replace inappropriate content with

        Returns:
            Dictionary with original text, moderated text, and detection info
        """
        if not text.strip():
            return {"error": "Empty text"}

        try:
            # First, check overall toxicity
            result = self.classifier(text)

            # Handle different model output formats
            if isinstance(result, list):
                predictions = result
            else:
                predictions = [result]

            # Find toxicity score
            toxic_score = 0.0
            is_toxic = False

            for pred in predictions:
                label = pred["label"].upper()
                score = pred["score"]

                # Check different possible toxic labels
                if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
                    toxic_score = max(toxic_score, score)
                    if score > self.toxicity_threshold:
                        is_toxic = True
                elif label in ["NOT_TOXIC", "CLEAN", "0"]:
                    # For models where high score means NOT toxic
                    toxic_score = max(toxic_score, 1.0 - score)
                    if (1.0 - score) > self.toxicity_threshold:
                        is_toxic = True

            if not is_toxic:
                return {
                    "original_text": text,
                    "moderated_text": text,
                    "is_modified": False,
                    "toxic_score": float(toxic_score),
                    "words_replaced": 0
                }

            # If toxic, analyze word by word to find problematic parts
            moderated_text, words_replaced = self._moderate_by_words(text, replacement)

            return {
                "original_text": text,
                "moderated_text": moderated_text,
                "is_modified": True,
                "toxic_score": float(toxic_score),
                "words_replaced": int(words_replaced)
            }

        except Exception as e:
            return {"error": f"Moderation error: {str(e)}"}

    def _moderate_by_words(self, text: str, replacement: str) -> tuple[str, int]:
        """
        Moderate text by analyzing individual words and phrases

        Args:
            text: Original text
            replacement: Replacement string

        Returns:
            Tuple of (moderated_text, words_replaced_count)
        """
        words = text.split()
        moderated_words = []
        words_replaced = 0

        # Check individual words
        for word in words:
            # Clean word for analysis (remove punctuation)
            clean_word = re.sub(r'[^\w]', '', word)
            if not clean_word:
                moderated_words.append(word)
                continue

            try:
                word_result = self.classifier(clean_word)

                # Handle different model output formats
                if isinstance(word_result, list):
                    predictions = word_result
                else:
                    predictions = [word_result]

                is_word_toxic = False
                for pred in predictions:
                    label = pred["label"].upper()
                    score = pred["score"]

                    if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
                        if score > self.toxicity_threshold:
                            is_word_toxic = True
                            break
                    elif label in ["NOT_TOXIC", "CLEAN", "0"]:
                        if (1.0 - score) > self.toxicity_threshold:
                            is_word_toxic = True
                            break

                if is_word_toxic:
                    # Replace the clean part with asterisks, keep punctuation
                    moderated_word = re.sub(r'\w+', replacement, word)
                    moderated_words.append(moderated_word)
                    words_replaced += 1
                else:
                    moderated_words.append(word)

            except:
                # If analysis fails for a word, keep it as is
                moderated_words.append(word)

        return " ".join(moderated_words), words_replaced

    def moderate_batch(self, texts: List[str], replacement: str = "***") -> List[Dict]:
        """
        Moderate multiple texts

        Args:
            texts: List of texts to moderate
            replacement: String to replace inappropriate content with

        Returns:
            List of moderation results
        """
        return [self.moderate(text, replacement) for text in texts]

    def set_threshold(self, threshold: float):
        """
        Set the toxicity threshold

        Args:
            threshold: Threshold between 0 and 1
        """
        if 0 <= threshold <= 1:
            self.toxicity_threshold = threshold
        else:
            raise ValueError("Threshold must be between 0 and 1")