ai-lab-transformers-playground/src/pipelines/moderation.py

174 lines
6.2 KiB
Python

from transformers import pipeline
from typing import Dict, List, Optional
import re
from src.config import Config
class ContentModerator:
"""Content moderator that detects and replaces inappropriate content"""
def __init__(self, model_name: Optional[str] = None):
"""
Initialize the content moderation pipeline
Args:
model_name: Name of the model to use (optional)
"""
self.model_name = model_name or Config.get_model("moderation")
print(f"Loading moderation model: {self.model_name}")
self.classifier = pipeline("text-classification", model=self.model_name)
print("Moderation model loaded successfully!")
# Threshold for considering content as toxic
self.toxicity_threshold = 0.5
def moderate(self, text: str, replacement: str = "***") -> Dict:
"""
Moderate content by detecting and replacing inappropriate words
Args:
text: Text to moderate
replacement: String to replace inappropriate content with
Returns:
Dictionary with original text, moderated text, and detection info
"""
if not text.strip():
return {"error": "Empty text"}
try:
# First, check overall toxicity
result = self.classifier(text)
# Handle different model output formats
if isinstance(result, list):
predictions = result
else:
predictions = [result]
# Find toxicity score
toxic_score = 0.0
is_toxic = False
for pred in predictions:
label = pred["label"].upper()
score = pred["score"]
# Check different possible toxic labels
if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
toxic_score = max(toxic_score, score)
if score > self.toxicity_threshold:
is_toxic = True
elif label in ["NOT_TOXIC", "CLEAN", "0"]:
# For models where high score means NOT toxic
toxic_score = max(toxic_score, 1.0 - score)
if (1.0 - score) > self.toxicity_threshold:
is_toxic = True
if not is_toxic:
return {
"original_text": text,
"moderated_text": text,
"is_modified": False,
"toxic_score": float(toxic_score),
"words_replaced": 0
}
# If toxic, analyze word by word to find problematic parts
moderated_text, words_replaced = self._moderate_by_words(text, replacement)
return {
"original_text": text,
"moderated_text": moderated_text,
"is_modified": True,
"toxic_score": float(toxic_score),
"words_replaced": int(words_replaced)
}
except Exception as e:
return {"error": f"Moderation error: {str(e)}"}
def _moderate_by_words(self, text: str, replacement: str) -> tuple[str, int]:
"""
Moderate text by analyzing individual words and phrases
Args:
text: Original text
replacement: Replacement string
Returns:
Tuple of (moderated_text, words_replaced_count)
"""
words = text.split()
moderated_words = []
words_replaced = 0
# Check individual words
for word in words:
# Clean word for analysis (remove punctuation)
clean_word = re.sub(r'[^\w]', '', word)
if not clean_word:
moderated_words.append(word)
continue
try:
word_result = self.classifier(clean_word)
# Handle different model output formats
if isinstance(word_result, list):
predictions = word_result
else:
predictions = [word_result]
is_word_toxic = False
for pred in predictions:
label = pred["label"].upper()
score = pred["score"]
if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
if score > self.toxicity_threshold:
is_word_toxic = True
break
elif label in ["NOT_TOXIC", "CLEAN", "0"]:
if (1.0 - score) > self.toxicity_threshold:
is_word_toxic = True
break
if is_word_toxic:
# Replace the clean part with asterisks, keep punctuation
moderated_word = re.sub(r'\w+', replacement, word)
moderated_words.append(moderated_word)
words_replaced += 1
else:
moderated_words.append(word)
except:
# If analysis fails for a word, keep it as is
moderated_words.append(word)
return " ".join(moderated_words), words_replaced
def moderate_batch(self, texts: List[str], replacement: str = "***") -> List[Dict]:
"""
Moderate multiple texts
Args:
texts: List of texts to moderate
replacement: String to replace inappropriate content with
Returns:
List of moderation results
"""
return [self.moderate(text, replacement) for text in texts]
def set_threshold(self, threshold: float):
"""
Set the toxicity threshold
Args:
threshold: Threshold between 0 and 1
"""
if 0 <= threshold <= 1:
self.toxicity_threshold = threshold
else:
raise ValueError("Threshold must be between 0 and 1")