174 lines
6.2 KiB
Python
174 lines
6.2 KiB
Python
from transformers import pipeline
|
|
from typing import Dict, List, Optional
|
|
import re
|
|
from src.config import Config
|
|
|
|
|
|
class ContentModerator:
|
|
"""Content moderator that detects and replaces inappropriate content"""
|
|
|
|
def __init__(self, model_name: Optional[str] = None):
|
|
"""
|
|
Initialize the content moderation pipeline
|
|
|
|
Args:
|
|
model_name: Name of the model to use (optional)
|
|
"""
|
|
self.model_name = model_name or Config.get_model("moderation")
|
|
print(f"Loading moderation model: {self.model_name}")
|
|
self.classifier = pipeline("text-classification", model=self.model_name)
|
|
print("Moderation model loaded successfully!")
|
|
|
|
# Threshold for considering content as toxic
|
|
self.toxicity_threshold = 0.5
|
|
|
|
def moderate(self, text: str, replacement: str = "***") -> Dict:
|
|
"""
|
|
Moderate content by detecting and replacing inappropriate words
|
|
|
|
Args:
|
|
text: Text to moderate
|
|
replacement: String to replace inappropriate content with
|
|
|
|
Returns:
|
|
Dictionary with original text, moderated text, and detection info
|
|
"""
|
|
if not text.strip():
|
|
return {"error": "Empty text"}
|
|
|
|
try:
|
|
# First, check overall toxicity
|
|
result = self.classifier(text)
|
|
|
|
# Handle different model output formats
|
|
if isinstance(result, list):
|
|
predictions = result
|
|
else:
|
|
predictions = [result]
|
|
|
|
# Find toxicity score
|
|
toxic_score = 0.0
|
|
is_toxic = False
|
|
|
|
for pred in predictions:
|
|
label = pred["label"].upper()
|
|
score = pred["score"]
|
|
|
|
# Check different possible toxic labels
|
|
if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
|
|
toxic_score = max(toxic_score, score)
|
|
if score > self.toxicity_threshold:
|
|
is_toxic = True
|
|
elif label in ["NOT_TOXIC", "CLEAN", "0"]:
|
|
# For models where high score means NOT toxic
|
|
toxic_score = max(toxic_score, 1.0 - score)
|
|
if (1.0 - score) > self.toxicity_threshold:
|
|
is_toxic = True
|
|
|
|
if not is_toxic:
|
|
return {
|
|
"original_text": text,
|
|
"moderated_text": text,
|
|
"is_modified": False,
|
|
"toxic_score": float(toxic_score),
|
|
"words_replaced": 0
|
|
}
|
|
|
|
# If toxic, analyze word by word to find problematic parts
|
|
moderated_text, words_replaced = self._moderate_by_words(text, replacement)
|
|
|
|
return {
|
|
"original_text": text,
|
|
"moderated_text": moderated_text,
|
|
"is_modified": True,
|
|
"toxic_score": float(toxic_score),
|
|
"words_replaced": int(words_replaced)
|
|
}
|
|
|
|
except Exception as e:
|
|
return {"error": f"Moderation error: {str(e)}"}
|
|
|
|
def _moderate_by_words(self, text: str, replacement: str) -> tuple[str, int]:
|
|
"""
|
|
Moderate text by analyzing individual words and phrases
|
|
|
|
Args:
|
|
text: Original text
|
|
replacement: Replacement string
|
|
|
|
Returns:
|
|
Tuple of (moderated_text, words_replaced_count)
|
|
"""
|
|
words = text.split()
|
|
moderated_words = []
|
|
words_replaced = 0
|
|
|
|
# Check individual words
|
|
for word in words:
|
|
# Clean word for analysis (remove punctuation)
|
|
clean_word = re.sub(r'[^\w]', '', word)
|
|
if not clean_word:
|
|
moderated_words.append(word)
|
|
continue
|
|
|
|
try:
|
|
word_result = self.classifier(clean_word)
|
|
|
|
# Handle different model output formats
|
|
if isinstance(word_result, list):
|
|
predictions = word_result
|
|
else:
|
|
predictions = [word_result]
|
|
|
|
is_word_toxic = False
|
|
for pred in predictions:
|
|
label = pred["label"].upper()
|
|
score = pred["score"]
|
|
|
|
if label in ["TOXIC", "TOXICITY", "HARMFUL", "1"]:
|
|
if score > self.toxicity_threshold:
|
|
is_word_toxic = True
|
|
break
|
|
elif label in ["NOT_TOXIC", "CLEAN", "0"]:
|
|
if (1.0 - score) > self.toxicity_threshold:
|
|
is_word_toxic = True
|
|
break
|
|
|
|
if is_word_toxic:
|
|
# Replace the clean part with asterisks, keep punctuation
|
|
moderated_word = re.sub(r'\w+', replacement, word)
|
|
moderated_words.append(moderated_word)
|
|
words_replaced += 1
|
|
else:
|
|
moderated_words.append(word)
|
|
|
|
except:
|
|
# If analysis fails for a word, keep it as is
|
|
moderated_words.append(word)
|
|
|
|
return " ".join(moderated_words), words_replaced
|
|
|
|
def moderate_batch(self, texts: List[str], replacement: str = "***") -> List[Dict]:
|
|
"""
|
|
Moderate multiple texts
|
|
|
|
Args:
|
|
texts: List of texts to moderate
|
|
replacement: String to replace inappropriate content with
|
|
|
|
Returns:
|
|
List of moderation results
|
|
"""
|
|
return [self.moderate(text, replacement) for text in texts]
|
|
|
|
def set_threshold(self, threshold: float):
|
|
"""
|
|
Set the toxicity threshold
|
|
|
|
Args:
|
|
threshold: Threshold between 0 and 1
|
|
"""
|
|
if 0 <= threshold <= 1:
|
|
self.toxicity_threshold = threshold
|
|
else:
|
|
raise ValueError("Threshold must be between 0 and 1") |