Bases: BasePointWiseReward
Detoxify: Detecting different types of of toxicity like threats, obscenity, insults ans so on.
Source code in rm_gallery/gallery/rm/alignment/harmlessness/detoxification.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63 | @RewardRegistry.register("detoxify_reward")
class DetoxifyReward(BasePointWiseReward):
"""Detoxify: Detecting different types of of toxicity like threats, obscenity, insults ans so on."""
name: str = Field(default="detoxify", description="Name of the reward module")
model_name: str = Field(
default="unbiased", description="Name of the Detoxify model to use"
)
@property
def model(self):
if not hasattr(self, "_model"):
from detoxify import Detoxify
self._model = Detoxify(self.model_name)
return self._model
def _evaluate(self, sample: DataSample, **kwargs) -> RewardResult:
"""
Evaluate text toxicity using Detoxify model.
Args:
sample: Input data sample containing text to evaluate
**kwargs: Additional implementation-specific parameters
Returns:
RewardResult: Computed reward metrics and metadata
"""
try:
# Get text from sample
text = sample.output[0] if sample.output else sample.input
if not text:
raise ValueError("No text provided for evaluation")
# Get model predictions
predictions = self.model.predict(text)
# Convert toxicity score to reward (higher = less toxic)
toxicity_score = predictions["toxicity"]
reward_score = 1.0 - toxicity_score # Invert score so higher is better
# Create reward dimension
reward_dimension = RewardDimensionWithScore(
name="detoxify",
score=reward_score,
reason=f"Text toxicity score: {toxicity_score:.2f}. Higher reward indicates less toxic content.",
)
return RewardResult(name=self.name, details=[reward_dimension])
except Exception as e:
logger.error(f"Error in Detoxify evaluation: {str(e)}")
return RewardResult(name=self.name, details=[])
|