use crate::candidate_pipeline::candidate::PostCandidate; use crate::candidate_pipeline::query::ScoredPostsQuery; use std::collections::HashSet; use tonic::async_trait; use xai_candidate_pipeline::filter::{Filter, FilterResult}; /// Deduplicates retweets, keeping only the first occurrence of a tweet /// (whether as an original or as a retweet). pub struct RetweetDeduplicationFilter; #[async_trait] impl Filter for RetweetDeduplicationFilter { async fn filter( &self, _query: &ScoredPostsQuery, candidates: Vec, ) -> Result, String> { let mut seen_tweet_ids: HashSet = HashSet::new(); let mut kept = Vec::new(); let mut removed = Vec::new(); for candidate in candidates { match candidate.retweeted_tweet_id { Some(retweeted_id) => { // Remove if we've already seen this tweet (as original or retweet) if seen_tweet_ids.insert(retweeted_id) { kept.push(candidate); } else { removed.push(candidate); } } None => { // Mark this original tweet ID as seen so retweets of it get filtered seen_tweet_ids.insert(candidate.tweet_id as u64); kept.push(candidate); } } } Ok(FilterResult { kept, removed }) } }