Keyword Extraction Algorithms for Reddit Text Analysis
From TF-IDF basics to transformer-based extraction for Reddit-scale text processing
Keyword extraction transforms unstructured Reddit discussions into searchable, analyzable data. Whether building a trend detection system, topic classifier, or search engine, selecting the right extraction algorithm significantly impacts results. This guide compares approaches from simple statistical methods to neural network-based extraction.
Reddit posts average 150 words but contain only 5-10 truly relevant keywords. Effective extraction reduces noise by 90% while preserving semantic meaning for downstream analysis.
TF-IDF
RAKE
YAKE
KeyBERT
Text Preprocessing for Reddit
Reddit text requires specialized preprocessing to handle platform-specific patterns before keyword extraction.
import re import html from typing import List import emoji class RedditTextPreprocessor: """Preprocess Reddit text for keyword extraction.""" def __init__(self, keep_entities: bool = True): self.keep_entities = keep_entities # Reddit-specific patterns self.patterns = { 'url': re.compile(r'https?://\S+'), 'subreddit': re.compile(r'r/(\w+)'), 'user': re.compile(r'u/(\w+)'), 'markdown_link': re.compile(r'\[([^\]]+)\]\([^\)]+\)'), 'markdown_bold': re.compile(r'\*\*(.*?)\*\*'), 'markdown_italic': re.compile(r'\*(.*?)\*'), 'markdown_strike': re.compile(r'~~(.*?)~~'), 'code_block': re.compile(r'```[\s\S]*?```'), 'inline_code': re.compile(r'`([^`]+)`'), 'quote': re.compile(r'^>.*$', re.MULTILINE), } # Reddit slang expansions self.slang_map = { 'imo': 'in my opinion', 'imho': 'in my humble opinion', 'eli5': 'explain like i am five', 'til': 'today i learned', 'tl;dr': 'summary', 'tldr': 'summary', 'afaik': 'as far as i know', 'iirc': 'if i recall correctly', 'fwiw': 'for what it is worth', 'ymmv': 'your mileage may vary', } def clean(self, text: str) -> str: """Full cleaning pipeline for Reddit text.""" if not text: return '' # Decode HTML entities text = html.unescape(text) # Remove code blocks (preserve for tech analysis) text = self.patterns['code_block'].sub(' ', text) text = self.patterns['inline_code'].sub(r'\1', text) # Remove quotes text = self.patterns['quote'].sub('', text) # Extract subreddit/user mentions if keeping entities if self.keep_entities: text = self.patterns['subreddit'].sub(r'\1', text) text = self.patterns['user'].sub('', text) else: text = self.patterns['subreddit'].sub('', text) text = self.patterns['user'].sub('', text) # Remove URLs text = self.patterns['url'].sub('', text) # Clean markdown text = self.patterns['markdown_link'].sub(r'\1', text) text = self.patterns['markdown_bold'].sub(r'\1', text) text = self.patterns['markdown_italic'].sub(r'\1', text) text = self.patterns['markdown_strike'].sub(r'\1', text) # Convert emoji to text (optional) text = emoji.demojize(text, delimiters=(" ", " ")) # Expand slang for slang, expansion in self.slang_map.items(): text = re.sub( rf'\b{slang}\b', expansion, text, flags=re.IGNORECASE ) # Normalize whitespace text = re.sub(r'\s+', ' ', text) return text.strip() def tokenize(self, text: str) -> List[str]: """Simple word tokenization.""" text = self.clean(text) # Keep alphanumeric and hyphens for compound words tokens = re.findall(r'\b[\w-]+\b', text.lower()) return tokens # Usage preprocessor = RedditTextPreprocessor() cleaned = preprocessor.clean(""" TIL that r/programming has great resources! Check out this **amazing** [tutorial](https://example.com) > Someone quoted this ```python print("code block") ``` IMO this is really helpful FWIW """) print(cleaned) # Output: "today i learned that programming has great resources! Check out this amazing tutorial in my opinion this is really helpful for what it is worth"
TF-IDF Keyword Extraction
Term Frequency-Inverse Document Frequency (TF-IDF) identifies keywords that are frequent in a document but rare across the corpus. It remains the gold standard for large-scale keyword extraction due to its simplicity and effectiveness.
| Component | Formula | Purpose |
|---|---|---|
| Term Frequency (TF) | count(t,d) / len(d) | How often term appears in document |
| Inverse Document Frequency (IDF) | log(N / df(t)) | Rarity across corpus |
| TF-IDF Score | TF * IDF | Combined importance |
from sklearn.feature_extraction.text import TfidfVectorizer from typing import List, Tuple, Dict import numpy as np class TFIDFKeywordExtractor: """ TF-IDF based keyword extraction for Reddit posts. Best for: Large corpus analysis, trend detection across subreddits. """ def __init__( self, max_features: int = 10000, ngram_range: Tuple[int, int] = (1, 2), min_df: int = 2, max_df: float = 0.95 ): self.vectorizer = TfidfVectorizer( max_features=max_features, ngram_range=ngram_range, min_df=min_df, max_df=max_df, stop_words='english', sublinear_tf=True # Use log(1 + tf) for dampening ) self.fitted = False def fit(self, documents: List[str]): """Fit vectorizer on corpus of documents.""" self.vectorizer.fit(documents) self.feature_names = self.vectorizer.get_feature_names_out() self.fitted = True return self def extract_keywords( self, text: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """Extract top keywords from a single document.""" if not self.fitted: raise ValueError("Call fit() first with corpus") # Transform single document tfidf_vector = self.vectorizer.transform([text]) scores = tfidf_vector.toarray()[0] # Get top keywords by score top_indices = scores.argsort()[-top_n:][::-1] keywords = [ (self.feature_names[i], scores[i]) for i in top_indices if scores[i] > 0 ] return keywords def extract_corpus_keywords( self, documents: List[str], top_n: int = 20 ) -> List[Tuple[str, float]]: """Extract most important keywords across entire corpus.""" tfidf_matrix = self.vectorizer.transform(documents) # Average TF-IDF scores across all documents mean_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten() top_indices = mean_scores.argsort()[-top_n:][::-1] return [ (self.feature_names[i], mean_scores[i]) for i in top_indices ] def compare_subreddits( self, subreddit_docs: Dict[str, List[str]], top_n: int = 10 ) -> Dict[str, List[Tuple[str, float]]]: """Find distinctive keywords for each subreddit.""" results = {} for subreddit, docs in subreddit_docs.items(): # Combine all docs for this subreddit combined = ' '.join(docs) results[subreddit] = self.extract_keywords(combined, top_n) return results # Usage example extractor = TFIDFKeywordExtractor(ngram_range=(1, 3)) # Fit on corpus corpus = [ "Machine learning models for sentiment analysis", "Deep learning neural networks outperform traditional methods", "Python libraries for natural language processing", # ... more documents ] extractor.fit(corpus) # Extract from new document keywords = extractor.extract_keywords( "The new transformer models achieve state of the art results", top_n=5 ) for kw, score in keywords: print(f"{kw}: {score:.4f}")
RAKE Algorithm
Rapid Automatic Keyword Extraction (RAKE) identifies keywords based on word co-occurrence patterns. Unlike TF-IDF, RAKE works on single documents without requiring a corpus, making it ideal for real-time processing.
from rake_nltk import Rake from typing import List, Tuple import nltk # Download required NLTK data nltk.download('stopwords', quiet=True) nltk.download('punkt', quiet=True) class RAKEKeywordExtractor: """ RAKE-based keyword extraction. Best for: Single document analysis, real-time extraction, extracting multi-word phrases. """ def __init__( self, min_length: int = 1, max_length: int = 4, include_repeated_phrases: bool = False ): self.rake = Rake( min_length=min_length, max_length=max_length, include_repeated_phrases=include_repeated_phrases ) # Add Reddit-specific stopwords reddit_stopwords = { 'edit', 'update', 'deleted', 'removed', 'reddit', 'subreddit', 'post', 'comment', 'upvote', 'downvote', 'karma' } self.rake.stopwords.update(reddit_stopwords) def extract_keywords( self, text: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """Extract keywords from text using RAKE algorithm.""" self.rake.extract_keywords_from_text(text) # Get ranked phrases with scores ranked = self.rake.get_ranked_phrases_with_scores() # Return top N (score, phrase) pairs return [(phrase, score) for score, phrase in ranked[:top_n]] def extract_from_posts( self, posts: List[str], aggregate: bool = True ) -> List[Tuple[str, float]]: """Extract keywords from multiple posts.""" if aggregate: # Combine all posts for aggregate analysis combined = ' '.join(posts) return self.extract_keywords(combined) else: # Return keywords per post return [self.extract_keywords(post) for post in posts] # Usage rake_extractor = RAKEKeywordExtractor(max_length=3) text = """ The new iPhone 15 Pro Max has an amazing camera system. Apple really improved the computational photography this year. The titanium frame makes it lighter than previous models. Battery life is significantly better than the iPhone 14. """ keywords = rake_extractor.extract_keywords(text, top_n=5) for phrase, score in keywords: print(f"{phrase}: {score:.2f}") # Output: # iphone 15 pro max: 16.0 # amazing camera system: 9.0 # computational photography: 8.0 # titanium frame makes: 7.5 # battery life significantly: 6.5
YAKE Algorithm
Yet Another Keyword Extractor (YAKE) uses statistical features without requiring external knowledge. It handles documents in any language and domain without training, making it versatile for Reddit's diverse content.
import yake from typing import List, Tuple class YAKEKeywordExtractor: """ YAKE-based keyword extraction. Best for: Unsupervised extraction, multilingual content, no corpus requirement. """ def __init__( self, language: str = "en", max_ngram_size: int = 3, deduplication_threshold: float = 0.9, deduplication_algo: str = 'seqm', window_size: int = 1 ): self.extractor = yake.KeywordExtractor( lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, top=20, features=None ) def extract_keywords( self, text: str, top_n: int = 10 ) -> List[Tuple[str, float]]: """ Extract keywords using YAKE. Note: YAKE scores are inverted (lower = more important) We normalize to 0-1 range with higher = better. """ keywords = self.extractor.extract_keywords(text) # Normalize scores (YAKE uses lower = better) if keywords: max_score = max(score for _, score in keywords) normalized = [ (kw, 1 - (score / (max_score + 0.001))) for kw, score in keywords[:top_n] ] return normalized return [] def batch_extract( self, texts: List[str], top_n: int = 5 ) -> List[List[Tuple[str, float]]]: """Extract keywords from multiple texts.""" return [self.extract_keywords(text, top_n) for text in texts] # Usage yake_extractor = YAKEKeywordExtractor(max_ngram_size=2) text = """ Google just released Gemini 2.0, their latest large language model. It reportedly outperforms GPT-4 on several benchmarks including reasoning and coding tasks. The model is available through Google Cloud API. """ keywords = yake_extractor.extract_keywords(text, top_n=5) for kw, score in keywords: print(f"{kw}: {score:.3f}") # Output: # gemini 2.0: 0.892 # large language model: 0.845 # google cloud api: 0.789 # gpt-4: 0.756 # coding tasks: 0.712
KeyBERT: Semantic Keyword Extraction
KeyBERT leverages transformer embeddings to extract semantically meaningful keywords. It understands context and synonyms, making it superior for nuanced Reddit discussions.
KeyBERT provides the highest quality keywords but requires GPU for production speeds. Expect ~50ms/document on GPU vs ~500ms on CPU. Consider batching for efficiency.
from keybert import KeyBERT from sentence_transformers import SentenceTransformer from typing import List, Tuple, Union import numpy as np class KeyBERTExtractor: """ KeyBERT-based semantic keyword extraction. Best for: High-quality extraction, semantic understanding, handling synonyms and context. """ def __init__( self, model_name: str = "all-MiniLM-L6-v2" ): # Use a fast, accurate embedding model self.model = KeyBERT(model=model_name) def extract_keywords( self, text: str, top_n: int = 10, keyphrase_ngram_range: Tuple[int, int] = (1, 2), use_mmr: bool = True, diversity: float = 0.5 ) -> List[Tuple[str, float]]: """ Extract keywords using BERT embeddings. Args: text: Document text top_n: Number of keywords to extract keyphrase_ngram_range: Min and max n-gram sizes use_mmr: Use Maximal Marginal Relevance for diversity diversity: Higher = more diverse keywords (0-1) """ keywords = self.model.extract_keywords( text, keyphrase_ngram_range=keyphrase_ngram_range, stop_words='english', use_mmr=use_mmr, diversity=diversity, top_n=top_n ) return keywords def extract_with_seed( self, text: str, seed_keywords: List[str], top_n: int = 10 ) -> List[Tuple[str, float]]: """ Extract keywords guided by seed keywords. Useful for domain-specific extraction. """ keywords = self.model.extract_keywords( text, seed_keywords=seed_keywords, top_n=top_n, stop_words='english' ) return keywords def batch_extract( self, texts: List[str], top_n: int = 5 ) -> List[List[Tuple[str, float]]]: """Efficient batch extraction.""" # KeyBERT handles batching internally results = [] for text in texts: keywords = self.extract_keywords(text, top_n=top_n) results.append(keywords) return results # Usage keybert = KeyBERTExtractor() text = """ After using the M3 MacBook Pro for two months, I can confidently say the Apple Silicon performance is incredible. The unified memory architecture handles large ML models without issue. Battery lasts all day even with heavy compilation workloads. Only downside is the lack of upgradability. """ # Standard extraction keywords = keybert.extract_keywords(text, top_n=5, diversity=0.7) print("Standard extraction:") for kw, score in keywords: print(f" {kw}: {score:.3f}") # Guided extraction with seed keywords seed_keywords = ['laptop', 'computer', 'performance'] guided = keybert.extract_with_seed(text, seed_keywords, top_n=5) print("\nGuided extraction:") for kw, score in guided: print(f" {kw}: {score:.3f}")
Algorithm Comparison
Choose your extraction algorithm based on specific requirements. Here is a comprehensive comparison across key dimensions.
| Algorithm | Speed | Accuracy | Corpus Needed | Multi-word | Best Use Case |
|---|---|---|---|---|---|
| TF-IDF | Very Fast | Good | Yes | With n-grams | Large-scale analysis |
| RAKE | Fast | Medium | No | Excellent | Real-time processing |
| YAKE | Fast | Good | No | Good | Multilingual content |
| KeyBERT | Slow (GPU: Fast) | Excellent | No | Excellent | Quality-critical apps |
| Ensemble | Variable | Best | Depends | Excellent | Production systems |
Keyword Extraction Built-In
reddapi.dev automatically extracts keywords from Reddit posts using optimized ensemble methods. Focus on insights, not infrastructure.
Try Semantic SearchProduction Optimization
Optimize keyword extraction for production workloads with caching, batching, and hybrid approaches.
from typing import List, Tuple, Dict from collections import defaultdict import hashlib class EnsembleKeywordExtractor: """ Combine multiple extractors for robust keyword extraction. Uses voting and score fusion for best results. """ def __init__( self, extractors: Dict[str, object], weights: Dict[str, float] = None ): self.extractors = extractors self.weights = weights or {k: 1.0 for k in extractors} self.cache = {} def _cache_key(self, text: str) -> str: """Generate cache key for text.""" return hashlib.md5(text.encode()).hexdigest() def extract_keywords( self, text: str, top_n: int = 10, use_cache: bool = True ) -> List[Tuple[str, float]]: """ Extract keywords using ensemble of extractors. Combines results through weighted score fusion. """ cache_key = self._cache_key(text) if use_cache and cache_key in self.cache: return self.cache[cache_key] # Collect keywords from all extractors all_keywords = defaultdict(float) for name, extractor in self.extractors.items(): weight = self.weights.get(name, 1.0) try: keywords = extractor.extract_keywords(text, top_n=top_n * 2) # Normalize scores to 0-1 range if keywords: max_score = max(score for _, score in keywords) or 1 for kw, score in keywords: normalized = score / max_score all_keywords[kw.lower()] += normalized * weight except Exception as e: print(f"Extractor {name} failed: {e}") continue # Sort by combined score sorted_keywords = sorted( all_keywords.items(), key=lambda x: x[1], reverse=True )[:top_n] # Cache results if use_cache: self.cache[cache_key] = sorted_keywords return sorted_keywords def extract_with_voting( self, text: str, top_n: int = 10, min_votes: int = 2 ) -> List[Tuple[str, int]]: """ Extract keywords that appear in multiple extractors. Returns keywords with vote count. """ keyword_votes = defaultdict(int) for extractor in self.extractors.values(): try: keywords = extractor.extract_keywords(text, top_n=top_n * 2) for kw, _ in keywords: keyword_votes[kw.lower()] += 1 except: continue # Filter by minimum votes voted_keywords = [ (kw, votes) for kw, votes in keyword_votes.items() if votes >= min_votes ] return sorted(voted_keywords, key=lambda x: x[1], reverse=True)[:top_n] # Setup ensemble ensemble = EnsembleKeywordExtractor( extractors={ 'tfidf': tfidf_extractor, 'rake': rake_extractor, 'yake': yake_extractor, }, weights={ 'tfidf': 1.2, # Slightly prefer TF-IDF 'rake': 1.0, 'yake': 1.0, } ) # Extract with ensemble keywords = ensemble.extract_keywords(text, top_n=10) # High-confidence keywords (appearing in 2+ extractors) voted = ensemble.extract_with_voting(text, min_votes=2)