#!/usr/bin/env python3
"""
Bogostian Ontographical Word Cleaner
Based on Ian Bogost's "Alien Phenomenology" principles
"""

import re
import json
import sys
from collections import Counter, defaultdict
from typing import List, Set, Tuple, Dict
import argparse
import unicodedata
import string

# Common English articles, prepositions, conjunctions, pronouns, auxiliary verbs
CONNECTING_WORDS = {
    # Articles
    'a', 'an', 'the',
    # Prepositions
    'in', 'on', 'at', 'by', 'with', 'from', 'of', 'to', 'for', 'about',
    'into', 'through', 'during', 'before', 'after', 'above', 'below',
    'between', 'under', 'along', 'across', 'behind', 'beyond', 'within',
    'without', 'upon', 'around', 'against', 'among', 'throughout',
    'despite', 'towards', 'per', 'via', 'versus', 'amid', 'amidst',
    'beneath', 'beside', 'besides', 'concerning', 'considering', 'down',
    'except', 'excluding', 'following', 'inside', 'near', 'off', 'onto',
    'opposite', 'outside', 'over', 'past', 'regarding', 'since', 'than',
    'till', 'underneath', 'unlike', 'until', 'up', 'vs',
    # Conjunctions
    'and', 'or', 'but', 'nor', 'yet', 'so', 'for', 'because', 'although',
    'though', 'while', 'if', 'unless', 'since', 'when', 'where', 'whereas',
    'whether', 'either', 'neither', 'both', 'not', 'only', 'also',
    # Pronouns
    'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her',
    'us', 'them', 'my', 'your', 'his', 'her', 'its', 'our', 'their',
    'mine', 'yours', 'hers', 'ours', 'theirs', 'myself', 'yourself',
    'himself', 'herself', 'itself', 'ourselves', 'yourselves', 'themselves',
    'this', 'that', 'these', 'those', 'who', 'whom', 'whose', 'which',
    'what', 'whoever', 'whomever', 'whichever', 'whatever', 'someone',
    'anyone', 'everyone', 'no one', 'somebody', 'anybody', 'everybody',
    'nobody', 'something', 'anything', 'everything', 'nothing',
    # Auxiliary verbs
    'is', 'are', 'was', 'were', 'been', 'being', 'be', 'am',
    'has', 'have', 'had', 'having', 'do', 'does', 'did', 'doing', 'done',
    'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can',
    'could', 'ought',
    # Common auxiliary phrases
    'let', 'make', 'need', 'used'
}

# Interface labels and web cruft commonly found in scraped content
WEB_CRUFT = {
    # Navigation
    'home', 'about', 'contact', 'login', 'logout', 'signup', 'signin',
    'register', 'submit', 'cancel', 'back', 'next', 'previous', 'menu',
    'navigation', 'nav', 'footer', 'header', 'sidebar',
    # Actions
    'click', 'here', 'read', 'more', 'less', 'view', 'download', 'upload',
    'save', 'delete', 'edit', 'update', 'refresh', 'reload', 'search',
    'find', 'filter', 'sort', 'share', 'print', 'export', 'import',
    # Status/Messages
    'error', 'success', 'warning', 'info', 'alert', 'notice', 'message',
    'loading', 'please', 'wait', 'processing', 'pending', 'completed',
    # Form labels
    'username', 'password', 'email', 'phone', 'address', 'city', 'state',
    'country', 'zip', 'zipcode', 'postal', 'code', 'firstname', 'lastname',
    'name', 'title', 'description', 'comment', 'comments', 'reply',
    # Common web terms
    'page', 'post', 'article', 'blog', 'news', 'category', 'tag', 'tags',
    'archive', 'archives', 'link', 'links', 'url', 'copyright', 'privacy',
    'policy', 'terms', 'conditions', 'disclaimer', 'faq', 'help', 'support',
    # Social media
    'like', 'follow', 'subscribe', 'tweet', 'retweet', 'pin', 'share',
    # HTTP/Tech
    'http', 'https', 'www', 'com', 'org', 'net', 'html', 'css', 'js',
    'json', 'xml', 'api', 'sdk', 'docs', 'documentation'
}

# Abstract processes without materiality
ABSTRACT_PROCESSES = {
    'accessibility', 'administration', 'functionality', 'implementation',
    'management', 'organization', 'processing', 'optimization', 'utilization',
    'coordination', 'integration', 'facilitation', 'evaluation', 'assessment',
    'analysis', 'synthesis', 'development', 'maintenance', 'monitoring',
    'supervision', 'regulation', 'standardization', 'verification', 'validation',
    'authorization', 'authentication', 'configuration', 'customization',
    'deployment', 'distribution', 'execution', 'initialization', 'installation',
    'migration', 'operation', 'performance', 'production', 'provision',
    'replication', 'restoration', 'synchronization', 'transformation',
    'transmission', 'troubleshooting', 'upgrading'
}

# Common English dictionary - simplified for demonstration
# In production, use a proper dictionary like NLTK's words corpus
try:
    with open('/usr/share/dict/words', 'r') as f:
        DICTIONARY = set(word.strip().lower() for word in f)
except FileNotFoundError:
    # Fallback to a basic dictionary of common English words
    DICTIONARY = set('''
    tree bird fish dog cat human child woman man worker teacher
    lighthouse dragonfly butterfly rainbow keyboard notebook football
    baseball building painting offering wedding meeting morning evening
    spring ring king thing ceiling feeling being lightning clothing
    beloved blessed cursed wicked naked sacred hundred kindred
    hammer storm rock lake lion truck whistle gun bee piranha bear rat
    professor exterminator carbon oxygen steel bronze concrete acrylic
    vinyl wood oil salt glass plastic metal water milk vinegar
    dream marriage election ceremony tornado miracle moment deadline
    book movie letter advertisement painting sculpture photograph
    contract constitution will map mystery elegance decay momentum
    silence darkness light shadow afternoon century river valley ridge
    desert basement pillory altar table factory cathedral locomotive
    marble construction mayor street laboratory scientist quantum particle
    network administrator server rain flower artist ocean grain symphony
    infinite particular consciousness matter relationship sound laughter
    document planning initiative instrument radiation sequence protocol
    performance metric factory abandoned dance wave edge world amber
    velocity architecture city town village forest mountain meadow field
    '''.lower().split())
    # Add more common words
    DICTIONARY.update(str(i) for i in range(1000))
    DICTIONARY.update(string.ascii_lowercase)

def extract_words_from_text(text: str) -> List[str]:
    """Extract all words from text, preserving original capitalization."""
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Replace non-alphanumeric with spaces, keep apostrophes
    text = re.sub(r"[^\w\s'-]", ' ', text)
    # Split on whitespace and filter empty strings
    words = [w for w in text.split() if w]
    return words

def stage1_technical_cleaning(words: List[str]) -> List[str]:
    """Stage 1: Technical cleaning - deduplication, normalization, encoding cleanup."""
    seen_lower = {}
    result = []
    
    for word in words:
        # Remove leading/trailing whitespace
        word = word.strip()
        
        # Skip empty strings
        if not word:
            continue
            
        # Strip quotation marks, backslashes, brackets
        word = word.strip('\'"\\[]{}()')
        
        # Remove trailing punctuation
        word = word.rstrip('.,;:!?')
        
        # Remove leading/trailing underscores and hyphens
        word = word.strip('_-')
        
        # Skip if empty after stripping
        if not word:
            continue
        
        # Handle HTML entities
        word = word.replace('&amp;', '&').replace('&nbsp;', ' ').replace('&lt;', '<').replace('&gt;', '>')
        
        # Skip URLs and emails
        if '@' in word or 'http' in word or '.com' in word or '.org' in word:
            continue
            
        # Skip file extensions
        if re.search(r'\.(html?|php|js|css|json|xml|pdf|doc|txt|jpg|png|gif)$', word, re.I):
            continue
            
        # Skip version numbers
        if re.match(r'^v?\d+(\.\d+)*$', word):
            continue
        
        # Case-sensitive deduplication
        word_lower = word.lower()
        if word_lower not in seen_lower:
            seen_lower[word_lower] = word
            result.append(word)
        elif word[0].isupper() and not seen_lower[word_lower][0].isupper():
            # Prefer capitalized version for proper nouns
            idx = result.index(seen_lower[word_lower])
            result[idx] = word
            seen_lower[word_lower] = word
    
    return result

def stage2_enjambment_separation(words: List[str]) -> List[str]:
    """Stage 2: Detect and separate compound words and phrases."""
    result = []
    
    for word in words:
        # Skip if it's a known good compound (like "lighthouse", "dragonfly")
        if word.lower() in ['lighthouse', 'dragonfly', 'butterfly', 'rainbow', 
                            'keyboard', 'notebook', 'football', 'baseball']:
            result.append(word)
            continue
        
        # Split hyphenated words
        if '-' in word and len(word) > 1:
            parts = word.split('-')
            # Keep the original if it's a well-known compound
            if len(parts) == 2 and all(len(p) > 2 for p in parts):
                result.extend(parts)
            else:
                result.append(word)
        
        # Split CamelCase
        elif re.match(r'^[A-Z][a-z]+[A-Z]', word):
            parts = re.findall(r'[A-Z][a-z]+|[a-z]+', word)
            result.extend(parts)
        
        # Split underscores
        elif '_' in word:
            parts = word.split('_')
            result.extend(p for p in parts if p)
        
        else:
            result.append(word)
    
    return result

def stage3_dictionary_validation(words: List[str]) -> Tuple[List[str], List[str]]:
    """Stage 3: Validate against dictionary, separate proper nouns."""
    valid_words = []
    proper_nouns = []
    
    for word in words:
        word_lower = word.lower()
        
        # Check if it's a dictionary word
        if word_lower in DICTIONARY or len(word_lower) <= 2:
            valid_words.append(word)
        # Check if it might be a proper noun (capitalized)
        elif word[0].isupper():
            proper_nouns.append(word)
        # Keep some neologisms and cultural terms
        elif word_lower in ['internet', 'cyborg', 'email', 'smartphone', 'wifi',
                            'blog', 'podcast', 'hashtag', 'meme', 'emoji']:
            valid_words.append(word)
    
    return valid_words, proper_nouns

def stage4_ontological_filtration(words: List[str], proper_nouns: List[str]) -> List[str]:
    """Stage 4: Apply Bogostian ontological filtering."""
    result = []
    
    all_words = words + proper_nouns
    
    for word in all_words:
        word_lower = word.lower()
        
        # Skip connecting words
        if word_lower in CONNECTING_WORDS:
            continue
        
        # Skip pure numbers
        if word.isdigit():
            continue
        
        # Skip web cruft
        if word_lower in WEB_CRUFT:
            continue
        
        # Skip abstract processes
        if word_lower in ABSTRACT_PROCESSES:
            # But keep if it could refer to a concrete institution
            if word[0].isupper():  # Might be "Administration" as institution
                result.append(word)
            continue
        
        # Skip gerunds that are just verb forms
        if word_lower.endswith('ing') and not word_lower in [
            'building', 'painting', 'offering', 'wedding', 'meeting',
            'morning', 'evening', 'spring', 'ring', 'king', 'thing',
            'ceiling', 'feeling', 'being', 'lightning', 'clothing'
        ]:
            # Check if removing 'ing' gives us a verb
            base = word_lower[:-3]
            if base in DICTIONARY or base + 'e' in DICTIONARY:
                continue
        
        # Skip past participles that aren't distinct adjectives
        if word_lower.endswith('ed') and word_lower not in [
            'beloved', 'blessed', 'cursed', 'wicked', 'naked',
            'sacred', 'hundred', 'kindred'
        ]:
            continue
        
        result.append(word)
    
    return result

def stage5_morphological_deduplication(words: List[str]) -> List[str]:
    """Stage 5: Remove morphological redundancy."""
    # Group words by their likely root
    root_groups = defaultdict(list)
    
    for word in words:
        # Simple stemming - just for grouping
        word_lower = word.lower()
        
        # Remove common suffixes to find root
        root = word_lower
        for suffix in ['ing', 'ed', 'er', 'est', 'ly', 's', 'es', 
                      'tion', 'ment', 'ness', 'ity', 'able', 'ible']:
            if root.endswith(suffix) and len(root) > len(suffix) + 2:
                potential_root = root[:-len(suffix)]
                if potential_root in DICTIONARY or potential_root + 'e' in DICTIONARY:
                    root = potential_root
                    break
        
        root_groups[root].append(word)
    
    # Choose the most evocative form from each group
    result = []
    for root, group in root_groups.items():
        if len(group) == 1:
            result.append(group[0])
        else:
            # Prefer: proper noun > base form > most concrete form
            proper = [w for w in group if w[0].isupper()]
            if proper:
                result.append(proper[0])
            else:
                # Prefer shorter forms (usually base forms)
                group.sort(key=len)
                result.append(group[0])
    
    return result

def stage6_enrichment_check(words: List[str]) -> Dict[str, List[str]]:
    """Stage 6: Categorize words by ontological type."""
    categories = {
        'concrete_objects': [],
        'places_spaces': [],
        'living_beings': [],
        'materials_substances': [],
        'conceptual_objects': [],
        'cultural_artifacts': [],
        'evocative_abstractions': []
    }
    
    for word in words:
        word_lower = word.lower()
        
        # Simple heuristic categorization
        # In a real implementation, this would be more sophisticated
        
        # Check for place indicators
        if word[0].isupper() and len(word) > 3:
            categories['places_spaces'].append(word)
        
        # Check for living being indicators
        elif word_lower in ['tree', 'bird', 'fish', 'dog', 'cat', 'human',
                           'child', 'woman', 'man', 'worker', 'teacher']:
            categories['living_beings'].append(word)
        
        # Check for material indicators  
        elif word_lower in ['steel', 'wood', 'stone', 'water', 'oil', 'salt',
                           'glass', 'plastic', 'metal', 'concrete']:
            categories['materials_substances'].append(word)
        
        # Check for cultural artifacts
        elif word_lower in ['book', 'movie', 'painting', 'song', 'poem',
                           'letter', 'document', 'map', 'photograph']:
            categories['cultural_artifacts'].append(word)
        
        # Check for conceptual objects that act like things
        elif word_lower in ['dream', 'marriage', 'election', 'ceremony',
                           'storm', 'tornado', 'miracle', 'moment']:
            categories['conceptual_objects'].append(word)
        
        # Abstract but evocative
        elif word_lower in ['mystery', 'elegance', 'decay', 'momentum',
                           'silence', 'darkness', 'light', 'shadow']:
            categories['evocative_abstractions'].append(word)
        
        # Default to concrete objects
        else:
            categories['concrete_objects'].append(word)
    
    return categories

def apply_bogostian_cleaning(text: str, min_words: int = 30, max_words: int = 500, verbose: bool = True) -> Dict:
    """Apply the complete Bogostian cleaning paradigm."""
    
    # Extract words
    if verbose:
        print("Extracting words from text...", file=sys.stderr)
    words = extract_words_from_text(text)
    initial_count = len(words)
    
    # Stage 1: Technical cleaning
    if verbose:
        print("Stage 1: Technical cleaning...", file=sys.stderr)
    words = stage1_technical_cleaning(words)
    stage1_count = len(words)
    
    # Stage 2: Enjambment separation
    if verbose:
        print("Stage 2: Enjambment separation...", file=sys.stderr)
    words = stage2_enjambment_separation(words)
    stage2_count = len(words)
    
    # Stage 3: Dictionary validation
    if verbose:
        print("Stage 3: Dictionary validation...", file=sys.stderr)
    valid_words, proper_nouns = stage3_dictionary_validation(words)
    stage3_count = len(valid_words) + len(proper_nouns)
    
    # Stage 4: Ontological filtration
    if verbose:
        print("Stage 4: Ontological filtration...", file=sys.stderr)
    words = stage4_ontological_filtration(valid_words, proper_nouns)
    stage4_count = len(words)
    
    # Stage 5: Morphological deduplication
    if verbose:
        print("Stage 5: Morphological deduplication...", file=sys.stderr)
    words = stage5_morphological_deduplication(words)
    stage5_count = len(words)
    
    # Stage 6: Enrichment check
    if verbose:
        print("Stage 6: Categorization and enrichment check...", file=sys.stderr)
    categories = stage6_enrichment_check(words)
    
    # Flatten categories back to single list
    final_words = []
    for category, category_words in categories.items():
        final_words.extend(category_words)
    
    # Remove any remaining duplicates
    final_words = list(dict.fromkeys(final_words))
    
    # Check size constraints
    if verbose:
        if len(final_words) < min_words:
            print(f"Warning: Only {len(final_words)} words after cleaning (minimum recommended: {min_words})", file=sys.stderr)
        elif len(final_words) > max_words:
            print(f"Note: {len(final_words)} words after cleaning, truncating to {max_words}", file=sys.stderr)
            final_words = final_words[:max_words]
    
    return {
        'words': final_words,
        'count': len(final_words),
        'stages': {
            'initial': initial_count,
            'stage1_technical': stage1_count,
            'stage2_enjambment': stage2_count,
            'stage3_validation': stage3_count,
            'stage4_filtration': stage4_count,
            'stage5_deduplication': stage5_count,
            'final': len(final_words)
        },
        'categories': {k: len(v) for k, v in categories.items()},
        'javascript_array': json.dumps(final_words, indent=2)
    }

def main():
    parser = argparse.ArgumentParser(
        description='Apply Bogostian Ontographical Word Cleaning to text'
    )
    parser.add_argument('input', help='Input file path or - for stdin')
    parser.add_argument('--min-words', type=int, default=30,
                       help='Minimum words for good litany (default: 30)')
    parser.add_argument('--max-words', type=int, default=500,
                       help='Maximum words before truncation (default: 500)')
    parser.add_argument('--output-format', choices=['json', 'list', 'javascript'],
                       default='json', help='Output format')
    parser.add_argument('--quiet', action='store_true',
                       help='Suppress progress messages')
    
    args = parser.parse_args()
    
    # Read input
    if args.input == '-':
        text = sys.stdin.read()
    else:
        with open(args.input, 'r', encoding='utf-8') as f:
            text = f.read()
    
    # Apply cleaning
    result = apply_bogostian_cleaning(text, args.min_words, args.max_words, 
                                     verbose=not args.quiet)
    
    # Output results
    if args.output_format == 'json':
        print(json.dumps(result, indent=2))
    elif args.output_format == 'list':
        for word in result['words']:
            print(word)
    elif args.output_format == 'javascript':
        print(f"const wordPool = {result['javascript_array']};")
    
    # Print summary to stderr (unless quiet)
    if not args.quiet:
        print(f"\n--- Bogostian Cleaning Summary ---", file=sys.stderr)
        print(f"Initial words: {result['stages']['initial']}", file=sys.stderr)
        print(f"Final words: {result['count']}", file=sys.stderr)
        print(f"Categories:", file=sys.stderr)
        for cat, count in result['categories'].items():
            print(f"  {cat}: {count}", file=sys.stderr)

if __name__ == '__main__':
    main()
