import os
import subprocess
import pdfplumber

# Configuration
FOLDER_PATH = "."  # Current folder
MIN_TEXT_LENGTH = 50  # If text is less than this, we assume it's an image

def has_embedded_text(file_path):
    """Checks if a PDF already has text."""
    try:
        with pdfplumber.open(file_path) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text
            
            # If we found enough text, return True
            if len(full_text.strip()) > MIN_TEXT_LENGTH:
                return True
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return False
    return False

def ocr_file(file_path):
    """Runs OCRmyPDF on the file."""
    output_path = file_path.replace(".pdf", "_OCR.pdf")
    
    # Don't re-OCR if the output already exists
    if os.path.exists(output_path):
        print(f"Skipping {file_path} (OCR version already exists)")
        return

    print(f"🖼️  Image Detected: Converting {file_path}...")
    
    try:
        # Run the OCR command
        # --force-ocr: Process even if it thinks there is some text (often garbage in scans)
        # --deskew: Straighten crooked scans
        command = [
            "ocrmypdf", 
            "--force-ocr", 
            "--deskew", 
            file_path, 
            output_path
        ]
        
        result = subprocess.run(command, capture_output=True, text=True)
        
        if result.returncode == 0:
            print(f"✅ Success: Created {output_path}")
        else:
            print(f"❌ Failed to OCR {file_path}")
            print(result.stderr)
            
    except FileNotFoundError:
        print("❌ Error: 'ocrmypdf' is not installed. Run 'sudo apt install ocrmypdf' first.")

def main():
    print("🔍 Scanning for image-based PDFs...")
    files = [f for f in os.listdir(FOLDER_PATH) if f.lower().endswith(".pdf") and "_OCR" not in f]
    
    count = 0
    for filename in files:
        file_path = os.path.join(FOLDER_PATH, filename)
        
        if not has_embedded_text(file_path):
            ocr_file(file_path)
            count += 1
            
    if count == 0:
        print("🎉 No image-only PDFs found. All files differ have text!")
    else:
        print(f"\n✨ Processed {count} files.")

if __name__ == "__main__":
    main()
