import os
import requests
import json
import pdfplumber
import re
from datetime import datetime
import time

# --- OPTIONAL DEPENDENCY: python-docx ---
DOCX_AVAILABLE = False
try:
    from docx import Document
    DOCX_AVAILABLE = True
except ImportError:
    print("Warning: 'python-docx' not found. .docx files will be skipped.")
    print("To support Word docs, run: pip install python-docx")

# --- CONFIGURATION ---
FOLDER_PATH = os.path.dirname(os.path.abspath(__file__))
# You can change this to "llama3" or "mistral" if installed
OLLAMA_MODEL = "granite3.3:2b" 
# ---------------------

def get_os_creation_date(filepath):
    """Last resort: Gets OS file creation date in YYMMDD format."""
    try:
        timestamp = os.path.getctime(filepath)
        return datetime.fromtimestamp(timestamp).strftime('%y%m%d')
    except:
        return datetime.now().strftime('%y%m%d')

def extract_latest_year_heuristic(text):
    """
    Scans for years (2000-2059), including spaced years (2 0 2 4).
    Returns the HIGHEST year found.
    """
    current_year = datetime.now().year
    found_years = []

    # 1. Standard Years (e.g., "2024", "2023-2024")
    matches_standard = re.findall(r'(?<!\d)(20[0-5][0-9])(?!\d)', text)
    if matches_standard:
        found_years.extend([int(y) for y in matches_standard])

    # 2. Spaced Years (e.g., "2 0 2 4")
    matches_spaced = re.findall(r'(?<!\d)2\s+0\s+[0-5]\s+[0-9](?!\d)', text)
    if matches_spaced:
        for m in matches_spaced:
            clean_year = int(m.replace(" ", ""))
            found_years.append(clean_year)

    if found_years:
        valid_years = [y for y in found_years if y <= current_year + 5]
        
        if valid_years:
            latest_year = max(valid_years)
            short_year = str(latest_year)[2:]
            return f"{short_year}0101"

    return None

def extract_text_from_docx(filepath):
    """Reads text from .docx files, including tables."""
    if not DOCX_AVAILABLE:
        return ""
    try:
        doc = Document(filepath)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        for table in doc.tables:
            for row in table.rows:
                for cell in row.cells:
                    full_text.append(cell.text)
        return "\n".join(full_text)
    except Exception as e:
        print(f"[ERROR] Reading DOCX: {e}")
        return ""

def clean_text_for_llm(text):
    clean = " ".join(text.split())
    # Limit to 4000 chars to prevent choking small models
    return clean[:4000] 

def ask_ollama(text):
    system_instruction = (
        "You are a data extraction assistant. "
        "Extract the applicant's **Full Name** and **Background**."
        "\n\n**Background Extraction Rules (STRICT):**\n"
        "1. **MANDATORY:** You MUST prefer the **Educational Degree** over any job title.\n"
        "   - Example: If text says 'IT Intern' AND 'Diploma in Information Technology', output 'Diploma in Information Technology'.\n"
        "   - Example: If text says 'Mechanical Engineering Student', output 'Diploma in Mechanical Engineering' (if listed) or 'Mechanical Engineering'.\n"
        "2. **FORBIDDEN:** Do NOT use 'Intern', 'Student', 'Assistant', or 'Worker' as the background unless NO degree is mentioned.\n"
        "\nOutput strictly in this format: Name | Background."
        "\nDo NOT include notes, explanations, or numbered lists."
    )

    prompt = f"Resume Text:\n{text}\n\n{system_instruction}"

    url = "http://localhost:11434/api/generate"
    data = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False,
        "options": {
            "temperature": 0.1, 
            "num_ctx": 4096 
        }
    }

    try:
        # Added timeout to prevent hanging on one file
        response = requests.post(url, json=data, timeout=60)
        response.raise_for_status()
        result = response.json()['response'].strip()
        return result
    except Exception as e:
        print(f"    [Warning] Ollama call failed: {e}")
        return None

def fix_spaced_names(text):
    # Fixes "J O H N" -> "JOHN"
    return re.sub(r'(?<=\b[A-Za-z])\s+(?=[A-Za-z]\b)', '', text)

def clean_extracted_string(s):
    # Remove lists (1.), labels (Name:), and fix spacing
    s = re.sub(r'^(1\.|2\.|Name:|Background:|\d\W)', '', s, flags=re.IGNORECASE)
    s = fix_spaced_names(s)
    s = s.split('\n')[0] 
    s = re.split(r'(?i)note\s*:', s)[0]
    
    # Truncate to safe filename length
    if len(s) > 60:
        s = s[:60].strip()
        
    return s.strip().title()

def get_name_fallback(text):
    """
    If AI returns 'Name' or 'Unknown', this function grabs the 
    first non-empty line of the resume, which is usually the name.
    """
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    ignore_list = ['resume', 'curriculum vitae', 'cv', 'profile', 'bio', 'page', 'summary', 'objective', 'name', 'contact']
    
    for line in lines:
        lower_line = line.lower()
        if len(line) < 3 or any(w in lower_line for w in ignore_list):
            continue
        
        word_count = len(line.split())
        if word_count > 5: continue # Names rarely have >5 words
        if "looking for" in lower_line or "seeking" in lower_line: continue

        if len(line) < 50 and not re.search(r'[0-9!@#$%^&*()_+={};"<>?]', line):
            print(f"    [Fallback] AI failed. Guessed name from first line: {line}")
            return line
            
    return "Unknown Applicant"

def process_folder():
    print(f"--- Resume Renamer (Strict Degree Priority + Resilient) ---")
    print(f"Working in: {FOLDER_PATH}\n")
    
    count_success = 0
    count_fail = 0
    script_name = os.path.basename(__file__)

    for filename in os.listdir(FOLDER_PATH):
        # 1. Check Extension
        file_ext = os.path.splitext(filename)[1].lower()
        if filename == script_name:
            continue
        
        if file_ext == '.docx' and not DOCX_AVAILABLE:
            continue
        
        if file_ext not in ['.pdf', '.docx']:
            continue

        filepath = os.path.join(FOLDER_PATH, filename)
        text = ""
        
        # 2. Extract Text
        print(f"Processing: {filename}...")
        try:
            if file_ext == '.pdf':
                with pdfplumber.open(filepath) as pdf:
                    for i in range(min(2, len(pdf.pages))):
                        text += pdf.pages[i].extract_text() or ""
            elif file_ext == '.docx':
                text = extract_text_from_docx(filepath)
                
            if len(text) < 50:
                print(f"    [SKIP] Text too short.")
                count_fail += 1
                continue
                
        except Exception as e:
            print(f"    [ERROR] Reading file: {e}")
            count_fail += 1
            continue

        # 3. GET DATE
        date_str = extract_latest_year_heuristic(text)
        if not date_str:
             date_str = get_os_creation_date(filepath)
             print(f"    [Fallback] Using OS Date: {date_str}")

        # 4. GET NAME/BG
        # Add a tiny delay to give Ollama a breather between files
        time.sleep(0.5)
        llm_output = ask_ollama(clean_text_for_llm(text))
        
        name = None
        bg = "General"

        if llm_output:
            if "|" in llm_output:
                parts = llm_output.split('|', 1)
                name = parts[0].strip()
                bg = parts[1].strip()
            elif "\n" in llm_output:
                lines = [line.strip() for line in llm_output.split('\n') if line.strip()]
                if len(lines) >= 2:
                    name = lines[0]
                    bg = lines[1]
            
            # --- IMPROVED FALLBACK CHECK ---
            forbidden_names = ["name", "unknown", "resume", "applicant", "candidate", "full name"]
            if not name or name.strip().lower() in forbidden_names:
                name = get_name_fallback(text)
            # -------------------------------

            if name:
                name = clean_extracted_string(name)
                bg = clean_extracted_string(bg)
                
                safe_name = re.sub(r'[^\w\s-]', '', name)
                safe_bg = re.sub(r'[^\w\s-]', '', bg)
                
                new_filename = f"{date_str} {safe_name} {safe_bg}{file_ext}"
                new_filepath = os.path.join(FOLDER_PATH, new_filename)
                
                if filepath != new_filepath:
                    if not os.path.exists(new_filepath):
                        os.rename(filepath, new_filepath)
                        print(f"    -> Renamed: [{new_filename}]")
                        count_success += 1
                    else:
                        print(f"    -> Duplicate: [{new_filename}]")
                else:
                    print("    -> No change.")
            else:
                print(f"    -> AI Format Fail: {llm_output}")
                count_fail += 1
        else:
            print("    -> AI returned nothing.")
            count_fail += 1

    print(f"\nDone! Renamed: {count_success} | Failed: {count_fail}")

if __name__ == "__main__":
    process_folder()
