|
|
| (3 intermediate revisions by the same user not shown) |
| Line 106: |
Line 106: |
|
| |
|
| # '''Paste the code found in the file block below.''' | | # '''Paste the code found in the file block below.''' |
|
| | [[:File:Rename resume.zip]] |
| {| class="wikitable"
| |
| |+ Caption text
| |
| |-
| |
| ! Header text
| |
| |-
| |
| | import os
| |
| import requests
| |
| | |
| import json
| |
| | |
| import pdfplumber
| |
| | |
| import re
| |
| | |
| from datetime import datetime
| |
| | |
| import time
| |
| | |
| <nowiki>#</nowiki> --- OPTIONAL DEPENDENCY: python-docx ---
| |
| | |
| DOCX_AVAILABLE = False
| |
| | |
| try:
| |
| | |
| from docx import Document
| |
| | |
| DOCX_AVAILABLE = True
| |
| | |
| except ImportError:
| |
| | |
| print("Warning: 'python-docx' not found. .docx files will be skipped.")
| |
| | |
| print("To support Word docs, run: pip install python-docx")
| |
| | |
| <nowiki>#</nowiki> --- CONFIGURATION ---
| |
| | |
| FOLDER_PATH = os.path.dirname(os.path.abspath(__file__))
| |
| | |
| <nowiki>#</nowiki> You can change this to "llama3" or "mistral" if installed
| |
| | |
| OLLAMA_MODEL = "granite3.3:2b"
| |
| | |
| <nowiki>#</nowiki> ---------------------
| |
| | |
| def get_os_creation_date(filepath):
| |
| | |
| """Last resort: Gets OS file creation date in YYMMDD format."""
| |
| | |
| try:
| |
| | |
| timestamp = os.path.getctime(filepath)
| |
| | |
| return datetime.fromtimestamp(timestamp).strftime('%y%m%d')
| |
| | |
| except:
| |
| | |
| return datetime.now().strftime('%y%m%d')
| |
| | |
| def extract_latest_year_heuristic(text):
| |
| | |
| """
| |
| | |
| Scans for years (2000-2059), including spaced years (2 0 2 4).
| |
| | |
| Returns the HIGHEST year found.
| |
| | |
| """
| |
| | |
| current_year = datetime.now().year
| |
| | |
| found_years = []
| |
| | |
| # 1. Standard Years (e.g., "2024", "2023-2024")
| |
| | |
| matches_standard = re.findall(r'(?<!\d)(20[0-5][0-9])(?!\d)', text)
| |
| | |
| if matches_standard:
| |
| | |
| found_years.extend([int(y) for y in matches_standard])
| |
| | |
| # 2. Spaced Years (e.g., "2 0 2 4")
| |
| | |
| matches_spaced = re.findall(r'(?<!\d)2\s+0\s+[0-5]\s+[0-9](?!\d)', text)
| |
| | |
| if matches_spaced:
| |
| | |
| for m in matches_spaced:
| |
| | |
| clean_year = int(m.replace(" ", ""))
| |
| | |
| found_years.append(clean_year)
| |
| | |
| if found_years:
| |
| | |
| valid_years = [y for y in found_years if y <= current_year + 5]
| |
| | |
|
| |
| | |
| if valid_years:
| |
| | |
| latest_year = max(valid_years)
| |
| | |
| short_year = str(latest_year)[2:]
| |
| | |
| return f"{short_year}0101"
| |
| | |
| return None
| |
| | |
| def extract_text_from_docx(filepath):
| |
| | |
| """Reads text from .docx files, including tables."""
| |
| | |
| if not DOCX_AVAILABLE:
| |
| | |
| return ""
| |
| | |
| try:
| |
| | |
| doc = Document(filepath)
| |
| | |
| full_text = []
| |
| | |
| for para in doc.paragraphs:
| |
| | |
| full_text.append(para.text)
| |
| | |
| for table in doc.tables:
| |
| | |
| for row in table.rows:
| |
| | |
| for cell in row.cells:
| |
| | |
| full_text.append(cell.text)
| |
| | |
| return "\n".join(full_text)
| |
| | |
| except Exception as e:
| |
| | |
| print(f"[ERROR] Reading DOCX: {e}")
| |
| | |
| return ""
| |
| | |
| def clean_text_for_llm(text):
| |
| | |
| clean = " ".join(text.split())
| |
| | |
| # Limit to 4000 chars to prevent choking small models
| |
| | |
| return clean[:4000]
| |
| | |
| def ask_ollama(text):
| |
| | |
| system_instruction = (
| |
| | |
| "You are a data extraction assistant. "
| |
| | |
| "Extract the applicant's **Full Name** and **Background**."
| |
| | |
| "\n\n**Background Extraction Rules (STRICT):**\n"
| |
| | |
| "1. **MANDATORY:** You MUST prefer the **Educational Degree** over any job title.\n"
| |
| | |
| " - Example: If text says 'IT Intern' AND 'Diploma in Information Technology', output 'Diploma in Information Technology'.\n"
| |
| | |
| " - Example: If text says 'Mechanical Engineering Student', output 'Diploma in Mechanical Engineering' (if listed) or 'Mechanical Engineering'.\n"
| |
| | |
| "2. **FORBIDDEN:** Do NOT use 'Intern', 'Student', 'Assistant', or 'Worker' as the background unless NO degree is mentioned.\n"
| |
| | |
| "\nOutput strictly in this format: Name | Background."
| |
| | |
| "\nDo NOT include notes, explanations, or numbered lists."
| |
| | |
| )
| |
| | |
| prompt = f"Resume Text:\n{text}\n\n{system_instruction}"
| |
| | |
| url = "<nowiki>http://localhost:11434/api/generate</nowiki>"
| |
| | |
| data = {
| |
| | |
| "model": OLLAMA_MODEL,
| |
| | |
| "prompt": prompt,
| |
| | |
| "stream": False,
| |
| | |
| "options": {
| |
| | |
| "temperature": 0.1,
| |
| | |
| "num_ctx": 4096
| |
| | |
| }
| |
| | |
| }
| |
| | |
| try:
| |
| | |
| # Added timeout to prevent hanging on one file
| |
| | |
| response = requests.post(url, json=data, timeout=60)
| |
| | |
| response.raise_for_status()
| |
| | |
| result = response.json()['response'].strip()
| |
| | |
| return result
| |
| | |
| except Exception as e:
| |
| | |
| print(f" [Warning] Ollama call failed: {e}")
| |
| | |
| return None
| |
| | |
| def fix_spaced_names(text):
| |
| | |
| # Fixes "J O H N" -> "JOHN"
| |
| | |
| return re.sub(r'(?<=\b[A-Za-z])\s+(?=[A-Za-z]\b)', <nowiki>''</nowiki>, text)
| |
| | |
| def clean_extracted_string(s):
| |
| | |
| # Remove lists (1.), labels (Name:), and fix spacing
| |
| | |
| s = re.sub(r'^(1\.|2\.|Name:|Background:|\d\W)', <nowiki>''</nowiki>, s, flags=re.IGNORECASE)
| |
| | |
| s = fix_spaced_names(s)
| |
| | |
| s = s.split('\n')[0]
| |
| | |
| s = re.split(r'(?i)note\s*:', s)[0]
| |
| | |
|
| |
| | |
| # Truncate to safe filename length
| |
| | |
| if len(s) > 60:
| |
| | |
| s = s[:60].strip()
| |
| | |
|
| |
| | |
| return s.strip().title()
| |
| | |
| def get_name_fallback(text):
| |
| | |
| """
| |
| | |
| If AI returns 'Name' or 'Unknown', this function grabs the
| |
| | |
| first non-empty line of the resume, which is usually the name.
| |
| | |
| """
| |
| | |
| lines = [line.strip() for line in text.split('\n') if line.strip()]
| |
| | |
|
| |
| | |
| ignore_list = ['resume', 'curriculum vitae', 'cv', 'profile', 'bio', 'page', 'summary', 'objective', 'name', 'contact']
| |
| | |
|
| |
| | |
| for line in lines:
| |
| | |
| lower_line = line.lower()
| |
| | |
| if len(line) < 3 or any(w in lower_line for w in ignore_list):
| |
| | |
| continue
| |
| | |
|
| |
| | |
| word_count = len(line.split())
| |
| | |
| if word_count > 5: continue # Names rarely have >5 words
| |
| | |
| if "looking for" in lower_line or "seeking" in lower_line: continue
| |
| | |
| if len(line) < 50 and not re.search(r'[0-9!@#$%^&*()_+={};"<>?]', line):
| |
| | |
| print(f" [Fallback] AI failed. Guessed name from first line: {line}")
| |
| | |
| return line
| |
| | |
|
| |
| | |
| return "Unknown Applicant"
| |
| | |
| def process_folder():
| |
| | |
| print(f"--- Resume Renamer (Strict Degree Priority + Resilient) ---")
| |
| | |
| print(f"Working in: {FOLDER_PATH}\n")
| |
| | |
|
| |
| | |
| count_success = 0
| |
| | |
| count_fail = 0
| |
| | |
| script_name = os.path.basename(__file__)
| |
| | |
| for filename in os.listdir(FOLDER_PATH):
| |
| | |
| # 1. Check Extension
| |
| | |
| file_ext = os.path.splitext(filename)[1].lower()
| |
| | |
| if filename == script_name:
| |
| | |
| continue
| |
| | |
|
| |
| | |
| if file_ext == '.docx' and not DOCX_AVAILABLE:
| |
| | |
| continue
| |
| | |
|
| |
| | |
| if file_ext not in ['.pdf', '.docx']:
| |
| | |
| continue
| |
| | |
| filepath = os.path.join(FOLDER_PATH, filename)
| |
| | |
| text = ""
| |
| | |
|
| |
| | |
| # 2. Extract Text
| |
| | |
| print(f"Processing: {filename}...")
| |
| | |
| try:
| |
| | |
| if file_ext == '.pdf':
| |
| | |
| with pdfplumber.open(filepath) as pdf:
| |
| | |
| for i in range(min(2, len(pdf.pages))):
| |
| | |
| text += pdf.pages[i].extract_text() or ""
| |
| | |
| elif file_ext == '.docx':
| |
| | |
| text = extract_text_from_docx(filepath)
| |
| | |
|
| |
| | |
| if len(text) < 50:
| |
| | |
| print(f" [SKIP] Text too short.")
| |
| | |
| count_fail += 1
| |
| | |
| continue
| |
| | |
|
| |
| | |
| except Exception as e:
| |
| | |
| print(f" [ERROR] Reading file: {e}")
| |
| | |
| count_fail += 1
| |
| | |
| continue
| |
| | |
| # 3. GET DATE
| |
| | |
| date_str = extract_latest_year_heuristic(text)
| |
| | |
| if not date_str:
| |
| | |
| date_str = get_os_creation_date(filepath)
| |
| | |
| print(f" [Fallback] Using OS Date: {date_str}")
| |
| | |
| # 4. GET NAME/BG
| |
| | |
| # Add a tiny delay to give Ollama a breather between files
| |
| | |
| time.sleep(0.5)
| |
| | |
| llm_output = ask_ollama(clean_text_for_llm(text))
| |
| | |
|
| |
| | |
| name = None
| |
| | |
| bg = "General"
| |
| | |
| if llm_output:
| |
| | |
| if "|" in llm_output:
| |
| | |
| parts = llm_output.split('|', 1)
| |
| | |
| name = parts[0].strip()
| |
| | |
| bg = parts[1].strip()
| |
| | |
| elif "\n" in llm_output:
| |
| | |
| lines = [line.strip() for line in llm_output.split('\n') if line.strip()]
| |
| | |
| if len(lines) >= 2:
| |
| | |
| name = lines[0]
| |
| | |
| bg = lines[1]
| |
| | |
|
| |
| | |
| # --- IMPROVED FALLBACK CHECK ---
| |
| | |
| forbidden_names = ["name", "unknown", "resume", "applicant", "candidate", "full name"]
| |
| | |
| if not name or name.strip().lower() in forbidden_names:
| |
| | |
| name = get_name_fallback(text)
| |
| | |
| # -------------------------------
| |
| | |
| if name:
| |
| | |
| name = clean_extracted_string(name)
| |
| | |
| bg = clean_extracted_string(bg)
| |
| | |
|
| |
| | |
| safe_name = re.sub(r'[^\w\s-]', <nowiki>''</nowiki>, name)
| |
| | |
| safe_bg = re.sub(r'[^\w\s-]', <nowiki>''</nowiki>, bg)
| |
| | |
|
| |
| | |
| new_filename = f"{date_str} {safe_name} {safe_bg}{file_ext}"
| |
| | |
| new_filepath = os.path.join(FOLDER_PATH, new_filename)
| |
| | |
|
| |
| | |
| if filepath != new_filepath:
| |
| | |
| if not os.path.exists(new_filepath):
| |
| | |
| os.rename(filepath, new_filepath)
| |
| | |
| print(f" -> Renamed: [{new_filename}]")
| |
| | |
| count_success += 1
| |
| | |
| else:
| |
| | |
| print(f" -> Duplicate: [{new_filename}]")
| |
| | |
| else:
| |
| | |
| print(" -> No change.")
| |
| | |
| else:
| |
| | |
| print(f" -> AI Format Fail: {llm_output}")
| |
| | |
| count_fail += 1
| |
| | |
| else:
| |
| | |
| print(" -> AI returned nothing.")
| |
| | |
| count_fail += 1
| |
| | |
| print(f"\nDone! Renamed: {count_success} | Failed: {count_fail}")
| |
| | |
| if __name__ == "__main__":
| |
| | |
| process_folder()
| |
| |}
| |
| # Save and exit: Press Ctrl+O, Enter, then Ctrl+X. | | # Save and exit: Press Ctrl+O, Enter, then Ctrl+X. |
|
| |
|
📂 Automated Resume Renamer & Organizer
For Ubuntu 24.04 using Local AI (Ollama)
1. The Problem
As an HR officer or Professor, you know that students and applicants rarely follow file naming conventions. You likely have a folder that looks like this:
- Resume.pdf
- CV_Final_v2.docx
- MyResume(1).pdf
- john_doe.pdf
The Goal: Automatically rename these files based on their content to a standard format:
YYMMDD Name Degree/Background.pdf
Example: 250101 Juan Dela Cruz BS Information Technology.pdf
2. Requirements Checklist
Please ensure you have the following ready before starting.
- [ ] Ubuntu 24.04 System (Updated).
- [ ] Python 3.12+ (Pre-installed on Ubuntu 24.04).
- [ ] Ollama installed locally (The AI engine).
- [ ] A Small Language Model pulled (e.g., granite3.3:2b or llama3.2).
- [ ] Python Libraries: pdfplumber (for PDFs), python-docx (for Word), requests (to talk to Ollama).
- [ ] No Images: The files must have embedded text. This script excludes OCR (Optical Character Recognition) to keep it fast and lightweight. Scanned images will be skipped.
3. How the Script Works (The Logic)
This script acts as a "Project Manager" that hires two distinct specialists to process each file. It does not blindly ask the AI for everything, as small AIs make mistakes with math and dates.
- File Discovery:
- The script looks for .pdf and .docx files in the folder where the script is located.
- Text Extraction:
- It pulls raw text. If the text is less than 50 characters (likely an image scan), it skips the file to prevent errors.
- The Date Specialist (Python Regex):
- Logic: It scans the text for explicit years (e.g., "2023", "2024").
- Rule: It ignores the word "Present". Why? If a resume from 2022 says "2022 - Present", treating "Present" as "Today" (2026) would incorrectly date the old resume. We stick to the highest printed number.
- Output: Sets the date to Jan 1st of the highest year found (e.g., 240101).
- The Content Specialist (Ollama AI):
- Logic: It sends the text to the local AI with strict instructions.
- Rule 1 (Priority): It looks for a Degree (e.g., "BS IT") first. It is forbidden from using "Intern" or "Student" if a degree is found.
- Rule 2 (Fallback): If the AI fails to find a name, the script grabs the first line of the document as a fallback.
- Sanitization & Renaming:
- It fixes "Spaced Names" (e.g., J O H N -> John).
- It ensures the filename isn't too long.
It renames the file only if the name doesn't already exist.
4. Installation Guide (Ubuntu 24.04)
Open your terminal (Ctrl+Alt+T) and follow these steps exactly.
Step A: System Update
Ensure your system tools are fresh to avoid installation conflicts.
sudo apt update && sudo apt upgrade -y
Step B: Install Ollama & The Model
- Install the Ollama Engine:
curl -fsSL [https://ollama.com/install.sh](https://ollama.com/install.sh) | sh
- Download the Brain (The Model): We use granite3.3:2b because it is very fast and follows formatting rules well.
ollama pull granite3.3:2b
- (Note: You can swap this for llama3 if you have a powerful computer, but Granite is sufficient for this task).
Step C: Setup Python Environment
Ubuntu 24.04 requires Virtual Environments (venv) for Python scripts to prevent breaking system tools.
- Create a Project Folder:
mkdir ~/resume_renamer
cd ~/resume_renamer
- Create the Virtual Environment:
python3 -m venv venv
- Activate the Environment:
source venv/bin/activate
- (You should see (venv) at the start of your command line now).
- Install Required Libraries:
pip install requests pdfplumber python-docx
Step D: Create the Script
- Create the python file:
nano rename_resumes.py
- Paste the code found in the file block below.
File:Rename resume.zip
- Save and exit: Press Ctrl+O, Enter, then Ctrl+X.
5. Running the Renamer
This script is portable. It works on the files sitting next to it.
- Copy the Script: Move the rename_resumes.py file into your folder full of PDFs (e.g., ~/Documents/Student_CVs).
- Open Terminal in that folder: cd ~/Documents/Student_CVs
- Activate your Python Environment (Point to where you created it): source ~/resume_renamer/venv/bin/activate
- Run the script: python3 rename_resumes.py