|
| 1 | +"""Service for extracting and parsing resume/CV data from PDF and DOCX files.""" |
| 2 | + |
| 3 | +import io |
| 4 | +import re |
| 5 | + |
| 6 | +import docx |
| 7 | +from PyPDF2 import PdfReader |
| 8 | + |
| 9 | + |
| 10 | +def extract_text_from_pdf(file_bytes: bytes) -> str: |
| 11 | + """Extract text content from a PDF file.""" |
| 12 | + reader = PdfReader(io.BytesIO(file_bytes)) |
| 13 | + text_parts: list[str] = [] |
| 14 | + for page in reader.pages: |
| 15 | + page_text = page.extract_text() |
| 16 | + if page_text: |
| 17 | + text_parts.append(page_text) |
| 18 | + return "\n".join(text_parts) |
| 19 | + |
| 20 | + |
| 21 | +def extract_text_from_docx(file_bytes: bytes) -> str: |
| 22 | + """Extract text content from a DOCX file.""" |
| 23 | + doc = docx.Document(io.BytesIO(file_bytes)) |
| 24 | + text_parts: list[str] = [] |
| 25 | + for paragraph in doc.paragraphs: |
| 26 | + if paragraph.text.strip(): |
| 27 | + text_parts.append(paragraph.text.strip()) |
| 28 | + return "\n".join(text_parts) |
| 29 | + |
| 30 | + |
| 31 | +def _extract_email(text: str) -> str: |
| 32 | + """Extract the first email address found in the text.""" |
| 33 | + match = re.search(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}", text) |
| 34 | + return match.group(0) if match else "" |
| 35 | + |
| 36 | + |
| 37 | +def _extract_phone(text: str) -> str: |
| 38 | + """Extract the first phone number found in the text (Brazilian format).""" |
| 39 | + patterns = [ |
| 40 | + r"\+55\s*\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}", |
| 41 | + r"\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}", |
| 42 | + ] |
| 43 | + for pattern in patterns: |
| 44 | + match = re.search(pattern, text) |
| 45 | + if match: |
| 46 | + return match.group(0).strip() |
| 47 | + return "" |
| 48 | + |
| 49 | + |
| 50 | +def _extract_linkedin(text: str) -> str: |
| 51 | + """Extract LinkedIn profile URL from the text.""" |
| 52 | + match = re.search( |
| 53 | + r"(?:https?://)?(?:www\.)?linkedin\.com/in/[a-zA-Z0-9\-_%]+/?", text |
| 54 | + ) |
| 55 | + return match.group(0) if match else "" |
| 56 | + |
| 57 | + |
| 58 | +def _extract_name(text: str) -> str: |
| 59 | + """Extract the candidate's name (typically the first non-empty line).""" |
| 60 | + lines = text.strip().split("\n") |
| 61 | + for line in lines: |
| 62 | + cleaned = line.strip() |
| 63 | + if not cleaned: |
| 64 | + continue |
| 65 | + if "@" in cleaned or "http" in cleaned.lower(): |
| 66 | + continue |
| 67 | + if re.match(r"^[\d\(\)+\-\s]+$", cleaned): |
| 68 | + continue |
| 69 | + if len(cleaned) < 3 or len(cleaned) > 80: |
| 70 | + continue |
| 71 | + if re.match(r"^[A-ZÀ-ÖØ-Ýa-zà-öø-ÿ\s\.]+$", cleaned): |
| 72 | + return cleaned |
| 73 | + return "" |
| 74 | + |
| 75 | + |
| 76 | +def _extract_city_state(text: str) -> tuple[str, str]: |
| 77 | + """Extract city and state (UF) from the text.""" |
| 78 | + uf_list = [ |
| 79 | + "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA", |
| 80 | + "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN", |
| 81 | + "RS", "RO", "RR", "SC", "SP", "SE", "TO", |
| 82 | + ] |
| 83 | + uf_pattern = "|".join(uf_list) |
| 84 | + |
| 85 | + patterns = [ |
| 86 | + rf"([A-ZÀ-Öa-zà-ö\s]+)\s*[/\-,]\s*({uf_pattern})\b", |
| 87 | + rf"\b({uf_pattern})\s*[/\-,]\s*([A-ZÀ-Öa-zà-ö\s]+)", |
| 88 | + ] |
| 89 | + |
| 90 | + for pattern in patterns: |
| 91 | + match = re.search(pattern, text) |
| 92 | + if match: |
| 93 | + groups = match.groups() |
| 94 | + if groups[0] in uf_list: |
| 95 | + return groups[1].strip(), groups[0] |
| 96 | + return groups[0].strip(), groups[1].strip() |
| 97 | + |
| 98 | + return "", "" |
| 99 | + |
| 100 | + |
| 101 | +def _extract_skills(text: str) -> list[str]: |
| 102 | + """Extract skills from common resume sections.""" |
| 103 | + skills: list[str] = [] |
| 104 | + |
| 105 | + section_patterns = [ |
| 106 | + r"(?:habilidades|competências|skills|tecnologias|conhecimentos)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 107 | + r"(?:HABILIDADES|COMPETÊNCIAS|SKILLS|TECNOLOGIAS|CONHECIMENTOS)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 108 | + ] |
| 109 | + |
| 110 | + for pattern in section_patterns: |
| 111 | + match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) |
| 112 | + if match: |
| 113 | + section_text = match.group(1).strip() |
| 114 | + items = re.split(r"[,;•\-\n|]+", section_text) |
| 115 | + for item in items: |
| 116 | + cleaned = item.strip() |
| 117 | + if cleaned and len(cleaned) > 1 and len(cleaned) < 60: |
| 118 | + skills.append(cleaned) |
| 119 | + break |
| 120 | + |
| 121 | + return skills |
| 122 | + |
| 123 | + |
| 124 | +def _extract_education(text: str) -> list[str]: |
| 125 | + """Extract education entries from common resume sections.""" |
| 126 | + education: list[str] = [] |
| 127 | + |
| 128 | + section_patterns = [ |
| 129 | + r"(?:formação|educação|education|formação acadêmica|escolaridade)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 130 | + r"(?:FORMAÇÃO|EDUCAÇÃO|EDUCATION|FORMAÇÃO ACADÊMICA|ESCOLARIDADE)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 131 | + ] |
| 132 | + |
| 133 | + for pattern in section_patterns: |
| 134 | + match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) |
| 135 | + if match: |
| 136 | + section_text = match.group(1).strip() |
| 137 | + lines = section_text.split("\n") |
| 138 | + for line in lines: |
| 139 | + cleaned = line.strip().lstrip("•-– ") |
| 140 | + if cleaned and len(cleaned) > 3: |
| 141 | + education.append(cleaned) |
| 142 | + break |
| 143 | + |
| 144 | + return education |
| 145 | + |
| 146 | + |
| 147 | +def _extract_experience(text: str) -> list[str]: |
| 148 | + """Extract work experience entries from common resume sections.""" |
| 149 | + experience: list[str] = [] |
| 150 | + |
| 151 | + section_patterns = [ |
| 152 | + r"(?:experiência|experience|experiência profissional|histórico profissional)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 153 | + r"(?:EXPERIÊNCIA|EXPERIENCE|EXPERIÊNCIA PROFISSIONAL|HISTÓRICO PROFISSIONAL)\s*:?\s*\n?(.*?)(?:\n\n|\Z)", |
| 154 | + ] |
| 155 | + |
| 156 | + for pattern in section_patterns: |
| 157 | + match = re.search(pattern, text, re.DOTALL | re.IGNORECASE) |
| 158 | + if match: |
| 159 | + section_text = match.group(1).strip() |
| 160 | + lines = section_text.split("\n") |
| 161 | + for line in lines: |
| 162 | + cleaned = line.strip().lstrip("•-– ") |
| 163 | + if cleaned and len(cleaned) > 3: |
| 164 | + experience.append(cleaned) |
| 165 | + break |
| 166 | + |
| 167 | + return experience |
| 168 | + |
| 169 | + |
| 170 | +def parse_resume_text(text: str) -> dict[str, str | list[str]]: |
| 171 | + """Parse resume text and extract structured data. |
| 172 | +
|
| 173 | + Returns a dictionary with the following keys: |
| 174 | + name, email, phone, city, state, linkedin, skills, education, experience |
| 175 | + """ |
| 176 | + city, state = _extract_city_state(text) |
| 177 | + |
| 178 | + return { |
| 179 | + "name": _extract_name(text), |
| 180 | + "email": _extract_email(text), |
| 181 | + "phone": _extract_phone(text), |
| 182 | + "city": city, |
| 183 | + "state": state, |
| 184 | + "linkedin": _extract_linkedin(text), |
| 185 | + "skills": _extract_skills(text), |
| 186 | + "education": _extract_education(text), |
| 187 | + "experience": _extract_experience(text), |
| 188 | + } |
0 commit comments