Skip to content

Commit 84610a4

Browse files
Merge pull request #5 from EluminiIT/devin/1774360322-resume-upload-autofill
- Backend: add POST /api/v1/companies/parse-resume endpoint - Backend: create resume_parser service (PDF/DOCX text extraction + regex parsing) - Backend: add ResumeData model for structured resume data - Backend: add PyPDF2 and python-docx dependencies - Frontend: create ResumeUpload component with file input and loading state - Frontend: create ResumeConfirmationModal with data preview and apply/cancel - Frontend: integrate upload + modal into companies.tsx form - Frontend: map resume fields to form fields (only fill empty fields) - Frontend: regenerate client SDK with new parse-resume endpoint Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: daniel.resgate <daniel.rider69@gmail.com>
2 parents 7f19d4c + 8eaaaf8 commit 84610a4

11 files changed

Lines changed: 716 additions & 4 deletions

File tree

backend/app/api/routes/companies.py

Lines changed: 62 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
1+
import logging
12
from typing import Any
23

3-
from fastapi import APIRouter, HTTPException
4+
from fastapi import APIRouter, HTTPException, UploadFile
45

56
from app.api.deps import CurrentUser, SessionDep
67
from app.crud import create_company, get_company_by_cnpj
7-
from app.models import CompanyCreate, CompanyPublic
8+
from app.models import CompanyCreate, CompanyPublic, ResumeData
9+
from app.resume_parser import (
10+
extract_text_from_docx,
11+
extract_text_from_pdf,
12+
parse_resume_text,
13+
)
14+
15+
logger = logging.getLogger(__name__)
816

917
router = APIRouter(prefix="/companies", tags=["companies"])
1018

19+
ALLOWED_EXTENSIONS = {".pdf", ".docx"}
20+
1121

1222
@router.post("/", response_model=CompanyPublic)
1323
def create_company_route(
@@ -24,3 +34,53 @@ def create_company_route(
2434
)
2535
company = create_company(session=session, company_in=company_in)
2636
return company
37+
38+
39+
@router.post("/parse-resume", response_model=ResumeData)
40+
async def parse_resume(
41+
*, current_user: CurrentUser, file: UploadFile # noqa: ARG001
42+
) -> Any:
43+
"""
44+
Parse a resume file (PDF or DOCX) and extract structured data.
45+
"""
46+
if not file.filename:
47+
raise HTTPException(
48+
status_code=400,
49+
detail="Nenhum arquivo foi enviado.",
50+
)
51+
52+
extension = ""
53+
if "." in file.filename:
54+
extension = "." + file.filename.rsplit(".", 1)[1].lower()
55+
56+
if extension not in ALLOWED_EXTENSIONS:
57+
raise HTTPException(
58+
status_code=400,
59+
detail="Formato de arquivo não suportado. Envie um arquivo PDF ou DOCX.",
60+
)
61+
62+
try:
63+
file_bytes = await file.read()
64+
65+
if extension == ".pdf":
66+
text = extract_text_from_pdf(file_bytes)
67+
else:
68+
text = extract_text_from_docx(file_bytes)
69+
70+
if not text.strip():
71+
raise HTTPException(
72+
status_code=400,
73+
detail="Não foi possível extrair texto do arquivo. Verifique se o arquivo não está vazio ou protegido.",
74+
)
75+
76+
parsed_data = parse_resume_text(text)
77+
return ResumeData(**parsed_data)
78+
79+
except HTTPException:
80+
raise
81+
except Exception as e:
82+
logger.exception("Erro ao processar currículo: %s", e)
83+
raise HTTPException(
84+
status_code=400,
85+
detail="Não foi possível ler o currículo enviado. Verifique o formato do arquivo e tente novamente.",
86+
)

backend/app/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,19 @@ class CompanyPublic(CompanyBase):
167167
created_at: datetime | None = None
168168

169169

170+
# Resume parsed data (not a DB table, just a response model)
171+
class ResumeData(SQLModel):
172+
name: str = ""
173+
email: str = ""
174+
phone: str = ""
175+
city: str = ""
176+
state: str = ""
177+
linkedin: str = ""
178+
skills: list[str] = []
179+
education: list[str] = []
180+
experience: list[str] = []
181+
182+
170183
# Generic message
171184
class Message(SQLModel):
172185
message: str

backend/app/resume_parser.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
"""Service for extracting and parsing resume/CV data from PDF and DOCX files."""
2+
3+
import io
4+
import re
5+
6+
import docx
7+
from PyPDF2 import PdfReader
8+
9+
10+
def extract_text_from_pdf(file_bytes: bytes) -> str:
11+
"""Extract text content from a PDF file."""
12+
reader = PdfReader(io.BytesIO(file_bytes))
13+
text_parts: list[str] = []
14+
for page in reader.pages:
15+
page_text = page.extract_text()
16+
if page_text:
17+
text_parts.append(page_text)
18+
return "\n".join(text_parts)
19+
20+
21+
def extract_text_from_docx(file_bytes: bytes) -> str:
22+
"""Extract text content from a DOCX file."""
23+
doc = docx.Document(io.BytesIO(file_bytes))
24+
text_parts: list[str] = []
25+
for paragraph in doc.paragraphs:
26+
if paragraph.text.strip():
27+
text_parts.append(paragraph.text.strip())
28+
return "\n".join(text_parts)
29+
30+
31+
def _extract_email(text: str) -> str:
32+
"""Extract the first email address found in the text."""
33+
match = re.search(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}", text)
34+
return match.group(0) if match else ""
35+
36+
37+
def _extract_phone(text: str) -> str:
38+
"""Extract the first phone number found in the text (Brazilian format)."""
39+
patterns = [
40+
r"\+55\s*\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}",
41+
r"\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}",
42+
]
43+
for pattern in patterns:
44+
match = re.search(pattern, text)
45+
if match:
46+
return match.group(0).strip()
47+
return ""
48+
49+
50+
def _extract_linkedin(text: str) -> str:
51+
"""Extract LinkedIn profile URL from the text."""
52+
match = re.search(
53+
r"(?:https?://)?(?:www\.)?linkedin\.com/in/[a-zA-Z0-9\-_%]+/?", text
54+
)
55+
return match.group(0) if match else ""
56+
57+
58+
def _extract_name(text: str) -> str:
59+
"""Extract the candidate's name (typically the first non-empty line)."""
60+
lines = text.strip().split("\n")
61+
for line in lines:
62+
cleaned = line.strip()
63+
if not cleaned:
64+
continue
65+
if "@" in cleaned or "http" in cleaned.lower():
66+
continue
67+
if re.match(r"^[\d\(\)+\-\s]+$", cleaned):
68+
continue
69+
if len(cleaned) < 3 or len(cleaned) > 80:
70+
continue
71+
if re.match(r"^[A-ZÀ-ÖØ-Ýa-zà-öø-ÿ\s\.]+$", cleaned):
72+
return cleaned
73+
return ""
74+
75+
76+
def _extract_city_state(text: str) -> tuple[str, str]:
77+
"""Extract city and state (UF) from the text."""
78+
uf_list = [
79+
"AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA",
80+
"MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN",
81+
"RS", "RO", "RR", "SC", "SP", "SE", "TO",
82+
]
83+
uf_pattern = "|".join(uf_list)
84+
85+
patterns = [
86+
rf"([A-ZÀ-Öa-zà-ö\s]+)\s*[/\-,]\s*({uf_pattern})\b",
87+
rf"\b({uf_pattern})\s*[/\-,]\s*([A-ZÀ-Öa-zà-ö\s]+)",
88+
]
89+
90+
for pattern in patterns:
91+
match = re.search(pattern, text)
92+
if match:
93+
groups = match.groups()
94+
if groups[0] in uf_list:
95+
return groups[1].strip(), groups[0]
96+
return groups[0].strip(), groups[1].strip()
97+
98+
return "", ""
99+
100+
101+
def _extract_skills(text: str) -> list[str]:
102+
"""Extract skills from common resume sections."""
103+
skills: list[str] = []
104+
105+
section_patterns = [
106+
r"(?:habilidades|competências|skills|tecnologias|conhecimentos)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
107+
r"(?:HABILIDADES|COMPETÊNCIAS|SKILLS|TECNOLOGIAS|CONHECIMENTOS)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
108+
]
109+
110+
for pattern in section_patterns:
111+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
112+
if match:
113+
section_text = match.group(1).strip()
114+
items = re.split(r"[,;•\-\n|]+", section_text)
115+
for item in items:
116+
cleaned = item.strip()
117+
if cleaned and len(cleaned) > 1 and len(cleaned) < 60:
118+
skills.append(cleaned)
119+
break
120+
121+
return skills
122+
123+
124+
def _extract_education(text: str) -> list[str]:
125+
"""Extract education entries from common resume sections."""
126+
education: list[str] = []
127+
128+
section_patterns = [
129+
r"(?:formação|educação|education|formação acadêmica|escolaridade)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
130+
r"(?:FORMAÇÃO|EDUCAÇÃO|EDUCATION|FORMAÇÃO ACADÊMICA|ESCOLARIDADE)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
131+
]
132+
133+
for pattern in section_patterns:
134+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
135+
if match:
136+
section_text = match.group(1).strip()
137+
lines = section_text.split("\n")
138+
for line in lines:
139+
cleaned = line.strip().lstrip("•-– ")
140+
if cleaned and len(cleaned) > 3:
141+
education.append(cleaned)
142+
break
143+
144+
return education
145+
146+
147+
def _extract_experience(text: str) -> list[str]:
148+
"""Extract work experience entries from common resume sections."""
149+
experience: list[str] = []
150+
151+
section_patterns = [
152+
r"(?:experiência|experience|experiência profissional|histórico profissional)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
153+
r"(?:EXPERIÊNCIA|EXPERIENCE|EXPERIÊNCIA PROFISSIONAL|HISTÓRICO PROFISSIONAL)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
154+
]
155+
156+
for pattern in section_patterns:
157+
match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
158+
if match:
159+
section_text = match.group(1).strip()
160+
lines = section_text.split("\n")
161+
for line in lines:
162+
cleaned = line.strip().lstrip("•-– ")
163+
if cleaned and len(cleaned) > 3:
164+
experience.append(cleaned)
165+
break
166+
167+
return experience
168+
169+
170+
def parse_resume_text(text: str) -> dict[str, str | list[str]]:
171+
"""Parse resume text and extract structured data.
172+
173+
Returns a dictionary with the following keys:
174+
name, email, phone, city, state, linkedin, skills, education, experience
175+
"""
176+
city, state = _extract_city_state(text)
177+
178+
return {
179+
"name": _extract_name(text),
180+
"email": _extract_email(text),
181+
"phone": _extract_phone(text),
182+
"city": city,
183+
"state": state,
184+
"linkedin": _extract_linkedin(text),
185+
"skills": _extract_skills(text),
186+
"education": _extract_education(text),
187+
"experience": _extract_experience(text),
188+
}

backend/pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ dependencies = [
1919
"sentry-sdk[fastapi]>=2.0.0,<3.0.0",
2020
"pyjwt<3.0.0,>=2.8.0",
2121
"pwdlib[argon2,bcrypt]>=0.3.0",
22+
"pypdf2>=3.0.1",
23+
"python-docx>=1.2.0",
2224
]
2325

2426
[dependency-groups]

frontend/src/client/schemas.gen.ts

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,18 @@
11
// This file is auto-generated by @hey-api/openapi-ts
22

3+
export const Body_companies_parse_resumeSchema = {
4+
properties: {
5+
file: {
6+
type: 'string',
7+
format: 'binary',
8+
title: 'File'
9+
}
10+
},
11+
type: 'object',
12+
required: ['file'],
13+
title: 'Body_companies-parse_resume'
14+
} as const;
15+
316
export const Body_login_login_access_tokenSchema = {
417
properties: {
518
grant_type: {
@@ -686,6 +699,67 @@ export const PrivateUserCreateSchema = {
686699
title: 'PrivateUserCreate'
687700
} as const;
688701

702+
export const ResumeDataSchema = {
703+
properties: {
704+
name: {
705+
type: 'string',
706+
title: 'Name',
707+
default: ''
708+
},
709+
email: {
710+
type: 'string',
711+
title: 'Email',
712+
default: ''
713+
},
714+
phone: {
715+
type: 'string',
716+
title: 'Phone',
717+
default: ''
718+
},
719+
city: {
720+
type: 'string',
721+
title: 'City',
722+
default: ''
723+
},
724+
state: {
725+
type: 'string',
726+
title: 'State',
727+
default: ''
728+
},
729+
linkedin: {
730+
type: 'string',
731+
title: 'Linkedin',
732+
default: ''
733+
},
734+
skills: {
735+
items: {
736+
type: 'string'
737+
},
738+
type: 'array',
739+
title: 'Skills',
740+
default: []
741+
},
742+
education: {
743+
items: {
744+
type: 'string'
745+
},
746+
type: 'array',
747+
title: 'Education',
748+
default: []
749+
},
750+
experience: {
751+
items: {
752+
type: 'string'
753+
},
754+
type: 'array',
755+
title: 'Experience',
756+
default: []
757+
}
758+
},
759+
type: 'object',
760+
title: 'ResumeData'
761+
} as const;
762+
689763
export const TokenSchema = {
690764
properties: {
691765
access_token: {

0 commit comments

Comments
 (0)