fastapi
diff --git a/‎backend/app/api/routes/companies.py‎
Lines changed: 62 additions & 2 deletions b/‎backend/app/api/routes/companies.py‎
Lines changed: 62 additions & 2 deletions
diff --git a/‎backend/app/models.py‎
Lines changed: 13 additions & 0 deletions b/‎backend/app/models.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎backend/app/resume_parser.py‎
Lines changed: 188 additions & 0 deletions b/‎backend/app/resume_parser.py‎
Lines changed: 188 additions & 0 deletions
diff --git a/‎backend/pyproject.toml‎
Lines changed: 2 additions & 0 deletions b/‎backend/pyproject.toml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎frontend/src/client/schemas.gen.ts‎
Lines changed: 74 additions & 0 deletions b/‎frontend/src/client/schemas.gen.ts‎
Lines changed: 74 additions & 0 deletions
@@ -1,13 +1,23 @@
+import logging
 from typing import Any
 
-from fastapi import APIRouter, HTTPException
+from fastapi import APIRouter, HTTPException, UploadFile
 
 from app.api.deps import CurrentUser, SessionDep
 from app.crud import create_company, get_company_by_cnpj
-from app.models import CompanyCreate, CompanyPublic
+from app.models import CompanyCreate, CompanyPublic, ResumeData
+from app.resume_parser import (
+    extract_text_from_docx,
+    extract_text_from_pdf,
+    parse_resume_text,
+)
+
+logger = logging.getLogger(__name__)
 
 router = APIRouter(prefix="/companies", tags=["companies"])
 
+ALLOWED_EXTENSIONS = {".pdf", ".docx"}
+
 
 @router.post("/", response_model=CompanyPublic)
 def create_company_route(
@@ -24,3 +34,53 @@ def create_company_route(
         )
     company = create_company(session=session, company_in=company_in)
     return company
+
+
+@router.post("/parse-resume", response_model=ResumeData)
+async def parse_resume(
+    *, current_user: CurrentUser, file: UploadFile  # noqa: ARG001
+) -> Any:
+    """
+    Parse a resume file (PDF or DOCX) and extract structured data.
+    """
+    if not file.filename:
+        raise HTTPException(
+            status_code=400,
+            detail="Nenhum arquivo foi enviado.",
+        )
+
+    extension = ""
+    if "." in file.filename:
+        extension = "." + file.filename.rsplit(".", 1)[1].lower()
+
+    if extension not in ALLOWED_EXTENSIONS:
+        raise HTTPException(
+            status_code=400,
+            detail="Formato de arquivo não suportado. Envie um arquivo PDF ou DOCX.",
+        )
+
+    try:
+        file_bytes = await file.read()
+
+        if extension == ".pdf":
+            text = extract_text_from_pdf(file_bytes)
+        else:
+            text = extract_text_from_docx(file_bytes)
+
+        if not text.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Não foi possível extrair texto do arquivo. Verifique se o arquivo não está vazio ou protegido.",
+            )
+
+        parsed_data = parse_resume_text(text)
+        return ResumeData(**parsed_data)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.exception("Erro ao processar currículo: %s", e)
+        raise HTTPException(
+            status_code=400,
+            detail="Não foi possível ler o currículo enviado. Verifique o formato do arquivo e tente novamente.",
+        )
@@ -167,6 +167,19 @@ class CompanyPublic(CompanyBase):
     created_at: datetime | None = None
 
 
+# Resume parsed data (not a DB table, just a response model)
+class ResumeData(SQLModel):
+    name: str = ""
+    email: str = ""
+    phone: str = ""
+    city: str = ""
+    state: str = ""
+    linkedin: str = ""
+    skills: list[str] = []
+    education: list[str] = []
+    experience: list[str] = []
+
+
 # Generic message
 class Message(SQLModel):
     message: str
 
@@ -0,0 +1,188 @@
+"""Service for extracting and parsing resume/CV data from PDF and DOCX files."""
+
+import io
+import re
+
+import docx
+from PyPDF2 import PdfReader
+
+
+def extract_text_from_pdf(file_bytes: bytes) -> str:
+    """Extract text content from a PDF file."""
+    reader = PdfReader(io.BytesIO(file_bytes))
+    text_parts: list[str] = []
+    for page in reader.pages:
+        page_text = page.extract_text()
+        if page_text:
+            text_parts.append(page_text)
+    return "\n".join(text_parts)
+
+
+def extract_text_from_docx(file_bytes: bytes) -> str:
+    """Extract text content from a DOCX file."""
+    doc = docx.Document(io.BytesIO(file_bytes))
+    text_parts: list[str] = []
+    for paragraph in doc.paragraphs:
+        if paragraph.text.strip():
+            text_parts.append(paragraph.text.strip())
+    return "\n".join(text_parts)
+
+
+def _extract_email(text: str) -> str:
+    """Extract the first email address found in the text."""
+    match = re.search(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}", text)
+    return match.group(0) if match else ""
+
+
+def _extract_phone(text: str) -> str:
+    """Extract the first phone number found in the text (Brazilian format)."""
+    patterns = [
+        r"\+55\s*\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}",
+        r"\(?\d{2}\)?\s*\d{4,5}[\-\s]?\d{4}",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, text)
+        if match:
+            return match.group(0).strip()
+    return ""
+
+
+def _extract_linkedin(text: str) -> str:
+    """Extract LinkedIn profile URL from the text."""
+    match = re.search(
+        r"(?:https?://)?(?:www\.)?linkedin\.com/in/[a-zA-Z0-9\-_%]+/?", text
+    )
+    return match.group(0) if match else ""
+
+
+def _extract_name(text: str) -> str:
+    """Extract the candidate's name (typically the first non-empty line)."""
+    lines = text.strip().split("\n")
+    for line in lines:
+        cleaned = line.strip()
+        if not cleaned:
+            continue
+        if "@" in cleaned or "http" in cleaned.lower():
+            continue
+        if re.match(r"^[\d\(\)+\-\s]+$", cleaned):
+            continue
+        if len(cleaned) < 3 or len(cleaned) > 80:
+            continue
+        if re.match(r"^[A-ZÀ-ÖØ-Ýa-zà-öø-ÿ\s\.]+$", cleaned):
+            return cleaned
+    return ""
+
+
+def _extract_city_state(text: str) -> tuple[str, str]:
+    """Extract city and state (UF) from the text."""
+    uf_list = [
+        "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA",
+        "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN",
+        "RS", "RO", "RR", "SC", "SP", "SE", "TO",
+    ]
+    uf_pattern = "|".join(uf_list)
+
+    patterns = [
+        rf"([A-ZÀ-Öa-zà-ö\s]+)\s*[/\-,]\s*({uf_pattern})\b",
+        rf"\b({uf_pattern})\s*[/\-,]\s*([A-ZÀ-Öa-zà-ö\s]+)",
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, text)
+        if match:
+            groups = match.groups()
+            if groups[0] in uf_list:
+                return groups[1].strip(), groups[0]
+            return groups[0].strip(), groups[1].strip()
+
+    return "", ""
+
+
+def _extract_skills(text: str) -> list[str]:
+    """Extract skills from common resume sections."""
+    skills: list[str] = []
+
+    section_patterns = [
+        r"(?:habilidades|competências|skills|tecnologias|conhecimentos)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+        r"(?:HABILIDADES|COMPETÊNCIAS|SKILLS|TECNOLOGIAS|CONHECIMENTOS)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+    ]
+
+    for pattern in section_patterns:
+        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+        if match:
+            section_text = match.group(1).strip()
+            items = re.split(r"[,;•\-\n|]+", section_text)
+            for item in items:
+                cleaned = item.strip()
+                if cleaned and len(cleaned) > 1 and len(cleaned) < 60:
+                    skills.append(cleaned)
+            break
+
+    return skills
+
+
+def _extract_education(text: str) -> list[str]:
+    """Extract education entries from common resume sections."""
+    education: list[str] = []
+
+    section_patterns = [
+        r"(?:formação|educação|education|formação acadêmica|escolaridade)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+        r"(?:FORMAÇÃO|EDUCAÇÃO|EDUCATION|FORMAÇÃO ACADÊMICA|ESCOLARIDADE)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+    ]
+
+    for pattern in section_patterns:
+        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+        if match:
+            section_text = match.group(1).strip()
+            lines = section_text.split("\n")
+            for line in lines:
+                cleaned = line.strip().lstrip("•-– ")
+                if cleaned and len(cleaned) > 3:
+                    education.append(cleaned)
+            break
+
+    return education
+
+
+def _extract_experience(text: str) -> list[str]:
+    """Extract work experience entries from common resume sections."""
+    experience: list[str] = []
+
+    section_patterns = [
+        r"(?:experiência|experience|experiência profissional|histórico profissional)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+        r"(?:EXPERIÊNCIA|EXPERIENCE|EXPERIÊNCIA PROFISSIONAL|HISTÓRICO PROFISSIONAL)\s*:?\s*\n?(.*?)(?:\n\n|\Z)",
+    ]
+
+    for pattern in section_patterns:
+        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
+        if match:
+            section_text = match.group(1).strip()
+            lines = section_text.split("\n")
+            for line in lines:
+                cleaned = line.strip().lstrip("•-– ")
+                if cleaned and len(cleaned) > 3:
+                    experience.append(cleaned)
+            break
+
+    return experience
+
+
+def parse_resume_text(text: str) -> dict[str, str | list[str]]:
+    """Parse resume text and extract structured data.
+
+    Returns a dictionary with the following keys:
+        name, email, phone, city, state, linkedin, skills, education, experience
+    """
+    city, state = _extract_city_state(text)
+
+    return {
+        "name": _extract_name(text),
+        "email": _extract_email(text),
+        "phone": _extract_phone(text),
+        "city": city,
+        "state": state,
+        "linkedin": _extract_linkedin(text),
+        "skills": _extract_skills(text),
+        "education": _extract_education(text),
+        "experience": _extract_experience(text),
+    }
@@ -19,6 +19,8 @@ dependencies = [
     "sentry-sdk[fastapi]>=2.0.0,<3.0.0",
     "pyjwt<3.0.0,>=2.8.0",
     "pwdlib[argon2,bcrypt]>=0.3.0",
+    "pypdf2>=3.0.1",
+    "python-docx>=1.2.0",
 ]
 
 [dependency-groups]
 
@@ -1,5 +1,18 @@
 // This file is auto-generated by @hey-api/openapi-ts
 
+export const Body_companies_parse_resumeSchema = {
+    properties: {
+        file: {
+            type: 'string',
+            format: 'binary',
+            title: 'File'
+        }
+    },
+    type: 'object',
+    required: ['file'],
+    title: 'Body_companies-parse_resume'
+} as const;
+
 export const Body_login_login_access_tokenSchema = {
     properties: {
         grant_type: {
@@ -686,6 +699,67 @@ export const PrivateUserCreateSchema = {
     title: 'PrivateUserCreate'
 } as const;
 
+export const ResumeDataSchema = {
+    properties: {
+        name: {
+            type: 'string',
+            title: 'Name',
+            default: ''
+        },
+        email: {
+            type: 'string',
+            title: 'Email',
+            default: ''
+        },
+        phone: {
+            type: 'string',
+            title: 'Phone',
+            default: ''
+        },
+        city: {
+            type: 'string',
+            title: 'City',
+            default: ''
+        },
+        state: {
+            type: 'string',
+            title: 'State',
+            default: ''
+        },
+        linkedin: {
+            type: 'string',
+            title: 'Linkedin',
+            default: ''
+        },
+        skills: {
+            items: {
+                type: 'string'
+            },
+            type: 'array',
+            title: 'Skills',
+            default: []
+        },
+        education: {
+            items: {
+                type: 'string'
+            },
+            type: 'array',
+            title: 'Education',
+            default: []
+        },
+        experience: {
+            items: {
+                type: 'string'
+            },
+            type: 'array',
+            title: 'Experience',
+            default: []
+        }
+    },
+    type: 'object',
+    title: 'ResumeData'
+} as const;
+
 export const TokenSchema = {
     properties: {
         access_token: {
Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@ dependencies = [`
`19`	`19`	`"sentry-sdk[fastapi]>=2.0.0,<3.0.0",`
`20`	`20`	`"pyjwt<3.0.0,>=2.8.0",`
`21`	`21`	`"pwdlib[argon2,bcrypt]>=0.3.0",`
	`22`	`+ "pypdf2>=3.0.1",`
	`23`	`+ "python-docx>=1.2.0",`
`22`	`24`	`]`
`23`	`25`
`24`	`26`	`[dependency-groups]`