Merge branch 'feature/apec-ingestion'

This commit is contained in:
Antoine 2026-06-05 18:01:17 +02:00
commit b4182c9686
23 changed files with 4417 additions and 9 deletions

View File

@ -9,6 +9,8 @@ description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"beautifulsoup4>=4.12,<5",
"playwright>=1.52,<2",
"pydantic>=2.7,<3",
"pypdf>=5.0,<6",
"pyyaml>=6.0,<7",

View File

@ -0,0 +1,3 @@
from job_research.apec.query_derivation import derive_apec_queries
__all__ = ["derive_apec_queries"]

View File

@ -0,0 +1,317 @@
from __future__ import annotations
import re
from contextlib import contextmanager
from dataclasses import dataclass
from urllib.parse import parse_qsl, urlencode, urlparse, urlsplit, urlunsplit
from playwright.sync_api import Error as PlaywrightError
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from playwright.sync_api import sync_playwright
from job_research.models import ListingError
_SEARCH_URL = "https://www.apec.fr/candidat/recherche-emploi.html/emploi"
_FRANCE_LOCATION_ID = "799"
_CDI_CONTRACT_ID = "101888"
_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
_ZERO_RESULTS_URL_FRAGMENT = "/recherche-avancee"
_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
_APEC_HOSTS = {"apec.fr", "www.apec.fr"}
_MAX_PAGES_PER_QUERY = 50
_MAX_CONSECUTIVE_NO_PROGRESS_PAGES = 10
@dataclass(slots=True)
class ApecSearchResult:
url: str
source_job_id: str | None = None
@dataclass(slots=True)
class ApecSearchFilters:
location: str | None = None
contract_type: str | None = None
class ApecSearchError(RuntimeError):
pass
@contextmanager
def _open_public_page():
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
try:
page = browser.new_page()
page.set_default_timeout(15_000)
yield page
finally:
browser.close()
def _extract_source_job_id(url: str) -> str | None:
match = _DETAIL_JOB_ID_PATTERN.search(url)
if match is None:
return None
return match.group(1)
def _search_results_url(base_url: str, page_number: int) -> str:
parsed_url = urlsplit(base_url)
params = parse_qsl(parsed_url.query, keep_blank_values=True)
filtered_params = [(key, value) for key, value in params if key != "page"]
filtered_params.append(("page", str(page_number)))
return urlunsplit(parsed_url._replace(query=urlencode(filtered_params, doseq=True)))
def _search_url(query: str, search_filters: ApecSearchFilters, page_number: int = 0) -> str:
params = [
("motsCles", query),
("page", str(page_number)),
]
if search_filters.location == "France":
params.insert(1, ("lieux", _FRANCE_LOCATION_ID))
if search_filters.contract_type == "CDI":
params.insert(2 if search_filters.location == "France" else 1, ("typesContrat", _CDI_CONTRACT_ID))
return f"{_SEARCH_URL}?{urlencode(params)}"
def _accept_cookies_if_present(page) -> None:
try:
page.locator('input[name="cguAcceptees"]').check(timeout=2_000)
except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
pass
for button_name in ("ACCEPTER", "Accepter tous les cookies"):
try:
page.get_by_role("button", name=button_name).click(timeout=2_000)
return
except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
continue
def _goto_and_wait(page, url: str) -> bool:
try:
page.goto(url, wait_until="domcontentloaded")
page.wait_for_load_state("domcontentloaded")
except Exception:
return False
return True
def _is_public_apec_detail_url(url: str) -> bool:
parsed_url = urlparse(url)
return (
parsed_url.scheme == "https"
and parsed_url.hostname in _APEC_HOSTS
and re.fullmatch(r"/candidat/recherche-emploi\.html/emploi/detail-offre/[^/?#]+", parsed_url.path)
is not None
)
class ApecAdapter:
def __init__(self, max_listings: int = 50) -> None:
self.max_listings = max_listings
self.search_errors: list[ListingError] = []
self._browser_context = None
@contextmanager
def browser_session(self):
if self._browser_context is not None:
yield
return
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True)
browser_context = browser.new_context()
self._browser_context = browser_context
try:
yield
finally:
self._browser_context = None
browser.close()
@contextmanager
def _open_page(self):
if self._browser_context is None:
with _open_public_page() as page:
yield page
return
page = self._browser_context.new_page()
page.set_default_timeout(15_000)
try:
yield page
finally:
page.close()
def _record_search_error(
self,
query: str,
search_filters: ApecSearchFilters,
message: str,
*,
url: str | None = None,
) -> None:
self.search_errors.append(
ListingError(url=url or _search_url(query, search_filters), stage="search", message=message)
)
@staticmethod
def _is_zero_results_page(page) -> bool:
return _ZERO_RESULTS_URL_FRAGMENT in page.url and "error=true" in page.url
def search(self, queries: list[str], search_filters: ApecSearchFilters) -> list[ApecSearchResult]:
results: list[ApecSearchResult] = []
seen_keys: set[str] = set()
usable_search_page_seen = False
self.search_errors = []
with self._open_page() as page:
for query in queries:
if not query.strip():
continue
if len(results) >= self.max_listings:
break
if not _goto_and_wait(page, _search_url(query, search_filters)):
self._record_search_error(query, search_filters, "search page navigation failed")
continue
_accept_cookies_if_present(page)
try:
page.wait_for_selector(_SEARCH_INPUT_SELECTOR, timeout=5_000)
except PlaywrightTimeoutError:
self._record_search_error(query, search_filters, "search input did not render")
continue
if self._is_zero_results_page(page):
usable_search_page_seen = True
continue
try:
page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
except PlaywrightTimeoutError:
self._record_search_error(query, search_filters, "search results did not render")
continue
usable_search_page_seen = True
result_page_url = page.url
seen_page_urls: set[str] = {result_page_url}
no_progress_pages = 0
for page_number in range(_MAX_PAGES_PER_QUERY):
if len(results) >= self.max_listings:
break
if page_number > 0:
next_page_url = _search_results_url(result_page_url, page_number)
if next_page_url in seen_page_urls:
break
if not _goto_and_wait(page, next_page_url):
self._record_search_error(
query,
search_filters,
f"page {page_number} navigation failed",
url=next_page_url,
)
break
try:
page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
except PlaywrightTimeoutError:
self._record_search_error(
query,
search_filters,
f"page {page_number} results did not render",
url=next_page_url,
)
break
current_page_url = page.url
if page_number > 0 and current_page_url in seen_page_urls:
break
seen_page_urls.add(current_page_url)
try:
hrefs = page.locator(_RESULT_LINK_SELECTOR).evaluate_all(
"nodes => nodes.map(node => node.href)"
)
except Exception:
self._record_search_error(
query,
search_filters,
f"page {page_number} result links could not be evaluated",
url=current_page_url,
)
break
if not hrefs:
no_progress_pages += 1
if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
break
continue
added_any_result = False
for href in hrefs:
source_job_id = _extract_source_job_id(href)
dedupe_key = source_job_id or href
if dedupe_key in seen_keys:
continue
seen_keys.add(dedupe_key)
results.append(ApecSearchResult(url=href, source_job_id=source_job_id))
added_any_result = True
if len(results) >= self.max_listings:
break
if added_any_result:
no_progress_pages = 0
else:
no_progress_pages += 1
if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
break
if not usable_search_page_seen:
raise ApecSearchError("Apec search page was not reachable for any query")
return results
def fetch_listing_html(self, url: str) -> str:
if not _is_public_apec_detail_url(url):
raise ValueError("ApecAdapter only fetches public Apec URLs")
with self._open_page() as page:
page.goto(url, wait_until="domcontentloaded")
_accept_cookies_if_present(page)
page.wait_for_function(
"""
() => {
const title = document.querySelector('.container-details-offer h1, h1');
const reference = document.querySelector('.ref-offre');
const offerList = document.querySelector('.details-offer-list');
return !!title && !!reference && !!offerList;
}
""",
polling=1000,
timeout=15_000,
)
final_url = page.url
if not _is_public_apec_detail_url(final_url):
raise ValueError(f"ApecAdapter landed on an unexpected URL after redirects: {final_url}")
return page.content()

View File

@ -0,0 +1,92 @@
from job_research.models import ApecListing
_MERGEABLE_FIELDS = (
"source_job_id",
"title",
"company",
"location",
"contract_type",
"description_text",
"published_at",
"refreshed_at",
)
def _merge_listing_metadata(survivor: ApecListing, source: ApecListing) -> None:
for field_name in _MERGEABLE_FIELDS:
if getattr(survivor, field_name) is None:
value = getattr(source, field_name)
if value is not None:
setattr(survivor, field_name, value)
for warning in source.warnings:
if warning not in survivor.warnings:
survivor.warnings.append(warning)
def _register_listing(
url_to_listing: dict[str, ApecListing],
source_job_id_to_listing: dict[str, ApecListing],
listing: ApecListing,
survivor: ApecListing,
) -> None:
url_to_listing[listing.url] = survivor
if listing.source_job_id is not None:
source_job_id_to_listing[listing.source_job_id] = survivor
def _repoint_listing_aliases(
url_to_listing: dict[str, ApecListing],
source_job_id_to_listing: dict[str, ApecListing],
removed: ApecListing,
survivor: ApecListing,
) -> None:
for mapping in (url_to_listing, source_job_id_to_listing):
for alias, listing in list(mapping.items()):
if listing is removed:
mapping[alias] = survivor
def dedupe_apec_listings(listings: list[ApecListing]) -> list[ApecListing]:
url_to_listing: dict[str, ApecListing] = {}
source_job_id_to_listing: dict[str, ApecListing] = {}
survivor_order: dict[int, int] = {}
next_order = 0
deduped: list[ApecListing] = []
for listing in listings:
source_job_id = listing.source_job_id
matches: list[ApecListing] = []
url_match = url_to_listing.get(listing.url)
if url_match is not None:
matches.append(url_match)
if source_job_id is not None:
source_job_id_match = source_job_id_to_listing.get(source_job_id)
if source_job_id_match is not None and source_job_id_match not in matches:
matches.append(source_job_id_match)
if not matches:
deduped.append(listing)
survivor_order[id(listing)] = next_order
next_order += 1
_register_listing(url_to_listing, source_job_id_to_listing, listing, listing)
continue
survivor = min(matches, key=lambda candidate: survivor_order[id(candidate)])
for other in matches:
if other is survivor:
continue
_merge_listing_metadata(survivor, other)
deduped[:] = [item for item in deduped if item is not other]
_repoint_listing_aliases(url_to_listing, source_job_id_to_listing, other, survivor)
survivor_order.pop(id(other), None)
_merge_listing_metadata(survivor, listing)
_register_listing(url_to_listing, source_job_id_to_listing, listing, survivor)
return deduped

View File

@ -0,0 +1,328 @@
from __future__ import annotations
import re
import unicodedata
from datetime import datetime
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from job_research.models import ApecListing, ListingWarning
_PUBLISHED_AT_PATTERN = re.compile(r"Publi[ée]e le (\d{2}/\d{2}/\d{4})")
_REFRESHED_AT_PATTERN = re.compile(r"Actualis[ée]e le (\d{2}/\d{2}/\d{4})")
_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
_SOURCE_JOB_ID_PATTERN = re.compile(r"Ref\. Apec\s*:\s*([A-Z0-9]+)")
_CONTRACT_PATTERN = re.compile(r"\b(CDI|CDD|Alternance|Intérim|Stage|Freelance|Indépendant)\b")
_HEADING_TAG_NAMES = {"h1", "h2", "h3", "h4", "h5", "h6"}
_PLACEHOLDER_TEXT_TOKENS = {
"na",
"nr",
"none",
"null",
"unknown",
"tbd",
"nonrenseigne",
"nonrenseignee",
"nondisponible",
}
def _clean_text(value: str | None) -> str | None:
if value is None:
return None
cleaned = " ".join(value.split())
return cleaned or None
def _text_token(value: str) -> str:
normalized = unicodedata.normalize("NFKD", value)
return re.sub(r"[^a-z0-9]+", "", normalized.casefold())
def _has_useful_text(value: str | None) -> bool:
cleaned = _clean_text(value)
if cleaned is None:
return False
token = _text_token(cleaned)
return bool(token) and token not in _PLACEHOLDER_TEXT_TOKENS
def _text_before_heading(node) -> str | None:
if isinstance(node, NavigableString):
return _clean_text(str(node))
if getattr(node, "name", None) in _HEADING_TAG_NAMES:
return None
pieces: list[str] = []
for child in getattr(node, "children", []):
if getattr(child, "name", None) in _HEADING_TAG_NAMES:
break
text = _text_before_heading(child)
cleaned = _clean_text(text)
if cleaned:
pieces.append(cleaned)
return _clean_text(" ".join(pieces))
def _extract_section_text(block, label: str) -> str | None:
heading = block.find(lambda tag: getattr(tag, "name", None) in _HEADING_TAG_NAMES and _clean_text(tag.get_text(" ", strip=True)) == label)
if heading is None:
return None
pieces: list[str] = []
for sibling in heading.next_siblings:
if getattr(sibling, "name", None) in _HEADING_TAG_NAMES:
break
text = _text_before_heading(sibling)
if text:
pieces.append(text)
return _clean_text(" ".join(pieces))
def _detail_block_text(soup: BeautifulSoup, label: str) -> str | None:
for block in soup.select(".details-post"):
if block.find("h4") is None:
continue
extracted = _extract_section_text(block, label)
if extracted is not None:
return extracted
return None
def _warning(field: str, message: str) -> ListingWarning:
return ListingWarning(field=field, message=message)
def _extract_source_job_id_from_url(url: str) -> str | None:
match = _DETAIL_JOB_ID_PATTERN.search(url)
if match is None:
return None
return match.group(1)
def _extract_listing_date(
soup: BeautifulSoup,
pattern: re.Pattern[str],
*,
field: str,
missing_message: str | None = None,
invalid_message: str,
warnings: list[ListingWarning],
warn_on_missing: bool,
) -> str | None:
card_offer = soup.select_one(".card-offer")
if card_offer is None:
if warn_on_missing and missing_message is not None:
warnings.append(_warning(field, missing_message))
return None
match = pattern.search(card_offer.get_text(" ", strip=True))
if match is None:
if warn_on_missing and missing_message is not None:
warnings.append(_warning(field, missing_message))
return None
try:
return datetime.strptime(match.group(1), "%d/%m/%Y").date().isoformat()
except ValueError:
warnings.append(_warning(field, invalid_message))
return None
def _extract_source_job_id(soup: BeautifulSoup) -> str | None:
ref = soup.select_one(".ref-offre")
if ref is None:
return None
match = _SOURCE_JOB_ID_PATTERN.search(ref.get_text(" ", strip=True))
if match is None:
return None
return match.group(1)
def _extract_contract_type(details_offer_list) -> str | None:
contract_item = details_offer_list.select_one("li:nth-of-type(2)")
if contract_item is None:
return None
span = contract_item.find("span")
if span is not None:
return _clean_text(span.get_text(" ", strip=True))
match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
if match is None:
return None
return match.group(1)
def _extract_company(soup: BeautifulSoup, details_offer_list) -> str | None:
for selector in (".card-ents .ents-name", ".card-ents-quote"):
company = soup.select_one(selector)
if company is not None:
text = _clean_text(company.get_text(" ", strip=True))
if text is not None:
return text
if details_offer_list is not None:
company = details_offer_list.select_one("li:first-of-type")
if company is not None:
text = _clean_text(company.get_text(" ", strip=True))
if text is not None:
return text
return None
def normalize_apec_listing(
url: str,
html: str,
fetched_at: str,
*,
source_job_id: str | None = None,
published_at: str | None = None,
refreshed_at: str | None = None,
) -> ApecListing:
soup = BeautifulSoup(html, "html.parser")
warnings: list[ListingWarning] = []
title = soup.select_one(".container-details-offer h1")
if title is None:
title = soup.find("h1")
if title is not None:
warnings.append(_warning("title", "Recovered title from generic h1 fallback"))
else:
warnings.append(_warning("title", "Title missing from Apec listing"))
title_text = _clean_text(title.get_text(" ", strip=True)) if title is not None else None
if title is not None and not _has_useful_text(title_text):
warnings.append(_warning("title", "Title is empty or placeholder text"))
details_offer_list = soup.select_one(".details-offer-list")
location = None
contract_type = None
if details_offer_list is not None:
location_item = details_offer_list.select_one("li:nth-of-type(3)")
if location_item is not None:
location = _clean_text(location_item.get_text(" ", strip=True))
if not _has_useful_text(location):
warnings.append(_warning("location", "Location is empty or placeholder text"))
else:
warnings.append(_warning("location", "Location missing from details-offer list"))
contract_item = details_offer_list.select_one("li:nth-of-type(2)")
if contract_item is None:
warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
else:
span = contract_item.find("span")
if span is not None:
contract_type = _clean_text(span.get_text(" ", strip=True))
if not _has_useful_text(contract_type):
warnings.append(_warning("contract_type", "Contract type is empty or placeholder text"))
else:
match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
if match is not None:
contract_type = match.group(1)
warnings.append(_warning("contract_type", "Recovered contract type from text fallback"))
else:
warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
else:
warnings.append(_warning("location", "Location missing from Apec listing"))
warnings.append(_warning("contract_type", "Contract type missing from Apec listing"))
description_text = _detail_block_text(soup, "Descriptif du poste")
if description_text is None:
warnings.append(_warning("description_text", "Description missing from Apec listing"))
elif not _has_useful_text(description_text):
warnings.append(_warning("description_text", "Description is empty or placeholder text"))
description_text = None
requested_source_job_id = _extract_source_job_id_from_url(url)
ref_source_job_id = _extract_source_job_id(soup)
if source_job_id is not None:
if (
requested_source_job_id is not None
and ref_source_job_id is not None
and requested_source_job_id != ref_source_job_id
):
warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
normalized_source_job_id = ref_source_job_id
else:
normalized_source_job_id = source_job_id
else:
if ref_source_job_id is None:
if requested_source_job_id is None:
warnings.append(_warning("source_job_id", "Source job id missing from Apec listing"))
normalized_source_job_id = None
else:
warnings.append(_warning("source_job_id", "Recovered source job id from detail URL fallback"))
normalized_source_job_id = requested_source_job_id
else:
warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
normalized_source_job_id = ref_source_job_id
company = soup.select_one(".card-ents .ents-name")
if company is None:
for selector, warning_message in (
(".card-ents-quote", "Recovered company from .card-ents-quote fallback"),
(".details-offer-list li:first-of-type", "Recovered company from details-offer-list fallback"),
):
company = soup.select_one(selector)
if company is not None:
warnings.append(_warning("company", warning_message))
break
company_text = _clean_text(company.get_text(" ", strip=True)) if company is not None else None
if company_text is None:
warnings.append(_warning("company", "Company missing from Apec listing"))
elif not _has_useful_text(company_text):
warnings.append(_warning("company", "Company is empty or placeholder text"))
company_text = None
published_at_value = published_at or _extract_listing_date(
soup,
_PUBLISHED_AT_PATTERN,
field="published_at",
missing_message="Published date missing from Apec listing",
invalid_message="Published date is invalid",
warnings=warnings,
warn_on_missing=True,
)
refreshed_at_value = refreshed_at or _extract_listing_date(
soup,
_REFRESHED_AT_PATTERN,
field="refreshed_at",
invalid_message="Refreshed date is invalid",
warnings=warnings,
warn_on_missing=False,
)
return ApecListing(
source="apec",
source_job_id=normalized_source_job_id,
url=url,
title=title_text if _has_useful_text(title_text) else None,
company=company_text,
location=location if _has_useful_text(location) else None,
contract_type=contract_type if _has_useful_text(contract_type) else None,
description_text=description_text,
published_at=published_at_value,
refreshed_at=refreshed_at_value,
fetched_at=fetched_at,
warnings=warnings,
)

View File

@ -0,0 +1,63 @@
from job_research.apec.adapter import ApecSearchFilters
from job_research.models import CandidateProfileOutput
def _normalize_term(raw_term: str) -> str:
return " ".join(raw_term.split())
def _normalize_constraint(raw_term: str) -> str:
return _normalize_term(raw_term).casefold()
def derive_apec_search_filters(profile: CandidateProfileOutput) -> ApecSearchFilters:
normalized_constraints = {_normalize_constraint(constraint) for constraint in profile.constraints}
return ApecSearchFilters(
location="France" if "france only" in normalized_constraints else None,
contract_type="CDI" if "cdi only" in normalized_constraints else None,
)
def derive_apec_queries(profile: CandidateProfileOutput) -> list[str]:
queries: list[str] = []
seen: set[str] = set()
def add_query(raw_query: str) -> None:
query = _normalize_term(raw_query)
if not query or query in seen or len(queries) == 5:
return
seen.add(query)
queries.append(query)
unique_roles: list[str] = []
for target_role in profile.target_roles:
query = " ".join(target_role.split())
if not query or query in unique_roles:
continue
unique_roles.append(query)
support_terms = [_normalize_term(term) for term in profile.strengths]
support_terms.extend(_normalize_term(term) for term in profile.skills_to_emphasize)
support_terms = [term for term in support_terms if term]
for target_role in unique_roles:
add_query(target_role)
if len(queries) == 5:
return queries
if unique_roles:
primary_role = unique_roles[0]
for term in support_terms:
add_query(f"{primary_role} {term}")
if len(queries) == 5:
break
else:
for term in support_terms:
add_query(term)
if len(queries) == 5:
break
return queries

View File

@ -1,15 +1,79 @@
from contextlib import nullcontext
from datetime import datetime, timezone
import re
from pathlib import Path
from typing import Any
from urllib.parse import unquote, urlparse
import typer
import yaml
from pydantic import ValidationError
from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
from job_research.apec.dedupe import dedupe_apec_listings
from job_research.apec.normalize import normalize_apec_listing
from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
from job_research.models import ApecRunMeta, ApecSnapshotMeta, CandidateProfileOutput, ListingError
from job_research.profile.cv_extractor import extract_cv_signals, extract_pdf_text
from job_research.profile.merge import build_candidate_profile_output
from job_research.profile.profile_parser import parse_profile_markdown
from job_research.storage import save_candidate_profile_yaml
from job_research.profile.profile_parser import AuthoredProfile, parse_profile_markdown
from job_research.storage import apec_run_paths, load_yaml, save_candidate_profile_yaml
app = typer.Typer(help="Build one canonical candidate profile YAML")
def _utc_now() -> datetime:
return datetime.now(timezone.utc)
def _snapshot_stem(url: str, source_job_id: str | None) -> str:
if source_job_id:
return source_job_id
parsed_url = urlparse(url)
fallback = parsed_url.path.rstrip("/").rsplit("/", 1)[-1] or parsed_url.netloc or "listing"
if parsed_url.query:
fallback = f"{fallback}-{parsed_url.query}"
stem = re.sub(r"[^A-Za-z0-9]+", "-", unquote(fallback)).strip("-")
return stem or "listing"
def _write_yaml(path: Path, payload: Any) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=True), encoding="utf-8")
def _load_candidate_profile(profile_path: Path) -> CandidateProfileOutput:
try:
return CandidateProfileOutput.model_validate(load_yaml(profile_path))
except FileNotFoundError as exc:
raise ValueError(f"candidate-profile.yaml not found at {profile_path}") from exc
except (OSError, UnicodeDecodeError) as exc:
raise ValueError(f"candidate-profile.yaml not readable at {profile_path}: {exc}") from exc
except (yaml.YAMLError, ValidationError, ValueError) as exc:
raise ValueError(f"invalid candidate-profile.yaml at {profile_path}: {exc}") from exc
def _load_cv_text(cv: Path) -> str:
try:
cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
except Exception as exc: # pragma: no cover - defensive boundary
raise ValueError(f"CV input not readable at {cv}: {exc}") from exc
if not cv_text.strip():
raise ValueError("No readable text found in CV input")
return cv_text
def _load_authored_profile(profile: Path) -> AuthoredProfile:
try:
return parse_profile_markdown(profile.read_text(encoding="utf-8"))
except Exception as exc: # pragma: no cover - defensive boundary
raise ValueError(f"profile markdown invalid at {profile}: {exc}") from exc
@app.callback()
def main_command() -> None:
pass
@ -41,15 +105,21 @@ def build_profile(
) -> None:
"""Build candidate-profile.yaml from CV and markdown profile."""
cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
if not cv_text.strip():
raise ValueError("No readable text found in CV input")
try:
cv_text = _load_cv_text(cv)
authored_profile = _load_authored_profile(profile)
except ValueError as exc:
typer.echo(str(exc), err=True)
raise typer.Exit(code=1)
authored_profile = parse_profile_markdown(profile.read_text(encoding="utf-8"))
cv_signals = extract_cv_signals(cv_text)
candidate_profile = build_candidate_profile_output(cv_signals, authored_profile)
save_candidate_profile_yaml(out, candidate_profile)
try:
save_candidate_profile_yaml(out, candidate_profile)
except OSError as exc:
typer.echo(f"Unable to write candidate profile to {out}: {exc}", err=True)
raise typer.Exit(code=1)
typer.echo(f"candidate profile written to {out}")
warning_count = len(candidate_profile.warnings)
@ -59,6 +129,147 @@ def build_profile(
typer.echo("No warnings included.")
@app.command("fetch-apec")
def fetch_apec(
data_root: Path = typer.Option(
Path("data"),
"--data-root",
file_okay=False,
dir_okay=True,
help="Directory containing candidate-profile.yaml and Apec run artifacts.",
),
) -> None:
"""Fetch, normalize, dedupe, and persist Apec listings."""
profile_path = data_root / "candidate-profile.yaml"
try:
profile = _load_candidate_profile(profile_path)
except ValueError as exc:
typer.echo(str(exc), err=True)
raise typer.Exit(code=1)
derived_queries = derive_apec_queries(profile)
if not derived_queries:
typer.echo("No usable Apec queries derived from candidate profile", err=True)
raise typer.Exit(code=1)
derived_search_filters = derive_apec_search_filters(profile)
search_filters = ApecSearchFilters(
location=derived_search_filters.location or "France",
contract_type=derived_search_filters.contract_type or "CDI",
)
current = _utc_now().astimezone(timezone.utc)
run_id = current.strftime("%Y-%m-%dT%H-%M-%S-%fZ")
run_started_at = current.replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
adapter = ApecAdapter(max_listings=50)
try:
search_results = adapter.search(derived_queries, search_filters=search_filters)[:50]
except Exception as exc: # pragma: no cover - defensive boundary
typer.echo(f"Unable to fetch Apec search results: {exc}", err=True)
raise typer.Exit(code=1)
paths = apec_run_paths(data_root, run_id)
try:
paths["snapshots"].mkdir(parents=True, exist_ok=True)
except OSError as exc: # pragma: no cover - defensive boundary
typer.echo(f"Unable to create Apec snapshot directory: {exc}", err=True)
raise typer.Exit(code=1)
normalized_listings = []
listing_errors: list[ListingError] = list(getattr(adapter, "search_errors", []))
snapshot_metadata: list[ApecSnapshotMeta] = []
fetched_count = 0
successful_fetch_count = 0
browser_session = getattr(adapter, "browser_session", None)
session_context = browser_session() if callable(browser_session) else nullcontext()
with session_context:
for result in search_results:
fetched_count += 1
fetched_at = _utc_now().astimezone(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
try:
html = adapter.fetch_listing_html(result.url)
except Exception as exc: # pragma: no cover - defensive boundary
listing_errors.append(ListingError(url=result.url, stage="fetch_html", message=str(exc)))
continue
successful_fetch_count += 1
snapshot_path = paths["snapshots"] / f"{_snapshot_stem(result.url, result.source_job_id)}.html"
snapshot_meta = ApecSnapshotMeta(
url=result.url,
source_job_id=result.source_job_id,
snapshot_file=None,
fetched_at=fetched_at,
)
try:
snapshot_path.write_text(html, encoding="utf-8")
except Exception as exc: # pragma: no cover - defensive boundary
listing_errors.append(ListingError(url=result.url, stage="snapshot_write", message=str(exc)))
else:
snapshot_meta.snapshot_file = snapshot_path.name
snapshot_metadata.append(snapshot_meta)
try:
listing = normalize_apec_listing(
url=result.url,
html=html,
fetched_at=fetched_at,
source_job_id=result.source_job_id,
)
except Exception as exc: # pragma: no cover - defensive boundary
listing_errors.append(ListingError(url=result.url, stage="normalize", message=str(exc)))
continue
normalized_listings.append(listing)
if search_results and successful_fetch_count == 0:
typer.echo("No listings could be fetched or normalized from Apec", err=True)
raise typer.Exit(code=1)
deduplicated_listings = dedupe_apec_listings(normalized_listings)
failed_count = len({error.url for error in listing_errors})
run_meta = ApecRunMeta(
run_id=run_id,
run_started_at=run_started_at,
derived_queries=derived_queries,
snapshots=snapshot_metadata,
fetched_count=fetched_count,
normalized_count=len(normalized_listings),
deduplicated_count=len(deduplicated_listings),
failed_count=failed_count,
listing_errors=listing_errors,
)
artifact_write_errors: list[str] = []
try:
_write_yaml(paths["listings"], [listing.model_dump(mode="json") for listing in deduplicated_listings])
except OSError as exc: # pragma: no cover - defensive boundary
artifact_write_errors.append(f"listings.yaml: {exc}")
try:
_write_yaml(paths["run_meta"], run_meta.model_dump(mode="json"))
except OSError as exc: # pragma: no cover - defensive boundary
artifact_write_errors.append(f"run-meta.yaml: {exc}")
if artifact_write_errors:
typer.echo(f"Unable to write Apec run artifacts: {'; '.join(artifact_write_errors)}", err=True)
raise typer.Exit(code=1)
typer.echo(
f"query={len(derived_queries)} fetched={fetched_count} normalized={len(normalized_listings)} "
f"deduplicated={len(deduplicated_listings)} failed={failed_count}"
)
def main() -> None:
app()

View File

@ -21,6 +21,51 @@ class WarningItem(BaseModel):
message: str
class ListingWarning(BaseModel):
field: str
message: str
class ListingError(BaseModel):
url: str
stage: str
message: str
class ApecSnapshotMeta(BaseModel):
url: str
source_job_id: str | None = None
snapshot_file: str | None = None
fetched_at: str
class ApecListing(BaseModel):
source: str
source_job_id: str | None = None
url: str
title: str | None = None
company: str | None = None
location: str | None = None
contract_type: str | None = None
description_text: str | None = None
published_at: str | None = None
refreshed_at: str | None = None
fetched_at: str
warnings: list[ListingWarning] = Field(default_factory=list)
class ApecRunMeta(BaseModel):
run_id: str
run_started_at: str
derived_queries: list[str] = Field(default_factory=list)
snapshots: list[ApecSnapshotMeta] = Field(default_factory=list)
fetched_count: int = 0
normalized_count: int = 0
deduplicated_count: int = 0
failed_count: int = 0
listing_errors: list[ListingError] = Field(default_factory=list)
class CandidateProfileOutput(BaseModel):
name: str | None = None
summary: str | None = None

View File

@ -5,6 +5,8 @@ from pathlib import Path
from pypdf import PdfReader
from job_research.models import WarningItem
EXPERIENCE_LINE_CONNECTORS = (" at ", " chez ", " au ", " à ")
@ -150,6 +152,15 @@ YEARS_OF_EXPERIENCE_PATTERNS = (
re.compile(r"^ann[ée]es d[']exp[ée]rience\s*:\s*(\d+)\s*$", re.IGNORECASE),
)
LOW_CONFIDENCE_NAME_PATTERNS = (
re.compile(r"\b(cv|resume|curriculum vitae|profile)\b", re.IGNORECASE),
re.compile(r"[|/@]"),
re.compile(
r"\b(data engineer|software engineer|developer|analyst|scientist|consultant|architect|manager|product owner|backend|frontend|full stack)\b",
re.IGNORECASE,
),
)
def extract_pdf_text(path: Path) -> str:
reader = PdfReader(str(path))
@ -168,8 +179,17 @@ def extract_pdf_text(path: Path) -> str:
def extract_cv_signals(text: str) -> dict[str, object]:
lines = [_normalize_line(line) for line in text.splitlines()]
non_empty_lines = [line for line in lines if line]
warnings: list[WarningItem] = []
name = non_empty_lines[0] if non_empty_lines else None
if name is not None and _looks_like_low_confidence_name(name):
warnings.append(
WarningItem(
field="name",
message="First CV line looks like a header or tagline; review manually.",
)
)
location = None
languages: list[str] = []
skills: list[str] = []
@ -247,6 +267,7 @@ def extract_cv_signals(text: str) -> dict[str, object]:
"skills": skills,
"experience_entries": experience_entries,
"education_entries": education_entries,
"warnings": warnings,
}
if years_of_experience is not None:
@ -369,3 +390,14 @@ def _looks_like_experience_title(title: str) -> bool:
def _looks_like_prose_company(company: str) -> bool:
return any(pattern.search(company) for pattern in EXPERIENCE_PROSE_COMPANY_PATTERNS)
def _looks_like_low_confidence_name(name: str) -> bool:
normalized = " ".join(name.split())
if not normalized:
return True
if len(normalized.split()) > 4:
return True
return any(pattern.search(normalized) for pattern in LOW_CONFIDENCE_NAME_PATTERNS)

View File

@ -33,6 +33,7 @@ def build_candidate_profile_output(
warnings: list[WarningItem] = []
_append_years_of_experience_warning(cv_signals, authored.notes, warnings)
_append_cv_extraction_warnings(cv_signals, warnings)
_append_missing_cv_fact_warnings(cv_signals, warnings)
merged_skills: list[str] = []
@ -99,6 +100,15 @@ def _append_missing_cv_fact_warnings(
warnings.append(WarningItem(field=field, message=message))
def _append_cv_extraction_warnings(
cv_signals: dict[str, object], warnings: list[WarningItem]
) -> None:
for warning in cv_signals.get("warnings") or []:
warnings.append(
warning if isinstance(warning, WarningItem) else WarningItem.model_validate(warning)
)
def _note_years_of_experience(note: str) -> int | None:
normalized = note.casefold().replace("", "'")
if not any(marker in normalized for marker in EXPERIENCE_NOTE_MARKERS):

View File

@ -17,14 +17,15 @@ class AuthoredProfile:
notes: list[str] = field(default_factory=list)
REQUIRED_SECTIONS = {
REQUIRED_SECTION_NAMES = (
"summary",
"target roles",
"strengths",
"skills to emphasize",
"constraints",
"notes",
}
)
REQUIRED_SECTIONS = set(REQUIRED_SECTION_NAMES)
def parse_profile_markdown(markdown: str) -> AuthoredProfile:
@ -45,6 +46,10 @@ def parse_profile_markdown(markdown: str) -> AuthoredProfile:
missing_text = ", ".join(sorted(missing))
raise ValueError(f"Missing required markdown sections: {missing_text}")
for section_name in REQUIRED_SECTION_NAMES:
if not _has_usable_section_content(sections[section_name]):
raise ValueError(f"Missing usable content in section '{section_name}'")
return AuthoredProfile(
summary=" ".join(sections["summary"]),
target_roles=_parse_list_section("target roles", sections["target roles"]),
@ -64,6 +69,8 @@ def _parse_list_section(section_name: str, lines: list[str]) -> list[str]:
item = _strip_list_marker(line)
if item is None:
raise ValueError(f"Unsupported content in section '{section_name}': {line}")
if not item:
raise ValueError(f"Missing usable content in section '{section_name}'")
items.append(item)
return items
@ -74,6 +81,8 @@ def _parse_notes_section(lines: list[str]) -> list[str]:
for line in lines:
item = _strip_list_marker(line)
if item == "":
raise ValueError("Missing usable content in section 'notes'")
notes.append(item if item is not None else line)
return notes
@ -81,7 +90,13 @@ def _parse_notes_section(lines: list[str]) -> list[str]:
def _strip_list_marker(line: str) -> str | None:
for marker in LIST_MARKERS:
if line == marker.strip():
return ""
if line.startswith(marker):
return line[len(marker):].strip()
return None
def _has_usable_section_content(lines: list[str]) -> bool:
return any(line not in {"-", "*", "+"} for line in lines)

View File

@ -21,3 +21,13 @@ def load_yaml(path: Path) -> dict[str, Any]:
raise ValueError("candidate-profile YAML root must be a mapping")
return dict(payload)
def apec_run_paths(data_root: Path, run_id: str) -> dict[str, Path]:
run_dir = data_root / "apec" / "runs" / run_id
return {
"run_dir": run_dir,
"listings": run_dir / "listings.yaml",
"run_meta": run_dir / "run-meta.yaml",
"snapshots": run_dir / "snapshots",
}

701
tests/apec/test_adapter.py Normal file
View File

@ -0,0 +1,701 @@
from contextlib import contextmanager
from urllib.parse import parse_qs, quote_plus, urlparse
import pytest
from playwright.sync_api import Error as PlaywrightError
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
from job_research.apec import adapter as adapter_module
from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
class _FakeResultButton:
def __init__(self, page, name: str) -> None:
self.page = page
self.name = name
def click(self, timeout: int | None = None) -> None:
if self.name == "Rechercher":
self.page.url = (
"https://www.apec.fr/candidat/recherche-emploi.html/emploi"
f"?motsCles={quote_plus(self.page.current_query)}&page=0"
)
self.page.current_page = 0
elif self.name in {"ACCEPTER", "Accepter tous les cookies"}:
self.page.consent_button_clicks.append(self.name)
self.page.consent_accepted = True
class _FakeLocator:
def __init__(self, page, selector: str) -> None:
self.page = page
self.selector = selector
def fill(self, value: str) -> None:
self.page.current_query = value
def check(self, timeout: int | None = None) -> None:
if self.selector == 'input[name="cguAcceptees"]':
self.page.cgu_checkbox_checked = True
return None
raise PlaywrightTimeoutError(f"selector not found: {self.selector}")
def evaluate_all(self, function: str):
if self.selector == _RESULT_LINK_SELECTOR:
return list(self.page.current_results())
return []
class _FakeDetailPage:
def __init__(
self,
result_pages: dict[str, dict[int, list[str]]],
*,
rendered_html: str = "<html>rendered</html>",
search_ready: bool = True,
zero_result_queries: set[str] | None = None,
consent_required: bool = False,
) -> None:
self.result_pages = result_pages
self.rendered_html = rendered_html
self.shell_html = "<html>shell</html>"
self.waited_functions: list[tuple[str, int | None]] = []
self.search_ready = search_ready
self.zero_result_queries = zero_result_queries or set()
self.consent_required = consent_required
self.cgu_checkbox_checked = False
self.consent_button_clicks: list[str] = []
self.consent_accepted = not consent_required
self.goto_urls: list[str] = []
self.current_query = ""
self.current_page = 0
self.url = ""
self.rendered = False
self.default_timeout: int | None = None
self.closed = False
def goto(self, url: str, wait_until: str | None = None) -> None:
self.goto_urls.append(url)
self.url = url
parsed_url = urlparse(url)
params = parse_qs(parsed_url.query)
if "motsCles" in params:
self.current_query = params["motsCles"][0]
if "page" in params:
self.current_page = int(params["page"][0])
if self.current_query in self.zero_result_queries and "/detail-offre/" not in parsed_url.path:
self.url = (
f"{parsed_url.scheme}://{parsed_url.netloc}"
f"{parsed_url.path}/recherche-avancee?{parsed_url.query}&error=true"
)
if "/detail-offre/" in parsed_url.path:
self.rendered = False
def wait_for_load_state(self, state: str) -> None:
return None
def set_default_timeout(self, timeout: int) -> None:
self.default_timeout = timeout
def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
if self.consent_required and not self.consent_accepted:
raise PlaywrightTimeoutError("consent not accepted")
self.waited_functions.append((function, polling))
self.rendered = True
return None
def wait_for_selector(self, selector: str, timeout: int | None = None) -> None:
if selector == _SEARCH_INPUT_SELECTOR:
if self.search_ready and self.consent_accepted:
return None
raise PlaywrightTimeoutError(f"selector not found: {selector}")
if selector == _RESULT_LINK_SELECTOR and self.current_results():
return None
raise PlaywrightTimeoutError(f"selector not found: {selector}")
def get_by_role(self, role: str, name: str):
return _FakeResultButton(self, name)
def locator(self, selector: str):
return _FakeLocator(self, selector)
def content(self) -> str:
return self.rendered_html if self.rendered else self.shell_html
def current_results(self) -> list[str]:
return self.result_pages.get(self.current_query, {}).get(self.current_page, [])
def close(self) -> None:
self.closed = True
@contextmanager
def _fake_open_public_page(page: _FakeDetailPage):
yield page
def test_search_continues_past_duplicate_only_pages(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {0: [first_result], 1: []},
"beta": {0: [first_result], 1: [second_result], 2: []},
}
)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result, second_result]
assert [result.source_job_id for result in results] == ["111", "222"]
assert "motsCles=alpha" in page.goto_urls[0]
assert "lieux=799" in page.goto_urls[0]
assert "typesContrat=101888" in page.goto_urls[0]
assert any(
"motsCles=beta" in url and "lieux=799" in url and "typesContrat=101888" in url and "page=1" in url
for url in page.goto_urls
)
assert any("page=1" in url for url in page.goto_urls)
def test_search_continues_past_duplicate_only_pages_until_a_later_hit(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=3&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {0: [first_result], 1: [first_result], 2: [first_result], 3: [second_result], 4: []},
"beta": {0: [first_result], 1: []},
}
)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result, second_result]
assert any("page=1" in url for url in page.goto_urls)
assert any("page=3" in url for url in page.goto_urls)
def test_search_continues_after_query_and_pagination_navigation_failures(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=beta&page=0&selectedIndex=0"
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
page = _FakeDetailPage(
{
"beta": {0: [first_result], 1: [second_result]},
}
)
original_goto = page.goto
goto_calls = 0
def flaky_goto(url: str, wait_until: str | None = None) -> None:
nonlocal goto_calls
goto_calls += 1
if goto_calls == 1:
raise RuntimeError("navigation boom")
original_goto(url, wait_until=wait_until)
original_wait_for_load_state = page.wait_for_load_state
def flaky_wait_for_load_state(state: str) -> None:
if page.current_page == 1:
raise RuntimeError("load boom")
original_wait_for_load_state(state)
monkeypatch.setattr(page, "goto", flaky_goto)
monkeypatch.setattr(page, "wait_for_load_state", flaky_wait_for_load_state)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert [result.source_job_id for result in results] == ["111"]
def test_search_stops_after_max_page_count(monkeypatch) -> None:
page = _FakeDetailPage(
{
"alpha": {
0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/333?motsCles=alpha&page=2&selectedIndex=0"],
3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/444?motsCles=alpha&page=3&selectedIndex=0"],
}
}
)
original_goto = page.goto
def bounded_goto(url: str, wait_until: str | None = None) -> None:
original_goto(url, wait_until=wait_until)
if page.current_page >= 3:
raise AssertionError("pagination should stop before page 3")
monkeypatch.setattr(page, "goto", bounded_goto)
monkeypatch.setattr(adapter_module, "_MAX_PAGES_PER_QUERY", 3)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.source_job_id for result in results] == ["111", "222", "333"]
assert not any("page=3" in url for url in page.goto_urls)
def test_search_stops_after_consecutive_no_progress_pages(monkeypatch) -> None:
page = _FakeDetailPage(
{
"alpha": {
0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=1&selectedIndex=0"],
2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=2&selectedIndex=0"],
3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=3&selectedIndex=0"],
}
}
)
original_goto = page.goto
def bounded_goto(url: str, wait_until: str | None = None) -> None:
original_goto(url, wait_until=wait_until)
if page.current_page >= 3:
raise AssertionError("pagination should stop before page 3")
monkeypatch.setattr(page, "goto", bounded_goto)
monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 2)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.source_job_id for result in results] == ["111"]
assert not any("page=3" in url for url in page.goto_urls)
def test_search_stops_when_result_page_url_repeats(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {0: [first_result], 1: [first_result], 2: [first_result]},
}
)
original_goto = page.goto
initial_result_page_url: str | None = None
def looping_goto(url: str, wait_until: str | None = None) -> None:
nonlocal initial_result_page_url
original_goto(url, wait_until=wait_until)
if initial_result_page_url is None and page.current_page == 0:
initial_result_page_url = page.url
elif initial_result_page_url is not None and page.current_page > 0:
page.url = initial_result_page_url
monkeypatch.setattr(page, "goto", looping_goto)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert any("page=1" in url for url in page.goto_urls)
assert not any("page=2" in url for url in page.goto_urls)
def test_search_raises_when_every_query_fails_to_load_a_search_page(monkeypatch) -> None:
page = _FakeDetailPage({"alpha": {0: []}}, search_ready=False)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
with pytest.raises(adapter_module.ApecSearchError):
ApecAdapter(max_listings=10).search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
def test_search_treats_zero_results_redirect_as_usable_and_records_other_failures(monkeypatch) -> None:
page = _FakeDetailPage(
{"alpha": {0: []}, "beta": {0: []}},
zero_result_queries={"alpha"},
)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
adapter = ApecAdapter(max_listings=10)
results = adapter.search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert results == []
assert [error.stage for error in adapter.search_errors] == ["search"]
assert "beta" in adapter.search_errors[0].url
def test_search_raises_when_every_query_renders_broken_search_shell(monkeypatch) -> None:
page = _FakeDetailPage({"alpha": {0: []}, "beta": {0: []}}, search_ready=True)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
adapter = ApecAdapter(max_listings=10)
with pytest.raises(adapter_module.ApecSearchError):
adapter.search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [error.stage for error in adapter.search_errors] == ["search", "search"]
def test_search_accepts_current_cgu_popin_before_waiting_for_results(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
page = _FakeDetailPage({"alpha": {0: [first_result]}}, consent_required=True)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert page.cgu_checkbox_checked is True
assert page.consent_button_clicks == ["ACCEPTER"]
assert page.consent_accepted is True
def test_search_ignores_unexpected_consent_widget_playwright_errors(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
page = _FakeDetailPage({"alpha": {0: [first_result]}})
class _ExplodingConsentButton:
def click(self, timeout: int | None = None) -> None:
raise PlaywrightError("consent widget boom")
def exploding_get_by_role(role: str, name: str):
return _ExplodingConsentButton()
monkeypatch.setattr(page, "get_by_role", exploding_get_by_role)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
results = ApecAdapter(max_listings=10).search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert page.cgu_checkbox_checked is True
def test_search_records_pagination_navigation_failures(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {
0: [first_result],
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
},
}
)
original_goto = page.goto
def flaky_goto(url: str, wait_until: str | None = None) -> None:
if "page=1" in url:
raise RuntimeError("navigation boom")
original_goto(url, wait_until=wait_until)
monkeypatch.setattr(page, "goto", flaky_goto)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
adapter = ApecAdapter(max_listings=10)
results = adapter.search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert [error.stage for error in adapter.search_errors] == ["search"]
assert "page=1" in adapter.search_errors[0].url
assert adapter.search_errors[0].message == "page 1 navigation failed"
def test_search_records_pagination_render_failures(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {
0: [first_result],
1: [second_result],
},
}
)
original_wait_for_selector = page.wait_for_selector
def flaky_wait_for_selector(selector: str, timeout: int | None = None) -> None:
if selector == _RESULT_LINK_SELECTOR and page.current_page == 1:
raise PlaywrightTimeoutError(f"selector not found: {selector}")
original_wait_for_selector(selector, timeout=timeout)
monkeypatch.setattr(page, "wait_for_selector", flaky_wait_for_selector)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
adapter = ApecAdapter(max_listings=10)
results = adapter.search(
["alpha"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result]
assert [error.stage for error in adapter.search_errors] == ["search"]
assert "page=1" in adapter.search_errors[0].url
assert adapter.search_errors[0].message == "page 1 results did not render"
def test_search_records_evaluate_all_failures_and_continues_to_next_query(monkeypatch) -> None:
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=0&selectedIndex=0"
page = _FakeDetailPage(
{
"alpha": {
0: [first_result],
1: [first_result],
},
"beta": {
0: [second_result],
1: [second_result],
},
}
)
original_locator = page.locator
class _FlakyLocator:
def __init__(self, locator) -> None:
self._locator = locator
def evaluate_all(self, function: str):
if (
self._locator.selector == _RESULT_LINK_SELECTOR
and page.current_query == "alpha"
and page.current_page == 1
):
raise RuntimeError("evaluate boom")
return self._locator.evaluate_all(function)
def __getattr__(self, name: str):
return getattr(self._locator, name)
def flaky_locator(selector: str):
return _FlakyLocator(original_locator(selector))
monkeypatch.setattr(page, "locator", flaky_locator)
monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 1)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
adapter = ApecAdapter(max_listings=10)
results = adapter.search(
["alpha", "beta"],
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
)
assert [result.url for result in results] == [first_result, second_result]
assert [error.stage for error in adapter.search_errors] == ["search"]
assert "page=1" in adapter.search_errors[0].url
def test_fetch_listing_html_waits_for_rendered_offer_content(monkeypatch) -> None:
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
html = ApecAdapter().fetch_listing_html(
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
)
assert html == "<html>rendered offer</html>"
assert len(page.waited_functions) == 1
assert ".container-details-offer h1" in page.waited_functions[0][0]
assert ".ref-offre" in page.waited_functions[0][0]
assert ".details-offer-list" in page.waited_functions[0][0]
assert "Descriptif du poste" not in page.waited_functions[0][0]
assert page.waited_functions[0][1] == 1000
def test_fetch_listing_html_accepts_current_cgu_popin_before_waiting_for_detail_content(monkeypatch) -> None:
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>", consent_required=True)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
html = ApecAdapter().fetch_listing_html(
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
)
assert html == "<html>rendered offer</html>"
assert page.cgu_checkbox_checked is True
assert page.consent_button_clicks == ["ACCEPTER"]
assert page.consent_accepted is True
def test_fetch_listing_html_uses_explicit_company_fallback_chain(monkeypatch) -> None:
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
ApecAdapter().fetch_listing_html(
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
)
wait_script = page.waited_functions[0][0]
assert "companySelectors" not in wait_script
assert ".container-details-offer h1" in wait_script
assert ".ref-offre" in wait_script
assert ".details-offer-list" in wait_script
def test_fetch_listing_html_rejects_redirected_non_apec_urls(monkeypatch) -> None:
page = _FakeDetailPage({}, rendered_html="<html>redirected</html>")
original_goto = page.goto
def redirecting_goto(url: str, wait_until: str | None = None) -> None:
original_goto(url, wait_until=wait_until)
page.url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/recherche-avancee?error=true"
monkeypatch.setattr(page, "goto", redirecting_goto)
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
with pytest.raises(ValueError, match="unexpected URL after redirects"):
ApecAdapter().fetch_listing_html(
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
)
assert len(page.waited_functions) == 1
assert page.goto_urls == ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"]
def test_fetch_listing_html_rejects_non_apec_hosts() -> None:
adapter = ApecAdapter()
with pytest.raises(ValueError):
adapter.fetch_listing_html("https://evilapec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
def test_fetch_listing_html_reuses_browser_context_across_calls(monkeypatch) -> None:
class FakePage:
def __init__(self) -> None:
self.goto_urls: list[str] = []
self.default_timeout: int | None = None
self.url = ""
def set_default_timeout(self, timeout: int) -> None:
self.default_timeout = timeout
def goto(self, url: str, wait_until: str | None = None) -> None:
self.goto_urls.append(url)
self.url = url
def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
return None
def content(self) -> str:
return "<html>shared</html>"
def close(self) -> None:
return None
class FakeBrowserContext:
def __init__(self) -> None:
self.new_page_calls = 0
def new_page(self) -> FakePage:
self.new_page_calls += 1
return FakePage()
def close(self) -> None:
return None
class FakeBrowser:
def __init__(self, browser_context: FakeBrowserContext) -> None:
self.browser_context = browser_context
def new_context(self) -> FakeBrowserContext:
return self.browser_context
def close(self) -> None:
return None
class FakeChromium:
def __init__(self, browser: FakeBrowser) -> None:
self.browser = browser
self.launch_calls = 0
def launch(self, headless: bool = True) -> FakeBrowser:
self.launch_calls += 1
return self.browser
class FakePlaywright:
def __init__(self, chromium: FakeChromium) -> None:
self.chromium = chromium
class FakePlaywrightManager:
def __init__(self, chromium: FakeChromium) -> None:
self.playwright = FakePlaywright(chromium)
def __enter__(self) -> FakePlaywright:
return self.playwright
def __exit__(self, exc_type, exc, tb) -> None:
return None
browser_context = FakeBrowserContext()
browser = FakeBrowser(browser_context)
chromium = FakeChromium(browser)
monkeypatch.setattr(adapter_module, "sync_playwright", lambda: FakePlaywrightManager(chromium))
adapter = ApecAdapter()
with adapter.browser_session():
html_one = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
html_two = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222")
assert html_one == "<html>shared</html>"
assert html_two == "<html>shared</html>"
assert chromium.launch_calls == 1
assert browser_context.new_page_calls == 2

212
tests/apec/test_dedupe.py Normal file
View File

@ -0,0 +1,212 @@
from job_research.apec.dedupe import dedupe_apec_listings
from job_research.models import ApecListing
def test_dedupe_apec_listings_by_url_preserves_first_listing() -> None:
first = ApecListing(
source="apec",
url="https://example.test/job/1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
url="https://example.test/job/1",
fetched_at="2026-06-01T10:01:00Z",
)
deduped = dedupe_apec_listings([first, second])
assert deduped == [first]
def test_dedupe_apec_listings_by_source_job_id_ignores_url_changes() -> None:
first = ApecListing(
source="apec",
source_job_id="job-123",
url="https://example.test/job/1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="job-123",
url="https://example.test/job/2",
fetched_at="2026-06-01T10:01:00Z",
)
deduped = dedupe_apec_listings([first, second])
assert deduped == [first]
def test_dedupe_apec_listings_collapses_mixed_key_duplicates() -> None:
first = ApecListing(
source="apec",
source_job_id="job-123",
url="https://example.test/job/1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id=None,
url="https://example.test/job/1",
fetched_at="2026-06-01T10:01:00Z",
)
deduped = dedupe_apec_listings([first, second])
assert deduped == [first]
def test_dedupe_apec_listings_keeps_secondary_ids_from_skipped_rows() -> None:
first = ApecListing(
source="apec",
source_job_id=None,
url="url1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="job-123",
url="url1",
fetched_at="2026-06-01T10:01:00Z",
)
third = ApecListing(
source="apec",
source_job_id="job-123",
url="url2",
fetched_at="2026-06-01T10:02:00Z",
)
deduped = dedupe_apec_listings([first, second, third])
assert deduped == [first]
def test_dedupe_apec_listings_merges_metadata_from_duplicate_rows() -> None:
first = ApecListing(
source="apec",
source_job_id=None,
published_at=None,
refreshed_at=None,
url="url1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="job-123",
published_at="2026-06-01",
refreshed_at="2026-06-02",
url="url1",
fetched_at="2026-06-01T10:01:00Z",
)
deduped = dedupe_apec_listings([first, second])
assert len(deduped) == 1
assert deduped[0].url == "url1"
assert deduped[0].source_job_id == "job-123"
assert deduped[0].published_at == "2026-06-01"
assert deduped[0].refreshed_at == "2026-06-02"
def test_dedupe_apec_listings_merges_metadata_through_alias_chain() -> None:
first = ApecListing(
source="apec",
source_job_id=None,
url="u1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="i2",
url="u2",
fetched_at="2026-06-01T10:01:00Z",
)
third = ApecListing(
source="apec",
source_job_id="i4",
url="u2",
fetched_at="2026-06-01T10:02:00Z",
)
fourth = ApecListing(
source="apec",
source_job_id="i2",
url="u1",
fetched_at="2026-06-01T10:03:00Z",
)
fifth = ApecListing(
source="apec",
source_job_id="i4",
url="u6",
company="NewestCo",
fetched_at="2026-06-01T10:04:00Z",
)
deduped = dedupe_apec_listings([first, second, third, fourth, fifth])
assert len(deduped) == 1
assert deduped[0].url == "u1"
assert deduped[0].source_job_id == "i2"
assert deduped[0].company == "NewestCo"
def test_dedupe_apec_listings_keeps_one_survivor_for_cluster_alias_chain() -> None:
first = ApecListing(
source="apec",
source_job_id="id2",
url="u2",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="id3",
url="u2",
fetched_at="2026-06-01T10:01:00Z",
)
third = ApecListing(
source="apec",
source_job_id="id2",
url="u1",
fetched_at="2026-06-01T10:02:00Z",
)
fourth = ApecListing(
source="apec",
source_job_id="id3",
url="u3",
fetched_at="2026-06-01T10:03:00Z",
)
deduped = dedupe_apec_listings([first, second, third, fourth])
assert len(deduped) == 1
assert deduped[0].url == "u2"
assert deduped[0].source_job_id == "id2"
def test_dedupe_apec_listings_keeps_first_listing_as_bridge_survivor() -> None:
first = ApecListing(
source="apec",
source_job_id="id1",
url="u1",
fetched_at="2026-06-01T10:00:00Z",
)
second = ApecListing(
source="apec",
source_job_id="id2",
url="u2",
fetched_at="2026-06-01T10:01:00Z",
)
third = ApecListing(
source="apec",
source_job_id="id1",
url="u2",
company="NewestCo",
fetched_at="2026-06-01T10:02:00Z",
)
deduped = dedupe_apec_listings([first, second, third])
assert len(deduped) == 1
assert deduped[0].url == "u1"
assert deduped[0].source_job_id == "id1"
assert deduped[0].company == "NewestCo"

View File

@ -0,0 +1,372 @@
from job_research.apec.normalize import normalize_apec_listing
def test_normalize_apec_listing_extracts_minimal_shape() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>CLOUD TEMPLE</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">CLOUD TEMPLE</span>
</div>
</article>
<div class="details-post">
<h4>Salaire</h4>
<span>A partir de 70 k brut annuel</span>
</div>
<div class="details-post">
<h4>Prise de poste</h4>
<span>Dès que possible</span>
</div>
<div class="details-post">
<h4>Expérience</h4>
<span>Minimum 7 ans</span>
</div>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
<div class="nested-late-sections">
<h4>Profil recherché</h4>
<p>Python / SQL</p>
<h4>Compétences attendues</h4>
<p>Ignored</p>
<h4>Entreprise</h4>
<p>Ignored</p>
<div class="recruiter">Ignored recruiter info</div>
</div>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="job-123",
)
assert listing.source == "apec"
assert listing.source_job_id == "job-123"
assert listing.url == "https://example.test/job/123"
assert listing.title == "Data Engineer F/H"
assert listing.company == "CLOUD TEMPLE"
assert listing.location == "Puteaux - 92"
assert listing.contract_type == "CDI"
assert listing.description_text == "Build pipelines"
assert listing.published_at == "2026-04-20"
assert listing.refreshed_at == "2026-06-02"
assert listing.fetched_at == "2026-06-01T10:00:00Z"
def test_normalize_apec_listing_prefers_final_source_job_id_from_detail_page() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : FINAL456</div>
<ul class="details-offer-list mb-20">
<li>CLOUD TEMPLE</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">CLOUD TEMPLE</span>
</div>
</article>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/REQUESTED123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="REQUESTED123",
)
assert listing.source_job_id == "FINAL456"
def test_normalize_apec_listing_warns_and_returns_none_for_invalid_dates() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>CLOUD TEMPLE</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 32/13/2026 Actualisée le 31/02/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">CLOUD TEMPLE</span>
</div>
</article>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="job-123",
)
assert listing.published_at is None
assert listing.refreshed_at is None
assert [warning.field for warning in listing.warnings] == ["published_at", "refreshed_at"]
def test_normalize_apec_listing_uses_details_offer_list_company_fallback() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>Fallback Company</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<div class="details-post">
<h4>Salaire</h4>
<span>A partir de 70 k brut annuel</span>
</div>
<div class="details-post">
<h4>Prise de poste</h4>
<span>Dès que possible</span>
</div>
<div class="details-post">
<h4>Expérience</h4>
<span>Minimum 7 ans</span>
</div>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
<div class="nested-late-sections">
<h4>Profil recherché</h4>
<p>Python / SQL</p>
<h4>Compétences attendues</h4>
<p>Ignored</p>
<h4>Entreprise</h4>
<p>Ignored</p>
<div class="recruiter">Ignored recruiter info</div>
</div>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id=None,
)
assert listing.company == "Fallback Company"
assert listing.description_text == "Build pipelines"
assert listing.refreshed_at == "2026-06-02"
def test_normalize_apec_listing_records_warnings_for_fallback_and_missing_fields() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>Fallback Company</li>
<li>1 CDI</li>
</ul>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="card-ents-quote">Fallback Company</span>
</div>
</article>
</main>
<h1>Fallback Title</h1>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id=None,
)
assert [warning.field for warning in listing.warnings] == [
"title",
"location",
"contract_type",
"description_text",
"source_job_id",
"company",
"published_at",
]
def test_normalize_apec_listing_records_warnings_for_placeholder_text_values() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1> N/A </h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>Example Corp</li>
<li>1 <span> N/A </span></li>
<li> - </li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">Example Corp</span>
</div>
</article>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="178554452W",
)
assert listing.title is None
assert listing.location is None
assert listing.contract_type is None
assert [warning.field for warning in listing.warnings] == ["title", "location", "contract_type"]
def test_normalize_apec_listing_records_warning_for_placeholder_company() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>Example Corp</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">N/A</span>
</div>
</article>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>Build pipelines</p>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="178554452W",
)
assert listing.company is None
assert listing.refreshed_at == "2026-06-02"
assert [warning.field for warning in listing.warnings] == ["company"]
def test_normalize_apec_listing_records_warning_for_placeholder_description_text() -> None:
html = """
<html>
<body>
<main class="container-details-offer">
<h1>Data Engineer F/H</h1>
<div class="card-offer">
<div class="ref-offre">Ref. Apec : 178554452W</div>
<ul class="details-offer-list mb-20">
<li>CLOUD TEMPLE</li>
<li>1 <span> CDI </span></li>
<li>Puteaux - 92</li>
</ul>
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
</div>
<article class="card card-ents mb-20">
<div class="list-hzt mb-20">
<span class="ents-name">CLOUD TEMPLE</span>
</div>
</article>
<div class="details-post">
<h4>Descriptif du poste</h4>
<p>N/A</p>
</div>
</main>
</body>
</html>
"""
listing = normalize_apec_listing(
url="https://example.test/job/123",
html=html,
fetched_at="2026-06-01T10:00:00Z",
source_job_id="job-123",
)
assert listing.description_text is None
assert [warning.field for warning in listing.warnings] == ["description_text"]

View File

@ -0,0 +1,75 @@
from job_research.apec.adapter import ApecSearchFilters
from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
from job_research.models import CandidateProfileOutput
def test_derive_apec_queries_preserves_order_dedupes_and_caps_at_five() -> None:
profile = CandidateProfileOutput(
target_roles=[
"Data Engineer",
"Analytics Engineer",
"Data Engineer",
"BI Engineer",
"Junior Data Platform Engineer",
"ML Engineer",
"Backend Engineer",
],
strengths=["Python", "SQL"],
skills_to_emphasize=["BigQuery", "Terraform"],
)
queries = derive_apec_queries(profile)
assert queries == [
"Data Engineer",
"Analytics Engineer",
"BI Engineer",
"Junior Data Platform Engineer",
"ML Engineer",
]
def test_derive_apec_queries_uses_up_to_five_target_roles_when_no_support_terms_exist() -> None:
profile = CandidateProfileOutput(
target_roles=[
"Data Engineer",
"Analytics Engineer",
"BI Engineer",
"Junior Data Platform Engineer",
"ML Engineer",
"Backend Engineer",
]
)
queries = derive_apec_queries(profile)
assert queries == [
"Data Engineer",
"Analytics Engineer",
"BI Engineer",
"Junior Data Platform Engineer",
"ML Engineer",
]
def test_derive_apec_queries_uses_support_terms_without_constraints() -> None:
profile = CandidateProfileOutput(
target_roles=["Data Engineer"],
strengths=["Python"],
skills_to_emphasize=["BigQuery"],
constraints=["CDI only", "France only"],
)
queries = derive_apec_queries(profile)
assert queries == [
"Data Engineer",
"Data Engineer Python",
"Data Engineer BigQuery",
]
def test_derive_apec_search_filters_from_constraints() -> None:
profile = CandidateProfileOutput(constraints=["CDI only", "France only"])
assert derive_apec_search_filters(profile) == ApecSearchFilters(location="France", contract_type="CDI")

View File

@ -29,6 +29,23 @@ def test_extract_cv_signals_reads_basic_fields_from_text() -> None:
assert len(extracted["experience_entries"]) == 2
def test_extract_cv_signals_flags_low_confidence_first_line_as_name() -> None:
text = dedent(
"""
Data Engineer | Python | GCP
Location: France
Languages: French, English
Skills: Python, SQL
Data Engineer at Company A
"""
).strip()
extracted = extract_cv_signals(text)
assert extracted["name"] == "Data Engineer | Python | GCP"
assert [warning.field for warning in extracted["warnings"]] == ["name"]
@pytest.mark.parametrize(
("line", "expected"),
[

View File

@ -1,5 +1,6 @@
from job_research.profile.merge import build_candidate_profile_output
from job_research.profile.profile_parser import AuthoredProfile
from job_research.models import WarningItem
def test_build_candidate_profile_output_writes_warning_when_facts_conflict() -> None:
@ -72,3 +73,34 @@ def test_build_candidate_profile_output_warns_on_missing_core_cv_facts() -> None
"skills",
"education_entries",
]
def test_build_candidate_profile_output_propagates_cv_extraction_warnings() -> None:
cv_signals = {
"name": "Data Engineer | Python | GCP",
"location": "France",
"languages": ["French", "English"],
"skills": ["Python", "SQL"],
"experience_entries": [{"title": "Data Engineer", "company": "A"}],
"education_entries": [{"credential": "MSc", "institution": "Example University"}],
"warnings": [
WarningItem(
field="name",
message="First CV line looks like a header or tagline; review manually.",
)
],
}
authored = AuthoredProfile(
summary="Junior data engineer focused on GCP.",
target_roles=["Data Engineer"],
strengths=["Python"],
skills_to_emphasize=["BigQuery", "GCP"],
constraints=["CDI only"],
notes=[],
)
output = build_candidate_profile_output(cv_signals, authored)
assert output.warnings == [
WarningItem(field="name", message="First CV line looks like a header or tagline; review manually.")
]

View File

@ -72,3 +72,38 @@ def test_parse_profile_markdown_rejects_unsupported_list_content() -> None:
with pytest.raises(ValueError, match="Unsupported content in section 'target roles'"):
parse_profile_markdown(markdown)
@pytest.mark.parametrize("section_name", ["Target Roles", "Notes"])
def test_parse_profile_markdown_rejects_blank_bullet_only_required_sections(
section_name: str,
) -> None:
target_roles = "- " if section_name == "Target Roles" else "- Data Engineer"
notes = "- " if section_name == "Notes" else "Slight preference for French listings."
markdown = dedent(
f"""
# Candidate Profile
## Summary
Junior data engineer focused on Python and GCP.
## Target Roles
{target_roles}
## Strengths
- Python
## Skills To Emphasize
- BigQuery
## Constraints
- CDI only
## Notes
{notes}
"""
).strip()
with pytest.raises(ValueError, match=f"Missing usable content in section '{section_name.lower()}'"):
parse_profile_markdown(markdown)

1467
tests/test_apec_cli.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,82 @@
from pathlib import Path
from job_research.models import ApecListing, ApecRunMeta, ApecSnapshotMeta, ListingWarning
from job_research.storage import apec_run_paths
FIXED_RUN_ID = "2026-06-01T10-00-00-123456Z"
def test_apec_models_serialize_expected_listing_shape() -> None:
listing = ApecListing(
source="apec",
source_job_id="123",
url="https://example.test/job/123",
title="Data Engineer",
company="Example",
location="Paris",
contract_type="CDI",
description_text="Build pipelines",
published_at="2026-06-01",
refreshed_at="2026-06-02",
fetched_at="2026-06-01T10:00:00Z",
warnings=[
ListingWarning(
field="location",
message="Location inferred from page text",
)
],
)
run_meta = ApecRunMeta(
run_id=FIXED_RUN_ID,
run_started_at="2026-06-01T10:00:00Z",
derived_queries=["Data Engineer"],
snapshots=[
ApecSnapshotMeta(
url="https://example.test/job/123",
source_job_id="123",
snapshot_file="job-123.html",
fetched_at="2026-06-01T10:00:00Z",
)
],
fetched_count=1,
normalized_count=1,
deduplicated_count=1,
failed_count=0,
listing_errors=[],
)
assert listing.model_dump()["source"] == "apec"
assert listing.model_dump()["warnings"][0]["field"] == "location"
assert listing.model_dump()["refreshed_at"] == "2026-06-02"
assert run_meta.model_dump()["run_id"] == FIXED_RUN_ID
assert run_meta.model_dump()["run_started_at"] == "2026-06-01T10:00:00Z"
assert run_meta.model_dump()["derived_queries"] == ["Data Engineer"]
assert run_meta.model_dump(mode="json")["snapshots"] == [
{
"url": "https://example.test/job/123",
"source_job_id": "123",
"snapshot_file": "job-123.html",
"fetched_at": "2026-06-01T10:00:00Z",
}
]
def test_apec_run_paths_builds_expected_layout(tmp_path: Path) -> None:
paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
run_dir = tmp_path / "apec" / "runs" / FIXED_RUN_ID
assert paths["run_dir"] == run_dir
assert paths["listings"] == run_dir / "listings.yaml"
assert paths["run_meta"] == run_dir / "run-meta.yaml"
assert paths["snapshots"] == run_dir / "snapshots"
def test_apec_run_artifacts_include_snapshot_and_meta(tmp_path: Path) -> None:
paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
paths["snapshots"].mkdir(parents=True, exist_ok=True)
snapshot = paths["snapshots"] / "job-123.html"
snapshot.write_text("<html>snapshot</html>", encoding="utf-8")
assert snapshot.read_text(encoding="utf-8") == "<html>snapshot</html>"

View File

@ -1,6 +1,10 @@
from subprocess import run
from textwrap import dedent
from pathlib import Path
from typer.testing import CliRunner
from job_research.cli import app
from job_research.storage import load_yaml
@ -147,6 +151,78 @@ def test_build_profile_reports_when_no_warnings_are_included(tmp_path) -> None:
assert "No warnings included." in result.stdout
def test_build_profile_reports_output_write_failures_cleanly(tmp_path, monkeypatch) -> None:
cv = tmp_path / "cv.txt"
cv.write_text(
dedent(
"""
Tonio Example
Location: France
Languages: French, English
Skills: Python, SQL
Data Engineer at Acme
Education: Master of Science at Example University
"""
).strip(),
encoding="utf-8",
)
profile = tmp_path / "profile.md"
profile.write_text(
dedent(
"""
# Candidate Profile
## Summary
Junior data engineer focused on Python and GCP.
## Target Roles
- Data Engineer
## Strengths
- Python
- SQL
## Skills To Emphasize
- GCP
- BigQuery
## Constraints
- CDI only
- France only
## Notes
- Slight preference for French listings.
"""
).strip(),
encoding="utf-8",
)
out = tmp_path / "candidate-profile.yaml"
original_write_text = Path.write_text
def flaky_write_text(
self: Path,
data: str,
encoding: str | None = None,
errors: str | None = None,
newline: str | None = None,
) -> int:
if self == out:
raise OSError("disk full")
return original_write_text(self, data, encoding=encoding, errors=errors, newline=newline)
monkeypatch.setattr(Path, "write_text", flaky_write_text)
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
assert result.exit_code == 1
assert not out.exists()
assert "Unable to write candidate profile to" in result.stderr
assert "disk full" in result.stderr
assert "Traceback" not in result.stderr
def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
cv = tmp_path / "cv.txt"
cv.write_text(" \n", encoding="utf-8")
@ -189,3 +265,100 @@ def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
assert result.returncode != 0
assert not out.exists()
assert "No readable text found in CV input" in result.stderr
assert "Traceback" not in result.stderr
def test_build_profile_reports_unreadable_pdf_input_cleanly(tmp_path, monkeypatch) -> None:
cv = tmp_path / "cv.pdf"
cv.write_bytes(b"%PDF-1.4\n")
profile = tmp_path / "profile.md"
profile.write_text(
dedent(
"""
# Candidate Profile
## Summary
Junior data engineer.
## Target Roles
- Data Engineer
## Strengths
- Python
## Skills To Emphasize
- BigQuery
## Constraints
- CDI only
## Notes
- Slight preference for French listings.
"""
).strip(),
encoding="utf-8",
)
out = tmp_path / "candidate-profile.yaml"
def broken_extract_pdf_text(path):
raise ValueError("broken pdf")
monkeypatch.setattr("job_research.cli.extract_pdf_text", broken_extract_pdf_text)
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
assert result.exit_code == 1
assert not out.exists()
assert "CV input not readable" in result.stderr
assert "broken pdf" in result.stderr
assert "Traceback" not in result.stderr
def test_build_profile_reports_malformed_profile_markdown_cleanly(tmp_path) -> None:
cv = tmp_path / "cv.txt"
cv.write_text(
dedent(
"""
Tonio Example
Location: France
Languages: French, English
Skills: Python
"""
).strip(),
encoding="utf-8",
)
profile = tmp_path / "profile.md"
profile.write_text(
dedent(
"""
# Candidate Profile
## Summary
Junior data engineer.
## Target Roles
Data Engineer
## Strengths
- Python
## Skills To Emphasize
- BigQuery
## Constraints
- CDI only
## Notes
- Slight preference for French listings.
"""
).strip(),
encoding="utf-8",
)
out = tmp_path / "candidate-profile.yaml"
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
assert result.exit_code == 1
assert not out.exists()
assert "profile markdown invalid" in result.stderr
assert "Traceback" not in result.stderr

114
uv.lock generated
View File

@ -20,6 +20,19 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
]
[[package]]
name = "beautifulsoup4"
version = "4.14.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "soupsieve" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
@ -29,6 +42,63 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
]
[[package]]
name = "greenlet"
version = "3.5.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/6d/6e/802acd792aebb2256fbbee8cacf2727faaeb6f240ac11008f09eae4414bc/greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829", size = 197356, upload-time = "2026-05-20T15:05:03.917Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/27/69/7f7e5372d998b81001899b1c0823c957aa413ba0f2662e65821611cc31e4/greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b", size = 285060, upload-time = "2026-05-20T13:08:51.899Z" },
{ url = "https://files.pythonhosted.org/packages/b1/bf/387f9b6b865fd2ae0d0be09e0004827295a01b71be76ed350dd1e28a91a4/greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a", size = 604370, upload-time = "2026-05-20T14:00:07.492Z" },
{ url = "https://files.pythonhosted.org/packages/32/f5/169ce3d4e4c67291bd18f8cbe0299c9f3e45102c7f1fb3c14780c93e4532/greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283", size = 616987, upload-time = "2026-05-20T14:05:44.237Z" },
{ url = "https://files.pythonhosted.org/packages/19/ba/c24110c55dffa55aa6e1d98b45310da33801aeba7686ff0190fe5d46fd32/greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce", size = 622911, upload-time = "2026-05-20T14:09:10.598Z" },
{ url = "https://files.pythonhosted.org/packages/ee/e5/7f2e41d5273be07e77560d61ea4e56485b4d6c316d2a84518c62d1364061/greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135", size = 613911, upload-time = "2026-05-20T13:14:27.539Z" },
{ url = "https://files.pythonhosted.org/packages/ec/7b/d20db2e8a5ad6c038702f3179b136f93f0a3d1a21a0c0777f3e470cdf4b2/greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436", size = 425228, upload-time = "2026-05-20T14:01:40.837Z" },
{ url = "https://files.pythonhosted.org/packages/c5/a4/fbdc67579b73615a1f91615e814303cc71e06128f7baaba87be79b8fb90c/greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd", size = 1570689, upload-time = "2026-05-20T14:02:27.225Z" },
{ url = "https://files.pythonhosted.org/packages/e6/b4/77abbe35078be39718a46cd49caf16bceb35662f97a34101dca28aa98e47/greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1", size = 1635602, upload-time = "2026-05-20T13:14:36.344Z" },
{ url = "https://files.pythonhosted.org/packages/37/f7/129f27ca700845b8ee8ca88ce7f43435a1239c2eddb7677fc938822762cf/greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9", size = 238683, upload-time = "2026-05-20T13:11:50.57Z" },
{ url = "https://files.pythonhosted.org/packages/6d/5c/a485a36e87df8d8fd0632ee01511244f5156a20ed3746cc6599340326395/greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e", size = 235499, upload-time = "2026-05-20T13:12:42.028Z" },
{ url = "https://files.pythonhosted.org/packages/8a/cb/c62454606daf5640369c94d8a9dd540599b1bfc090e2d2180cb77f4038d2/greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07", size = 285579, upload-time = "2026-05-20T13:08:56.396Z" },
{ url = "https://files.pythonhosted.org/packages/ec/71/c4270398c2eba968a6071af1dfbdcaeee6ec1c24bc8b435b8cc452700da6/greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea", size = 651106, upload-time = "2026-05-20T14:00:09.448Z" },
{ url = "https://files.pythonhosted.org/packages/1a/ab/71e34b78a44ec271fb5f550c17bc46d301ddc5953890d935f270b0dcdb5a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2", size = 663478, upload-time = "2026-05-20T14:05:45.88Z" },
{ url = "https://files.pythonhosted.org/packages/c6/2d/2d80842910da44f78c286532d084b8a5c3717c844ae80ceb3858738ae89a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c", size = 667767, upload-time = "2026-05-20T14:09:12.15Z" },
{ url = "https://files.pythonhosted.org/packages/77/96/4efd6fa5c62c85426a0c19077a586258ebc3a2a146ff2493e4312a697a22/greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c", size = 660800, upload-time = "2026-05-20T13:14:29.129Z" },
{ url = "https://files.pythonhosted.org/packages/e9/d3/dad2eecedfbb1ed7050a20dcfae40c1442b74bc7423608be2c7e03ee7133/greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d", size = 470786, upload-time = "2026-05-20T14:01:42.064Z" },
{ url = "https://files.pythonhosted.org/packages/7a/e0/6c71401a25cac7000261304e866a2f2cc04dc74810d40e2f118aa4799495/greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0", size = 1617518, upload-time = "2026-05-20T14:02:28.662Z" },
{ url = "https://files.pythonhosted.org/packages/41/26/c5c06643e8c0af9e7bf18e16cb51d0ab7625155f0392e1c9015d66d556cd/greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc", size = 1681593, upload-time = "2026-05-20T13:14:39.417Z" },
{ url = "https://files.pythonhosted.org/packages/8a/bd/e11a108317485075e68af9d23039619b86b28130c3b50d227d42edece64b/greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3", size = 239800, upload-time = "2026-05-20T13:09:30.128Z" },
{ url = "https://files.pythonhosted.org/packages/47/f8/8e8e8417b7bf28639a5a56356ef934d0375e1d0c70a57e04d7701e870ffe/greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54", size = 236862, upload-time = "2026-05-20T13:09:10.498Z" },
{ url = "https://files.pythonhosted.org/packages/90/12/41bf27fde4d3605d3773ae57751eda182b8be2f5398011c041173b1d9534/greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad", size = 293637, upload-time = "2026-05-20T13:12:35.529Z" },
{ url = "https://files.pythonhosted.org/packages/44/44/ba14b23e9757707050c2f397d305bbcae62e5d7cad122f8b6baec5ae4a1f/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e", size = 650840, upload-time = "2026-05-20T14:00:11.079Z" },
{ url = "https://files.pythonhosted.org/packages/a8/37/5ddc2b686a6844f91abecef43411842426da2e1573f60b49ecf2547f4ae1/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986", size = 656416, upload-time = "2026-05-20T14:05:47.118Z" },
{ url = "https://files.pythonhosted.org/packages/8c/46/5987dcd1a2570ba84f3b187536b2ca3ae97613387e57f5cfa99df068fe5e/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f", size = 656607, upload-time = "2026-05-20T14:09:13.949Z" },
{ url = "https://files.pythonhosted.org/packages/e1/f0/d17510297c35a2992712f0bf84de3779749999f7d3d63aa1f09db7c62dbe/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e", size = 654397, upload-time = "2026-05-20T13:14:30.696Z" },
{ url = "https://files.pythonhosted.org/packages/2c/c1/6da0a9ddcc29d7e51ef14883fa3dc1e53b3f4ffba00582106c7bf55da1d8/greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de", size = 488287, upload-time = "2026-05-20T14:01:43.143Z" },
{ url = "https://files.pythonhosted.org/packages/37/eb/147387705bb89092645b012586e7273cb5ed3c90ef7eaf3a69173eaf0209/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d", size = 1614469, upload-time = "2026-05-20T14:02:30.192Z" },
{ url = "https://files.pythonhosted.org/packages/a6/4e/37ee0da7732b7aa9896f17e15579a9df34b9fcb9dd494f0adfa749af6623/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78", size = 1675115, upload-time = "2026-05-20T13:14:40.972Z" },
{ url = "https://files.pythonhosted.org/packages/57/f3/97dfcf4a6eb5077f8a672234216fb5923eb89f2cab7081cb10b2cf75b605/greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2", size = 245246, upload-time = "2026-05-20T13:12:22.646Z" },
{ url = "https://files.pythonhosted.org/packages/5d/73/d7f72e34b582f694f4a9b248162db7b09cc458a259ba8f0c0bfa1a34ea7d/greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541", size = 285575, upload-time = "2026-05-20T13:12:07.043Z" },
{ url = "https://files.pythonhosted.org/packages/df/59/fa9c6e87dc8ad27a95dabe2f29f372b733d05a8a67470f6c901ed9975655/greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de", size = 656428, upload-time = "2026-05-20T14:00:12.556Z" },
{ url = "https://files.pythonhosted.org/packages/f6/f9/e753408871eaa61dfe35e619cfc67512b036fde99893685d50eea9e07146/greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64", size = 667064, upload-time = "2026-05-20T14:05:48.662Z" },
{ url = "https://files.pythonhosted.org/packages/dc/74/807a047255bf1e09303627c46dc043dca596b6958a354d904f32ab382005/greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0", size = 672962, upload-time = "2026-05-20T14:09:15.532Z" },
{ url = "https://files.pythonhosted.org/packages/96/27/5565b5b40389f1c7753003a07e21892fda8660926787036d5bc0308b8113/greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5", size = 665697, upload-time = "2026-05-20T13:14:32.943Z" },
{ url = "https://files.pythonhosted.org/packages/76/32/19d4e13225193c29b13e308015223f7d75fd3d8623d49dd19040d2ce8ec1/greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc", size = 476047, upload-time = "2026-05-20T14:01:44.39Z" },
{ url = "https://files.pythonhosted.org/packages/cf/82/e7de4178c0c2d1c9a5a3be3cc0b33e46a85b3ee4a77c071bf7ad8600e079/greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368", size = 1621256, upload-time = "2026-05-20T14:02:31.91Z" },
{ url = "https://files.pythonhosted.org/packages/00/10/f2dddcf7dacac17dfc68691809589adad06135eb28930429cf58a6467a2f/greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26", size = 1685956, upload-time = "2026-05-20T13:14:42.55Z" },
{ url = "https://files.pythonhosted.org/packages/22/17/4a232b32133230ada52f70e9d7f5b65b0caef8772f01849bd8d149e7e4ca/greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab", size = 239802, upload-time = "2026-05-20T13:13:15.481Z" },
{ url = "https://files.pythonhosted.org/packages/c2/ae/4e623a7e6d4d2a5f4cb8e4c82de4169fc637942caae68d6e676b8a128ac5/greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6", size = 236853, upload-time = "2026-05-20T13:15:37.301Z" },
{ url = "https://files.pythonhosted.org/packages/7a/57/816d9cff29119da3505b3d6a5e14a8af89006ac36f47f891ff293ee05af1/greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed", size = 293877, upload-time = "2026-05-20T13:10:19.078Z" },
{ url = "https://files.pythonhosted.org/packages/23/a1/59b0a7c7d140ff1a75626680b9a9899b79a9176cab298b394968fb023295/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244", size = 655333, upload-time = "2026-05-20T14:00:14.758Z" },
{ url = "https://files.pythonhosted.org/packages/72/1b/5efe127597625042218939d01855109f352779050768b670b52edcc16a6c/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c", size = 659443, upload-time = "2026-05-20T14:05:50.159Z" },
{ url = "https://files.pythonhosted.org/packages/c9/9d/1dcdf7b95ab3cf8c7b6d7277c18a5e167312f2b362ddfcc5d5e6d8d84b43/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c", size = 659998, upload-time = "2026-05-20T14:09:16.912Z" },
{ url = "https://files.pythonhosted.org/packages/6c/6d/c404246ea4d22d097a7426d0efb5b781bd7eb67715f09e79001bd552ab18/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd", size = 658356, upload-time = "2026-05-20T13:14:35.091Z" },
{ url = "https://files.pythonhosted.org/packages/05/7e/c4959664fc231d587d66d8e81f2095e98056ba1954beafdcbe635e251052/greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62", size = 494470, upload-time = "2026-05-20T14:01:45.611Z" },
{ url = "https://files.pythonhosted.org/packages/51/02/f8ee37fb6d2219329f350af241c27fcf12df57e723d11f6fc6d3bacdadaa/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e", size = 1619216, upload-time = "2026-05-20T14:02:33.403Z" },
{ url = "https://files.pythonhosted.org/packages/93/c5/3dc9475ace2c7a3680da12372cddd7f1ac874eb410a1ac48d3e9dab83782/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659", size = 1678427, upload-time = "2026-05-20T13:14:43.71Z" },
{ url = "https://files.pythonhosted.org/packages/df/4e/750c15c317a41ffb36f0bf40b933e3d744a7dede61889f74443ea69690cf/greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e", size = 245225, upload-time = "2026-05-20T13:13:59.366Z" },
{ url = "https://files.pythonhosted.org/packages/4f/fd/d3baea2eeb7b617efd47e87ca06e2ec2c6118d303aa9e918e0ce16eadc10/greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a", size = 239590, upload-time = "2026-05-20T13:13:37.382Z" },
]
[[package]]
name = "iniconfig"
version = "2.3.0"
@ -43,6 +113,8 @@ name = "job-research"
version = "0.1.0"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },
{ name = "playwright" },
{ name = "pydantic" },
{ name = "pypdf" },
{ name = "pyyaml" },
@ -56,6 +128,8 @@ dev = [
[package.metadata]
requires-dist = [
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
{ name = "playwright", specifier = ">=1.52,<2" },
{ name = "pydantic", specifier = ">=2.7,<3" },
{ name = "pypdf", specifier = ">=5.0,<6" },
{ name = "pyyaml", specifier = ">=6.0,<7" },
@ -95,6 +169,25 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
]
[[package]]
name = "playwright"
version = "1.60.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "greenlet" },
{ name = "pyee" },
]
wheels = [
{ url = "https://files.pythonhosted.org/packages/21/f0/832bd9677194908da118064eef20082f2791e3d18215cc6d9391ee2c5a67/playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7", size = 43474635, upload-time = "2026-05-18T12:00:31.969Z" },
{ url = "https://files.pythonhosted.org/packages/59/7b/e1d32ae8a3ed937ec2be3721c5f728b13d731a0b7c6442e0b3bec5094ac0/playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5", size = 42261327, upload-time = "2026-05-18T12:00:35.638Z" },
{ url = "https://files.pythonhosted.org/packages/d7/bc/23de499ded6411c188a20c5a0dea6f0cd4ed5d2b3cc6042a5dbd3ed609aa/playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705", size = 43474636, upload-time = "2026-05-18T12:00:39.294Z" },
{ url = "https://files.pythonhosted.org/packages/22/7b/1d679f4fced4ea94efadd17103856d8c565384f68382a1681264e46f5925/playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e", size = 47467220, upload-time = "2026-05-18T12:00:43.179Z" },
{ url = "https://files.pythonhosted.org/packages/84/c2/1528d267d4442bd2c6b8eaeab819dd52c2030bf80e89293f0ba1f687473b/playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353", size = 47154856, upload-time = "2026-05-18T12:00:46.715Z" },
{ url = "https://files.pythonhosted.org/packages/bb/4e/b008b6440a7a1624378041da94829956d4b8f7ab9ef5aad22d0dc3f2e26d/playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7", size = 37902157, upload-time = "2026-05-18T12:00:50.374Z" },
{ url = "https://files.pythonhosted.org/packages/55/f0/0541524133104f9cc20bf900870ff4a736b76a23483f3a55295ddfa58409/playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02", size = 37902159, upload-time = "2026-05-18T12:00:53.728Z" },
{ url = "https://files.pythonhosted.org/packages/80/c8/210f282d278e4709cdd71b12a31af45a30a22ab3207b387e29b37e478713/playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537", size = 34037981, upload-time = "2026-05-18T12:00:57.584Z" },
]
[[package]]
name = "pluggy"
version = "1.6.0"
@ -175,6 +268,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" },
]
[[package]]
name = "pyee"
version = "13.0.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/8b/04/e7c1fe4dc78a6fdbfd6c337b1c3732ff543b8a397683ab38378447baa331/pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8", size = 31655, upload-time = "2026-02-14T21:12:28.044Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a0/c4/b4d4827c93ef43c01f599ef31453ccc1c132b353284fc6c87d535c233129/pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228", size = 15659, upload-time = "2026-02-14T21:12:26.263Z" },
]
[[package]]
name = "pygments"
version = "2.20.0"
@ -267,6 +372,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
]
[[package]]
name = "soupsieve"
version = "2.8.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" },
]
[[package]]
name = "typer"
version = "0.26.2"