Merge branch 'feature/apec-ingestion'

2026-06-05 18:01:17 +02:00 · 2026-06-05 18:01:17 +02:00 · b4182c9686
commit b4182c9686
parent cfbd1943ec e9db6b48d9
23 changed files with 4417 additions and 9 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,6 +9,8 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
+    "beautifulsoup4>=4.12,<5",
+    "playwright>=1.52,<2",
    "pydantic>=2.7,<3",
    "pypdf>=5.0,<6",
    "pyyaml>=6.0,<7",
--- a/src/job_research/apec/init.py
+++ b/src/job_research/apec/init.py
@ -0,0 +1,3 @@
+from job_research.apec.query_derivation import derive_apec_queries
+
+__all__ = ["derive_apec_queries"]
--- a/src/job_research/apec/adapter.py
+++ b/src/job_research/apec/adapter.py
@ -0,0 +1,317 @@
+from __future__ import annotations
+
+import re
+from contextlib import contextmanager
+from dataclasses import dataclass
+from urllib.parse import parse_qsl, urlencode, urlparse, urlsplit, urlunsplit
+
+from playwright.sync_api import Error as PlaywrightError
+from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
+from playwright.sync_api import sync_playwright
+
+from job_research.models import ListingError
+
+
+_SEARCH_URL = "https://www.apec.fr/candidat/recherche-emploi.html/emploi"
+_FRANCE_LOCATION_ID = "799"
+_CDI_CONTRACT_ID = "101888"
+_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
+_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
+_ZERO_RESULTS_URL_FRAGMENT = "/recherche-avancee"
+_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
+_APEC_HOSTS = {"apec.fr", "www.apec.fr"}
+_MAX_PAGES_PER_QUERY = 50
+_MAX_CONSECUTIVE_NO_PROGRESS_PAGES = 10
+
+
+@dataclass(slots=True)
+class ApecSearchResult:
+    url: str
+    source_job_id: str | None = None
+
+
+@dataclass(slots=True)
+class ApecSearchFilters:
+    location: str | None = None
+    contract_type: str | None = None
+
+
+class ApecSearchError(RuntimeError):
+    pass
+
+
+@contextmanager
+def _open_public_page():
+    with sync_playwright() as playwright:
+        browser = playwright.chromium.launch(headless=True)
+        try:
+            page = browser.new_page()
+            page.set_default_timeout(15_000)
+            yield page
+        finally:
+            browser.close()
+
+
+def _extract_source_job_id(url: str) -> str | None:
+    match = _DETAIL_JOB_ID_PATTERN.search(url)
+    if match is None:
+        return None
+
+    return match.group(1)
+
+
+def _search_results_url(base_url: str, page_number: int) -> str:
+    parsed_url = urlsplit(base_url)
+    params = parse_qsl(parsed_url.query, keep_blank_values=True)
+    filtered_params = [(key, value) for key, value in params if key != "page"]
+    filtered_params.append(("page", str(page_number)))
+    return urlunsplit(parsed_url._replace(query=urlencode(filtered_params, doseq=True)))
+
+
+def _search_url(query: str, search_filters: ApecSearchFilters, page_number: int = 0) -> str:
+    params = [
+        ("motsCles", query),
+        ("page", str(page_number)),
+    ]
+
+    if search_filters.location == "France":
+        params.insert(1, ("lieux", _FRANCE_LOCATION_ID))
+
+    if search_filters.contract_type == "CDI":
+        params.insert(2 if search_filters.location == "France" else 1, ("typesContrat", _CDI_CONTRACT_ID))
+
+    return f"{_SEARCH_URL}?{urlencode(params)}"
+
+
+def _accept_cookies_if_present(page) -> None:
+    try:
+        page.locator('input[name="cguAcceptees"]').check(timeout=2_000)
+    except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
+        pass
+
+    for button_name in ("ACCEPTER", "Accepter tous les cookies"):
+        try:
+            page.get_by_role("button", name=button_name).click(timeout=2_000)
+            return
+        except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
+            continue
+
+
+def _goto_and_wait(page, url: str) -> bool:
+    try:
+        page.goto(url, wait_until="domcontentloaded")
+        page.wait_for_load_state("domcontentloaded")
+    except Exception:
+        return False
+
+    return True
+
+
+def _is_public_apec_detail_url(url: str) -> bool:
+    parsed_url = urlparse(url)
+    return (
+        parsed_url.scheme == "https"
+        and parsed_url.hostname in _APEC_HOSTS
+        and re.fullmatch(r"/candidat/recherche-emploi\.html/emploi/detail-offre/[^/?#]+", parsed_url.path)
+        is not None
+    )
+
+
+class ApecAdapter:
+    def __init__(self, max_listings: int = 50) -> None:
+        self.max_listings = max_listings
+        self.search_errors: list[ListingError] = []
+        self._browser_context = None
+
+    @contextmanager
+    def browser_session(self):
+        if self._browser_context is not None:
+            yield
+            return
+
+        with sync_playwright() as playwright:
+            browser = playwright.chromium.launch(headless=True)
+            browser_context = browser.new_context()
+            self._browser_context = browser_context
+            try:
+                yield
+            finally:
+                self._browser_context = None
+                browser.close()
+
+    @contextmanager
+    def _open_page(self):
+        if self._browser_context is None:
+            with _open_public_page() as page:
+                yield page
+            return
+
+        page = self._browser_context.new_page()
+        page.set_default_timeout(15_000)
+        try:
+            yield page
+        finally:
+            page.close()
+
+    def _record_search_error(
+        self,
+        query: str,
+        search_filters: ApecSearchFilters,
+        message: str,
+        *,
+        url: str | None = None,
+    ) -> None:
+        self.search_errors.append(
+            ListingError(url=url or _search_url(query, search_filters), stage="search", message=message)
+        )
+
+    @staticmethod
+    def _is_zero_results_page(page) -> bool:
+        return _ZERO_RESULTS_URL_FRAGMENT in page.url and "error=true" in page.url
+
+    def search(self, queries: list[str], search_filters: ApecSearchFilters) -> list[ApecSearchResult]:
+        results: list[ApecSearchResult] = []
+        seen_keys: set[str] = set()
+        usable_search_page_seen = False
+        self.search_errors = []
+
+        with self._open_page() as page:
+            for query in queries:
+                if not query.strip():
+                    continue
+
+                if len(results) >= self.max_listings:
+                    break
+
+                if not _goto_and_wait(page, _search_url(query, search_filters)):
+                    self._record_search_error(query, search_filters, "search page navigation failed")
+                    continue
+
+                _accept_cookies_if_present(page)
+
+                try:
+                    page.wait_for_selector(_SEARCH_INPUT_SELECTOR, timeout=5_000)
+                except PlaywrightTimeoutError:
+                    self._record_search_error(query, search_filters, "search input did not render")
+                    continue
+
+                if self._is_zero_results_page(page):
+                    usable_search_page_seen = True
+                    continue
+
+                try:
+                    page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
+                except PlaywrightTimeoutError:
+                    self._record_search_error(query, search_filters, "search results did not render")
+                    continue
+
+                usable_search_page_seen = True
+
+                result_page_url = page.url
+                seen_page_urls: set[str] = {result_page_url}
+                no_progress_pages = 0
+
+                for page_number in range(_MAX_PAGES_PER_QUERY):
+                    if len(results) >= self.max_listings:
+                        break
+
+                    if page_number > 0:
+                        next_page_url = _search_results_url(result_page_url, page_number)
+                        if next_page_url in seen_page_urls:
+                            break
+
+                        if not _goto_and_wait(page, next_page_url):
+                            self._record_search_error(
+                                query,
+                                search_filters,
+                                f"page {page_number} navigation failed",
+                                url=next_page_url,
+                            )
+                            break
+
+                        try:
+                            page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
+                        except PlaywrightTimeoutError:
+                            self._record_search_error(
+                                query,
+                                search_filters,
+                                f"page {page_number} results did not render",
+                                url=next_page_url,
+                            )
+                            break
+
+                    current_page_url = page.url
+                    if page_number > 0 and current_page_url in seen_page_urls:
+                        break
+
+                    seen_page_urls.add(current_page_url)
+
+                    try:
+                        hrefs = page.locator(_RESULT_LINK_SELECTOR).evaluate_all(
+                            "nodes => nodes.map(node => node.href)"
+                        )
+                    except Exception:
+                        self._record_search_error(
+                            query,
+                            search_filters,
+                            f"page {page_number} result links could not be evaluated",
+                            url=current_page_url,
+                        )
+                        break
+
+                    if not hrefs:
+                        no_progress_pages += 1
+                        if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
+                            break
+
+                        continue
+
+                    added_any_result = False
+                    for href in hrefs:
+                        source_job_id = _extract_source_job_id(href)
+                        dedupe_key = source_job_id or href
+                        if dedupe_key in seen_keys:
+                            continue
+
+                        seen_keys.add(dedupe_key)
+                        results.append(ApecSearchResult(url=href, source_job_id=source_job_id))
+                        added_any_result = True
+
+                        if len(results) >= self.max_listings:
+                            break
+
+                    if added_any_result:
+                        no_progress_pages = 0
+                    else:
+                        no_progress_pages += 1
+                        if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
+                            break
+
+        if not usable_search_page_seen:
+            raise ApecSearchError("Apec search page was not reachable for any query")
+
+        return results
+
+    def fetch_listing_html(self, url: str) -> str:
+        if not _is_public_apec_detail_url(url):
+            raise ValueError("ApecAdapter only fetches public Apec URLs")
+
+        with self._open_page() as page:
+            page.goto(url, wait_until="domcontentloaded")
+            _accept_cookies_if_present(page)
+            page.wait_for_function(
+                """
+                () => {
+                    const title = document.querySelector('.container-details-offer h1, h1');
+                    const reference = document.querySelector('.ref-offre');
+                    const offerList = document.querySelector('.details-offer-list');
+                    return !!title && !!reference && !!offerList;
+                }
+                """,
+                polling=1000,
+                timeout=15_000,
+            )
+            final_url = page.url
+            if not _is_public_apec_detail_url(final_url):
+                raise ValueError(f"ApecAdapter landed on an unexpected URL after redirects: {final_url}")
+            return page.content()
--- a/src/job_research/apec/dedupe.py
+++ b/src/job_research/apec/dedupe.py
@ -0,0 +1,92 @@
+from job_research.models import ApecListing
+
+
+_MERGEABLE_FIELDS = (
+    "source_job_id",
+    "title",
+    "company",
+    "location",
+    "contract_type",
+    "description_text",
+    "published_at",
+    "refreshed_at",
+)
+
+
+def _merge_listing_metadata(survivor: ApecListing, source: ApecListing) -> None:
+    for field_name in _MERGEABLE_FIELDS:
+        if getattr(survivor, field_name) is None:
+            value = getattr(source, field_name)
+            if value is not None:
+                setattr(survivor, field_name, value)
+
+    for warning in source.warnings:
+        if warning not in survivor.warnings:
+            survivor.warnings.append(warning)
+
+
+def _register_listing(
+    url_to_listing: dict[str, ApecListing],
+    source_job_id_to_listing: dict[str, ApecListing],
+    listing: ApecListing,
+    survivor: ApecListing,
+) -> None:
+    url_to_listing[listing.url] = survivor
+
+    if listing.source_job_id is not None:
+        source_job_id_to_listing[listing.source_job_id] = survivor
+
+
+def _repoint_listing_aliases(
+    url_to_listing: dict[str, ApecListing],
+    source_job_id_to_listing: dict[str, ApecListing],
+    removed: ApecListing,
+    survivor: ApecListing,
+) -> None:
+    for mapping in (url_to_listing, source_job_id_to_listing):
+        for alias, listing in list(mapping.items()):
+            if listing is removed:
+                mapping[alias] = survivor
+
+
+def dedupe_apec_listings(listings: list[ApecListing]) -> list[ApecListing]:
+    url_to_listing: dict[str, ApecListing] = {}
+    source_job_id_to_listing: dict[str, ApecListing] = {}
+    survivor_order: dict[int, int] = {}
+    next_order = 0
+    deduped: list[ApecListing] = []
+
+    for listing in listings:
+        source_job_id = listing.source_job_id
+        matches: list[ApecListing] = []
+
+        url_match = url_to_listing.get(listing.url)
+        if url_match is not None:
+            matches.append(url_match)
+
+        if source_job_id is not None:
+            source_job_id_match = source_job_id_to_listing.get(source_job_id)
+            if source_job_id_match is not None and source_job_id_match not in matches:
+                matches.append(source_job_id_match)
+
+        if not matches:
+            deduped.append(listing)
+            survivor_order[id(listing)] = next_order
+            next_order += 1
+            _register_listing(url_to_listing, source_job_id_to_listing, listing, listing)
+            continue
+
+        survivor = min(matches, key=lambda candidate: survivor_order[id(candidate)])
+        for other in matches:
+            if other is survivor:
+                continue
+
+            _merge_listing_metadata(survivor, other)
+            deduped[:] = [item for item in deduped if item is not other]
+            _repoint_listing_aliases(url_to_listing, source_job_id_to_listing, other, survivor)
+            survivor_order.pop(id(other), None)
+
+        _merge_listing_metadata(survivor, listing)
+        _register_listing(url_to_listing, source_job_id_to_listing, listing, survivor)
+
+    return deduped
--- a/src/job_research/apec/normalize.py
+++ b/src/job_research/apec/normalize.py
@ -0,0 +1,328 @@
+from __future__ import annotations
+
+import re
+import unicodedata
+from datetime import datetime
+
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+
+from job_research.models import ApecListing, ListingWarning
+
+
+_PUBLISHED_AT_PATTERN = re.compile(r"Publi[ée]e le (\d{2}/\d{2}/\d{4})")
+_REFRESHED_AT_PATTERN = re.compile(r"Actualis[ée]e le (\d{2}/\d{2}/\d{4})")
+_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
+_SOURCE_JOB_ID_PATTERN = re.compile(r"Ref\. Apec\s*:\s*([A-Z0-9]+)")
+_CONTRACT_PATTERN = re.compile(r"\b(CDI|CDD|Alternance|Intérim|Stage|Freelance|Indépendant)\b")
+_HEADING_TAG_NAMES = {"h1", "h2", "h3", "h4", "h5", "h6"}
+_PLACEHOLDER_TEXT_TOKENS = {
+    "na",
+    "nr",
+    "none",
+    "null",
+    "unknown",
+    "tbd",
+    "nonrenseigne",
+    "nonrenseignee",
+    "nondisponible",
+}
+
+
+def _clean_text(value: str | None) -> str | None:
+    if value is None:
+        return None
+
+    cleaned = " ".join(value.split())
+    return cleaned or None
+
+
+def _text_token(value: str) -> str:
+    normalized = unicodedata.normalize("NFKD", value)
+    return re.sub(r"[^a-z0-9]+", "", normalized.casefold())
+
+
+def _has_useful_text(value: str | None) -> bool:
+    cleaned = _clean_text(value)
+    if cleaned is None:
+        return False
+
+    token = _text_token(cleaned)
+    return bool(token) and token not in _PLACEHOLDER_TEXT_TOKENS
+
+
+def _text_before_heading(node) -> str | None:
+    if isinstance(node, NavigableString):
+        return _clean_text(str(node))
+
+    if getattr(node, "name", None) in _HEADING_TAG_NAMES:
+        return None
+
+    pieces: list[str] = []
+    for child in getattr(node, "children", []):
+        if getattr(child, "name", None) in _HEADING_TAG_NAMES:
+            break
+
+        text = _text_before_heading(child)
+        cleaned = _clean_text(text)
+        if cleaned:
+            pieces.append(cleaned)
+
+    return _clean_text(" ".join(pieces))
+
+
+def _extract_section_text(block, label: str) -> str | None:
+    heading = block.find(lambda tag: getattr(tag, "name", None) in _HEADING_TAG_NAMES and _clean_text(tag.get_text(" ", strip=True)) == label)
+    if heading is None:
+        return None
+
+    pieces: list[str] = []
+    for sibling in heading.next_siblings:
+        if getattr(sibling, "name", None) in _HEADING_TAG_NAMES:
+            break
+
+        text = _text_before_heading(sibling)
+        if text:
+            pieces.append(text)
+
+    return _clean_text(" ".join(pieces))
+
+
+def _detail_block_text(soup: BeautifulSoup, label: str) -> str | None:
+    for block in soup.select(".details-post"):
+        if block.find("h4") is None:
+            continue
+
+        extracted = _extract_section_text(block, label)
+        if extracted is not None:
+            return extracted
+
+    return None
+
+
+def _warning(field: str, message: str) -> ListingWarning:
+    return ListingWarning(field=field, message=message)
+
+
+def _extract_source_job_id_from_url(url: str) -> str | None:
+    match = _DETAIL_JOB_ID_PATTERN.search(url)
+    if match is None:
+        return None
+
+    return match.group(1)
+
+
+def _extract_listing_date(
+    soup: BeautifulSoup,
+    pattern: re.Pattern[str],
+    *,
+    field: str,
+    missing_message: str | None = None,
+    invalid_message: str,
+    warnings: list[ListingWarning],
+    warn_on_missing: bool,
+) -> str | None:
+    card_offer = soup.select_one(".card-offer")
+    if card_offer is None:
+        if warn_on_missing and missing_message is not None:
+            warnings.append(_warning(field, missing_message))
+        return None
+
+    match = pattern.search(card_offer.get_text(" ", strip=True))
+    if match is None:
+        if warn_on_missing and missing_message is not None:
+            warnings.append(_warning(field, missing_message))
+        return None
+
+    try:
+        return datetime.strptime(match.group(1), "%d/%m/%Y").date().isoformat()
+    except ValueError:
+        warnings.append(_warning(field, invalid_message))
+        return None
+
+
+def _extract_source_job_id(soup: BeautifulSoup) -> str | None:
+    ref = soup.select_one(".ref-offre")
+    if ref is None:
+        return None
+
+    match = _SOURCE_JOB_ID_PATTERN.search(ref.get_text(" ", strip=True))
+    if match is None:
+        return None
+
+    return match.group(1)
+
+
+def _extract_contract_type(details_offer_list) -> str | None:
+    contract_item = details_offer_list.select_one("li:nth-of-type(2)")
+    if contract_item is None:
+        return None
+
+    span = contract_item.find("span")
+    if span is not None:
+        return _clean_text(span.get_text(" ", strip=True))
+
+    match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
+    if match is None:
+        return None
+
+    return match.group(1)
+
+
+def _extract_company(soup: BeautifulSoup, details_offer_list) -> str | None:
+    for selector in (".card-ents .ents-name", ".card-ents-quote"):
+        company = soup.select_one(selector)
+        if company is not None:
+            text = _clean_text(company.get_text(" ", strip=True))
+            if text is not None:
+                return text
+
+    if details_offer_list is not None:
+        company = details_offer_list.select_one("li:first-of-type")
+        if company is not None:
+            text = _clean_text(company.get_text(" ", strip=True))
+            if text is not None:
+                return text
+
+    return None
+
+
+def normalize_apec_listing(
+    url: str,
+    html: str,
+    fetched_at: str,
+    *,
+    source_job_id: str | None = None,
+    published_at: str | None = None,
+    refreshed_at: str | None = None,
+) -> ApecListing:
+    soup = BeautifulSoup(html, "html.parser")
+    warnings: list[ListingWarning] = []
+
+    title = soup.select_one(".container-details-offer h1")
+    if title is None:
+        title = soup.find("h1")
+        if title is not None:
+            warnings.append(_warning("title", "Recovered title from generic h1 fallback"))
+        else:
+            warnings.append(_warning("title", "Title missing from Apec listing"))
+    title_text = _clean_text(title.get_text(" ", strip=True)) if title is not None else None
+    if title is not None and not _has_useful_text(title_text):
+        warnings.append(_warning("title", "Title is empty or placeholder text"))
+
+    details_offer_list = soup.select_one(".details-offer-list")
+
+    location = None
+    contract_type = None
+    if details_offer_list is not None:
+        location_item = details_offer_list.select_one("li:nth-of-type(3)")
+        if location_item is not None:
+            location = _clean_text(location_item.get_text(" ", strip=True))
+            if not _has_useful_text(location):
+                warnings.append(_warning("location", "Location is empty or placeholder text"))
+        else:
+            warnings.append(_warning("location", "Location missing from details-offer list"))
+
+        contract_item = details_offer_list.select_one("li:nth-of-type(2)")
+        if contract_item is None:
+            warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
+        else:
+            span = contract_item.find("span")
+            if span is not None:
+                contract_type = _clean_text(span.get_text(" ", strip=True))
+                if not _has_useful_text(contract_type):
+                    warnings.append(_warning("contract_type", "Contract type is empty or placeholder text"))
+            else:
+                match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
+                if match is not None:
+                    contract_type = match.group(1)
+                    warnings.append(_warning("contract_type", "Recovered contract type from text fallback"))
+                else:
+                    warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
+    else:
+        warnings.append(_warning("location", "Location missing from Apec listing"))
+        warnings.append(_warning("contract_type", "Contract type missing from Apec listing"))
+
+    description_text = _detail_block_text(soup, "Descriptif du poste")
+    if description_text is None:
+        warnings.append(_warning("description_text", "Description missing from Apec listing"))
+    elif not _has_useful_text(description_text):
+        warnings.append(_warning("description_text", "Description is empty or placeholder text"))
+        description_text = None
+
+    requested_source_job_id = _extract_source_job_id_from_url(url)
+    ref_source_job_id = _extract_source_job_id(soup)
+
+    if source_job_id is not None:
+        if (
+            requested_source_job_id is not None
+            and ref_source_job_id is not None
+            and requested_source_job_id != ref_source_job_id
+        ):
+            warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
+            normalized_source_job_id = ref_source_job_id
+        else:
+            normalized_source_job_id = source_job_id
+    else:
+        if ref_source_job_id is None:
+            if requested_source_job_id is None:
+                warnings.append(_warning("source_job_id", "Source job id missing from Apec listing"))
+                normalized_source_job_id = None
+            else:
+                warnings.append(_warning("source_job_id", "Recovered source job id from detail URL fallback"))
+                normalized_source_job_id = requested_source_job_id
+        else:
+            warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
+            normalized_source_job_id = ref_source_job_id
+
+    company = soup.select_one(".card-ents .ents-name")
+    if company is None:
+        for selector, warning_message in (
+            (".card-ents-quote", "Recovered company from .card-ents-quote fallback"),
+            (".details-offer-list li:first-of-type", "Recovered company from details-offer-list fallback"),
+        ):
+            company = soup.select_one(selector)
+            if company is not None:
+                warnings.append(_warning("company", warning_message))
+                break
+
+    company_text = _clean_text(company.get_text(" ", strip=True)) if company is not None else None
+    if company_text is None:
+        warnings.append(_warning("company", "Company missing from Apec listing"))
+    elif not _has_useful_text(company_text):
+        warnings.append(_warning("company", "Company is empty or placeholder text"))
+        company_text = None
+
+    published_at_value = published_at or _extract_listing_date(
+        soup,
+        _PUBLISHED_AT_PATTERN,
+        field="published_at",
+        missing_message="Published date missing from Apec listing",
+        invalid_message="Published date is invalid",
+        warnings=warnings,
+        warn_on_missing=True,
+    )
+
+    refreshed_at_value = refreshed_at or _extract_listing_date(
+        soup,
+        _REFRESHED_AT_PATTERN,
+        field="refreshed_at",
+        invalid_message="Refreshed date is invalid",
+        warnings=warnings,
+        warn_on_missing=False,
+    )
+
+    return ApecListing(
+        source="apec",
+        source_job_id=normalized_source_job_id,
+        url=url,
+        title=title_text if _has_useful_text(title_text) else None,
+        company=company_text,
+        location=location if _has_useful_text(location) else None,
+        contract_type=contract_type if _has_useful_text(contract_type) else None,
+        description_text=description_text,
+        published_at=published_at_value,
+        refreshed_at=refreshed_at_value,
+        fetched_at=fetched_at,
+        warnings=warnings,
+    )
--- a/src/job_research/apec/query_derivation.py
+++ b/src/job_research/apec/query_derivation.py
@ -0,0 +1,63 @@
+from job_research.apec.adapter import ApecSearchFilters
+from job_research.models import CandidateProfileOutput
+
+
+def _normalize_term(raw_term: str) -> str:
+    return " ".join(raw_term.split())
+
+
+def _normalize_constraint(raw_term: str) -> str:
+    return _normalize_term(raw_term).casefold()
+
+
+def derive_apec_search_filters(profile: CandidateProfileOutput) -> ApecSearchFilters:
+    normalized_constraints = {_normalize_constraint(constraint) for constraint in profile.constraints}
+
+    return ApecSearchFilters(
+        location="France" if "france only" in normalized_constraints else None,
+        contract_type="CDI" if "cdi only" in normalized_constraints else None,
+    )
+
+
+def derive_apec_queries(profile: CandidateProfileOutput) -> list[str]:
+    queries: list[str] = []
+    seen: set[str] = set()
+
+    def add_query(raw_query: str) -> None:
+        query = _normalize_term(raw_query)
+        if not query or query in seen or len(queries) == 5:
+            return
+
+        seen.add(query)
+        queries.append(query)
+
+    unique_roles: list[str] = []
+    for target_role in profile.target_roles:
+        query = " ".join(target_role.split())
+        if not query or query in unique_roles:
+            continue
+
+        unique_roles.append(query)
+
+    support_terms = [_normalize_term(term) for term in profile.strengths]
+    support_terms.extend(_normalize_term(term) for term in profile.skills_to_emphasize)
+    support_terms = [term for term in support_terms if term]
+
+    for target_role in unique_roles:
+        add_query(target_role)
+        if len(queries) == 5:
+            return queries
+
+    if unique_roles:
+        primary_role = unique_roles[0]
+        for term in support_terms:
+            add_query(f"{primary_role} {term}")
+            if len(queries) == 5:
+                break
+    else:
+        for term in support_terms:
+            add_query(term)
+            if len(queries) == 5:
+                break
+
+    return queries
--- a/src/job_research/cli.py
+++ b/src/job_research/cli.py
@ -1,15 +1,79 @@
+from contextlib import nullcontext
+from datetime import datetime, timezone
+import re
 from pathlib import Path
+from typing import Any
+from urllib.parse import unquote, urlparse

 import typer
+import yaml
+from pydantic import ValidationError

+from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
+from job_research.apec.dedupe import dedupe_apec_listings
+from job_research.apec.normalize import normalize_apec_listing
+from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
+from job_research.models import ApecRunMeta, ApecSnapshotMeta, CandidateProfileOutput, ListingError
 from job_research.profile.cv_extractor import extract_cv_signals, extract_pdf_text
 from job_research.profile.merge import build_candidate_profile_output
-from job_research.profile.profile_parser import parse_profile_markdown
-from job_research.storage import save_candidate_profile_yaml
+from job_research.profile.profile_parser import AuthoredProfile, parse_profile_markdown
+from job_research.storage import apec_run_paths, load_yaml, save_candidate_profile_yaml

 app = typer.Typer(help="Build one canonical candidate profile YAML")


+def _utc_now() -> datetime:
+    return datetime.now(timezone.utc)
+
+
+def _snapshot_stem(url: str, source_job_id: str | None) -> str:
+    if source_job_id:
+        return source_job_id
+
+    parsed_url = urlparse(url)
+    fallback = parsed_url.path.rstrip("/").rsplit("/", 1)[-1] or parsed_url.netloc or "listing"
+    if parsed_url.query:
+        fallback = f"{fallback}-{parsed_url.query}"
+
+    stem = re.sub(r"[^A-Za-z0-9]+", "-", unquote(fallback)).strip("-")
+    return stem or "listing"
+
+
+def _write_yaml(path: Path, payload: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=True), encoding="utf-8")
+
+
+def _load_candidate_profile(profile_path: Path) -> CandidateProfileOutput:
+    try:
+        return CandidateProfileOutput.model_validate(load_yaml(profile_path))
+    except FileNotFoundError as exc:
+        raise ValueError(f"candidate-profile.yaml not found at {profile_path}") from exc
+    except (OSError, UnicodeDecodeError) as exc:
+        raise ValueError(f"candidate-profile.yaml not readable at {profile_path}: {exc}") from exc
+    except (yaml.YAMLError, ValidationError, ValueError) as exc:
+        raise ValueError(f"invalid candidate-profile.yaml at {profile_path}: {exc}") from exc
+
+
+def _load_cv_text(cv: Path) -> str:
+    try:
+        cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
+    except Exception as exc:  # pragma: no cover - defensive boundary
+        raise ValueError(f"CV input not readable at {cv}: {exc}") from exc
+
+    if not cv_text.strip():
+        raise ValueError("No readable text found in CV input")
+
+    return cv_text
+
+
+def _load_authored_profile(profile: Path) -> AuthoredProfile:
+    try:
+        return parse_profile_markdown(profile.read_text(encoding="utf-8"))
+    except Exception as exc:  # pragma: no cover - defensive boundary
+        raise ValueError(f"profile markdown invalid at {profile}: {exc}") from exc
+
+
@app.callback()
 def main_command() -> None:
    pass
@ -41,15 +105,21 @@ def build_profile(
 ) -> None:
    """Build candidate-profile.yaml from CV and markdown profile."""

-    cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
-    if not cv_text.strip():
-        raise ValueError("No readable text found in CV input")
+    try:
+        cv_text = _load_cv_text(cv)
+        authored_profile = _load_authored_profile(profile)
+    except ValueError as exc:
+        typer.echo(str(exc), err=True)
+        raise typer.Exit(code=1)

-    authored_profile = parse_profile_markdown(profile.read_text(encoding="utf-8"))
    cv_signals = extract_cv_signals(cv_text)
    candidate_profile = build_candidate_profile_output(cv_signals, authored_profile)

-    save_candidate_profile_yaml(out, candidate_profile)
+    try:
+        save_candidate_profile_yaml(out, candidate_profile)
+    except OSError as exc:
+        typer.echo(f"Unable to write candidate profile to {out}: {exc}", err=True)
+        raise typer.Exit(code=1)

    typer.echo(f"candidate profile written to {out}")
    warning_count = len(candidate_profile.warnings)
@ -59,6 +129,147 @@ def build_profile(
        typer.echo("No warnings included.")


+@app.command("fetch-apec")
+def fetch_apec(
+    data_root: Path = typer.Option(
+        Path("data"),
+        "--data-root",
+        file_okay=False,
+        dir_okay=True,
+        help="Directory containing candidate-profile.yaml and Apec run artifacts.",
+    ),
+    ) -> None:
+    """Fetch, normalize, dedupe, and persist Apec listings."""
+
+    profile_path = data_root / "candidate-profile.yaml"
+    try:
+        profile = _load_candidate_profile(profile_path)
+    except ValueError as exc:
+        typer.echo(str(exc), err=True)
+        raise typer.Exit(code=1)
+
+    derived_queries = derive_apec_queries(profile)
+
+    if not derived_queries:
+        typer.echo("No usable Apec queries derived from candidate profile", err=True)
+        raise typer.Exit(code=1)
+
+    derived_search_filters = derive_apec_search_filters(profile)
+    search_filters = ApecSearchFilters(
+        location=derived_search_filters.location or "France",
+        contract_type=derived_search_filters.contract_type or "CDI",
+    )
+
+    current = _utc_now().astimezone(timezone.utc)
+    run_id = current.strftime("%Y-%m-%dT%H-%M-%S-%fZ")
+    run_started_at = current.replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+    adapter = ApecAdapter(max_listings=50)
+    try:
+        search_results = adapter.search(derived_queries, search_filters=search_filters)[:50]
+    except Exception as exc:  # pragma: no cover - defensive boundary
+        typer.echo(f"Unable to fetch Apec search results: {exc}", err=True)
+        raise typer.Exit(code=1)
+
+    paths = apec_run_paths(data_root, run_id)
+    try:
+        paths["snapshots"].mkdir(parents=True, exist_ok=True)
+    except OSError as exc:  # pragma: no cover - defensive boundary
+        typer.echo(f"Unable to create Apec snapshot directory: {exc}", err=True)
+        raise typer.Exit(code=1)
+
+    normalized_listings = []
+    listing_errors: list[ListingError] = list(getattr(adapter, "search_errors", []))
+    snapshot_metadata: list[ApecSnapshotMeta] = []
+    fetched_count = 0
+    successful_fetch_count = 0
+
+    browser_session = getattr(adapter, "browser_session", None)
+    session_context = browser_session() if callable(browser_session) else nullcontext()
+
+    with session_context:
+        for result in search_results:
+            fetched_count += 1
+            fetched_at = _utc_now().astimezone(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+            try:
+                html = adapter.fetch_listing_html(result.url)
+            except Exception as exc:  # pragma: no cover - defensive boundary
+                listing_errors.append(ListingError(url=result.url, stage="fetch_html", message=str(exc)))
+                continue
+
+            successful_fetch_count += 1
+
+            snapshot_path = paths["snapshots"] / f"{_snapshot_stem(result.url, result.source_job_id)}.html"
+            snapshot_meta = ApecSnapshotMeta(
+                url=result.url,
+                source_job_id=result.source_job_id,
+                snapshot_file=None,
+                fetched_at=fetched_at,
+            )
+
+            try:
+                snapshot_path.write_text(html, encoding="utf-8")
+            except Exception as exc:  # pragma: no cover - defensive boundary
+                listing_errors.append(ListingError(url=result.url, stage="snapshot_write", message=str(exc)))
+            else:
+                snapshot_meta.snapshot_file = snapshot_path.name
+
+            snapshot_metadata.append(snapshot_meta)
+
+            try:
+                listing = normalize_apec_listing(
+                    url=result.url,
+                    html=html,
+                    fetched_at=fetched_at,
+                    source_job_id=result.source_job_id,
+                )
+            except Exception as exc:  # pragma: no cover - defensive boundary
+                listing_errors.append(ListingError(url=result.url, stage="normalize", message=str(exc)))
+                continue
+
+            normalized_listings.append(listing)
+
+    if search_results and successful_fetch_count == 0:
+        typer.echo("No listings could be fetched or normalized from Apec", err=True)
+        raise typer.Exit(code=1)
+
+    deduplicated_listings = dedupe_apec_listings(normalized_listings)
+    failed_count = len({error.url for error in listing_errors})
+    run_meta = ApecRunMeta(
+        run_id=run_id,
+        run_started_at=run_started_at,
+        derived_queries=derived_queries,
+        snapshots=snapshot_metadata,
+        fetched_count=fetched_count,
+        normalized_count=len(normalized_listings),
+        deduplicated_count=len(deduplicated_listings),
+        failed_count=failed_count,
+        listing_errors=listing_errors,
+    )
+
+    artifact_write_errors: list[str] = []
+
+    try:
+        _write_yaml(paths["listings"], [listing.model_dump(mode="json") for listing in deduplicated_listings])
+    except OSError as exc:  # pragma: no cover - defensive boundary
+        artifact_write_errors.append(f"listings.yaml: {exc}")
+
+    try:
+        _write_yaml(paths["run_meta"], run_meta.model_dump(mode="json"))
+    except OSError as exc:  # pragma: no cover - defensive boundary
+        artifact_write_errors.append(f"run-meta.yaml: {exc}")
+
+    if artifact_write_errors:
+        typer.echo(f"Unable to write Apec run artifacts: {'; '.join(artifact_write_errors)}", err=True)
+        raise typer.Exit(code=1)
+
+    typer.echo(
+        f"query={len(derived_queries)} fetched={fetched_count} normalized={len(normalized_listings)} "
+        f"deduplicated={len(deduplicated_listings)} failed={failed_count}"
+    )
+
+
 def main() -> None:
    app()

--- a/src/job_research/models.py
+++ b/src/job_research/models.py
@ -21,6 +21,51 @@ class WarningItem(BaseModel):
    message: str


+class ListingWarning(BaseModel):
+    field: str
+    message: str
+
+
+class ListingError(BaseModel):
+    url: str
+    stage: str
+    message: str
+
+
+class ApecSnapshotMeta(BaseModel):
+    url: str
+    source_job_id: str | None = None
+    snapshot_file: str | None = None
+    fetched_at: str
+
+
+class ApecListing(BaseModel):
+    source: str
+    source_job_id: str | None = None
+    url: str
+    title: str | None = None
+    company: str | None = None
+    location: str | None = None
+    contract_type: str | None = None
+    description_text: str | None = None
+    published_at: str | None = None
+    refreshed_at: str | None = None
+    fetched_at: str
+    warnings: list[ListingWarning] = Field(default_factory=list)
+
+
+class ApecRunMeta(BaseModel):
+    run_id: str
+    run_started_at: str
+    derived_queries: list[str] = Field(default_factory=list)
+    snapshots: list[ApecSnapshotMeta] = Field(default_factory=list)
+    fetched_count: int = 0
+    normalized_count: int = 0
+    deduplicated_count: int = 0
+    failed_count: int = 0
+    listing_errors: list[ListingError] = Field(default_factory=list)
+
+
 class CandidateProfileOutput(BaseModel):
    name: str | None = None
    summary: str | None = None
--- a/src/job_research/profile/cv_extractor.py
+++ b/src/job_research/profile/cv_extractor.py
@ -5,6 +5,8 @@ from pathlib import Path

 from pypdf import PdfReader

+from job_research.models import WarningItem
+

 EXPERIENCE_LINE_CONNECTORS = (" at ", " chez ", " au ", " à ")

@ -150,6 +152,15 @@ YEARS_OF_EXPERIENCE_PATTERNS = (
    re.compile(r"^ann[ée]es d['’]exp[ée]rience\s*:\s*(\d+)\s*$", re.IGNORECASE),
 )

+LOW_CONFIDENCE_NAME_PATTERNS = (
+    re.compile(r"\b(cv|resume|curriculum vitae|profile)\b", re.IGNORECASE),
+    re.compile(r"[|/@]"),
+    re.compile(
+        r"\b(data engineer|software engineer|developer|analyst|scientist|consultant|architect|manager|product owner|backend|frontend|full stack)\b",
+        re.IGNORECASE,
+    ),
+)
+

 def extract_pdf_text(path: Path) -> str:
    reader = PdfReader(str(path))
@ -168,8 +179,17 @@ def extract_pdf_text(path: Path) -> str:
 def extract_cv_signals(text: str) -> dict[str, object]:
    lines = [_normalize_line(line) for line in text.splitlines()]
    non_empty_lines = [line for line in lines if line]
+    warnings: list[WarningItem] = []

    name = non_empty_lines[0] if non_empty_lines else None
+    if name is not None and _looks_like_low_confidence_name(name):
+        warnings.append(
+            WarningItem(
+                field="name",
+                message="First CV line looks like a header or tagline; review manually.",
+            )
+        )
+
    location = None
    languages: list[str] = []
    skills: list[str] = []
@ -247,6 +267,7 @@ def extract_cv_signals(text: str) -> dict[str, object]:
        "skills": skills,
        "experience_entries": experience_entries,
        "education_entries": education_entries,
+        "warnings": warnings,
    }

    if years_of_experience is not None:
@ -369,3 +390,14 @@ def _looks_like_experience_title(title: str) -> bool:

 def _looks_like_prose_company(company: str) -> bool:
    return any(pattern.search(company) for pattern in EXPERIENCE_PROSE_COMPANY_PATTERNS)
+
+
+def _looks_like_low_confidence_name(name: str) -> bool:
+    normalized = " ".join(name.split())
+    if not normalized:
+        return True
+
+    if len(normalized.split()) > 4:
+        return True
+
+    return any(pattern.search(normalized) for pattern in LOW_CONFIDENCE_NAME_PATTERNS)
--- a/src/job_research/profile/merge.py
+++ b/src/job_research/profile/merge.py
@ -33,6 +33,7 @@ def build_candidate_profile_output(
    warnings: list[WarningItem] = []

    _append_years_of_experience_warning(cv_signals, authored.notes, warnings)
+    _append_cv_extraction_warnings(cv_signals, warnings)
    _append_missing_cv_fact_warnings(cv_signals, warnings)

    merged_skills: list[str] = []
@ -99,6 +100,15 @@ def _append_missing_cv_fact_warnings(
        warnings.append(WarningItem(field=field, message=message))


+def _append_cv_extraction_warnings(
+    cv_signals: dict[str, object], warnings: list[WarningItem]
+) -> None:
+    for warning in cv_signals.get("warnings") or []:
+        warnings.append(
+            warning if isinstance(warning, WarningItem) else WarningItem.model_validate(warning)
+        )
+
+
 def _note_years_of_experience(note: str) -> int | None:
    normalized = note.casefold().replace("’", "'")
    if not any(marker in normalized for marker in EXPERIENCE_NOTE_MARKERS):
--- a/src/job_research/profile/profile_parser.py
+++ b/src/job_research/profile/profile_parser.py
@ -17,14 +17,15 @@ class AuthoredProfile:
    notes: list[str] = field(default_factory=list)


-REQUIRED_SECTIONS = {
+REQUIRED_SECTION_NAMES = (
    "summary",
    "target roles",
    "strengths",
    "skills to emphasize",
    "constraints",
    "notes",
-}
+)
+REQUIRED_SECTIONS = set(REQUIRED_SECTION_NAMES)


 def parse_profile_markdown(markdown: str) -> AuthoredProfile:
@ -45,6 +46,10 @@ def parse_profile_markdown(markdown: str) -> AuthoredProfile:
        missing_text = ", ".join(sorted(missing))
        raise ValueError(f"Missing required markdown sections: {missing_text}")

+    for section_name in REQUIRED_SECTION_NAMES:
+        if not _has_usable_section_content(sections[section_name]):
+            raise ValueError(f"Missing usable content in section '{section_name}'")
+
    return AuthoredProfile(
        summary=" ".join(sections["summary"]),
        target_roles=_parse_list_section("target roles", sections["target roles"]),
@ -64,6 +69,8 @@ def _parse_list_section(section_name: str, lines: list[str]) -> list[str]:
        item = _strip_list_marker(line)
        if item is None:
            raise ValueError(f"Unsupported content in section '{section_name}': {line}")
+        if not item:
+            raise ValueError(f"Missing usable content in section '{section_name}'")
        items.append(item)

    return items
@ -74,6 +81,8 @@ def _parse_notes_section(lines: list[str]) -> list[str]:

    for line in lines:
        item = _strip_list_marker(line)
+        if item == "":
+            raise ValueError("Missing usable content in section 'notes'")
        notes.append(item if item is not None else line)

    return notes
@ -81,7 +90,13 @@ def _parse_notes_section(lines: list[str]) -> list[str]:

 def _strip_list_marker(line: str) -> str | None:
    for marker in LIST_MARKERS:
+        if line == marker.strip():
+            return ""
        if line.startswith(marker):
            return line[len(marker):].strip()

    return None
+
+
+def _has_usable_section_content(lines: list[str]) -> bool:
+    return any(line not in {"-", "*", "+"} for line in lines)
--- a/src/job_research/storage.py
+++ b/src/job_research/storage.py
@ -21,3 +21,13 @@ def load_yaml(path: Path) -> dict[str, Any]:
        raise ValueError("candidate-profile YAML root must be a mapping")

    return dict(payload)
+
+
+def apec_run_paths(data_root: Path, run_id: str) -> dict[str, Path]:
+    run_dir = data_root / "apec" / "runs" / run_id
+    return {
+        "run_dir": run_dir,
+        "listings": run_dir / "listings.yaml",
+        "run_meta": run_dir / "run-meta.yaml",
+        "snapshots": run_dir / "snapshots",
+    }
--- a/tests/apec/test_adapter.py
+++ b/tests/apec/test_adapter.py
@ -0,0 +1,701 @@
+from contextlib import contextmanager
+from urllib.parse import parse_qs, quote_plus, urlparse
+
+import pytest
+from playwright.sync_api import Error as PlaywrightError
+from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
+
+from job_research.apec import adapter as adapter_module
+from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
+
+
+_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
+_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
+
+
+class _FakeResultButton:
+    def __init__(self, page, name: str) -> None:
+        self.page = page
+        self.name = name
+
+    def click(self, timeout: int | None = None) -> None:
+        if self.name == "Rechercher":
+            self.page.url = (
+                "https://www.apec.fr/candidat/recherche-emploi.html/emploi"
+                f"?motsCles={quote_plus(self.page.current_query)}&page=0"
+            )
+            self.page.current_page = 0
+        elif self.name in {"ACCEPTER", "Accepter tous les cookies"}:
+            self.page.consent_button_clicks.append(self.name)
+            self.page.consent_accepted = True
+
+
+class _FakeLocator:
+    def __init__(self, page, selector: str) -> None:
+        self.page = page
+        self.selector = selector
+
+    def fill(self, value: str) -> None:
+        self.page.current_query = value
+
+    def check(self, timeout: int | None = None) -> None:
+        if self.selector == 'input[name="cguAcceptees"]':
+            self.page.cgu_checkbox_checked = True
+            return None
+
+        raise PlaywrightTimeoutError(f"selector not found: {self.selector}")
+
+    def evaluate_all(self, function: str):
+        if self.selector == _RESULT_LINK_SELECTOR:
+            return list(self.page.current_results())
+
+        return []
+
+
+class _FakeDetailPage:
+    def __init__(
+        self,
+        result_pages: dict[str, dict[int, list[str]]],
+        *,
+        rendered_html: str = "<html>rendered</html>",
+        search_ready: bool = True,
+        zero_result_queries: set[str] | None = None,
+        consent_required: bool = False,
+    ) -> None:
+        self.result_pages = result_pages
+        self.rendered_html = rendered_html
+        self.shell_html = "<html>shell</html>"
+        self.waited_functions: list[tuple[str, int | None]] = []
+        self.search_ready = search_ready
+        self.zero_result_queries = zero_result_queries or set()
+        self.consent_required = consent_required
+        self.cgu_checkbox_checked = False
+        self.consent_button_clicks: list[str] = []
+        self.consent_accepted = not consent_required
+        self.goto_urls: list[str] = []
+        self.current_query = ""
+        self.current_page = 0
+        self.url = ""
+        self.rendered = False
+        self.default_timeout: int | None = None
+        self.closed = False
+
+    def goto(self, url: str, wait_until: str | None = None) -> None:
+        self.goto_urls.append(url)
+        self.url = url
+
+        parsed_url = urlparse(url)
+        params = parse_qs(parsed_url.query)
+        if "motsCles" in params:
+            self.current_query = params["motsCles"][0]
+        if "page" in params:
+            self.current_page = int(params["page"][0])
+
+        if self.current_query in self.zero_result_queries and "/detail-offre/" not in parsed_url.path:
+            self.url = (
+                f"{parsed_url.scheme}://{parsed_url.netloc}"
+                f"{parsed_url.path}/recherche-avancee?{parsed_url.query}&error=true"
+            )
+
+        if "/detail-offre/" in parsed_url.path:
+            self.rendered = False
+
+    def wait_for_load_state(self, state: str) -> None:
+        return None
+
+    def set_default_timeout(self, timeout: int) -> None:
+        self.default_timeout = timeout
+
+    def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
+        if self.consent_required and not self.consent_accepted:
+            raise PlaywrightTimeoutError("consent not accepted")
+
+        self.waited_functions.append((function, polling))
+        self.rendered = True
+        return None
+
+    def wait_for_selector(self, selector: str, timeout: int | None = None) -> None:
+        if selector == _SEARCH_INPUT_SELECTOR:
+            if self.search_ready and self.consent_accepted:
+                return None
+
+            raise PlaywrightTimeoutError(f"selector not found: {selector}")
+
+        if selector == _RESULT_LINK_SELECTOR and self.current_results():
+            return None
+
+        raise PlaywrightTimeoutError(f"selector not found: {selector}")
+
+    def get_by_role(self, role: str, name: str):
+        return _FakeResultButton(self, name)
+
+    def locator(self, selector: str):
+        return _FakeLocator(self, selector)
+
+    def content(self) -> str:
+        return self.rendered_html if self.rendered else self.shell_html
+
+    def current_results(self) -> list[str]:
+        return self.result_pages.get(self.current_query, {}).get(self.current_page, [])
+
+    def close(self) -> None:
+        self.closed = True
+
+
+@contextmanager
+def _fake_open_public_page(page: _FakeDetailPage):
+    yield page
+
+
+def test_search_continues_past_duplicate_only_pages(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {0: [first_result], 1: []},
+            "beta": {0: [first_result], 1: [second_result], 2: []},
+        }
+    )
+
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha", "beta"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result, second_result]
+    assert [result.source_job_id for result in results] == ["111", "222"]
+    assert "motsCles=alpha" in page.goto_urls[0]
+    assert "lieux=799" in page.goto_urls[0]
+    assert "typesContrat=101888" in page.goto_urls[0]
+    assert any(
+        "motsCles=beta" in url and "lieux=799" in url and "typesContrat=101888" in url and "page=1" in url
+        for url in page.goto_urls
+    )
+    assert any("page=1" in url for url in page.goto_urls)
+
+
+def test_search_continues_past_duplicate_only_pages_until_a_later_hit(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=3&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {0: [first_result], 1: [first_result], 2: [first_result], 3: [second_result], 4: []},
+            "beta": {0: [first_result], 1: []},
+        }
+    )
+
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha", "beta"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result, second_result]
+    assert any("page=1" in url for url in page.goto_urls)
+    assert any("page=3" in url for url in page.goto_urls)
+
+
+def test_search_continues_after_query_and_pagination_navigation_failures(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=beta&page=0&selectedIndex=0"
+    second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "beta": {0: [first_result], 1: [second_result]},
+        }
+    )
+
+    original_goto = page.goto
+    goto_calls = 0
+
+    def flaky_goto(url: str, wait_until: str | None = None) -> None:
+        nonlocal goto_calls
+
+        goto_calls += 1
+        if goto_calls == 1:
+            raise RuntimeError("navigation boom")
+
+        original_goto(url, wait_until=wait_until)
+
+    original_wait_for_load_state = page.wait_for_load_state
+
+    def flaky_wait_for_load_state(state: str) -> None:
+        if page.current_page == 1:
+            raise RuntimeError("load boom")
+
+        original_wait_for_load_state(state)
+
+    monkeypatch.setattr(page, "goto", flaky_goto)
+    monkeypatch.setattr(page, "wait_for_load_state", flaky_wait_for_load_state)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha", "beta"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert [result.source_job_id for result in results] == ["111"]
+
+
+def test_search_stops_after_max_page_count(monkeypatch) -> None:
+    page = _FakeDetailPage(
+        {
+            "alpha": {
+                0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
+                1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
+                2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/333?motsCles=alpha&page=2&selectedIndex=0"],
+                3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/444?motsCles=alpha&page=3&selectedIndex=0"],
+            }
+        }
+    )
+
+    original_goto = page.goto
+
+    def bounded_goto(url: str, wait_until: str | None = None) -> None:
+        original_goto(url, wait_until=wait_until)
+        if page.current_page >= 3:
+            raise AssertionError("pagination should stop before page 3")
+
+    monkeypatch.setattr(page, "goto", bounded_goto)
+    monkeypatch.setattr(adapter_module, "_MAX_PAGES_PER_QUERY", 3)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.source_job_id for result in results] == ["111", "222", "333"]
+    assert not any("page=3" in url for url in page.goto_urls)
+
+
+def test_search_stops_after_consecutive_no_progress_pages(monkeypatch) -> None:
+    page = _FakeDetailPage(
+        {
+            "alpha": {
+                0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
+                1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=1&selectedIndex=0"],
+                2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=2&selectedIndex=0"],
+                3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=3&selectedIndex=0"],
+            }
+        }
+    )
+
+    original_goto = page.goto
+
+    def bounded_goto(url: str, wait_until: str | None = None) -> None:
+        original_goto(url, wait_until=wait_until)
+        if page.current_page >= 3:
+            raise AssertionError("pagination should stop before page 3")
+
+    monkeypatch.setattr(page, "goto", bounded_goto)
+    monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 2)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.source_job_id for result in results] == ["111"]
+    assert not any("page=3" in url for url in page.goto_urls)
+
+
+def test_search_stops_when_result_page_url_repeats(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {0: [first_result], 1: [first_result], 2: [first_result]},
+        }
+    )
+
+    original_goto = page.goto
+    initial_result_page_url: str | None = None
+
+    def looping_goto(url: str, wait_until: str | None = None) -> None:
+        nonlocal initial_result_page_url
+
+        original_goto(url, wait_until=wait_until)
+
+        if initial_result_page_url is None and page.current_page == 0:
+            initial_result_page_url = page.url
+        elif initial_result_page_url is not None and page.current_page > 0:
+            page.url = initial_result_page_url
+
+    monkeypatch.setattr(page, "goto", looping_goto)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert any("page=1" in url for url in page.goto_urls)
+    assert not any("page=2" in url for url in page.goto_urls)
+
+
+def test_search_raises_when_every_query_fails_to_load_a_search_page(monkeypatch) -> None:
+    page = _FakeDetailPage({"alpha": {0: []}}, search_ready=False)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    with pytest.raises(adapter_module.ApecSearchError):
+        ApecAdapter(max_listings=10).search(
+            ["alpha", "beta"],
+            search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+        )
+
+
+def test_search_treats_zero_results_redirect_as_usable_and_records_other_failures(monkeypatch) -> None:
+    page = _FakeDetailPage(
+        {"alpha": {0: []}, "beta": {0: []}},
+        zero_result_queries={"alpha"},
+    )
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    adapter = ApecAdapter(max_listings=10)
+    results = adapter.search(
+        ["alpha", "beta"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert results == []
+    assert [error.stage for error in adapter.search_errors] == ["search"]
+    assert "beta" in adapter.search_errors[0].url
+
+
+def test_search_raises_when_every_query_renders_broken_search_shell(monkeypatch) -> None:
+    page = _FakeDetailPage({"alpha": {0: []}, "beta": {0: []}}, search_ready=True)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    adapter = ApecAdapter(max_listings=10)
+
+    with pytest.raises(adapter_module.ApecSearchError):
+        adapter.search(
+            ["alpha", "beta"],
+            search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+        )
+
+    assert [error.stage for error in adapter.search_errors] == ["search", "search"]
+
+
+def test_search_accepts_current_cgu_popin_before_waiting_for_results(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    page = _FakeDetailPage({"alpha": {0: [first_result]}}, consent_required=True)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert page.cgu_checkbox_checked is True
+    assert page.consent_button_clicks == ["ACCEPTER"]
+    assert page.consent_accepted is True
+
+
+def test_search_ignores_unexpected_consent_widget_playwright_errors(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    page = _FakeDetailPage({"alpha": {0: [first_result]}})
+
+    class _ExplodingConsentButton:
+        def click(self, timeout: int | None = None) -> None:
+            raise PlaywrightError("consent widget boom")
+
+    def exploding_get_by_role(role: str, name: str):
+        return _ExplodingConsentButton()
+
+    monkeypatch.setattr(page, "get_by_role", exploding_get_by_role)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    results = ApecAdapter(max_listings=10).search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert page.cgu_checkbox_checked is True
+
+
+def test_search_records_pagination_navigation_failures(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {
+                0: [first_result],
+                1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
+            },
+        }
+    )
+
+    original_goto = page.goto
+
+    def flaky_goto(url: str, wait_until: str | None = None) -> None:
+        if "page=1" in url:
+            raise RuntimeError("navigation boom")
+
+        original_goto(url, wait_until=wait_until)
+
+    monkeypatch.setattr(page, "goto", flaky_goto)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    adapter = ApecAdapter(max_listings=10)
+    results = adapter.search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert [error.stage for error in adapter.search_errors] == ["search"]
+    assert "page=1" in adapter.search_errors[0].url
+    assert adapter.search_errors[0].message == "page 1 navigation failed"
+
+
+def test_search_records_pagination_render_failures(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {
+                0: [first_result],
+                1: [second_result],
+            },
+        }
+    )
+
+    original_wait_for_selector = page.wait_for_selector
+
+    def flaky_wait_for_selector(selector: str, timeout: int | None = None) -> None:
+        if selector == _RESULT_LINK_SELECTOR and page.current_page == 1:
+            raise PlaywrightTimeoutError(f"selector not found: {selector}")
+
+        original_wait_for_selector(selector, timeout=timeout)
+
+    monkeypatch.setattr(page, "wait_for_selector", flaky_wait_for_selector)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    adapter = ApecAdapter(max_listings=10)
+    results = adapter.search(
+        ["alpha"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result]
+    assert [error.stage for error in adapter.search_errors] == ["search"]
+    assert "page=1" in adapter.search_errors[0].url
+    assert adapter.search_errors[0].message == "page 1 results did not render"
+
+
+def test_search_records_evaluate_all_failures_and_continues_to_next_query(monkeypatch) -> None:
+    first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
+    second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=0&selectedIndex=0"
+    page = _FakeDetailPage(
+        {
+            "alpha": {
+                0: [first_result],
+                1: [first_result],
+            },
+            "beta": {
+                0: [second_result],
+                1: [second_result],
+            },
+        }
+    )
+
+    original_locator = page.locator
+
+    class _FlakyLocator:
+        def __init__(self, locator) -> None:
+            self._locator = locator
+
+        def evaluate_all(self, function: str):
+            if (
+                self._locator.selector == _RESULT_LINK_SELECTOR
+                and page.current_query == "alpha"
+                and page.current_page == 1
+            ):
+                raise RuntimeError("evaluate boom")
+
+            return self._locator.evaluate_all(function)
+
+        def __getattr__(self, name: str):
+            return getattr(self._locator, name)
+
+    def flaky_locator(selector: str):
+        return _FlakyLocator(original_locator(selector))
+
+    monkeypatch.setattr(page, "locator", flaky_locator)
+    monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 1)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    adapter = ApecAdapter(max_listings=10)
+    results = adapter.search(
+        ["alpha", "beta"],
+        search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
+    )
+
+    assert [result.url for result in results] == [first_result, second_result]
+    assert [error.stage for error in adapter.search_errors] == ["search"]
+    assert "page=1" in adapter.search_errors[0].url
+
+
+def test_fetch_listing_html_waits_for_rendered_offer_content(monkeypatch) -> None:
+    page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    html = ApecAdapter().fetch_listing_html(
+        "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
+    )
+
+    assert html == "<html>rendered offer</html>"
+    assert len(page.waited_functions) == 1
+    assert ".container-details-offer h1" in page.waited_functions[0][0]
+    assert ".ref-offre" in page.waited_functions[0][0]
+    assert ".details-offer-list" in page.waited_functions[0][0]
+    assert "Descriptif du poste" not in page.waited_functions[0][0]
+    assert page.waited_functions[0][1] == 1000
+
+
+def test_fetch_listing_html_accepts_current_cgu_popin_before_waiting_for_detail_content(monkeypatch) -> None:
+    page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>", consent_required=True)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    html = ApecAdapter().fetch_listing_html(
+        "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
+    )
+
+    assert html == "<html>rendered offer</html>"
+    assert page.cgu_checkbox_checked is True
+    assert page.consent_button_clicks == ["ACCEPTER"]
+    assert page.consent_accepted is True
+
+
+def test_fetch_listing_html_uses_explicit_company_fallback_chain(monkeypatch) -> None:
+    page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    ApecAdapter().fetch_listing_html(
+        "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
+    )
+
+    wait_script = page.waited_functions[0][0]
+    assert "companySelectors" not in wait_script
+    assert ".container-details-offer h1" in wait_script
+    assert ".ref-offre" in wait_script
+    assert ".details-offer-list" in wait_script
+
+
+def test_fetch_listing_html_rejects_redirected_non_apec_urls(monkeypatch) -> None:
+    page = _FakeDetailPage({}, rendered_html="<html>redirected</html>")
+
+    original_goto = page.goto
+
+    def redirecting_goto(url: str, wait_until: str | None = None) -> None:
+        original_goto(url, wait_until=wait_until)
+        page.url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/recherche-avancee?error=true"
+
+    monkeypatch.setattr(page, "goto", redirecting_goto)
+    monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
+
+    with pytest.raises(ValueError, match="unexpected URL after redirects"):
+        ApecAdapter().fetch_listing_html(
+            "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
+        )
+
+    assert len(page.waited_functions) == 1
+    assert page.goto_urls == ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"]
+
+
+def test_fetch_listing_html_rejects_non_apec_hosts() -> None:
+    adapter = ApecAdapter()
+
+    with pytest.raises(ValueError):
+        adapter.fetch_listing_html("https://evilapec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
+
+
+def test_fetch_listing_html_reuses_browser_context_across_calls(monkeypatch) -> None:
+    class FakePage:
+        def __init__(self) -> None:
+            self.goto_urls: list[str] = []
+            self.default_timeout: int | None = None
+            self.url = ""
+
+        def set_default_timeout(self, timeout: int) -> None:
+            self.default_timeout = timeout
+
+        def goto(self, url: str, wait_until: str | None = None) -> None:
+            self.goto_urls.append(url)
+            self.url = url
+
+        def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
+            return None
+
+        def content(self) -> str:
+            return "<html>shared</html>"
+
+        def close(self) -> None:
+            return None
+
+    class FakeBrowserContext:
+        def __init__(self) -> None:
+            self.new_page_calls = 0
+
+        def new_page(self) -> FakePage:
+            self.new_page_calls += 1
+            return FakePage()
+
+        def close(self) -> None:
+            return None
+
+    class FakeBrowser:
+        def __init__(self, browser_context: FakeBrowserContext) -> None:
+            self.browser_context = browser_context
+
+        def new_context(self) -> FakeBrowserContext:
+            return self.browser_context
+
+        def close(self) -> None:
+            return None
+
+    class FakeChromium:
+        def __init__(self, browser: FakeBrowser) -> None:
+            self.browser = browser
+            self.launch_calls = 0
+
+        def launch(self, headless: bool = True) -> FakeBrowser:
+            self.launch_calls += 1
+            return self.browser
+
+    class FakePlaywright:
+        def __init__(self, chromium: FakeChromium) -> None:
+            self.chromium = chromium
+
+    class FakePlaywrightManager:
+        def __init__(self, chromium: FakeChromium) -> None:
+            self.playwright = FakePlaywright(chromium)
+
+        def __enter__(self) -> FakePlaywright:
+            return self.playwright
+
+        def __exit__(self, exc_type, exc, tb) -> None:
+            return None
+
+    browser_context = FakeBrowserContext()
+    browser = FakeBrowser(browser_context)
+    chromium = FakeChromium(browser)
+
+    monkeypatch.setattr(adapter_module, "sync_playwright", lambda: FakePlaywrightManager(chromium))
+
+    adapter = ApecAdapter()
+    with adapter.browser_session():
+        html_one = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
+        html_two = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222")
+
+    assert html_one == "<html>shared</html>"
+    assert html_two == "<html>shared</html>"
+    assert chromium.launch_calls == 1
+    assert browser_context.new_page_calls == 2
--- a/tests/apec/test_dedupe.py
+++ b/tests/apec/test_dedupe.py
@ -0,0 +1,212 @@
+from job_research.apec.dedupe import dedupe_apec_listings
+from job_research.models import ApecListing
+
+
+def test_dedupe_apec_listings_by_url_preserves_first_listing() -> None:
+    first = ApecListing(
+        source="apec",
+        url="https://example.test/job/1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        url="https://example.test/job/1",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second])
+
+    assert deduped == [first]
+
+
+def test_dedupe_apec_listings_by_source_job_id_ignores_url_changes() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        url="https://example.test/job/1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        url="https://example.test/job/2",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second])
+
+    assert deduped == [first]
+
+
+def test_dedupe_apec_listings_collapses_mixed_key_duplicates() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        url="https://example.test/job/1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id=None,
+        url="https://example.test/job/1",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second])
+
+    assert deduped == [first]
+
+
+def test_dedupe_apec_listings_keeps_secondary_ids_from_skipped_rows() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id=None,
+        url="url1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        url="url1",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+    third = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        url="url2",
+        fetched_at="2026-06-01T10:02:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second, third])
+
+    assert deduped == [first]
+
+
+def test_dedupe_apec_listings_merges_metadata_from_duplicate_rows() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id=None,
+        published_at=None,
+        refreshed_at=None,
+        url="url1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="job-123",
+        published_at="2026-06-01",
+        refreshed_at="2026-06-02",
+        url="url1",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second])
+
+    assert len(deduped) == 1
+    assert deduped[0].url == "url1"
+    assert deduped[0].source_job_id == "job-123"
+    assert deduped[0].published_at == "2026-06-01"
+    assert deduped[0].refreshed_at == "2026-06-02"
+
+
+def test_dedupe_apec_listings_merges_metadata_through_alias_chain() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id=None,
+        url="u1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="i2",
+        url="u2",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+    third = ApecListing(
+        source="apec",
+        source_job_id="i4",
+        url="u2",
+        fetched_at="2026-06-01T10:02:00Z",
+    )
+    fourth = ApecListing(
+        source="apec",
+        source_job_id="i2",
+        url="u1",
+        fetched_at="2026-06-01T10:03:00Z",
+    )
+    fifth = ApecListing(
+        source="apec",
+        source_job_id="i4",
+        url="u6",
+        company="NewestCo",
+        fetched_at="2026-06-01T10:04:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second, third, fourth, fifth])
+
+    assert len(deduped) == 1
+    assert deduped[0].url == "u1"
+    assert deduped[0].source_job_id == "i2"
+    assert deduped[0].company == "NewestCo"
+
+
+def test_dedupe_apec_listings_keeps_one_survivor_for_cluster_alias_chain() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id="id2",
+        url="u2",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="id3",
+        url="u2",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+    third = ApecListing(
+        source="apec",
+        source_job_id="id2",
+        url="u1",
+        fetched_at="2026-06-01T10:02:00Z",
+    )
+    fourth = ApecListing(
+        source="apec",
+        source_job_id="id3",
+        url="u3",
+        fetched_at="2026-06-01T10:03:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second, third, fourth])
+
+    assert len(deduped) == 1
+    assert deduped[0].url == "u2"
+    assert deduped[0].source_job_id == "id2"
+
+
+def test_dedupe_apec_listings_keeps_first_listing_as_bridge_survivor() -> None:
+    first = ApecListing(
+        source="apec",
+        source_job_id="id1",
+        url="u1",
+        fetched_at="2026-06-01T10:00:00Z",
+    )
+    second = ApecListing(
+        source="apec",
+        source_job_id="id2",
+        url="u2",
+        fetched_at="2026-06-01T10:01:00Z",
+    )
+    third = ApecListing(
+        source="apec",
+        source_job_id="id1",
+        url="u2",
+        company="NewestCo",
+        fetched_at="2026-06-01T10:02:00Z",
+    )
+
+    deduped = dedupe_apec_listings([first, second, third])
+
+    assert len(deduped) == 1
+    assert deduped[0].url == "u1"
+    assert deduped[0].source_job_id == "id1"
+    assert deduped[0].company == "NewestCo"
--- a/tests/apec/test_normalize.py
+++ b/tests/apec/test_normalize.py
@ -0,0 +1,372 @@
+from job_research.apec.normalize import normalize_apec_listing
+
+
+def test_normalize_apec_listing_extracts_minimal_shape() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>CLOUD TEMPLE</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">CLOUD TEMPLE</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Salaire</h4>
+            <span>A partir de 70 k€ brut annuel</span>
+          </div>
+          <div class="details-post">
+            <h4>Prise de poste</h4>
+            <span>Dès que possible</span>
+          </div>
+          <div class="details-post">
+            <h4>Expérience</h4>
+            <span>Minimum 7 ans</span>
+          </div>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+            <div class="nested-late-sections">
+              <h4>Profil recherché</h4>
+              <p>Python / SQL</p>
+              <h4>Compétences attendues</h4>
+              <p>Ignored</p>
+              <h4>Entreprise</h4>
+              <p>Ignored</p>
+              <div class="recruiter">Ignored recruiter info</div>
+            </div>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="job-123",
+    )
+
+    assert listing.source == "apec"
+    assert listing.source_job_id == "job-123"
+    assert listing.url == "https://example.test/job/123"
+    assert listing.title == "Data Engineer F/H"
+    assert listing.company == "CLOUD TEMPLE"
+    assert listing.location == "Puteaux - 92"
+    assert listing.contract_type == "CDI"
+    assert listing.description_text == "Build pipelines"
+    assert listing.published_at == "2026-04-20"
+    assert listing.refreshed_at == "2026-06-02"
+    assert listing.fetched_at == "2026-06-01T10:00:00Z"
+
+
+def test_normalize_apec_listing_prefers_final_source_job_id_from_detail_page() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : FINAL456</div>
+            <ul class="details-offer-list mb-20">
+              <li>CLOUD TEMPLE</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">CLOUD TEMPLE</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/REQUESTED123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="REQUESTED123",
+    )
+
+    assert listing.source_job_id == "FINAL456"
+
+
+def test_normalize_apec_listing_warns_and_returns_none_for_invalid_dates() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>CLOUD TEMPLE</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 32/13/2026 Actualisée le 31/02/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">CLOUD TEMPLE</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="job-123",
+    )
+
+    assert listing.published_at is None
+    assert listing.refreshed_at is None
+    assert [warning.field for warning in listing.warnings] == ["published_at", "refreshed_at"]
+
+
+def test_normalize_apec_listing_uses_details_offer_list_company_fallback() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>Fallback Company</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <div class="details-post">
+            <h4>Salaire</h4>
+            <span>A partir de 70 k€ brut annuel</span>
+          </div>
+          <div class="details-post">
+            <h4>Prise de poste</h4>
+            <span>Dès que possible</span>
+          </div>
+          <div class="details-post">
+            <h4>Expérience</h4>
+            <span>Minimum 7 ans</span>
+          </div>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+            <div class="nested-late-sections">
+              <h4>Profil recherché</h4>
+              <p>Python / SQL</p>
+              <h4>Compétences attendues</h4>
+              <p>Ignored</p>
+              <h4>Entreprise</h4>
+              <p>Ignored</p>
+              <div class="recruiter">Ignored recruiter info</div>
+            </div>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id=None,
+    )
+
+    assert listing.company == "Fallback Company"
+    assert listing.description_text == "Build pipelines"
+    assert listing.refreshed_at == "2026-06-02"
+
+
+def test_normalize_apec_listing_records_warnings_for_fallback_and_missing_fields() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>Fallback Company</li>
+              <li>1 CDI</li>
+            </ul>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="card-ents-quote">Fallback Company</span>
+            </div>
+          </article>
+        </main>
+        <h1>Fallback Title</h1>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id=None,
+    )
+
+    assert [warning.field for warning in listing.warnings] == [
+        "title",
+        "location",
+        "contract_type",
+        "description_text",
+        "source_job_id",
+        "company",
+        "published_at",
+    ]
+
+
+def test_normalize_apec_listing_records_warnings_for_placeholder_text_values() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1> N/A </h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>Example Corp</li>
+              <li>1 <span> N/A </span></li>
+              <li> - </li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">Example Corp</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="178554452W",
+    )
+
+    assert listing.title is None
+    assert listing.location is None
+    assert listing.contract_type is None
+    assert [warning.field for warning in listing.warnings] == ["title", "location", "contract_type"]
+
+
+def test_normalize_apec_listing_records_warning_for_placeholder_company() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>Example Corp</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">N/A</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>Build pipelines</p>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="178554452W",
+    )
+
+    assert listing.company is None
+    assert listing.refreshed_at == "2026-06-02"
+    assert [warning.field for warning in listing.warnings] == ["company"]
+
+
+def test_normalize_apec_listing_records_warning_for_placeholder_description_text() -> None:
+    html = """
+    <html>
+      <body>
+        <main class="container-details-offer">
+          <h1>Data Engineer F/H</h1>
+          <div class="card-offer">
+            <div class="ref-offre">Ref. Apec : 178554452W</div>
+            <ul class="details-offer-list mb-20">
+              <li>CLOUD TEMPLE</li>
+              <li>1 <span> CDI </span></li>
+              <li>Puteaux - 92</li>
+            </ul>
+            <p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
+          </div>
+          <article class="card card-ents mb-20">
+            <div class="list-hzt mb-20">
+              <span class="ents-name">CLOUD TEMPLE</span>
+            </div>
+          </article>
+          <div class="details-post">
+            <h4>Descriptif du poste</h4>
+            <p>N/A</p>
+          </div>
+        </main>
+      </body>
+    </html>
+    """
+
+    listing = normalize_apec_listing(
+        url="https://example.test/job/123",
+        html=html,
+        fetched_at="2026-06-01T10:00:00Z",
+        source_job_id="job-123",
+    )
+
+    assert listing.description_text is None
+    assert [warning.field for warning in listing.warnings] == ["description_text"]
--- a/tests/apec/test_query_derivation.py
+++ b/tests/apec/test_query_derivation.py
@ -0,0 +1,75 @@
+from job_research.apec.adapter import ApecSearchFilters
+from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
+from job_research.models import CandidateProfileOutput
+
+
+def test_derive_apec_queries_preserves_order_dedupes_and_caps_at_five() -> None:
+    profile = CandidateProfileOutput(
+        target_roles=[
+            "Data Engineer",
+            "Analytics Engineer",
+            "Data Engineer",
+            "BI Engineer",
+            "Junior Data Platform Engineer",
+            "ML Engineer",
+            "Backend Engineer",
+        ],
+        strengths=["Python", "SQL"],
+        skills_to_emphasize=["BigQuery", "Terraform"],
+    )
+
+    queries = derive_apec_queries(profile)
+
+    assert queries == [
+        "Data Engineer",
+        "Analytics Engineer",
+        "BI Engineer",
+        "Junior Data Platform Engineer",
+        "ML Engineer",
+    ]
+
+
+def test_derive_apec_queries_uses_up_to_five_target_roles_when_no_support_terms_exist() -> None:
+    profile = CandidateProfileOutput(
+        target_roles=[
+            "Data Engineer",
+            "Analytics Engineer",
+            "BI Engineer",
+            "Junior Data Platform Engineer",
+            "ML Engineer",
+            "Backend Engineer",
+        ]
+    )
+
+    queries = derive_apec_queries(profile)
+
+    assert queries == [
+        "Data Engineer",
+        "Analytics Engineer",
+        "BI Engineer",
+        "Junior Data Platform Engineer",
+        "ML Engineer",
+    ]
+
+
+def test_derive_apec_queries_uses_support_terms_without_constraints() -> None:
+    profile = CandidateProfileOutput(
+        target_roles=["Data Engineer"],
+        strengths=["Python"],
+        skills_to_emphasize=["BigQuery"],
+        constraints=["CDI only", "France only"],
+    )
+
+    queries = derive_apec_queries(profile)
+
+    assert queries == [
+        "Data Engineer",
+        "Data Engineer Python",
+        "Data Engineer BigQuery",
+    ]
+
+
+def test_derive_apec_search_filters_from_constraints() -> None:
+    profile = CandidateProfileOutput(constraints=["CDI only", "France only"])
+
+    assert derive_apec_search_filters(profile) == ApecSearchFilters(location="France", contract_type="CDI")
--- a/tests/profile/test_cv_extractor.py
+++ b/tests/profile/test_cv_extractor.py
@ -29,6 +29,23 @@ def test_extract_cv_signals_reads_basic_fields_from_text() -> None:
    assert len(extracted["experience_entries"]) == 2


+def test_extract_cv_signals_flags_low_confidence_first_line_as_name() -> None:
+    text = dedent(
+        """
+        Data Engineer | Python | GCP
+        Location: France
+        Languages: French, English
+        Skills: Python, SQL
+        Data Engineer at Company A
+        """
+    ).strip()
+
+    extracted = extract_cv_signals(text)
+
+    assert extracted["name"] == "Data Engineer | Python | GCP"
+    assert [warning.field for warning in extracted["warnings"]] == ["name"]
+
+
@pytest.mark.parametrize(
    ("line", "expected"),
    [
--- a/tests/profile/test_merge.py
+++ b/tests/profile/test_merge.py
@ -1,5 +1,6 @@
 from job_research.profile.merge import build_candidate_profile_output
 from job_research.profile.profile_parser import AuthoredProfile
+from job_research.models import WarningItem


 def test_build_candidate_profile_output_writes_warning_when_facts_conflict() -> None:
@ -72,3 +73,34 @@ def test_build_candidate_profile_output_warns_on_missing_core_cv_facts() -> None
        "skills",
        "education_entries",
    ]
+
+
+def test_build_candidate_profile_output_propagates_cv_extraction_warnings() -> None:
+    cv_signals = {
+        "name": "Data Engineer | Python | GCP",
+        "location": "France",
+        "languages": ["French", "English"],
+        "skills": ["Python", "SQL"],
+        "experience_entries": [{"title": "Data Engineer", "company": "A"}],
+        "education_entries": [{"credential": "MSc", "institution": "Example University"}],
+        "warnings": [
+            WarningItem(
+                field="name",
+                message="First CV line looks like a header or tagline; review manually.",
+            )
+        ],
+    }
+    authored = AuthoredProfile(
+        summary="Junior data engineer focused on GCP.",
+        target_roles=["Data Engineer"],
+        strengths=["Python"],
+        skills_to_emphasize=["BigQuery", "GCP"],
+        constraints=["CDI only"],
+        notes=[],
+    )
+
+    output = build_candidate_profile_output(cv_signals, authored)
+
+    assert output.warnings == [
+        WarningItem(field="name", message="First CV line looks like a header or tagline; review manually.")
+    ]
--- a/tests/profile/test_profile_parser.py
+++ b/tests/profile/test_profile_parser.py
@ -72,3 +72,38 @@ def test_parse_profile_markdown_rejects_unsupported_list_content() -> None:

    with pytest.raises(ValueError, match="Unsupported content in section 'target roles'"):
        parse_profile_markdown(markdown)
+
+
+@pytest.mark.parametrize("section_name", ["Target Roles", "Notes"])
+def test_parse_profile_markdown_rejects_blank_bullet_only_required_sections(
+    section_name: str,
+) -> None:
+    target_roles = "- " if section_name == "Target Roles" else "- Data Engineer"
+    notes = "- " if section_name == "Notes" else "Slight preference for French listings."
+
+    markdown = dedent(
+        f"""
+        # Candidate Profile
+
+        ## Summary
+        Junior data engineer focused on Python and GCP.
+
+        ## Target Roles
+        {target_roles}
+
+        ## Strengths
+        - Python
+
+        ## Skills To Emphasize
+        - BigQuery
+
+        ## Constraints
+        - CDI only
+
+        ## Notes
+        {notes}
+        """
+    ).strip()
+
+    with pytest.raises(ValueError, match=f"Missing usable content in section '{section_name.lower()}'"):
+        parse_profile_markdown(markdown)
--- a/tests/test_apec_cli.py
+++ b/tests/test_apec_cli.py
--- a/tests/test_apec_storage.py
+++ b/tests/test_apec_storage.py
@ -0,0 +1,82 @@
+from pathlib import Path
+
+from job_research.models import ApecListing, ApecRunMeta, ApecSnapshotMeta, ListingWarning
+from job_research.storage import apec_run_paths
+
+
+FIXED_RUN_ID = "2026-06-01T10-00-00-123456Z"
+
+
+def test_apec_models_serialize_expected_listing_shape() -> None:
+    listing = ApecListing(
+        source="apec",
+        source_job_id="123",
+        url="https://example.test/job/123",
+        title="Data Engineer",
+        company="Example",
+        location="Paris",
+        contract_type="CDI",
+        description_text="Build pipelines",
+        published_at="2026-06-01",
+        refreshed_at="2026-06-02",
+        fetched_at="2026-06-01T10:00:00Z",
+        warnings=[
+            ListingWarning(
+                field="location",
+                message="Location inferred from page text",
+            )
+        ],
+    )
+    run_meta = ApecRunMeta(
+        run_id=FIXED_RUN_ID,
+        run_started_at="2026-06-01T10:00:00Z",
+        derived_queries=["Data Engineer"],
+        snapshots=[
+            ApecSnapshotMeta(
+                url="https://example.test/job/123",
+                source_job_id="123",
+                snapshot_file="job-123.html",
+                fetched_at="2026-06-01T10:00:00Z",
+            )
+        ],
+        fetched_count=1,
+        normalized_count=1,
+        deduplicated_count=1,
+        failed_count=0,
+        listing_errors=[],
+    )
+
+    assert listing.model_dump()["source"] == "apec"
+    assert listing.model_dump()["warnings"][0]["field"] == "location"
+    assert listing.model_dump()["refreshed_at"] == "2026-06-02"
+    assert run_meta.model_dump()["run_id"] == FIXED_RUN_ID
+    assert run_meta.model_dump()["run_started_at"] == "2026-06-01T10:00:00Z"
+    assert run_meta.model_dump()["derived_queries"] == ["Data Engineer"]
+    assert run_meta.model_dump(mode="json")["snapshots"] == [
+        {
+            "url": "https://example.test/job/123",
+            "source_job_id": "123",
+            "snapshot_file": "job-123.html",
+            "fetched_at": "2026-06-01T10:00:00Z",
+        }
+    ]
+
+
+def test_apec_run_paths_builds_expected_layout(tmp_path: Path) -> None:
+    paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
+    run_dir = tmp_path / "apec" / "runs" / FIXED_RUN_ID
+
+    assert paths["run_dir"] == run_dir
+    assert paths["listings"] == run_dir / "listings.yaml"
+    assert paths["run_meta"] == run_dir / "run-meta.yaml"
+    assert paths["snapshots"] == run_dir / "snapshots"
+
+
+def test_apec_run_artifacts_include_snapshot_and_meta(tmp_path: Path) -> None:
+    paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
+    paths["snapshots"].mkdir(parents=True, exist_ok=True)
+
+    snapshot = paths["snapshots"] / "job-123.html"
+    snapshot.write_text("<html>snapshot</html>", encoding="utf-8")
+
+    assert snapshot.read_text(encoding="utf-8") == "<html>snapshot</html>"
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -1,6 +1,10 @@
 from subprocess import run
 from textwrap import dedent
+from pathlib import Path

+from typer.testing import CliRunner
+
+from job_research.cli import app
 from job_research.storage import load_yaml


@ -147,6 +151,78 @@ def test_build_profile_reports_when_no_warnings_are_included(tmp_path) -> None:
    assert "No warnings included." in result.stdout


+def test_build_profile_reports_output_write_failures_cleanly(tmp_path, monkeypatch) -> None:
+    cv = tmp_path / "cv.txt"
+    cv.write_text(
+        dedent(
+            """
+            Tonio Example
+            Location: France
+            Languages: French, English
+            Skills: Python, SQL
+            Data Engineer at Acme
+            Education: Master of Science at Example University
+            """
+        ).strip(),
+        encoding="utf-8",
+    )
+    profile = tmp_path / "profile.md"
+    profile.write_text(
+        dedent(
+            """
+            # Candidate Profile
+
+            ## Summary
+            Junior data engineer focused on Python and GCP.
+
+            ## Target Roles
+            - Data Engineer
+
+            ## Strengths
+            - Python
+            - SQL
+
+            ## Skills To Emphasize
+            - GCP
+            - BigQuery
+
+            ## Constraints
+            - CDI only
+            - France only
+
+            ## Notes
+            - Slight preference for French listings.
+            """
+        ).strip(),
+        encoding="utf-8",
+    )
+    out = tmp_path / "candidate-profile.yaml"
+
+    original_write_text = Path.write_text
+
+    def flaky_write_text(
+        self: Path,
+        data: str,
+        encoding: str | None = None,
+        errors: str | None = None,
+        newline: str | None = None,
+    ) -> int:
+        if self == out:
+            raise OSError("disk full")
+
+        return original_write_text(self, data, encoding=encoding, errors=errors, newline=newline)
+
+    monkeypatch.setattr(Path, "write_text", flaky_write_text)
+
+    result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
+
+    assert result.exit_code == 1
+    assert not out.exists()
+    assert "Unable to write candidate profile to" in result.stderr
+    assert "disk full" in result.stderr
+    assert "Traceback" not in result.stderr
+
+
 def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
    cv = tmp_path / "cv.txt"
    cv.write_text("   \n", encoding="utf-8")
@ -189,3 +265,100 @@ def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
    assert result.returncode != 0
    assert not out.exists()
    assert "No readable text found in CV input" in result.stderr
+    assert "Traceback" not in result.stderr
+
+
+def test_build_profile_reports_unreadable_pdf_input_cleanly(tmp_path, monkeypatch) -> None:
+    cv = tmp_path / "cv.pdf"
+    cv.write_bytes(b"%PDF-1.4\n")
+    profile = tmp_path / "profile.md"
+    profile.write_text(
+        dedent(
+            """
+            # Candidate Profile
+
+            ## Summary
+            Junior data engineer.
+
+            ## Target Roles
+            - Data Engineer
+
+            ## Strengths
+            - Python
+
+            ## Skills To Emphasize
+            - BigQuery
+
+            ## Constraints
+            - CDI only
+
+            ## Notes
+            - Slight preference for French listings.
+            """
+        ).strip(),
+        encoding="utf-8",
+    )
+    out = tmp_path / "candidate-profile.yaml"
+
+    def broken_extract_pdf_text(path):
+        raise ValueError("broken pdf")
+
+    monkeypatch.setattr("job_research.cli.extract_pdf_text", broken_extract_pdf_text)
+
+    result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
+
+    assert result.exit_code == 1
+    assert not out.exists()
+    assert "CV input not readable" in result.stderr
+    assert "broken pdf" in result.stderr
+    assert "Traceback" not in result.stderr
+
+
+def test_build_profile_reports_malformed_profile_markdown_cleanly(tmp_path) -> None:
+    cv = tmp_path / "cv.txt"
+    cv.write_text(
+        dedent(
+            """
+            Tonio Example
+            Location: France
+            Languages: French, English
+            Skills: Python
+            """
+        ).strip(),
+        encoding="utf-8",
+    )
+    profile = tmp_path / "profile.md"
+    profile.write_text(
+        dedent(
+            """
+            # Candidate Profile
+
+            ## Summary
+            Junior data engineer.
+
+            ## Target Roles
+            Data Engineer
+
+            ## Strengths
+            - Python
+
+            ## Skills To Emphasize
+            - BigQuery
+
+            ## Constraints
+            - CDI only
+
+            ## Notes
+            - Slight preference for French listings.
+            """
+        ).strip(),
+        encoding="utf-8",
+    )
+    out = tmp_path / "candidate-profile.yaml"
+
+    result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
+
+    assert result.exit_code == 1
+    assert not out.exists()
+    assert "profile markdown invalid" in result.stderr
+    assert "Traceback" not in result.stderr
--- a/uv.lock
+++ b/uv.lock
@ -20,6 +20,19 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
 ]

+[[package]]
+name = "beautifulsoup4"
+version = "4.14.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "soupsieve" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@ -29,6 +42,63 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]

+[[package]]
+name = "greenlet"
+version = "3.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6d/6e/802acd792aebb2256fbbee8cacf2727faaeb6f240ac11008f09eae4414bc/greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829", size = 197356, upload-time = "2026-05-20T15:05:03.917Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/69/7f7e5372d998b81001899b1c0823c957aa413ba0f2662e65821611cc31e4/greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b", size = 285060, upload-time = "2026-05-20T13:08:51.899Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/bf/387f9b6b865fd2ae0d0be09e0004827295a01b71be76ed350dd1e28a91a4/greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a", size = 604370, upload-time = "2026-05-20T14:00:07.492Z" },
+    { url = "https://files.pythonhosted.org/packages/32/f5/169ce3d4e4c67291bd18f8cbe0299c9f3e45102c7f1fb3c14780c93e4532/greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283", size = 616987, upload-time = "2026-05-20T14:05:44.237Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ba/c24110c55dffa55aa6e1d98b45310da33801aeba7686ff0190fe5d46fd32/greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce", size = 622911, upload-time = "2026-05-20T14:09:10.598Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/e5/7f2e41d5273be07e77560d61ea4e56485b4d6c316d2a84518c62d1364061/greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135", size = 613911, upload-time = "2026-05-20T13:14:27.539Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/7b/d20db2e8a5ad6c038702f3179b136f93f0a3d1a21a0c0777f3e470cdf4b2/greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436", size = 425228, upload-time = "2026-05-20T14:01:40.837Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/a4/fbdc67579b73615a1f91615e814303cc71e06128f7baaba87be79b8fb90c/greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd", size = 1570689, upload-time = "2026-05-20T14:02:27.225Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b4/77abbe35078be39718a46cd49caf16bceb35662f97a34101dca28aa98e47/greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1", size = 1635602, upload-time = "2026-05-20T13:14:36.344Z" },
+    { url = "https://files.pythonhosted.org/packages/37/f7/129f27ca700845b8ee8ca88ce7f43435a1239c2eddb7677fc938822762cf/greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9", size = 238683, upload-time = "2026-05-20T13:11:50.57Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/5c/a485a36e87df8d8fd0632ee01511244f5156a20ed3746cc6599340326395/greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e", size = 235499, upload-time = "2026-05-20T13:12:42.028Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/cb/c62454606daf5640369c94d8a9dd540599b1bfc090e2d2180cb77f4038d2/greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07", size = 285579, upload-time = "2026-05-20T13:08:56.396Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/71/c4270398c2eba968a6071af1dfbdcaeee6ec1c24bc8b435b8cc452700da6/greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea", size = 651106, upload-time = "2026-05-20T14:00:09.448Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/ab/71e34b78a44ec271fb5f550c17bc46d301ddc5953890d935f270b0dcdb5a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2", size = 663478, upload-time = "2026-05-20T14:05:45.88Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/2d/2d80842910da44f78c286532d084b8a5c3717c844ae80ceb3858738ae89a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c", size = 667767, upload-time = "2026-05-20T14:09:12.15Z" },
+    { url = "https://files.pythonhosted.org/packages/77/96/4efd6fa5c62c85426a0c19077a586258ebc3a2a146ff2493e4312a697a22/greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c", size = 660800, upload-time = "2026-05-20T13:14:29.129Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/d3/dad2eecedfbb1ed7050a20dcfae40c1442b74bc7423608be2c7e03ee7133/greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d", size = 470786, upload-time = "2026-05-20T14:01:42.064Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/e0/6c71401a25cac7000261304e866a2f2cc04dc74810d40e2f118aa4799495/greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0", size = 1617518, upload-time = "2026-05-20T14:02:28.662Z" },
+    { url = "https://files.pythonhosted.org/packages/41/26/c5c06643e8c0af9e7bf18e16cb51d0ab7625155f0392e1c9015d66d556cd/greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc", size = 1681593, upload-time = "2026-05-20T13:14:39.417Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/bd/e11a108317485075e68af9d23039619b86b28130c3b50d227d42edece64b/greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3", size = 239800, upload-time = "2026-05-20T13:09:30.128Z" },
+    { url = "https://files.pythonhosted.org/packages/47/f8/8e8e8417b7bf28639a5a56356ef934d0375e1d0c70a57e04d7701e870ffe/greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54", size = 236862, upload-time = "2026-05-20T13:09:10.498Z" },
+    { url = "https://files.pythonhosted.org/packages/90/12/41bf27fde4d3605d3773ae57751eda182b8be2f5398011c041173b1d9534/greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad", size = 293637, upload-time = "2026-05-20T13:12:35.529Z" },
+    { url = "https://files.pythonhosted.org/packages/44/44/ba14b23e9757707050c2f397d305bbcae62e5d7cad122f8b6baec5ae4a1f/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e", size = 650840, upload-time = "2026-05-20T14:00:11.079Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/37/5ddc2b686a6844f91abecef43411842426da2e1573f60b49ecf2547f4ae1/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986", size = 656416, upload-time = "2026-05-20T14:05:47.118Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/46/5987dcd1a2570ba84f3b187536b2ca3ae97613387e57f5cfa99df068fe5e/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f", size = 656607, upload-time = "2026-05-20T14:09:13.949Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/f0/d17510297c35a2992712f0bf84de3779749999f7d3d63aa1f09db7c62dbe/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e", size = 654397, upload-time = "2026-05-20T13:14:30.696Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/c1/6da0a9ddcc29d7e51ef14883fa3dc1e53b3f4ffba00582106c7bf55da1d8/greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de", size = 488287, upload-time = "2026-05-20T14:01:43.143Z" },
+    { url = "https://files.pythonhosted.org/packages/37/eb/147387705bb89092645b012586e7273cb5ed3c90ef7eaf3a69173eaf0209/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d", size = 1614469, upload-time = "2026-05-20T14:02:30.192Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/4e/37ee0da7732b7aa9896f17e15579a9df34b9fcb9dd494f0adfa749af6623/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78", size = 1675115, upload-time = "2026-05-20T13:14:40.972Z" },
+    { url = "https://files.pythonhosted.org/packages/57/f3/97dfcf4a6eb5077f8a672234216fb5923eb89f2cab7081cb10b2cf75b605/greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2", size = 245246, upload-time = "2026-05-20T13:12:22.646Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/73/d7f72e34b582f694f4a9b248162db7b09cc458a259ba8f0c0bfa1a34ea7d/greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541", size = 285575, upload-time = "2026-05-20T13:12:07.043Z" },
+    { url = "https://files.pythonhosted.org/packages/df/59/fa9c6e87dc8ad27a95dabe2f29f372b733d05a8a67470f6c901ed9975655/greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de", size = 656428, upload-time = "2026-05-20T14:00:12.556Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/f9/e753408871eaa61dfe35e619cfc67512b036fde99893685d50eea9e07146/greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64", size = 667064, upload-time = "2026-05-20T14:05:48.662Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/74/807a047255bf1e09303627c46dc043dca596b6958a354d904f32ab382005/greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0", size = 672962, upload-time = "2026-05-20T14:09:15.532Z" },
+    { url = "https://files.pythonhosted.org/packages/96/27/5565b5b40389f1c7753003a07e21892fda8660926787036d5bc0308b8113/greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5", size = 665697, upload-time = "2026-05-20T13:14:32.943Z" },
+    { url = "https://files.pythonhosted.org/packages/76/32/19d4e13225193c29b13e308015223f7d75fd3d8623d49dd19040d2ce8ec1/greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc", size = 476047, upload-time = "2026-05-20T14:01:44.39Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/82/e7de4178c0c2d1c9a5a3be3cc0b33e46a85b3ee4a77c071bf7ad8600e079/greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368", size = 1621256, upload-time = "2026-05-20T14:02:31.91Z" },
+    { url = "https://files.pythonhosted.org/packages/00/10/f2dddcf7dacac17dfc68691809589adad06135eb28930429cf58a6467a2f/greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26", size = 1685956, upload-time = "2026-05-20T13:14:42.55Z" },
+    { url = "https://files.pythonhosted.org/packages/22/17/4a232b32133230ada52f70e9d7f5b65b0caef8772f01849bd8d149e7e4ca/greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab", size = 239802, upload-time = "2026-05-20T13:13:15.481Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/ae/4e623a7e6d4d2a5f4cb8e4c82de4169fc637942caae68d6e676b8a128ac5/greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6", size = 236853, upload-time = "2026-05-20T13:15:37.301Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/57/816d9cff29119da3505b3d6a5e14a8af89006ac36f47f891ff293ee05af1/greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed", size = 293877, upload-time = "2026-05-20T13:10:19.078Z" },
+    { url = "https://files.pythonhosted.org/packages/23/a1/59b0a7c7d140ff1a75626680b9a9899b79a9176cab298b394968fb023295/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244", size = 655333, upload-time = "2026-05-20T14:00:14.758Z" },
+    { url = "https://files.pythonhosted.org/packages/72/1b/5efe127597625042218939d01855109f352779050768b670b52edcc16a6c/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c", size = 659443, upload-time = "2026-05-20T14:05:50.159Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/9d/1dcdf7b95ab3cf8c7b6d7277c18a5e167312f2b362ddfcc5d5e6d8d84b43/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c", size = 659998, upload-time = "2026-05-20T14:09:16.912Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/6d/c404246ea4d22d097a7426d0efb5b781bd7eb67715f09e79001bd552ab18/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd", size = 658356, upload-time = "2026-05-20T13:14:35.091Z" },
+    { url = "https://files.pythonhosted.org/packages/05/7e/c4959664fc231d587d66d8e81f2095e98056ba1954beafdcbe635e251052/greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62", size = 494470, upload-time = "2026-05-20T14:01:45.611Z" },
+    { url = "https://files.pythonhosted.org/packages/51/02/f8ee37fb6d2219329f350af241c27fcf12df57e723d11f6fc6d3bacdadaa/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e", size = 1619216, upload-time = "2026-05-20T14:02:33.403Z" },
+    { url = "https://files.pythonhosted.org/packages/93/c5/3dc9475ace2c7a3680da12372cddd7f1ac874eb410a1ac48d3e9dab83782/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659", size = 1678427, upload-time = "2026-05-20T13:14:43.71Z" },
+    { url = "https://files.pythonhosted.org/packages/df/4e/750c15c317a41ffb36f0bf40b933e3d744a7dede61889f74443ea69690cf/greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e", size = 245225, upload-time = "2026-05-20T13:13:59.366Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/fd/d3baea2eeb7b617efd47e87ca06e2ec2c6118d303aa9e918e0ce16eadc10/greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a", size = 239590, upload-time = "2026-05-20T13:13:37.382Z" },
+]
+
 [[package]]
 name = "iniconfig"
 version = "2.3.0"
@ -43,6 +113,8 @@ name = "job-research"
 version = "0.1.0"
 source = { editable = "." }
 dependencies = [
+    { name = "beautifulsoup4" },
+    { name = "playwright" },
    { name = "pydantic" },
    { name = "pypdf" },
    { name = "pyyaml" },
@ -56,6 +128,8 @@ dev = [

 [package.metadata]
 requires-dist = [
+    { name = "beautifulsoup4", specifier = ">=4.12,<5" },
+    { name = "playwright", specifier = ">=1.52,<2" },
    { name = "pydantic", specifier = ">=2.7,<3" },
    { name = "pypdf", specifier = ">=5.0,<6" },
    { name = "pyyaml", specifier = ">=6.0,<7" },
@ -95,6 +169,25 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
 ]

+[[package]]
+name = "playwright"
+version = "1.60.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "greenlet" },
+    { name = "pyee" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/21/f0/832bd9677194908da118064eef20082f2791e3d18215cc6d9391ee2c5a67/playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7", size = 43474635, upload-time = "2026-05-18T12:00:31.969Z" },
+    { url = "https://files.pythonhosted.org/packages/59/7b/e1d32ae8a3ed937ec2be3721c5f728b13d731a0b7c6442e0b3bec5094ac0/playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5", size = 42261327, upload-time = "2026-05-18T12:00:35.638Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/bc/23de499ded6411c188a20c5a0dea6f0cd4ed5d2b3cc6042a5dbd3ed609aa/playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705", size = 43474636, upload-time = "2026-05-18T12:00:39.294Z" },
+    { url = "https://files.pythonhosted.org/packages/22/7b/1d679f4fced4ea94efadd17103856d8c565384f68382a1681264e46f5925/playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e", size = 47467220, upload-time = "2026-05-18T12:00:43.179Z" },
+    { url = "https://files.pythonhosted.org/packages/84/c2/1528d267d4442bd2c6b8eaeab819dd52c2030bf80e89293f0ba1f687473b/playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353", size = 47154856, upload-time = "2026-05-18T12:00:46.715Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/4e/b008b6440a7a1624378041da94829956d4b8f7ab9ef5aad22d0dc3f2e26d/playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7", size = 37902157, upload-time = "2026-05-18T12:00:50.374Z" },
+    { url = "https://files.pythonhosted.org/packages/55/f0/0541524133104f9cc20bf900870ff4a736b76a23483f3a55295ddfa58409/playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02", size = 37902159, upload-time = "2026-05-18T12:00:53.728Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c8/210f282d278e4709cdd71b12a31af45a30a22ab3207b387e29b37e478713/playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537", size = 34037981, upload-time = "2026-05-18T12:00:57.584Z" },
+]
+
 [[package]]
 name = "pluggy"
 version = "1.6.0"
@ -175,6 +268,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" },
 ]

+[[package]]
+name = "pyee"
+version = "13.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/8b/04/e7c1fe4dc78a6fdbfd6c337b1c3732ff543b8a397683ab38378447baa331/pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8", size = 31655, upload-time = "2026-02-14T21:12:28.044Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/c4/b4d4827c93ef43c01f599ef31453ccc1c132b353284fc6c87d535c233129/pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228", size = 15659, upload-time = "2026-02-14T21:12:26.263Z" },
+]
+
 [[package]]
 name = "pygments"
 version = "2.20.0"
@ -267,6 +372,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
 ]

+[[package]]
+name = "soupsieve"
+version = "2.8.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" },
+]
+
 [[package]]
 name = "typer"
 version = "0.26.2"