Merge branch 'feature/apec-ingestion'
This commit is contained in:
commit
b4182c9686
@ -9,6 +9,8 @@ description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"beautifulsoup4>=4.12,<5",
|
||||
"playwright>=1.52,<2",
|
||||
"pydantic>=2.7,<3",
|
||||
"pypdf>=5.0,<6",
|
||||
"pyyaml>=6.0,<7",
|
||||
|
||||
3
src/job_research/apec/__init__.py
Normal file
3
src/job_research/apec/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from job_research.apec.query_derivation import derive_apec_queries
|
||||
|
||||
__all__ = ["derive_apec_queries"]
|
||||
317
src/job_research/apec/adapter.py
Normal file
317
src/job_research/apec/adapter.py
Normal file
@ -0,0 +1,317 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import parse_qsl, urlencode, urlparse, urlsplit, urlunsplit
|
||||
|
||||
from playwright.sync_api import Error as PlaywrightError
|
||||
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from job_research.models import ListingError
|
||||
|
||||
|
||||
_SEARCH_URL = "https://www.apec.fr/candidat/recherche-emploi.html/emploi"
|
||||
_FRANCE_LOCATION_ID = "799"
|
||||
_CDI_CONTRACT_ID = "101888"
|
||||
_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
|
||||
_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
|
||||
_ZERO_RESULTS_URL_FRAGMENT = "/recherche-avancee"
|
||||
_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
|
||||
_APEC_HOSTS = {"apec.fr", "www.apec.fr"}
|
||||
_MAX_PAGES_PER_QUERY = 50
|
||||
_MAX_CONSECUTIVE_NO_PROGRESS_PAGES = 10
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ApecSearchResult:
|
||||
url: str
|
||||
source_job_id: str | None = None
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class ApecSearchFilters:
|
||||
location: str | None = None
|
||||
contract_type: str | None = None
|
||||
|
||||
|
||||
class ApecSearchError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _open_public_page():
|
||||
with sync_playwright() as playwright:
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
try:
|
||||
page = browser.new_page()
|
||||
page.set_default_timeout(15_000)
|
||||
yield page
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
|
||||
def _extract_source_job_id(url: str) -> str | None:
|
||||
match = _DETAIL_JOB_ID_PATTERN.search(url)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _search_results_url(base_url: str, page_number: int) -> str:
|
||||
parsed_url = urlsplit(base_url)
|
||||
params = parse_qsl(parsed_url.query, keep_blank_values=True)
|
||||
filtered_params = [(key, value) for key, value in params if key != "page"]
|
||||
filtered_params.append(("page", str(page_number)))
|
||||
return urlunsplit(parsed_url._replace(query=urlencode(filtered_params, doseq=True)))
|
||||
|
||||
|
||||
def _search_url(query: str, search_filters: ApecSearchFilters, page_number: int = 0) -> str:
|
||||
params = [
|
||||
("motsCles", query),
|
||||
("page", str(page_number)),
|
||||
]
|
||||
|
||||
if search_filters.location == "France":
|
||||
params.insert(1, ("lieux", _FRANCE_LOCATION_ID))
|
||||
|
||||
if search_filters.contract_type == "CDI":
|
||||
params.insert(2 if search_filters.location == "France" else 1, ("typesContrat", _CDI_CONTRACT_ID))
|
||||
|
||||
return f"{_SEARCH_URL}?{urlencode(params)}"
|
||||
|
||||
|
||||
def _accept_cookies_if_present(page) -> None:
|
||||
try:
|
||||
page.locator('input[name="cguAcceptees"]').check(timeout=2_000)
|
||||
except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
|
||||
pass
|
||||
|
||||
for button_name in ("ACCEPTER", "Accepter tous les cookies"):
|
||||
try:
|
||||
page.get_by_role("button", name=button_name).click(timeout=2_000)
|
||||
return
|
||||
except (AttributeError, PlaywrightError, PlaywrightTimeoutError):
|
||||
continue
|
||||
|
||||
|
||||
def _goto_and_wait(page, url: str) -> bool:
|
||||
try:
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
page.wait_for_load_state("domcontentloaded")
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _is_public_apec_detail_url(url: str) -> bool:
|
||||
parsed_url = urlparse(url)
|
||||
return (
|
||||
parsed_url.scheme == "https"
|
||||
and parsed_url.hostname in _APEC_HOSTS
|
||||
and re.fullmatch(r"/candidat/recherche-emploi\.html/emploi/detail-offre/[^/?#]+", parsed_url.path)
|
||||
is not None
|
||||
)
|
||||
|
||||
|
||||
class ApecAdapter:
|
||||
def __init__(self, max_listings: int = 50) -> None:
|
||||
self.max_listings = max_listings
|
||||
self.search_errors: list[ListingError] = []
|
||||
self._browser_context = None
|
||||
|
||||
@contextmanager
|
||||
def browser_session(self):
|
||||
if self._browser_context is not None:
|
||||
yield
|
||||
return
|
||||
|
||||
with sync_playwright() as playwright:
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
browser_context = browser.new_context()
|
||||
self._browser_context = browser_context
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
self._browser_context = None
|
||||
browser.close()
|
||||
|
||||
@contextmanager
|
||||
def _open_page(self):
|
||||
if self._browser_context is None:
|
||||
with _open_public_page() as page:
|
||||
yield page
|
||||
return
|
||||
|
||||
page = self._browser_context.new_page()
|
||||
page.set_default_timeout(15_000)
|
||||
try:
|
||||
yield page
|
||||
finally:
|
||||
page.close()
|
||||
|
||||
def _record_search_error(
|
||||
self,
|
||||
query: str,
|
||||
search_filters: ApecSearchFilters,
|
||||
message: str,
|
||||
*,
|
||||
url: str | None = None,
|
||||
) -> None:
|
||||
self.search_errors.append(
|
||||
ListingError(url=url or _search_url(query, search_filters), stage="search", message=message)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_zero_results_page(page) -> bool:
|
||||
return _ZERO_RESULTS_URL_FRAGMENT in page.url and "error=true" in page.url
|
||||
|
||||
def search(self, queries: list[str], search_filters: ApecSearchFilters) -> list[ApecSearchResult]:
|
||||
results: list[ApecSearchResult] = []
|
||||
seen_keys: set[str] = set()
|
||||
usable_search_page_seen = False
|
||||
self.search_errors = []
|
||||
|
||||
with self._open_page() as page:
|
||||
for query in queries:
|
||||
if not query.strip():
|
||||
continue
|
||||
|
||||
if len(results) >= self.max_listings:
|
||||
break
|
||||
|
||||
if not _goto_and_wait(page, _search_url(query, search_filters)):
|
||||
self._record_search_error(query, search_filters, "search page navigation failed")
|
||||
continue
|
||||
|
||||
_accept_cookies_if_present(page)
|
||||
|
||||
try:
|
||||
page.wait_for_selector(_SEARCH_INPUT_SELECTOR, timeout=5_000)
|
||||
except PlaywrightTimeoutError:
|
||||
self._record_search_error(query, search_filters, "search input did not render")
|
||||
continue
|
||||
|
||||
if self._is_zero_results_page(page):
|
||||
usable_search_page_seen = True
|
||||
continue
|
||||
|
||||
try:
|
||||
page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
|
||||
except PlaywrightTimeoutError:
|
||||
self._record_search_error(query, search_filters, "search results did not render")
|
||||
continue
|
||||
|
||||
usable_search_page_seen = True
|
||||
|
||||
result_page_url = page.url
|
||||
seen_page_urls: set[str] = {result_page_url}
|
||||
no_progress_pages = 0
|
||||
|
||||
for page_number in range(_MAX_PAGES_PER_QUERY):
|
||||
if len(results) >= self.max_listings:
|
||||
break
|
||||
|
||||
if page_number > 0:
|
||||
next_page_url = _search_results_url(result_page_url, page_number)
|
||||
if next_page_url in seen_page_urls:
|
||||
break
|
||||
|
||||
if not _goto_and_wait(page, next_page_url):
|
||||
self._record_search_error(
|
||||
query,
|
||||
search_filters,
|
||||
f"page {page_number} navigation failed",
|
||||
url=next_page_url,
|
||||
)
|
||||
break
|
||||
|
||||
try:
|
||||
page.wait_for_selector(_RESULT_LINK_SELECTOR, timeout=5_000)
|
||||
except PlaywrightTimeoutError:
|
||||
self._record_search_error(
|
||||
query,
|
||||
search_filters,
|
||||
f"page {page_number} results did not render",
|
||||
url=next_page_url,
|
||||
)
|
||||
break
|
||||
|
||||
current_page_url = page.url
|
||||
if page_number > 0 and current_page_url in seen_page_urls:
|
||||
break
|
||||
|
||||
seen_page_urls.add(current_page_url)
|
||||
|
||||
try:
|
||||
hrefs = page.locator(_RESULT_LINK_SELECTOR).evaluate_all(
|
||||
"nodes => nodes.map(node => node.href)"
|
||||
)
|
||||
except Exception:
|
||||
self._record_search_error(
|
||||
query,
|
||||
search_filters,
|
||||
f"page {page_number} result links could not be evaluated",
|
||||
url=current_page_url,
|
||||
)
|
||||
break
|
||||
|
||||
if not hrefs:
|
||||
no_progress_pages += 1
|
||||
if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
|
||||
break
|
||||
|
||||
continue
|
||||
|
||||
added_any_result = False
|
||||
for href in hrefs:
|
||||
source_job_id = _extract_source_job_id(href)
|
||||
dedupe_key = source_job_id or href
|
||||
if dedupe_key in seen_keys:
|
||||
continue
|
||||
|
||||
seen_keys.add(dedupe_key)
|
||||
results.append(ApecSearchResult(url=href, source_job_id=source_job_id))
|
||||
added_any_result = True
|
||||
|
||||
if len(results) >= self.max_listings:
|
||||
break
|
||||
|
||||
if added_any_result:
|
||||
no_progress_pages = 0
|
||||
else:
|
||||
no_progress_pages += 1
|
||||
if no_progress_pages >= _MAX_CONSECUTIVE_NO_PROGRESS_PAGES:
|
||||
break
|
||||
|
||||
if not usable_search_page_seen:
|
||||
raise ApecSearchError("Apec search page was not reachable for any query")
|
||||
|
||||
return results
|
||||
|
||||
def fetch_listing_html(self, url: str) -> str:
|
||||
if not _is_public_apec_detail_url(url):
|
||||
raise ValueError("ApecAdapter only fetches public Apec URLs")
|
||||
|
||||
with self._open_page() as page:
|
||||
page.goto(url, wait_until="domcontentloaded")
|
||||
_accept_cookies_if_present(page)
|
||||
page.wait_for_function(
|
||||
"""
|
||||
() => {
|
||||
const title = document.querySelector('.container-details-offer h1, h1');
|
||||
const reference = document.querySelector('.ref-offre');
|
||||
const offerList = document.querySelector('.details-offer-list');
|
||||
return !!title && !!reference && !!offerList;
|
||||
}
|
||||
""",
|
||||
polling=1000,
|
||||
timeout=15_000,
|
||||
)
|
||||
final_url = page.url
|
||||
if not _is_public_apec_detail_url(final_url):
|
||||
raise ValueError(f"ApecAdapter landed on an unexpected URL after redirects: {final_url}")
|
||||
return page.content()
|
||||
92
src/job_research/apec/dedupe.py
Normal file
92
src/job_research/apec/dedupe.py
Normal file
@ -0,0 +1,92 @@
|
||||
from job_research.models import ApecListing
|
||||
|
||||
|
||||
_MERGEABLE_FIELDS = (
|
||||
"source_job_id",
|
||||
"title",
|
||||
"company",
|
||||
"location",
|
||||
"contract_type",
|
||||
"description_text",
|
||||
"published_at",
|
||||
"refreshed_at",
|
||||
)
|
||||
|
||||
|
||||
def _merge_listing_metadata(survivor: ApecListing, source: ApecListing) -> None:
|
||||
for field_name in _MERGEABLE_FIELDS:
|
||||
if getattr(survivor, field_name) is None:
|
||||
value = getattr(source, field_name)
|
||||
if value is not None:
|
||||
setattr(survivor, field_name, value)
|
||||
|
||||
for warning in source.warnings:
|
||||
if warning not in survivor.warnings:
|
||||
survivor.warnings.append(warning)
|
||||
|
||||
|
||||
def _register_listing(
|
||||
url_to_listing: dict[str, ApecListing],
|
||||
source_job_id_to_listing: dict[str, ApecListing],
|
||||
listing: ApecListing,
|
||||
survivor: ApecListing,
|
||||
) -> None:
|
||||
url_to_listing[listing.url] = survivor
|
||||
|
||||
if listing.source_job_id is not None:
|
||||
source_job_id_to_listing[listing.source_job_id] = survivor
|
||||
|
||||
|
||||
def _repoint_listing_aliases(
|
||||
url_to_listing: dict[str, ApecListing],
|
||||
source_job_id_to_listing: dict[str, ApecListing],
|
||||
removed: ApecListing,
|
||||
survivor: ApecListing,
|
||||
) -> None:
|
||||
for mapping in (url_to_listing, source_job_id_to_listing):
|
||||
for alias, listing in list(mapping.items()):
|
||||
if listing is removed:
|
||||
mapping[alias] = survivor
|
||||
|
||||
|
||||
def dedupe_apec_listings(listings: list[ApecListing]) -> list[ApecListing]:
|
||||
url_to_listing: dict[str, ApecListing] = {}
|
||||
source_job_id_to_listing: dict[str, ApecListing] = {}
|
||||
survivor_order: dict[int, int] = {}
|
||||
next_order = 0
|
||||
deduped: list[ApecListing] = []
|
||||
|
||||
for listing in listings:
|
||||
source_job_id = listing.source_job_id
|
||||
matches: list[ApecListing] = []
|
||||
|
||||
url_match = url_to_listing.get(listing.url)
|
||||
if url_match is not None:
|
||||
matches.append(url_match)
|
||||
|
||||
if source_job_id is not None:
|
||||
source_job_id_match = source_job_id_to_listing.get(source_job_id)
|
||||
if source_job_id_match is not None and source_job_id_match not in matches:
|
||||
matches.append(source_job_id_match)
|
||||
|
||||
if not matches:
|
||||
deduped.append(listing)
|
||||
survivor_order[id(listing)] = next_order
|
||||
next_order += 1
|
||||
_register_listing(url_to_listing, source_job_id_to_listing, listing, listing)
|
||||
continue
|
||||
|
||||
survivor = min(matches, key=lambda candidate: survivor_order[id(candidate)])
|
||||
for other in matches:
|
||||
if other is survivor:
|
||||
continue
|
||||
|
||||
_merge_listing_metadata(survivor, other)
|
||||
deduped[:] = [item for item in deduped if item is not other]
|
||||
_repoint_listing_aliases(url_to_listing, source_job_id_to_listing, other, survivor)
|
||||
survivor_order.pop(id(other), None)
|
||||
|
||||
_merge_listing_metadata(survivor, listing)
|
||||
_register_listing(url_to_listing, source_job_id_to_listing, listing, survivor)
|
||||
|
||||
return deduped
|
||||
328
src/job_research/apec/normalize.py
Normal file
328
src/job_research/apec/normalize.py
Normal file
@ -0,0 +1,328 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from datetime import datetime
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString
|
||||
|
||||
from job_research.models import ApecListing, ListingWarning
|
||||
|
||||
|
||||
_PUBLISHED_AT_PATTERN = re.compile(r"Publi[ée]e le (\d{2}/\d{2}/\d{4})")
|
||||
_REFRESHED_AT_PATTERN = re.compile(r"Actualis[ée]e le (\d{2}/\d{2}/\d{4})")
|
||||
_DETAIL_JOB_ID_PATTERN = re.compile(r"/detail-offre/([^/?#]+)")
|
||||
_SOURCE_JOB_ID_PATTERN = re.compile(r"Ref\. Apec\s*:\s*([A-Z0-9]+)")
|
||||
_CONTRACT_PATTERN = re.compile(r"\b(CDI|CDD|Alternance|Intérim|Stage|Freelance|Indépendant)\b")
|
||||
_HEADING_TAG_NAMES = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
||||
_PLACEHOLDER_TEXT_TOKENS = {
|
||||
"na",
|
||||
"nr",
|
||||
"none",
|
||||
"null",
|
||||
"unknown",
|
||||
"tbd",
|
||||
"nonrenseigne",
|
||||
"nonrenseignee",
|
||||
"nondisponible",
|
||||
}
|
||||
|
||||
|
||||
def _clean_text(value: str | None) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
cleaned = " ".join(value.split())
|
||||
return cleaned or None
|
||||
|
||||
|
||||
def _text_token(value: str) -> str:
|
||||
normalized = unicodedata.normalize("NFKD", value)
|
||||
return re.sub(r"[^a-z0-9]+", "", normalized.casefold())
|
||||
|
||||
|
||||
def _has_useful_text(value: str | None) -> bool:
|
||||
cleaned = _clean_text(value)
|
||||
if cleaned is None:
|
||||
return False
|
||||
|
||||
token = _text_token(cleaned)
|
||||
return bool(token) and token not in _PLACEHOLDER_TEXT_TOKENS
|
||||
|
||||
|
||||
def _text_before_heading(node) -> str | None:
|
||||
if isinstance(node, NavigableString):
|
||||
return _clean_text(str(node))
|
||||
|
||||
if getattr(node, "name", None) in _HEADING_TAG_NAMES:
|
||||
return None
|
||||
|
||||
pieces: list[str] = []
|
||||
for child in getattr(node, "children", []):
|
||||
if getattr(child, "name", None) in _HEADING_TAG_NAMES:
|
||||
break
|
||||
|
||||
text = _text_before_heading(child)
|
||||
cleaned = _clean_text(text)
|
||||
if cleaned:
|
||||
pieces.append(cleaned)
|
||||
|
||||
return _clean_text(" ".join(pieces))
|
||||
|
||||
|
||||
def _extract_section_text(block, label: str) -> str | None:
|
||||
heading = block.find(lambda tag: getattr(tag, "name", None) in _HEADING_TAG_NAMES and _clean_text(tag.get_text(" ", strip=True)) == label)
|
||||
if heading is None:
|
||||
return None
|
||||
|
||||
pieces: list[str] = []
|
||||
for sibling in heading.next_siblings:
|
||||
if getattr(sibling, "name", None) in _HEADING_TAG_NAMES:
|
||||
break
|
||||
|
||||
text = _text_before_heading(sibling)
|
||||
if text:
|
||||
pieces.append(text)
|
||||
|
||||
return _clean_text(" ".join(pieces))
|
||||
|
||||
|
||||
def _detail_block_text(soup: BeautifulSoup, label: str) -> str | None:
|
||||
for block in soup.select(".details-post"):
|
||||
if block.find("h4") is None:
|
||||
continue
|
||||
|
||||
extracted = _extract_section_text(block, label)
|
||||
if extracted is not None:
|
||||
return extracted
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _warning(field: str, message: str) -> ListingWarning:
|
||||
return ListingWarning(field=field, message=message)
|
||||
|
||||
|
||||
def _extract_source_job_id_from_url(url: str) -> str | None:
|
||||
match = _DETAIL_JOB_ID_PATTERN.search(url)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _extract_listing_date(
|
||||
soup: BeautifulSoup,
|
||||
pattern: re.Pattern[str],
|
||||
*,
|
||||
field: str,
|
||||
missing_message: str | None = None,
|
||||
invalid_message: str,
|
||||
warnings: list[ListingWarning],
|
||||
warn_on_missing: bool,
|
||||
) -> str | None:
|
||||
card_offer = soup.select_one(".card-offer")
|
||||
if card_offer is None:
|
||||
if warn_on_missing and missing_message is not None:
|
||||
warnings.append(_warning(field, missing_message))
|
||||
return None
|
||||
|
||||
match = pattern.search(card_offer.get_text(" ", strip=True))
|
||||
if match is None:
|
||||
if warn_on_missing and missing_message is not None:
|
||||
warnings.append(_warning(field, missing_message))
|
||||
return None
|
||||
|
||||
try:
|
||||
return datetime.strptime(match.group(1), "%d/%m/%Y").date().isoformat()
|
||||
except ValueError:
|
||||
warnings.append(_warning(field, invalid_message))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_source_job_id(soup: BeautifulSoup) -> str | None:
|
||||
ref = soup.select_one(".ref-offre")
|
||||
if ref is None:
|
||||
return None
|
||||
|
||||
match = _SOURCE_JOB_ID_PATTERN.search(ref.get_text(" ", strip=True))
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _extract_contract_type(details_offer_list) -> str | None:
|
||||
contract_item = details_offer_list.select_one("li:nth-of-type(2)")
|
||||
if contract_item is None:
|
||||
return None
|
||||
|
||||
span = contract_item.find("span")
|
||||
if span is not None:
|
||||
return _clean_text(span.get_text(" ", strip=True))
|
||||
|
||||
match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return match.group(1)
|
||||
|
||||
|
||||
def _extract_company(soup: BeautifulSoup, details_offer_list) -> str | None:
|
||||
for selector in (".card-ents .ents-name", ".card-ents-quote"):
|
||||
company = soup.select_one(selector)
|
||||
if company is not None:
|
||||
text = _clean_text(company.get_text(" ", strip=True))
|
||||
if text is not None:
|
||||
return text
|
||||
|
||||
if details_offer_list is not None:
|
||||
company = details_offer_list.select_one("li:first-of-type")
|
||||
if company is not None:
|
||||
text = _clean_text(company.get_text(" ", strip=True))
|
||||
if text is not None:
|
||||
return text
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def normalize_apec_listing(
|
||||
url: str,
|
||||
html: str,
|
||||
fetched_at: str,
|
||||
*,
|
||||
source_job_id: str | None = None,
|
||||
published_at: str | None = None,
|
||||
refreshed_at: str | None = None,
|
||||
) -> ApecListing:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
warnings: list[ListingWarning] = []
|
||||
|
||||
title = soup.select_one(".container-details-offer h1")
|
||||
if title is None:
|
||||
title = soup.find("h1")
|
||||
if title is not None:
|
||||
warnings.append(_warning("title", "Recovered title from generic h1 fallback"))
|
||||
else:
|
||||
warnings.append(_warning("title", "Title missing from Apec listing"))
|
||||
title_text = _clean_text(title.get_text(" ", strip=True)) if title is not None else None
|
||||
if title is not None and not _has_useful_text(title_text):
|
||||
warnings.append(_warning("title", "Title is empty or placeholder text"))
|
||||
|
||||
details_offer_list = soup.select_one(".details-offer-list")
|
||||
|
||||
location = None
|
||||
contract_type = None
|
||||
if details_offer_list is not None:
|
||||
location_item = details_offer_list.select_one("li:nth-of-type(3)")
|
||||
if location_item is not None:
|
||||
location = _clean_text(location_item.get_text(" ", strip=True))
|
||||
if not _has_useful_text(location):
|
||||
warnings.append(_warning("location", "Location is empty or placeholder text"))
|
||||
else:
|
||||
warnings.append(_warning("location", "Location missing from details-offer list"))
|
||||
|
||||
contract_item = details_offer_list.select_one("li:nth-of-type(2)")
|
||||
if contract_item is None:
|
||||
warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
|
||||
else:
|
||||
span = contract_item.find("span")
|
||||
if span is not None:
|
||||
contract_type = _clean_text(span.get_text(" ", strip=True))
|
||||
if not _has_useful_text(contract_type):
|
||||
warnings.append(_warning("contract_type", "Contract type is empty or placeholder text"))
|
||||
else:
|
||||
match = _CONTRACT_PATTERN.search(contract_item.get_text(" ", strip=True))
|
||||
if match is not None:
|
||||
contract_type = match.group(1)
|
||||
warnings.append(_warning("contract_type", "Recovered contract type from text fallback"))
|
||||
else:
|
||||
warnings.append(_warning("contract_type", "Contract type missing from details-offer list"))
|
||||
else:
|
||||
warnings.append(_warning("location", "Location missing from Apec listing"))
|
||||
warnings.append(_warning("contract_type", "Contract type missing from Apec listing"))
|
||||
|
||||
description_text = _detail_block_text(soup, "Descriptif du poste")
|
||||
if description_text is None:
|
||||
warnings.append(_warning("description_text", "Description missing from Apec listing"))
|
||||
elif not _has_useful_text(description_text):
|
||||
warnings.append(_warning("description_text", "Description is empty or placeholder text"))
|
||||
description_text = None
|
||||
|
||||
requested_source_job_id = _extract_source_job_id_from_url(url)
|
||||
ref_source_job_id = _extract_source_job_id(soup)
|
||||
|
||||
if source_job_id is not None:
|
||||
if (
|
||||
requested_source_job_id is not None
|
||||
and ref_source_job_id is not None
|
||||
and requested_source_job_id != ref_source_job_id
|
||||
):
|
||||
warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
|
||||
normalized_source_job_id = ref_source_job_id
|
||||
else:
|
||||
normalized_source_job_id = source_job_id
|
||||
else:
|
||||
if ref_source_job_id is None:
|
||||
if requested_source_job_id is None:
|
||||
warnings.append(_warning("source_job_id", "Source job id missing from Apec listing"))
|
||||
normalized_source_job_id = None
|
||||
else:
|
||||
warnings.append(_warning("source_job_id", "Recovered source job id from detail URL fallback"))
|
||||
normalized_source_job_id = requested_source_job_id
|
||||
else:
|
||||
warnings.append(_warning("source_job_id", "Recovered source job id from ref-offre fallback"))
|
||||
normalized_source_job_id = ref_source_job_id
|
||||
|
||||
company = soup.select_one(".card-ents .ents-name")
|
||||
if company is None:
|
||||
for selector, warning_message in (
|
||||
(".card-ents-quote", "Recovered company from .card-ents-quote fallback"),
|
||||
(".details-offer-list li:first-of-type", "Recovered company from details-offer-list fallback"),
|
||||
):
|
||||
company = soup.select_one(selector)
|
||||
if company is not None:
|
||||
warnings.append(_warning("company", warning_message))
|
||||
break
|
||||
|
||||
company_text = _clean_text(company.get_text(" ", strip=True)) if company is not None else None
|
||||
if company_text is None:
|
||||
warnings.append(_warning("company", "Company missing from Apec listing"))
|
||||
elif not _has_useful_text(company_text):
|
||||
warnings.append(_warning("company", "Company is empty or placeholder text"))
|
||||
company_text = None
|
||||
|
||||
published_at_value = published_at or _extract_listing_date(
|
||||
soup,
|
||||
_PUBLISHED_AT_PATTERN,
|
||||
field="published_at",
|
||||
missing_message="Published date missing from Apec listing",
|
||||
invalid_message="Published date is invalid",
|
||||
warnings=warnings,
|
||||
warn_on_missing=True,
|
||||
)
|
||||
|
||||
refreshed_at_value = refreshed_at or _extract_listing_date(
|
||||
soup,
|
||||
_REFRESHED_AT_PATTERN,
|
||||
field="refreshed_at",
|
||||
invalid_message="Refreshed date is invalid",
|
||||
warnings=warnings,
|
||||
warn_on_missing=False,
|
||||
)
|
||||
|
||||
return ApecListing(
|
||||
source="apec",
|
||||
source_job_id=normalized_source_job_id,
|
||||
url=url,
|
||||
title=title_text if _has_useful_text(title_text) else None,
|
||||
company=company_text,
|
||||
location=location if _has_useful_text(location) else None,
|
||||
contract_type=contract_type if _has_useful_text(contract_type) else None,
|
||||
description_text=description_text,
|
||||
published_at=published_at_value,
|
||||
refreshed_at=refreshed_at_value,
|
||||
fetched_at=fetched_at,
|
||||
warnings=warnings,
|
||||
)
|
||||
63
src/job_research/apec/query_derivation.py
Normal file
63
src/job_research/apec/query_derivation.py
Normal file
@ -0,0 +1,63 @@
|
||||
from job_research.apec.adapter import ApecSearchFilters
|
||||
from job_research.models import CandidateProfileOutput
|
||||
|
||||
|
||||
def _normalize_term(raw_term: str) -> str:
|
||||
return " ".join(raw_term.split())
|
||||
|
||||
|
||||
def _normalize_constraint(raw_term: str) -> str:
|
||||
return _normalize_term(raw_term).casefold()
|
||||
|
||||
|
||||
def derive_apec_search_filters(profile: CandidateProfileOutput) -> ApecSearchFilters:
|
||||
normalized_constraints = {_normalize_constraint(constraint) for constraint in profile.constraints}
|
||||
|
||||
return ApecSearchFilters(
|
||||
location="France" if "france only" in normalized_constraints else None,
|
||||
contract_type="CDI" if "cdi only" in normalized_constraints else None,
|
||||
)
|
||||
|
||||
|
||||
def derive_apec_queries(profile: CandidateProfileOutput) -> list[str]:
|
||||
queries: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def add_query(raw_query: str) -> None:
|
||||
query = _normalize_term(raw_query)
|
||||
if not query or query in seen or len(queries) == 5:
|
||||
return
|
||||
|
||||
seen.add(query)
|
||||
queries.append(query)
|
||||
|
||||
unique_roles: list[str] = []
|
||||
for target_role in profile.target_roles:
|
||||
query = " ".join(target_role.split())
|
||||
if not query or query in unique_roles:
|
||||
continue
|
||||
|
||||
unique_roles.append(query)
|
||||
|
||||
support_terms = [_normalize_term(term) for term in profile.strengths]
|
||||
support_terms.extend(_normalize_term(term) for term in profile.skills_to_emphasize)
|
||||
support_terms = [term for term in support_terms if term]
|
||||
|
||||
for target_role in unique_roles:
|
||||
add_query(target_role)
|
||||
if len(queries) == 5:
|
||||
return queries
|
||||
|
||||
if unique_roles:
|
||||
primary_role = unique_roles[0]
|
||||
for term in support_terms:
|
||||
add_query(f"{primary_role} {term}")
|
||||
if len(queries) == 5:
|
||||
break
|
||||
else:
|
||||
for term in support_terms:
|
||||
add_query(term)
|
||||
if len(queries) == 5:
|
||||
break
|
||||
|
||||
return queries
|
||||
@ -1,15 +1,79 @@
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime, timezone
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from urllib.parse import unquote, urlparse
|
||||
|
||||
import typer
|
||||
import yaml
|
||||
from pydantic import ValidationError
|
||||
|
||||
from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
|
||||
from job_research.apec.dedupe import dedupe_apec_listings
|
||||
from job_research.apec.normalize import normalize_apec_listing
|
||||
from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
|
||||
from job_research.models import ApecRunMeta, ApecSnapshotMeta, CandidateProfileOutput, ListingError
|
||||
from job_research.profile.cv_extractor import extract_cv_signals, extract_pdf_text
|
||||
from job_research.profile.merge import build_candidate_profile_output
|
||||
from job_research.profile.profile_parser import parse_profile_markdown
|
||||
from job_research.storage import save_candidate_profile_yaml
|
||||
from job_research.profile.profile_parser import AuthoredProfile, parse_profile_markdown
|
||||
from job_research.storage import apec_run_paths, load_yaml, save_candidate_profile_yaml
|
||||
|
||||
app = typer.Typer(help="Build one canonical candidate profile YAML")
|
||||
|
||||
|
||||
def _utc_now() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _snapshot_stem(url: str, source_job_id: str | None) -> str:
|
||||
if source_job_id:
|
||||
return source_job_id
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
fallback = parsed_url.path.rstrip("/").rsplit("/", 1)[-1] or parsed_url.netloc or "listing"
|
||||
if parsed_url.query:
|
||||
fallback = f"{fallback}-{parsed_url.query}"
|
||||
|
||||
stem = re.sub(r"[^A-Za-z0-9]+", "-", unquote(fallback)).strip("-")
|
||||
return stem or "listing"
|
||||
|
||||
|
||||
def _write_yaml(path: Path, payload: Any) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(yaml.safe_dump(payload, sort_keys=False, allow_unicode=True), encoding="utf-8")
|
||||
|
||||
|
||||
def _load_candidate_profile(profile_path: Path) -> CandidateProfileOutput:
|
||||
try:
|
||||
return CandidateProfileOutput.model_validate(load_yaml(profile_path))
|
||||
except FileNotFoundError as exc:
|
||||
raise ValueError(f"candidate-profile.yaml not found at {profile_path}") from exc
|
||||
except (OSError, UnicodeDecodeError) as exc:
|
||||
raise ValueError(f"candidate-profile.yaml not readable at {profile_path}: {exc}") from exc
|
||||
except (yaml.YAMLError, ValidationError, ValueError) as exc:
|
||||
raise ValueError(f"invalid candidate-profile.yaml at {profile_path}: {exc}") from exc
|
||||
|
||||
|
||||
def _load_cv_text(cv: Path) -> str:
|
||||
try:
|
||||
cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
raise ValueError(f"CV input not readable at {cv}: {exc}") from exc
|
||||
|
||||
if not cv_text.strip():
|
||||
raise ValueError("No readable text found in CV input")
|
||||
|
||||
return cv_text
|
||||
|
||||
|
||||
def _load_authored_profile(profile: Path) -> AuthoredProfile:
|
||||
try:
|
||||
return parse_profile_markdown(profile.read_text(encoding="utf-8"))
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
raise ValueError(f"profile markdown invalid at {profile}: {exc}") from exc
|
||||
|
||||
|
||||
@app.callback()
|
||||
def main_command() -> None:
|
||||
pass
|
||||
@ -41,15 +105,21 @@ def build_profile(
|
||||
) -> None:
|
||||
"""Build candidate-profile.yaml from CV and markdown profile."""
|
||||
|
||||
cv_text = extract_pdf_text(cv) if cv.suffix.lower() == ".pdf" else cv.read_text(encoding="utf-8")
|
||||
if not cv_text.strip():
|
||||
raise ValueError("No readable text found in CV input")
|
||||
try:
|
||||
cv_text = _load_cv_text(cv)
|
||||
authored_profile = _load_authored_profile(profile)
|
||||
except ValueError as exc:
|
||||
typer.echo(str(exc), err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
authored_profile = parse_profile_markdown(profile.read_text(encoding="utf-8"))
|
||||
cv_signals = extract_cv_signals(cv_text)
|
||||
candidate_profile = build_candidate_profile_output(cv_signals, authored_profile)
|
||||
|
||||
save_candidate_profile_yaml(out, candidate_profile)
|
||||
try:
|
||||
save_candidate_profile_yaml(out, candidate_profile)
|
||||
except OSError as exc:
|
||||
typer.echo(f"Unable to write candidate profile to {out}: {exc}", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
typer.echo(f"candidate profile written to {out}")
|
||||
warning_count = len(candidate_profile.warnings)
|
||||
@ -59,6 +129,147 @@ def build_profile(
|
||||
typer.echo("No warnings included.")
|
||||
|
||||
|
||||
@app.command("fetch-apec")
|
||||
def fetch_apec(
|
||||
data_root: Path = typer.Option(
|
||||
Path("data"),
|
||||
"--data-root",
|
||||
file_okay=False,
|
||||
dir_okay=True,
|
||||
help="Directory containing candidate-profile.yaml and Apec run artifacts.",
|
||||
),
|
||||
) -> None:
|
||||
"""Fetch, normalize, dedupe, and persist Apec listings."""
|
||||
|
||||
profile_path = data_root / "candidate-profile.yaml"
|
||||
try:
|
||||
profile = _load_candidate_profile(profile_path)
|
||||
except ValueError as exc:
|
||||
typer.echo(str(exc), err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
derived_queries = derive_apec_queries(profile)
|
||||
|
||||
if not derived_queries:
|
||||
typer.echo("No usable Apec queries derived from candidate profile", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
derived_search_filters = derive_apec_search_filters(profile)
|
||||
search_filters = ApecSearchFilters(
|
||||
location=derived_search_filters.location or "France",
|
||||
contract_type=derived_search_filters.contract_type or "CDI",
|
||||
)
|
||||
|
||||
current = _utc_now().astimezone(timezone.utc)
|
||||
run_id = current.strftime("%Y-%m-%dT%H-%M-%S-%fZ")
|
||||
run_started_at = current.replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
adapter = ApecAdapter(max_listings=50)
|
||||
try:
|
||||
search_results = adapter.search(derived_queries, search_filters=search_filters)[:50]
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
typer.echo(f"Unable to fetch Apec search results: {exc}", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
paths = apec_run_paths(data_root, run_id)
|
||||
try:
|
||||
paths["snapshots"].mkdir(parents=True, exist_ok=True)
|
||||
except OSError as exc: # pragma: no cover - defensive boundary
|
||||
typer.echo(f"Unable to create Apec snapshot directory: {exc}", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
normalized_listings = []
|
||||
listing_errors: list[ListingError] = list(getattr(adapter, "search_errors", []))
|
||||
snapshot_metadata: list[ApecSnapshotMeta] = []
|
||||
fetched_count = 0
|
||||
successful_fetch_count = 0
|
||||
|
||||
browser_session = getattr(adapter, "browser_session", None)
|
||||
session_context = browser_session() if callable(browser_session) else nullcontext()
|
||||
|
||||
with session_context:
|
||||
for result in search_results:
|
||||
fetched_count += 1
|
||||
fetched_at = _utc_now().astimezone(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
try:
|
||||
html = adapter.fetch_listing_html(result.url)
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
listing_errors.append(ListingError(url=result.url, stage="fetch_html", message=str(exc)))
|
||||
continue
|
||||
|
||||
successful_fetch_count += 1
|
||||
|
||||
snapshot_path = paths["snapshots"] / f"{_snapshot_stem(result.url, result.source_job_id)}.html"
|
||||
snapshot_meta = ApecSnapshotMeta(
|
||||
url=result.url,
|
||||
source_job_id=result.source_job_id,
|
||||
snapshot_file=None,
|
||||
fetched_at=fetched_at,
|
||||
)
|
||||
|
||||
try:
|
||||
snapshot_path.write_text(html, encoding="utf-8")
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
listing_errors.append(ListingError(url=result.url, stage="snapshot_write", message=str(exc)))
|
||||
else:
|
||||
snapshot_meta.snapshot_file = snapshot_path.name
|
||||
|
||||
snapshot_metadata.append(snapshot_meta)
|
||||
|
||||
try:
|
||||
listing = normalize_apec_listing(
|
||||
url=result.url,
|
||||
html=html,
|
||||
fetched_at=fetched_at,
|
||||
source_job_id=result.source_job_id,
|
||||
)
|
||||
except Exception as exc: # pragma: no cover - defensive boundary
|
||||
listing_errors.append(ListingError(url=result.url, stage="normalize", message=str(exc)))
|
||||
continue
|
||||
|
||||
normalized_listings.append(listing)
|
||||
|
||||
if search_results and successful_fetch_count == 0:
|
||||
typer.echo("No listings could be fetched or normalized from Apec", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
deduplicated_listings = dedupe_apec_listings(normalized_listings)
|
||||
failed_count = len({error.url for error in listing_errors})
|
||||
run_meta = ApecRunMeta(
|
||||
run_id=run_id,
|
||||
run_started_at=run_started_at,
|
||||
derived_queries=derived_queries,
|
||||
snapshots=snapshot_metadata,
|
||||
fetched_count=fetched_count,
|
||||
normalized_count=len(normalized_listings),
|
||||
deduplicated_count=len(deduplicated_listings),
|
||||
failed_count=failed_count,
|
||||
listing_errors=listing_errors,
|
||||
)
|
||||
|
||||
artifact_write_errors: list[str] = []
|
||||
|
||||
try:
|
||||
_write_yaml(paths["listings"], [listing.model_dump(mode="json") for listing in deduplicated_listings])
|
||||
except OSError as exc: # pragma: no cover - defensive boundary
|
||||
artifact_write_errors.append(f"listings.yaml: {exc}")
|
||||
|
||||
try:
|
||||
_write_yaml(paths["run_meta"], run_meta.model_dump(mode="json"))
|
||||
except OSError as exc: # pragma: no cover - defensive boundary
|
||||
artifact_write_errors.append(f"run-meta.yaml: {exc}")
|
||||
|
||||
if artifact_write_errors:
|
||||
typer.echo(f"Unable to write Apec run artifacts: {'; '.join(artifact_write_errors)}", err=True)
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
typer.echo(
|
||||
f"query={len(derived_queries)} fetched={fetched_count} normalized={len(normalized_listings)} "
|
||||
f"deduplicated={len(deduplicated_listings)} failed={failed_count}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
app()
|
||||
|
||||
|
||||
@ -21,6 +21,51 @@ class WarningItem(BaseModel):
|
||||
message: str
|
||||
|
||||
|
||||
class ListingWarning(BaseModel):
|
||||
field: str
|
||||
message: str
|
||||
|
||||
|
||||
class ListingError(BaseModel):
|
||||
url: str
|
||||
stage: str
|
||||
message: str
|
||||
|
||||
|
||||
class ApecSnapshotMeta(BaseModel):
|
||||
url: str
|
||||
source_job_id: str | None = None
|
||||
snapshot_file: str | None = None
|
||||
fetched_at: str
|
||||
|
||||
|
||||
class ApecListing(BaseModel):
|
||||
source: str
|
||||
source_job_id: str | None = None
|
||||
url: str
|
||||
title: str | None = None
|
||||
company: str | None = None
|
||||
location: str | None = None
|
||||
contract_type: str | None = None
|
||||
description_text: str | None = None
|
||||
published_at: str | None = None
|
||||
refreshed_at: str | None = None
|
||||
fetched_at: str
|
||||
warnings: list[ListingWarning] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ApecRunMeta(BaseModel):
|
||||
run_id: str
|
||||
run_started_at: str
|
||||
derived_queries: list[str] = Field(default_factory=list)
|
||||
snapshots: list[ApecSnapshotMeta] = Field(default_factory=list)
|
||||
fetched_count: int = 0
|
||||
normalized_count: int = 0
|
||||
deduplicated_count: int = 0
|
||||
failed_count: int = 0
|
||||
listing_errors: list[ListingError] = Field(default_factory=list)
|
||||
|
||||
|
||||
class CandidateProfileOutput(BaseModel):
|
||||
name: str | None = None
|
||||
summary: str | None = None
|
||||
|
||||
@ -5,6 +5,8 @@ from pathlib import Path
|
||||
|
||||
from pypdf import PdfReader
|
||||
|
||||
from job_research.models import WarningItem
|
||||
|
||||
|
||||
EXPERIENCE_LINE_CONNECTORS = (" at ", " chez ", " au ", " à ")
|
||||
|
||||
@ -150,6 +152,15 @@ YEARS_OF_EXPERIENCE_PATTERNS = (
|
||||
re.compile(r"^ann[ée]es d['’]exp[ée]rience\s*:\s*(\d+)\s*$", re.IGNORECASE),
|
||||
)
|
||||
|
||||
LOW_CONFIDENCE_NAME_PATTERNS = (
|
||||
re.compile(r"\b(cv|resume|curriculum vitae|profile)\b", re.IGNORECASE),
|
||||
re.compile(r"[|/@]"),
|
||||
re.compile(
|
||||
r"\b(data engineer|software engineer|developer|analyst|scientist|consultant|architect|manager|product owner|backend|frontend|full stack)\b",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def extract_pdf_text(path: Path) -> str:
|
||||
reader = PdfReader(str(path))
|
||||
@ -168,8 +179,17 @@ def extract_pdf_text(path: Path) -> str:
|
||||
def extract_cv_signals(text: str) -> dict[str, object]:
|
||||
lines = [_normalize_line(line) for line in text.splitlines()]
|
||||
non_empty_lines = [line for line in lines if line]
|
||||
warnings: list[WarningItem] = []
|
||||
|
||||
name = non_empty_lines[0] if non_empty_lines else None
|
||||
if name is not None and _looks_like_low_confidence_name(name):
|
||||
warnings.append(
|
||||
WarningItem(
|
||||
field="name",
|
||||
message="First CV line looks like a header or tagline; review manually.",
|
||||
)
|
||||
)
|
||||
|
||||
location = None
|
||||
languages: list[str] = []
|
||||
skills: list[str] = []
|
||||
@ -247,6 +267,7 @@ def extract_cv_signals(text: str) -> dict[str, object]:
|
||||
"skills": skills,
|
||||
"experience_entries": experience_entries,
|
||||
"education_entries": education_entries,
|
||||
"warnings": warnings,
|
||||
}
|
||||
|
||||
if years_of_experience is not None:
|
||||
@ -369,3 +390,14 @@ def _looks_like_experience_title(title: str) -> bool:
|
||||
|
||||
def _looks_like_prose_company(company: str) -> bool:
|
||||
return any(pattern.search(company) for pattern in EXPERIENCE_PROSE_COMPANY_PATTERNS)
|
||||
|
||||
|
||||
def _looks_like_low_confidence_name(name: str) -> bool:
|
||||
normalized = " ".join(name.split())
|
||||
if not normalized:
|
||||
return True
|
||||
|
||||
if len(normalized.split()) > 4:
|
||||
return True
|
||||
|
||||
return any(pattern.search(normalized) for pattern in LOW_CONFIDENCE_NAME_PATTERNS)
|
||||
|
||||
@ -33,6 +33,7 @@ def build_candidate_profile_output(
|
||||
warnings: list[WarningItem] = []
|
||||
|
||||
_append_years_of_experience_warning(cv_signals, authored.notes, warnings)
|
||||
_append_cv_extraction_warnings(cv_signals, warnings)
|
||||
_append_missing_cv_fact_warnings(cv_signals, warnings)
|
||||
|
||||
merged_skills: list[str] = []
|
||||
@ -99,6 +100,15 @@ def _append_missing_cv_fact_warnings(
|
||||
warnings.append(WarningItem(field=field, message=message))
|
||||
|
||||
|
||||
def _append_cv_extraction_warnings(
|
||||
cv_signals: dict[str, object], warnings: list[WarningItem]
|
||||
) -> None:
|
||||
for warning in cv_signals.get("warnings") or []:
|
||||
warnings.append(
|
||||
warning if isinstance(warning, WarningItem) else WarningItem.model_validate(warning)
|
||||
)
|
||||
|
||||
|
||||
def _note_years_of_experience(note: str) -> int | None:
|
||||
normalized = note.casefold().replace("’", "'")
|
||||
if not any(marker in normalized for marker in EXPERIENCE_NOTE_MARKERS):
|
||||
|
||||
@ -17,14 +17,15 @@ class AuthoredProfile:
|
||||
notes: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
REQUIRED_SECTIONS = {
|
||||
REQUIRED_SECTION_NAMES = (
|
||||
"summary",
|
||||
"target roles",
|
||||
"strengths",
|
||||
"skills to emphasize",
|
||||
"constraints",
|
||||
"notes",
|
||||
}
|
||||
)
|
||||
REQUIRED_SECTIONS = set(REQUIRED_SECTION_NAMES)
|
||||
|
||||
|
||||
def parse_profile_markdown(markdown: str) -> AuthoredProfile:
|
||||
@ -45,6 +46,10 @@ def parse_profile_markdown(markdown: str) -> AuthoredProfile:
|
||||
missing_text = ", ".join(sorted(missing))
|
||||
raise ValueError(f"Missing required markdown sections: {missing_text}")
|
||||
|
||||
for section_name in REQUIRED_SECTION_NAMES:
|
||||
if not _has_usable_section_content(sections[section_name]):
|
||||
raise ValueError(f"Missing usable content in section '{section_name}'")
|
||||
|
||||
return AuthoredProfile(
|
||||
summary=" ".join(sections["summary"]),
|
||||
target_roles=_parse_list_section("target roles", sections["target roles"]),
|
||||
@ -64,6 +69,8 @@ def _parse_list_section(section_name: str, lines: list[str]) -> list[str]:
|
||||
item = _strip_list_marker(line)
|
||||
if item is None:
|
||||
raise ValueError(f"Unsupported content in section '{section_name}': {line}")
|
||||
if not item:
|
||||
raise ValueError(f"Missing usable content in section '{section_name}'")
|
||||
items.append(item)
|
||||
|
||||
return items
|
||||
@ -74,6 +81,8 @@ def _parse_notes_section(lines: list[str]) -> list[str]:
|
||||
|
||||
for line in lines:
|
||||
item = _strip_list_marker(line)
|
||||
if item == "":
|
||||
raise ValueError("Missing usable content in section 'notes'")
|
||||
notes.append(item if item is not None else line)
|
||||
|
||||
return notes
|
||||
@ -81,7 +90,13 @@ def _parse_notes_section(lines: list[str]) -> list[str]:
|
||||
|
||||
def _strip_list_marker(line: str) -> str | None:
|
||||
for marker in LIST_MARKERS:
|
||||
if line == marker.strip():
|
||||
return ""
|
||||
if line.startswith(marker):
|
||||
return line[len(marker):].strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _has_usable_section_content(lines: list[str]) -> bool:
|
||||
return any(line not in {"-", "*", "+"} for line in lines)
|
||||
|
||||
@ -21,3 +21,13 @@ def load_yaml(path: Path) -> dict[str, Any]:
|
||||
raise ValueError("candidate-profile YAML root must be a mapping")
|
||||
|
||||
return dict(payload)
|
||||
|
||||
|
||||
def apec_run_paths(data_root: Path, run_id: str) -> dict[str, Path]:
|
||||
run_dir = data_root / "apec" / "runs" / run_id
|
||||
return {
|
||||
"run_dir": run_dir,
|
||||
"listings": run_dir / "listings.yaml",
|
||||
"run_meta": run_dir / "run-meta.yaml",
|
||||
"snapshots": run_dir / "snapshots",
|
||||
}
|
||||
|
||||
701
tests/apec/test_adapter.py
Normal file
701
tests/apec/test_adapter.py
Normal file
@ -0,0 +1,701 @@
|
||||
from contextlib import contextmanager
|
||||
from urllib.parse import parse_qs, quote_plus, urlparse
|
||||
|
||||
import pytest
|
||||
from playwright.sync_api import Error as PlaywrightError
|
||||
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
||||
|
||||
from job_research.apec import adapter as adapter_module
|
||||
from job_research.apec.adapter import ApecAdapter, ApecSearchFilters
|
||||
|
||||
|
||||
_RESULT_LINK_SELECTOR = "a[href*='/candidat/recherche-emploi.html/emploi/detail-offre/']"
|
||||
_SEARCH_INPUT_SELECTOR = 'input[name="keywords"]'
|
||||
|
||||
|
||||
class _FakeResultButton:
|
||||
def __init__(self, page, name: str) -> None:
|
||||
self.page = page
|
||||
self.name = name
|
||||
|
||||
def click(self, timeout: int | None = None) -> None:
|
||||
if self.name == "Rechercher":
|
||||
self.page.url = (
|
||||
"https://www.apec.fr/candidat/recherche-emploi.html/emploi"
|
||||
f"?motsCles={quote_plus(self.page.current_query)}&page=0"
|
||||
)
|
||||
self.page.current_page = 0
|
||||
elif self.name in {"ACCEPTER", "Accepter tous les cookies"}:
|
||||
self.page.consent_button_clicks.append(self.name)
|
||||
self.page.consent_accepted = True
|
||||
|
||||
|
||||
class _FakeLocator:
|
||||
def __init__(self, page, selector: str) -> None:
|
||||
self.page = page
|
||||
self.selector = selector
|
||||
|
||||
def fill(self, value: str) -> None:
|
||||
self.page.current_query = value
|
||||
|
||||
def check(self, timeout: int | None = None) -> None:
|
||||
if self.selector == 'input[name="cguAcceptees"]':
|
||||
self.page.cgu_checkbox_checked = True
|
||||
return None
|
||||
|
||||
raise PlaywrightTimeoutError(f"selector not found: {self.selector}")
|
||||
|
||||
def evaluate_all(self, function: str):
|
||||
if self.selector == _RESULT_LINK_SELECTOR:
|
||||
return list(self.page.current_results())
|
||||
|
||||
return []
|
||||
|
||||
|
||||
class _FakeDetailPage:
|
||||
def __init__(
|
||||
self,
|
||||
result_pages: dict[str, dict[int, list[str]]],
|
||||
*,
|
||||
rendered_html: str = "<html>rendered</html>",
|
||||
search_ready: bool = True,
|
||||
zero_result_queries: set[str] | None = None,
|
||||
consent_required: bool = False,
|
||||
) -> None:
|
||||
self.result_pages = result_pages
|
||||
self.rendered_html = rendered_html
|
||||
self.shell_html = "<html>shell</html>"
|
||||
self.waited_functions: list[tuple[str, int | None]] = []
|
||||
self.search_ready = search_ready
|
||||
self.zero_result_queries = zero_result_queries or set()
|
||||
self.consent_required = consent_required
|
||||
self.cgu_checkbox_checked = False
|
||||
self.consent_button_clicks: list[str] = []
|
||||
self.consent_accepted = not consent_required
|
||||
self.goto_urls: list[str] = []
|
||||
self.current_query = ""
|
||||
self.current_page = 0
|
||||
self.url = ""
|
||||
self.rendered = False
|
||||
self.default_timeout: int | None = None
|
||||
self.closed = False
|
||||
|
||||
def goto(self, url: str, wait_until: str | None = None) -> None:
|
||||
self.goto_urls.append(url)
|
||||
self.url = url
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
params = parse_qs(parsed_url.query)
|
||||
if "motsCles" in params:
|
||||
self.current_query = params["motsCles"][0]
|
||||
if "page" in params:
|
||||
self.current_page = int(params["page"][0])
|
||||
|
||||
if self.current_query in self.zero_result_queries and "/detail-offre/" not in parsed_url.path:
|
||||
self.url = (
|
||||
f"{parsed_url.scheme}://{parsed_url.netloc}"
|
||||
f"{parsed_url.path}/recherche-avancee?{parsed_url.query}&error=true"
|
||||
)
|
||||
|
||||
if "/detail-offre/" in parsed_url.path:
|
||||
self.rendered = False
|
||||
|
||||
def wait_for_load_state(self, state: str) -> None:
|
||||
return None
|
||||
|
||||
def set_default_timeout(self, timeout: int) -> None:
|
||||
self.default_timeout = timeout
|
||||
|
||||
def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
|
||||
if self.consent_required and not self.consent_accepted:
|
||||
raise PlaywrightTimeoutError("consent not accepted")
|
||||
|
||||
self.waited_functions.append((function, polling))
|
||||
self.rendered = True
|
||||
return None
|
||||
|
||||
def wait_for_selector(self, selector: str, timeout: int | None = None) -> None:
|
||||
if selector == _SEARCH_INPUT_SELECTOR:
|
||||
if self.search_ready and self.consent_accepted:
|
||||
return None
|
||||
|
||||
raise PlaywrightTimeoutError(f"selector not found: {selector}")
|
||||
|
||||
if selector == _RESULT_LINK_SELECTOR and self.current_results():
|
||||
return None
|
||||
|
||||
raise PlaywrightTimeoutError(f"selector not found: {selector}")
|
||||
|
||||
def get_by_role(self, role: str, name: str):
|
||||
return _FakeResultButton(self, name)
|
||||
|
||||
def locator(self, selector: str):
|
||||
return _FakeLocator(self, selector)
|
||||
|
||||
def content(self) -> str:
|
||||
return self.rendered_html if self.rendered else self.shell_html
|
||||
|
||||
def current_results(self) -> list[str]:
|
||||
return self.result_pages.get(self.current_query, {}).get(self.current_page, [])
|
||||
|
||||
def close(self) -> None:
|
||||
self.closed = True
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _fake_open_public_page(page: _FakeDetailPage):
|
||||
yield page
|
||||
|
||||
|
||||
def test_search_continues_past_duplicate_only_pages(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {0: [first_result], 1: []},
|
||||
"beta": {0: [first_result], 1: [second_result], 2: []},
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result, second_result]
|
||||
assert [result.source_job_id for result in results] == ["111", "222"]
|
||||
assert "motsCles=alpha" in page.goto_urls[0]
|
||||
assert "lieux=799" in page.goto_urls[0]
|
||||
assert "typesContrat=101888" in page.goto_urls[0]
|
||||
assert any(
|
||||
"motsCles=beta" in url and "lieux=799" in url and "typesContrat=101888" in url and "page=1" in url
|
||||
for url in page.goto_urls
|
||||
)
|
||||
assert any("page=1" in url for url in page.goto_urls)
|
||||
|
||||
|
||||
def test_search_continues_past_duplicate_only_pages_until_a_later_hit(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=3&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {0: [first_result], 1: [first_result], 2: [first_result], 3: [second_result], 4: []},
|
||||
"beta": {0: [first_result], 1: []},
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result, second_result]
|
||||
assert any("page=1" in url for url in page.goto_urls)
|
||||
assert any("page=3" in url for url in page.goto_urls)
|
||||
|
||||
|
||||
def test_search_continues_after_query_and_pagination_navigation_failures(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=beta&page=0&selectedIndex=0"
|
||||
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=1&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"beta": {0: [first_result], 1: [second_result]},
|
||||
}
|
||||
)
|
||||
|
||||
original_goto = page.goto
|
||||
goto_calls = 0
|
||||
|
||||
def flaky_goto(url: str, wait_until: str | None = None) -> None:
|
||||
nonlocal goto_calls
|
||||
|
||||
goto_calls += 1
|
||||
if goto_calls == 1:
|
||||
raise RuntimeError("navigation boom")
|
||||
|
||||
original_goto(url, wait_until=wait_until)
|
||||
|
||||
original_wait_for_load_state = page.wait_for_load_state
|
||||
|
||||
def flaky_wait_for_load_state(state: str) -> None:
|
||||
if page.current_page == 1:
|
||||
raise RuntimeError("load boom")
|
||||
|
||||
original_wait_for_load_state(state)
|
||||
|
||||
monkeypatch.setattr(page, "goto", flaky_goto)
|
||||
monkeypatch.setattr(page, "wait_for_load_state", flaky_wait_for_load_state)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert [result.source_job_id for result in results] == ["111"]
|
||||
|
||||
|
||||
def test_search_stops_after_max_page_count(monkeypatch) -> None:
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {
|
||||
0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
|
||||
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
|
||||
2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/333?motsCles=alpha&page=2&selectedIndex=0"],
|
||||
3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/444?motsCles=alpha&page=3&selectedIndex=0"],
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
original_goto = page.goto
|
||||
|
||||
def bounded_goto(url: str, wait_until: str | None = None) -> None:
|
||||
original_goto(url, wait_until=wait_until)
|
||||
if page.current_page >= 3:
|
||||
raise AssertionError("pagination should stop before page 3")
|
||||
|
||||
monkeypatch.setattr(page, "goto", bounded_goto)
|
||||
monkeypatch.setattr(adapter_module, "_MAX_PAGES_PER_QUERY", 3)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.source_job_id for result in results] == ["111", "222", "333"]
|
||||
assert not any("page=3" in url for url in page.goto_urls)
|
||||
|
||||
|
||||
def test_search_stops_after_consecutive_no_progress_pages(monkeypatch) -> None:
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {
|
||||
0: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"],
|
||||
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=1&selectedIndex=0"],
|
||||
2: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=2&selectedIndex=0"],
|
||||
3: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=3&selectedIndex=0"],
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
original_goto = page.goto
|
||||
|
||||
def bounded_goto(url: str, wait_until: str | None = None) -> None:
|
||||
original_goto(url, wait_until=wait_until)
|
||||
if page.current_page >= 3:
|
||||
raise AssertionError("pagination should stop before page 3")
|
||||
|
||||
monkeypatch.setattr(page, "goto", bounded_goto)
|
||||
monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 2)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.source_job_id for result in results] == ["111"]
|
||||
assert not any("page=3" in url for url in page.goto_urls)
|
||||
|
||||
|
||||
def test_search_stops_when_result_page_url_repeats(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {0: [first_result], 1: [first_result], 2: [first_result]},
|
||||
}
|
||||
)
|
||||
|
||||
original_goto = page.goto
|
||||
initial_result_page_url: str | None = None
|
||||
|
||||
def looping_goto(url: str, wait_until: str | None = None) -> None:
|
||||
nonlocal initial_result_page_url
|
||||
|
||||
original_goto(url, wait_until=wait_until)
|
||||
|
||||
if initial_result_page_url is None and page.current_page == 0:
|
||||
initial_result_page_url = page.url
|
||||
elif initial_result_page_url is not None and page.current_page > 0:
|
||||
page.url = initial_result_page_url
|
||||
|
||||
monkeypatch.setattr(page, "goto", looping_goto)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert any("page=1" in url for url in page.goto_urls)
|
||||
assert not any("page=2" in url for url in page.goto_urls)
|
||||
|
||||
|
||||
def test_search_raises_when_every_query_fails_to_load_a_search_page(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({"alpha": {0: []}}, search_ready=False)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
with pytest.raises(adapter_module.ApecSearchError):
|
||||
ApecAdapter(max_listings=10).search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
|
||||
def test_search_treats_zero_results_redirect_as_usable_and_records_other_failures(monkeypatch) -> None:
|
||||
page = _FakeDetailPage(
|
||||
{"alpha": {0: []}, "beta": {0: []}},
|
||||
zero_result_queries={"alpha"},
|
||||
)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
adapter = ApecAdapter(max_listings=10)
|
||||
results = adapter.search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert results == []
|
||||
assert [error.stage for error in adapter.search_errors] == ["search"]
|
||||
assert "beta" in adapter.search_errors[0].url
|
||||
|
||||
|
||||
def test_search_raises_when_every_query_renders_broken_search_shell(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({"alpha": {0: []}, "beta": {0: []}}, search_ready=True)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
adapter = ApecAdapter(max_listings=10)
|
||||
|
||||
with pytest.raises(adapter_module.ApecSearchError):
|
||||
adapter.search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [error.stage for error in adapter.search_errors] == ["search", "search"]
|
||||
|
||||
|
||||
def test_search_accepts_current_cgu_popin_before_waiting_for_results(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
page = _FakeDetailPage({"alpha": {0: [first_result]}}, consent_required=True)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert page.cgu_checkbox_checked is True
|
||||
assert page.consent_button_clicks == ["ACCEPTER"]
|
||||
assert page.consent_accepted is True
|
||||
|
||||
|
||||
def test_search_ignores_unexpected_consent_widget_playwright_errors(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
page = _FakeDetailPage({"alpha": {0: [first_result]}})
|
||||
|
||||
class _ExplodingConsentButton:
|
||||
def click(self, timeout: int | None = None) -> None:
|
||||
raise PlaywrightError("consent widget boom")
|
||||
|
||||
def exploding_get_by_role(role: str, name: str):
|
||||
return _ExplodingConsentButton()
|
||||
|
||||
monkeypatch.setattr(page, "get_by_role", exploding_get_by_role)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
results = ApecAdapter(max_listings=10).search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert page.cgu_checkbox_checked is True
|
||||
|
||||
|
||||
def test_search_records_pagination_navigation_failures(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {
|
||||
0: [first_result],
|
||||
1: ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
original_goto = page.goto
|
||||
|
||||
def flaky_goto(url: str, wait_until: str | None = None) -> None:
|
||||
if "page=1" in url:
|
||||
raise RuntimeError("navigation boom")
|
||||
|
||||
original_goto(url, wait_until=wait_until)
|
||||
|
||||
monkeypatch.setattr(page, "goto", flaky_goto)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
adapter = ApecAdapter(max_listings=10)
|
||||
results = adapter.search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert [error.stage for error in adapter.search_errors] == ["search"]
|
||||
assert "page=1" in adapter.search_errors[0].url
|
||||
assert adapter.search_errors[0].message == "page 1 navigation failed"
|
||||
|
||||
|
||||
def test_search_records_pagination_render_failures(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=alpha&page=1&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {
|
||||
0: [first_result],
|
||||
1: [second_result],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
original_wait_for_selector = page.wait_for_selector
|
||||
|
||||
def flaky_wait_for_selector(selector: str, timeout: int | None = None) -> None:
|
||||
if selector == _RESULT_LINK_SELECTOR and page.current_page == 1:
|
||||
raise PlaywrightTimeoutError(f"selector not found: {selector}")
|
||||
|
||||
original_wait_for_selector(selector, timeout=timeout)
|
||||
|
||||
monkeypatch.setattr(page, "wait_for_selector", flaky_wait_for_selector)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
adapter = ApecAdapter(max_listings=10)
|
||||
results = adapter.search(
|
||||
["alpha"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result]
|
||||
assert [error.stage for error in adapter.search_errors] == ["search"]
|
||||
assert "page=1" in adapter.search_errors[0].url
|
||||
assert adapter.search_errors[0].message == "page 1 results did not render"
|
||||
|
||||
|
||||
def test_search_records_evaluate_all_failures_and_continues_to_next_query(monkeypatch) -> None:
|
||||
first_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111?motsCles=alpha&page=0&selectedIndex=0"
|
||||
second_result = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222?motsCles=beta&page=0&selectedIndex=0"
|
||||
page = _FakeDetailPage(
|
||||
{
|
||||
"alpha": {
|
||||
0: [first_result],
|
||||
1: [first_result],
|
||||
},
|
||||
"beta": {
|
||||
0: [second_result],
|
||||
1: [second_result],
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
original_locator = page.locator
|
||||
|
||||
class _FlakyLocator:
|
||||
def __init__(self, locator) -> None:
|
||||
self._locator = locator
|
||||
|
||||
def evaluate_all(self, function: str):
|
||||
if (
|
||||
self._locator.selector == _RESULT_LINK_SELECTOR
|
||||
and page.current_query == "alpha"
|
||||
and page.current_page == 1
|
||||
):
|
||||
raise RuntimeError("evaluate boom")
|
||||
|
||||
return self._locator.evaluate_all(function)
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
return getattr(self._locator, name)
|
||||
|
||||
def flaky_locator(selector: str):
|
||||
return _FlakyLocator(original_locator(selector))
|
||||
|
||||
monkeypatch.setattr(page, "locator", flaky_locator)
|
||||
monkeypatch.setattr(adapter_module, "_MAX_CONSECUTIVE_NO_PROGRESS_PAGES", 1)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
adapter = ApecAdapter(max_listings=10)
|
||||
results = adapter.search(
|
||||
["alpha", "beta"],
|
||||
search_filters=ApecSearchFilters(location="France", contract_type="CDI"),
|
||||
)
|
||||
|
||||
assert [result.url for result in results] == [first_result, second_result]
|
||||
assert [error.stage for error in adapter.search_errors] == ["search"]
|
||||
assert "page=1" in adapter.search_errors[0].url
|
||||
|
||||
|
||||
def test_fetch_listing_html_waits_for_rendered_offer_content(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
html = ApecAdapter().fetch_listing_html(
|
||||
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
|
||||
)
|
||||
|
||||
assert html == "<html>rendered offer</html>"
|
||||
assert len(page.waited_functions) == 1
|
||||
assert ".container-details-offer h1" in page.waited_functions[0][0]
|
||||
assert ".ref-offre" in page.waited_functions[0][0]
|
||||
assert ".details-offer-list" in page.waited_functions[0][0]
|
||||
assert "Descriptif du poste" not in page.waited_functions[0][0]
|
||||
assert page.waited_functions[0][1] == 1000
|
||||
|
||||
|
||||
def test_fetch_listing_html_accepts_current_cgu_popin_before_waiting_for_detail_content(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>", consent_required=True)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
html = ApecAdapter().fetch_listing_html(
|
||||
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
|
||||
)
|
||||
|
||||
assert html == "<html>rendered offer</html>"
|
||||
assert page.cgu_checkbox_checked is True
|
||||
assert page.consent_button_clicks == ["ACCEPTER"]
|
||||
assert page.consent_accepted is True
|
||||
|
||||
|
||||
def test_fetch_listing_html_uses_explicit_company_fallback_chain(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({}, rendered_html="<html>rendered offer</html>")
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
ApecAdapter().fetch_listing_html(
|
||||
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
|
||||
)
|
||||
|
||||
wait_script = page.waited_functions[0][0]
|
||||
assert "companySelectors" not in wait_script
|
||||
assert ".container-details-offer h1" in wait_script
|
||||
assert ".ref-offre" in wait_script
|
||||
assert ".details-offer-list" in wait_script
|
||||
|
||||
|
||||
def test_fetch_listing_html_rejects_redirected_non_apec_urls(monkeypatch) -> None:
|
||||
page = _FakeDetailPage({}, rendered_html="<html>redirected</html>")
|
||||
|
||||
original_goto = page.goto
|
||||
|
||||
def redirecting_goto(url: str, wait_until: str | None = None) -> None:
|
||||
original_goto(url, wait_until=wait_until)
|
||||
page.url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi/recherche-avancee?error=true"
|
||||
|
||||
monkeypatch.setattr(page, "goto", redirecting_goto)
|
||||
monkeypatch.setattr(adapter_module, "_open_public_page", lambda: _fake_open_public_page(page))
|
||||
|
||||
with pytest.raises(ValueError, match="unexpected URL after redirects"):
|
||||
ApecAdapter().fetch_listing_html(
|
||||
"https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"
|
||||
)
|
||||
|
||||
assert len(page.waited_functions) == 1
|
||||
assert page.goto_urls == ["https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111"]
|
||||
|
||||
|
||||
def test_fetch_listing_html_rejects_non_apec_hosts() -> None:
|
||||
adapter = ApecAdapter()
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
adapter.fetch_listing_html("https://evilapec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
|
||||
|
||||
|
||||
def test_fetch_listing_html_reuses_browser_context_across_calls(monkeypatch) -> None:
|
||||
class FakePage:
|
||||
def __init__(self) -> None:
|
||||
self.goto_urls: list[str] = []
|
||||
self.default_timeout: int | None = None
|
||||
self.url = ""
|
||||
|
||||
def set_default_timeout(self, timeout: int) -> None:
|
||||
self.default_timeout = timeout
|
||||
|
||||
def goto(self, url: str, wait_until: str | None = None) -> None:
|
||||
self.goto_urls.append(url)
|
||||
self.url = url
|
||||
|
||||
def wait_for_function(self, function: str, polling: int | None = None, timeout: int | None = None) -> None:
|
||||
return None
|
||||
|
||||
def content(self) -> str:
|
||||
return "<html>shared</html>"
|
||||
|
||||
def close(self) -> None:
|
||||
return None
|
||||
|
||||
class FakeBrowserContext:
|
||||
def __init__(self) -> None:
|
||||
self.new_page_calls = 0
|
||||
|
||||
def new_page(self) -> FakePage:
|
||||
self.new_page_calls += 1
|
||||
return FakePage()
|
||||
|
||||
def close(self) -> None:
|
||||
return None
|
||||
|
||||
class FakeBrowser:
|
||||
def __init__(self, browser_context: FakeBrowserContext) -> None:
|
||||
self.browser_context = browser_context
|
||||
|
||||
def new_context(self) -> FakeBrowserContext:
|
||||
return self.browser_context
|
||||
|
||||
def close(self) -> None:
|
||||
return None
|
||||
|
||||
class FakeChromium:
|
||||
def __init__(self, browser: FakeBrowser) -> None:
|
||||
self.browser = browser
|
||||
self.launch_calls = 0
|
||||
|
||||
def launch(self, headless: bool = True) -> FakeBrowser:
|
||||
self.launch_calls += 1
|
||||
return self.browser
|
||||
|
||||
class FakePlaywright:
|
||||
def __init__(self, chromium: FakeChromium) -> None:
|
||||
self.chromium = chromium
|
||||
|
||||
class FakePlaywrightManager:
|
||||
def __init__(self, chromium: FakeChromium) -> None:
|
||||
self.playwright = FakePlaywright(chromium)
|
||||
|
||||
def __enter__(self) -> FakePlaywright:
|
||||
return self.playwright
|
||||
|
||||
def __exit__(self, exc_type, exc, tb) -> None:
|
||||
return None
|
||||
|
||||
browser_context = FakeBrowserContext()
|
||||
browser = FakeBrowser(browser_context)
|
||||
chromium = FakeChromium(browser)
|
||||
|
||||
monkeypatch.setattr(adapter_module, "sync_playwright", lambda: FakePlaywrightManager(chromium))
|
||||
|
||||
adapter = ApecAdapter()
|
||||
with adapter.browser_session():
|
||||
html_one = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/111")
|
||||
html_two = adapter.fetch_listing_html("https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/222")
|
||||
|
||||
assert html_one == "<html>shared</html>"
|
||||
assert html_two == "<html>shared</html>"
|
||||
assert chromium.launch_calls == 1
|
||||
assert browser_context.new_page_calls == 2
|
||||
212
tests/apec/test_dedupe.py
Normal file
212
tests/apec/test_dedupe.py
Normal file
@ -0,0 +1,212 @@
|
||||
from job_research.apec.dedupe import dedupe_apec_listings
|
||||
from job_research.models import ApecListing
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_by_url_preserves_first_listing() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
url="https://example.test/job/1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
url="https://example.test/job/1",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second])
|
||||
|
||||
assert deduped == [first]
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_by_source_job_id_ignores_url_changes() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
url="https://example.test/job/1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
url="https://example.test/job/2",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second])
|
||||
|
||||
assert deduped == [first]
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_collapses_mixed_key_duplicates() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
url="https://example.test/job/1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id=None,
|
||||
url="https://example.test/job/1",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second])
|
||||
|
||||
assert deduped == [first]
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_keeps_secondary_ids_from_skipped_rows() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id=None,
|
||||
url="url1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
url="url1",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
third = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
url="url2",
|
||||
fetched_at="2026-06-01T10:02:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second, third])
|
||||
|
||||
assert deduped == [first]
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_merges_metadata_from_duplicate_rows() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id=None,
|
||||
published_at=None,
|
||||
refreshed_at=None,
|
||||
url="url1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="job-123",
|
||||
published_at="2026-06-01",
|
||||
refreshed_at="2026-06-02",
|
||||
url="url1",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second])
|
||||
|
||||
assert len(deduped) == 1
|
||||
assert deduped[0].url == "url1"
|
||||
assert deduped[0].source_job_id == "job-123"
|
||||
assert deduped[0].published_at == "2026-06-01"
|
||||
assert deduped[0].refreshed_at == "2026-06-02"
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_merges_metadata_through_alias_chain() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id=None,
|
||||
url="u1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="i2",
|
||||
url="u2",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
third = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="i4",
|
||||
url="u2",
|
||||
fetched_at="2026-06-01T10:02:00Z",
|
||||
)
|
||||
fourth = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="i2",
|
||||
url="u1",
|
||||
fetched_at="2026-06-01T10:03:00Z",
|
||||
)
|
||||
fifth = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="i4",
|
||||
url="u6",
|
||||
company="NewestCo",
|
||||
fetched_at="2026-06-01T10:04:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second, third, fourth, fifth])
|
||||
|
||||
assert len(deduped) == 1
|
||||
assert deduped[0].url == "u1"
|
||||
assert deduped[0].source_job_id == "i2"
|
||||
assert deduped[0].company == "NewestCo"
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_keeps_one_survivor_for_cluster_alias_chain() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id2",
|
||||
url="u2",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id3",
|
||||
url="u2",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
third = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id2",
|
||||
url="u1",
|
||||
fetched_at="2026-06-01T10:02:00Z",
|
||||
)
|
||||
fourth = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id3",
|
||||
url="u3",
|
||||
fetched_at="2026-06-01T10:03:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second, third, fourth])
|
||||
|
||||
assert len(deduped) == 1
|
||||
assert deduped[0].url == "u2"
|
||||
assert deduped[0].source_job_id == "id2"
|
||||
|
||||
|
||||
def test_dedupe_apec_listings_keeps_first_listing_as_bridge_survivor() -> None:
|
||||
first = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id1",
|
||||
url="u1",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
second = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id2",
|
||||
url="u2",
|
||||
fetched_at="2026-06-01T10:01:00Z",
|
||||
)
|
||||
third = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="id1",
|
||||
url="u2",
|
||||
company="NewestCo",
|
||||
fetched_at="2026-06-01T10:02:00Z",
|
||||
)
|
||||
|
||||
deduped = dedupe_apec_listings([first, second, third])
|
||||
|
||||
assert len(deduped) == 1
|
||||
assert deduped[0].url == "u1"
|
||||
assert deduped[0].source_job_id == "id1"
|
||||
assert deduped[0].company == "NewestCo"
|
||||
372
tests/apec/test_normalize.py
Normal file
372
tests/apec/test_normalize.py
Normal file
@ -0,0 +1,372 @@
|
||||
from job_research.apec.normalize import normalize_apec_listing
|
||||
|
||||
|
||||
def test_normalize_apec_listing_extracts_minimal_shape() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>CLOUD TEMPLE</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">CLOUD TEMPLE</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Salaire</h4>
|
||||
<span>A partir de 70 k€ brut annuel</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Prise de poste</h4>
|
||||
<span>Dès que possible</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Expérience</h4>
|
||||
<span>Minimum 7 ans</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
<div class="nested-late-sections">
|
||||
<h4>Profil recherché</h4>
|
||||
<p>Python / SQL</p>
|
||||
<h4>Compétences attendues</h4>
|
||||
<p>Ignored</p>
|
||||
<h4>Entreprise</h4>
|
||||
<p>Ignored</p>
|
||||
<div class="recruiter">Ignored recruiter info</div>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="job-123",
|
||||
)
|
||||
|
||||
assert listing.source == "apec"
|
||||
assert listing.source_job_id == "job-123"
|
||||
assert listing.url == "https://example.test/job/123"
|
||||
assert listing.title == "Data Engineer F/H"
|
||||
assert listing.company == "CLOUD TEMPLE"
|
||||
assert listing.location == "Puteaux - 92"
|
||||
assert listing.contract_type == "CDI"
|
||||
assert listing.description_text == "Build pipelines"
|
||||
assert listing.published_at == "2026-04-20"
|
||||
assert listing.refreshed_at == "2026-06-02"
|
||||
assert listing.fetched_at == "2026-06-01T10:00:00Z"
|
||||
|
||||
|
||||
def test_normalize_apec_listing_prefers_final_source_job_id_from_detail_page() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : FINAL456</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>CLOUD TEMPLE</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">CLOUD TEMPLE</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://www.apec.fr/candidat/recherche-emploi.html/emploi/detail-offre/REQUESTED123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="REQUESTED123",
|
||||
)
|
||||
|
||||
assert listing.source_job_id == "FINAL456"
|
||||
|
||||
|
||||
def test_normalize_apec_listing_warns_and_returns_none_for_invalid_dates() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>CLOUD TEMPLE</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 32/13/2026 Actualisée le 31/02/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">CLOUD TEMPLE</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="job-123",
|
||||
)
|
||||
|
||||
assert listing.published_at is None
|
||||
assert listing.refreshed_at is None
|
||||
assert [warning.field for warning in listing.warnings] == ["published_at", "refreshed_at"]
|
||||
|
||||
|
||||
def test_normalize_apec_listing_uses_details_offer_list_company_fallback() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>Fallback Company</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Salaire</h4>
|
||||
<span>A partir de 70 k€ brut annuel</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Prise de poste</h4>
|
||||
<span>Dès que possible</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Expérience</h4>
|
||||
<span>Minimum 7 ans</span>
|
||||
</div>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
<div class="nested-late-sections">
|
||||
<h4>Profil recherché</h4>
|
||||
<p>Python / SQL</p>
|
||||
<h4>Compétences attendues</h4>
|
||||
<p>Ignored</p>
|
||||
<h4>Entreprise</h4>
|
||||
<p>Ignored</p>
|
||||
<div class="recruiter">Ignored recruiter info</div>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id=None,
|
||||
)
|
||||
|
||||
assert listing.company == "Fallback Company"
|
||||
assert listing.description_text == "Build pipelines"
|
||||
assert listing.refreshed_at == "2026-06-02"
|
||||
|
||||
|
||||
def test_normalize_apec_listing_records_warnings_for_fallback_and_missing_fields() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>Fallback Company</li>
|
||||
<li>1 CDI</li>
|
||||
</ul>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="card-ents-quote">Fallback Company</span>
|
||||
</div>
|
||||
</article>
|
||||
</main>
|
||||
<h1>Fallback Title</h1>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id=None,
|
||||
)
|
||||
|
||||
assert [warning.field for warning in listing.warnings] == [
|
||||
"title",
|
||||
"location",
|
||||
"contract_type",
|
||||
"description_text",
|
||||
"source_job_id",
|
||||
"company",
|
||||
"published_at",
|
||||
]
|
||||
|
||||
|
||||
def test_normalize_apec_listing_records_warnings_for_placeholder_text_values() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1> N/A </h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>Example Corp</li>
|
||||
<li>1 <span> N/A </span></li>
|
||||
<li> - </li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">Example Corp</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="178554452W",
|
||||
)
|
||||
|
||||
assert listing.title is None
|
||||
assert listing.location is None
|
||||
assert listing.contract_type is None
|
||||
assert [warning.field for warning in listing.warnings] == ["title", "location", "contract_type"]
|
||||
|
||||
|
||||
def test_normalize_apec_listing_records_warning_for_placeholder_company() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>Example Corp</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">N/A</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>Build pipelines</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="178554452W",
|
||||
)
|
||||
|
||||
assert listing.company is None
|
||||
assert listing.refreshed_at == "2026-06-02"
|
||||
assert [warning.field for warning in listing.warnings] == ["company"]
|
||||
|
||||
|
||||
def test_normalize_apec_listing_records_warning_for_placeholder_description_text() -> None:
|
||||
html = """
|
||||
<html>
|
||||
<body>
|
||||
<main class="container-details-offer">
|
||||
<h1>Data Engineer F/H</h1>
|
||||
<div class="card-offer">
|
||||
<div class="ref-offre">Ref. Apec : 178554452W</div>
|
||||
<ul class="details-offer-list mb-20">
|
||||
<li>CLOUD TEMPLE</li>
|
||||
<li>1 <span> CDI </span></li>
|
||||
<li>Puteaux - 92</li>
|
||||
</ul>
|
||||
<p>Publiée le 20/04/2026 Actualisée le 02/06/2026</p>
|
||||
</div>
|
||||
<article class="card card-ents mb-20">
|
||||
<div class="list-hzt mb-20">
|
||||
<span class="ents-name">CLOUD TEMPLE</span>
|
||||
</div>
|
||||
</article>
|
||||
<div class="details-post">
|
||||
<h4>Descriptif du poste</h4>
|
||||
<p>N/A</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
listing = normalize_apec_listing(
|
||||
url="https://example.test/job/123",
|
||||
html=html,
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
source_job_id="job-123",
|
||||
)
|
||||
|
||||
assert listing.description_text is None
|
||||
assert [warning.field for warning in listing.warnings] == ["description_text"]
|
||||
75
tests/apec/test_query_derivation.py
Normal file
75
tests/apec/test_query_derivation.py
Normal file
@ -0,0 +1,75 @@
|
||||
from job_research.apec.adapter import ApecSearchFilters
|
||||
from job_research.apec.query_derivation import derive_apec_queries, derive_apec_search_filters
|
||||
from job_research.models import CandidateProfileOutput
|
||||
|
||||
|
||||
def test_derive_apec_queries_preserves_order_dedupes_and_caps_at_five() -> None:
|
||||
profile = CandidateProfileOutput(
|
||||
target_roles=[
|
||||
"Data Engineer",
|
||||
"Analytics Engineer",
|
||||
"Data Engineer",
|
||||
"BI Engineer",
|
||||
"Junior Data Platform Engineer",
|
||||
"ML Engineer",
|
||||
"Backend Engineer",
|
||||
],
|
||||
strengths=["Python", "SQL"],
|
||||
skills_to_emphasize=["BigQuery", "Terraform"],
|
||||
)
|
||||
|
||||
queries = derive_apec_queries(profile)
|
||||
|
||||
assert queries == [
|
||||
"Data Engineer",
|
||||
"Analytics Engineer",
|
||||
"BI Engineer",
|
||||
"Junior Data Platform Engineer",
|
||||
"ML Engineer",
|
||||
]
|
||||
|
||||
|
||||
def test_derive_apec_queries_uses_up_to_five_target_roles_when_no_support_terms_exist() -> None:
|
||||
profile = CandidateProfileOutput(
|
||||
target_roles=[
|
||||
"Data Engineer",
|
||||
"Analytics Engineer",
|
||||
"BI Engineer",
|
||||
"Junior Data Platform Engineer",
|
||||
"ML Engineer",
|
||||
"Backend Engineer",
|
||||
]
|
||||
)
|
||||
|
||||
queries = derive_apec_queries(profile)
|
||||
|
||||
assert queries == [
|
||||
"Data Engineer",
|
||||
"Analytics Engineer",
|
||||
"BI Engineer",
|
||||
"Junior Data Platform Engineer",
|
||||
"ML Engineer",
|
||||
]
|
||||
|
||||
|
||||
def test_derive_apec_queries_uses_support_terms_without_constraints() -> None:
|
||||
profile = CandidateProfileOutput(
|
||||
target_roles=["Data Engineer"],
|
||||
strengths=["Python"],
|
||||
skills_to_emphasize=["BigQuery"],
|
||||
constraints=["CDI only", "France only"],
|
||||
)
|
||||
|
||||
queries = derive_apec_queries(profile)
|
||||
|
||||
assert queries == [
|
||||
"Data Engineer",
|
||||
"Data Engineer Python",
|
||||
"Data Engineer BigQuery",
|
||||
]
|
||||
|
||||
|
||||
def test_derive_apec_search_filters_from_constraints() -> None:
|
||||
profile = CandidateProfileOutput(constraints=["CDI only", "France only"])
|
||||
|
||||
assert derive_apec_search_filters(profile) == ApecSearchFilters(location="France", contract_type="CDI")
|
||||
@ -29,6 +29,23 @@ def test_extract_cv_signals_reads_basic_fields_from_text() -> None:
|
||||
assert len(extracted["experience_entries"]) == 2
|
||||
|
||||
|
||||
def test_extract_cv_signals_flags_low_confidence_first_line_as_name() -> None:
|
||||
text = dedent(
|
||||
"""
|
||||
Data Engineer | Python | GCP
|
||||
Location: France
|
||||
Languages: French, English
|
||||
Skills: Python, SQL
|
||||
Data Engineer at Company A
|
||||
"""
|
||||
).strip()
|
||||
|
||||
extracted = extract_cv_signals(text)
|
||||
|
||||
assert extracted["name"] == "Data Engineer | Python | GCP"
|
||||
assert [warning.field for warning in extracted["warnings"]] == ["name"]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("line", "expected"),
|
||||
[
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from job_research.profile.merge import build_candidate_profile_output
|
||||
from job_research.profile.profile_parser import AuthoredProfile
|
||||
from job_research.models import WarningItem
|
||||
|
||||
|
||||
def test_build_candidate_profile_output_writes_warning_when_facts_conflict() -> None:
|
||||
@ -72,3 +73,34 @@ def test_build_candidate_profile_output_warns_on_missing_core_cv_facts() -> None
|
||||
"skills",
|
||||
"education_entries",
|
||||
]
|
||||
|
||||
|
||||
def test_build_candidate_profile_output_propagates_cv_extraction_warnings() -> None:
|
||||
cv_signals = {
|
||||
"name": "Data Engineer | Python | GCP",
|
||||
"location": "France",
|
||||
"languages": ["French", "English"],
|
||||
"skills": ["Python", "SQL"],
|
||||
"experience_entries": [{"title": "Data Engineer", "company": "A"}],
|
||||
"education_entries": [{"credential": "MSc", "institution": "Example University"}],
|
||||
"warnings": [
|
||||
WarningItem(
|
||||
field="name",
|
||||
message="First CV line looks like a header or tagline; review manually.",
|
||||
)
|
||||
],
|
||||
}
|
||||
authored = AuthoredProfile(
|
||||
summary="Junior data engineer focused on GCP.",
|
||||
target_roles=["Data Engineer"],
|
||||
strengths=["Python"],
|
||||
skills_to_emphasize=["BigQuery", "GCP"],
|
||||
constraints=["CDI only"],
|
||||
notes=[],
|
||||
)
|
||||
|
||||
output = build_candidate_profile_output(cv_signals, authored)
|
||||
|
||||
assert output.warnings == [
|
||||
WarningItem(field="name", message="First CV line looks like a header or tagline; review manually.")
|
||||
]
|
||||
|
||||
@ -72,3 +72,38 @@ def test_parse_profile_markdown_rejects_unsupported_list_content() -> None:
|
||||
|
||||
with pytest.raises(ValueError, match="Unsupported content in section 'target roles'"):
|
||||
parse_profile_markdown(markdown)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("section_name", ["Target Roles", "Notes"])
|
||||
def test_parse_profile_markdown_rejects_blank_bullet_only_required_sections(
|
||||
section_name: str,
|
||||
) -> None:
|
||||
target_roles = "- " if section_name == "Target Roles" else "- Data Engineer"
|
||||
notes = "- " if section_name == "Notes" else "Slight preference for French listings."
|
||||
|
||||
markdown = dedent(
|
||||
f"""
|
||||
# Candidate Profile
|
||||
|
||||
## Summary
|
||||
Junior data engineer focused on Python and GCP.
|
||||
|
||||
## Target Roles
|
||||
{target_roles}
|
||||
|
||||
## Strengths
|
||||
- Python
|
||||
|
||||
## Skills To Emphasize
|
||||
- BigQuery
|
||||
|
||||
## Constraints
|
||||
- CDI only
|
||||
|
||||
## Notes
|
||||
{notes}
|
||||
"""
|
||||
).strip()
|
||||
|
||||
with pytest.raises(ValueError, match=f"Missing usable content in section '{section_name.lower()}'"):
|
||||
parse_profile_markdown(markdown)
|
||||
|
||||
1467
tests/test_apec_cli.py
Normal file
1467
tests/test_apec_cli.py
Normal file
File diff suppressed because it is too large
Load Diff
82
tests/test_apec_storage.py
Normal file
82
tests/test_apec_storage.py
Normal file
@ -0,0 +1,82 @@
|
||||
from pathlib import Path
|
||||
|
||||
from job_research.models import ApecListing, ApecRunMeta, ApecSnapshotMeta, ListingWarning
|
||||
from job_research.storage import apec_run_paths
|
||||
|
||||
|
||||
FIXED_RUN_ID = "2026-06-01T10-00-00-123456Z"
|
||||
|
||||
|
||||
def test_apec_models_serialize_expected_listing_shape() -> None:
|
||||
listing = ApecListing(
|
||||
source="apec",
|
||||
source_job_id="123",
|
||||
url="https://example.test/job/123",
|
||||
title="Data Engineer",
|
||||
company="Example",
|
||||
location="Paris",
|
||||
contract_type="CDI",
|
||||
description_text="Build pipelines",
|
||||
published_at="2026-06-01",
|
||||
refreshed_at="2026-06-02",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
warnings=[
|
||||
ListingWarning(
|
||||
field="location",
|
||||
message="Location inferred from page text",
|
||||
)
|
||||
],
|
||||
)
|
||||
run_meta = ApecRunMeta(
|
||||
run_id=FIXED_RUN_ID,
|
||||
run_started_at="2026-06-01T10:00:00Z",
|
||||
derived_queries=["Data Engineer"],
|
||||
snapshots=[
|
||||
ApecSnapshotMeta(
|
||||
url="https://example.test/job/123",
|
||||
source_job_id="123",
|
||||
snapshot_file="job-123.html",
|
||||
fetched_at="2026-06-01T10:00:00Z",
|
||||
)
|
||||
],
|
||||
fetched_count=1,
|
||||
normalized_count=1,
|
||||
deduplicated_count=1,
|
||||
failed_count=0,
|
||||
listing_errors=[],
|
||||
)
|
||||
|
||||
assert listing.model_dump()["source"] == "apec"
|
||||
assert listing.model_dump()["warnings"][0]["field"] == "location"
|
||||
assert listing.model_dump()["refreshed_at"] == "2026-06-02"
|
||||
assert run_meta.model_dump()["run_id"] == FIXED_RUN_ID
|
||||
assert run_meta.model_dump()["run_started_at"] == "2026-06-01T10:00:00Z"
|
||||
assert run_meta.model_dump()["derived_queries"] == ["Data Engineer"]
|
||||
assert run_meta.model_dump(mode="json")["snapshots"] == [
|
||||
{
|
||||
"url": "https://example.test/job/123",
|
||||
"source_job_id": "123",
|
||||
"snapshot_file": "job-123.html",
|
||||
"fetched_at": "2026-06-01T10:00:00Z",
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_apec_run_paths_builds_expected_layout(tmp_path: Path) -> None:
|
||||
paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
|
||||
run_dir = tmp_path / "apec" / "runs" / FIXED_RUN_ID
|
||||
|
||||
assert paths["run_dir"] == run_dir
|
||||
assert paths["listings"] == run_dir / "listings.yaml"
|
||||
assert paths["run_meta"] == run_dir / "run-meta.yaml"
|
||||
assert paths["snapshots"] == run_dir / "snapshots"
|
||||
|
||||
|
||||
def test_apec_run_artifacts_include_snapshot_and_meta(tmp_path: Path) -> None:
|
||||
paths = apec_run_paths(tmp_path, run_id=FIXED_RUN_ID)
|
||||
paths["snapshots"].mkdir(parents=True, exist_ok=True)
|
||||
|
||||
snapshot = paths["snapshots"] / "job-123.html"
|
||||
snapshot.write_text("<html>snapshot</html>", encoding="utf-8")
|
||||
|
||||
assert snapshot.read_text(encoding="utf-8") == "<html>snapshot</html>"
|
||||
@ -1,6 +1,10 @@
|
||||
from subprocess import run
|
||||
from textwrap import dedent
|
||||
from pathlib import Path
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from job_research.cli import app
|
||||
from job_research.storage import load_yaml
|
||||
|
||||
|
||||
@ -147,6 +151,78 @@ def test_build_profile_reports_when_no_warnings_are_included(tmp_path) -> None:
|
||||
assert "No warnings included." in result.stdout
|
||||
|
||||
|
||||
def test_build_profile_reports_output_write_failures_cleanly(tmp_path, monkeypatch) -> None:
|
||||
cv = tmp_path / "cv.txt"
|
||||
cv.write_text(
|
||||
dedent(
|
||||
"""
|
||||
Tonio Example
|
||||
Location: France
|
||||
Languages: French, English
|
||||
Skills: Python, SQL
|
||||
Data Engineer at Acme
|
||||
Education: Master of Science at Example University
|
||||
"""
|
||||
).strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
profile = tmp_path / "profile.md"
|
||||
profile.write_text(
|
||||
dedent(
|
||||
"""
|
||||
# Candidate Profile
|
||||
|
||||
## Summary
|
||||
Junior data engineer focused on Python and GCP.
|
||||
|
||||
## Target Roles
|
||||
- Data Engineer
|
||||
|
||||
## Strengths
|
||||
- Python
|
||||
- SQL
|
||||
|
||||
## Skills To Emphasize
|
||||
- GCP
|
||||
- BigQuery
|
||||
|
||||
## Constraints
|
||||
- CDI only
|
||||
- France only
|
||||
|
||||
## Notes
|
||||
- Slight preference for French listings.
|
||||
"""
|
||||
).strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
out = tmp_path / "candidate-profile.yaml"
|
||||
|
||||
original_write_text = Path.write_text
|
||||
|
||||
def flaky_write_text(
|
||||
self: Path,
|
||||
data: str,
|
||||
encoding: str | None = None,
|
||||
errors: str | None = None,
|
||||
newline: str | None = None,
|
||||
) -> int:
|
||||
if self == out:
|
||||
raise OSError("disk full")
|
||||
|
||||
return original_write_text(self, data, encoding=encoding, errors=errors, newline=newline)
|
||||
|
||||
monkeypatch.setattr(Path, "write_text", flaky_write_text)
|
||||
|
||||
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert not out.exists()
|
||||
assert "Unable to write candidate profile to" in result.stderr
|
||||
assert "disk full" in result.stderr
|
||||
assert "Traceback" not in result.stderr
|
||||
|
||||
|
||||
def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
|
||||
cv = tmp_path / "cv.txt"
|
||||
cv.write_text(" \n", encoding="utf-8")
|
||||
@ -189,3 +265,100 @@ def test_build_profile_rejects_empty_cv_text_before_writing(tmp_path) -> None:
|
||||
assert result.returncode != 0
|
||||
assert not out.exists()
|
||||
assert "No readable text found in CV input" in result.stderr
|
||||
assert "Traceback" not in result.stderr
|
||||
|
||||
|
||||
def test_build_profile_reports_unreadable_pdf_input_cleanly(tmp_path, monkeypatch) -> None:
|
||||
cv = tmp_path / "cv.pdf"
|
||||
cv.write_bytes(b"%PDF-1.4\n")
|
||||
profile = tmp_path / "profile.md"
|
||||
profile.write_text(
|
||||
dedent(
|
||||
"""
|
||||
# Candidate Profile
|
||||
|
||||
## Summary
|
||||
Junior data engineer.
|
||||
|
||||
## Target Roles
|
||||
- Data Engineer
|
||||
|
||||
## Strengths
|
||||
- Python
|
||||
|
||||
## Skills To Emphasize
|
||||
- BigQuery
|
||||
|
||||
## Constraints
|
||||
- CDI only
|
||||
|
||||
## Notes
|
||||
- Slight preference for French listings.
|
||||
"""
|
||||
).strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
out = tmp_path / "candidate-profile.yaml"
|
||||
|
||||
def broken_extract_pdf_text(path):
|
||||
raise ValueError("broken pdf")
|
||||
|
||||
monkeypatch.setattr("job_research.cli.extract_pdf_text", broken_extract_pdf_text)
|
||||
|
||||
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert not out.exists()
|
||||
assert "CV input not readable" in result.stderr
|
||||
assert "broken pdf" in result.stderr
|
||||
assert "Traceback" not in result.stderr
|
||||
|
||||
|
||||
def test_build_profile_reports_malformed_profile_markdown_cleanly(tmp_path) -> None:
|
||||
cv = tmp_path / "cv.txt"
|
||||
cv.write_text(
|
||||
dedent(
|
||||
"""
|
||||
Tonio Example
|
||||
Location: France
|
||||
Languages: French, English
|
||||
Skills: Python
|
||||
"""
|
||||
).strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
profile = tmp_path / "profile.md"
|
||||
profile.write_text(
|
||||
dedent(
|
||||
"""
|
||||
# Candidate Profile
|
||||
|
||||
## Summary
|
||||
Junior data engineer.
|
||||
|
||||
## Target Roles
|
||||
Data Engineer
|
||||
|
||||
## Strengths
|
||||
- Python
|
||||
|
||||
## Skills To Emphasize
|
||||
- BigQuery
|
||||
|
||||
## Constraints
|
||||
- CDI only
|
||||
|
||||
## Notes
|
||||
- Slight preference for French listings.
|
||||
"""
|
||||
).strip(),
|
||||
encoding="utf-8",
|
||||
)
|
||||
out = tmp_path / "candidate-profile.yaml"
|
||||
|
||||
result = CliRunner().invoke(app, ["build-profile", "--cv", str(cv), "--profile", str(profile), "--out", str(out)])
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert not out.exists()
|
||||
assert "profile markdown invalid" in result.stderr
|
||||
assert "Traceback" not in result.stderr
|
||||
|
||||
114
uv.lock
generated
114
uv.lock
generated
@ -20,6 +20,19 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "soupsieve" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
@ -29,6 +42,63 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
version = "3.5.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/6d/6e/802acd792aebb2256fbbee8cacf2727faaeb6f240ac11008f09eae4414bc/greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829", size = 197356, upload-time = "2026-05-20T15:05:03.917Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/27/69/7f7e5372d998b81001899b1c0823c957aa413ba0f2662e65821611cc31e4/greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b", size = 285060, upload-time = "2026-05-20T13:08:51.899Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/bf/387f9b6b865fd2ae0d0be09e0004827295a01b71be76ed350dd1e28a91a4/greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a", size = 604370, upload-time = "2026-05-20T14:00:07.492Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/f5/169ce3d4e4c67291bd18f8cbe0299c9f3e45102c7f1fb3c14780c93e4532/greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283", size = 616987, upload-time = "2026-05-20T14:05:44.237Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/19/ba/c24110c55dffa55aa6e1d98b45310da33801aeba7686ff0190fe5d46fd32/greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce", size = 622911, upload-time = "2026-05-20T14:09:10.598Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ee/e5/7f2e41d5273be07e77560d61ea4e56485b4d6c316d2a84518c62d1364061/greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135", size = 613911, upload-time = "2026-05-20T13:14:27.539Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/7b/d20db2e8a5ad6c038702f3179b136f93f0a3d1a21a0c0777f3e470cdf4b2/greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436", size = 425228, upload-time = "2026-05-20T14:01:40.837Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c5/a4/fbdc67579b73615a1f91615e814303cc71e06128f7baaba87be79b8fb90c/greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd", size = 1570689, upload-time = "2026-05-20T14:02:27.225Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e6/b4/77abbe35078be39718a46cd49caf16bceb35662f97a34101dca28aa98e47/greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1", size = 1635602, upload-time = "2026-05-20T13:14:36.344Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/f7/129f27ca700845b8ee8ca88ce7f43435a1239c2eddb7677fc938822762cf/greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9", size = 238683, upload-time = "2026-05-20T13:11:50.57Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6d/5c/a485a36e87df8d8fd0632ee01511244f5156a20ed3746cc6599340326395/greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e", size = 235499, upload-time = "2026-05-20T13:12:42.028Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/cb/c62454606daf5640369c94d8a9dd540599b1bfc090e2d2180cb77f4038d2/greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07", size = 285579, upload-time = "2026-05-20T13:08:56.396Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/71/c4270398c2eba968a6071af1dfbdcaeee6ec1c24bc8b435b8cc452700da6/greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea", size = 651106, upload-time = "2026-05-20T14:00:09.448Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/ab/71e34b78a44ec271fb5f550c17bc46d301ddc5953890d935f270b0dcdb5a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2", size = 663478, upload-time = "2026-05-20T14:05:45.88Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c6/2d/2d80842910da44f78c286532d084b8a5c3717c844ae80ceb3858738ae89a/greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c", size = 667767, upload-time = "2026-05-20T14:09:12.15Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/77/96/4efd6fa5c62c85426a0c19077a586258ebc3a2a146ff2493e4312a697a22/greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c", size = 660800, upload-time = "2026-05-20T13:14:29.129Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e9/d3/dad2eecedfbb1ed7050a20dcfae40c1442b74bc7423608be2c7e03ee7133/greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d", size = 470786, upload-time = "2026-05-20T14:01:42.064Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/e0/6c71401a25cac7000261304e866a2f2cc04dc74810d40e2f118aa4799495/greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0", size = 1617518, upload-time = "2026-05-20T14:02:28.662Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/41/26/c5c06643e8c0af9e7bf18e16cb51d0ab7625155f0392e1c9015d66d556cd/greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc", size = 1681593, upload-time = "2026-05-20T13:14:39.417Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/bd/e11a108317485075e68af9d23039619b86b28130c3b50d227d42edece64b/greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3", size = 239800, upload-time = "2026-05-20T13:09:30.128Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/47/f8/8e8e8417b7bf28639a5a56356ef934d0375e1d0c70a57e04d7701e870ffe/greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54", size = 236862, upload-time = "2026-05-20T13:09:10.498Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/90/12/41bf27fde4d3605d3773ae57751eda182b8be2f5398011c041173b1d9534/greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad", size = 293637, upload-time = "2026-05-20T13:12:35.529Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/44/ba14b23e9757707050c2f397d305bbcae62e5d7cad122f8b6baec5ae4a1f/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e", size = 650840, upload-time = "2026-05-20T14:00:11.079Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a8/37/5ddc2b686a6844f91abecef43411842426da2e1573f60b49ecf2547f4ae1/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986", size = 656416, upload-time = "2026-05-20T14:05:47.118Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8c/46/5987dcd1a2570ba84f3b187536b2ca3ae97613387e57f5cfa99df068fe5e/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f", size = 656607, upload-time = "2026-05-20T14:09:13.949Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/f0/d17510297c35a2992712f0bf84de3779749999f7d3d63aa1f09db7c62dbe/greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e", size = 654397, upload-time = "2026-05-20T13:14:30.696Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/c1/6da0a9ddcc29d7e51ef14883fa3dc1e53b3f4ffba00582106c7bf55da1d8/greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de", size = 488287, upload-time = "2026-05-20T14:01:43.143Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/37/eb/147387705bb89092645b012586e7273cb5ed3c90ef7eaf3a69173eaf0209/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d", size = 1614469, upload-time = "2026-05-20T14:02:30.192Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/a6/4e/37ee0da7732b7aa9896f17e15579a9df34b9fcb9dd494f0adfa749af6623/greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78", size = 1675115, upload-time = "2026-05-20T13:14:40.972Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/57/f3/97dfcf4a6eb5077f8a672234216fb5923eb89f2cab7081cb10b2cf75b605/greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2", size = 245246, upload-time = "2026-05-20T13:12:22.646Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/73/d7f72e34b582f694f4a9b248162db7b09cc458a259ba8f0c0bfa1a34ea7d/greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541", size = 285575, upload-time = "2026-05-20T13:12:07.043Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/59/fa9c6e87dc8ad27a95dabe2f29f372b733d05a8a67470f6c901ed9975655/greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de", size = 656428, upload-time = "2026-05-20T14:00:12.556Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/f9/e753408871eaa61dfe35e619cfc67512b036fde99893685d50eea9e07146/greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64", size = 667064, upload-time = "2026-05-20T14:05:48.662Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/74/807a047255bf1e09303627c46dc043dca596b6958a354d904f32ab382005/greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0", size = 672962, upload-time = "2026-05-20T14:09:15.532Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/96/27/5565b5b40389f1c7753003a07e21892fda8660926787036d5bc0308b8113/greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5", size = 665697, upload-time = "2026-05-20T13:14:32.943Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/76/32/19d4e13225193c29b13e308015223f7d75fd3d8623d49dd19040d2ce8ec1/greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc", size = 476047, upload-time = "2026-05-20T14:01:44.39Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/cf/82/e7de4178c0c2d1c9a5a3be3cc0b33e46a85b3ee4a77c071bf7ad8600e079/greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368", size = 1621256, upload-time = "2026-05-20T14:02:31.91Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/00/10/f2dddcf7dacac17dfc68691809589adad06135eb28930429cf58a6467a2f/greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26", size = 1685956, upload-time = "2026-05-20T13:14:42.55Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/17/4a232b32133230ada52f70e9d7f5b65b0caef8772f01849bd8d149e7e4ca/greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab", size = 239802, upload-time = "2026-05-20T13:13:15.481Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/ae/4e623a7e6d4d2a5f4cb8e4c82de4169fc637942caae68d6e676b8a128ac5/greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6", size = 236853, upload-time = "2026-05-20T13:15:37.301Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7a/57/816d9cff29119da3505b3d6a5e14a8af89006ac36f47f891ff293ee05af1/greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed", size = 293877, upload-time = "2026-05-20T13:10:19.078Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/23/a1/59b0a7c7d140ff1a75626680b9a9899b79a9176cab298b394968fb023295/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244", size = 655333, upload-time = "2026-05-20T14:00:14.758Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/72/1b/5efe127597625042218939d01855109f352779050768b670b52edcc16a6c/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c", size = 659443, upload-time = "2026-05-20T14:05:50.159Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c9/9d/1dcdf7b95ab3cf8c7b6d7277c18a5e167312f2b362ddfcc5d5e6d8d84b43/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c", size = 659998, upload-time = "2026-05-20T14:09:16.912Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6c/6d/c404246ea4d22d097a7426d0efb5b781bd7eb67715f09e79001bd552ab18/greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd", size = 658356, upload-time = "2026-05-20T13:14:35.091Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/7e/c4959664fc231d587d66d8e81f2095e98056ba1954beafdcbe635e251052/greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62", size = 494470, upload-time = "2026-05-20T14:01:45.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/51/02/f8ee37fb6d2219329f350af241c27fcf12df57e723d11f6fc6d3bacdadaa/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e", size = 1619216, upload-time = "2026-05-20T14:02:33.403Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/c5/3dc9475ace2c7a3680da12372cddd7f1ac874eb410a1ac48d3e9dab83782/greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659", size = 1678427, upload-time = "2026-05-20T13:14:43.71Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/4e/750c15c317a41ffb36f0bf40b933e3d744a7dede61889f74443ea69690cf/greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e", size = 245225, upload-time = "2026-05-20T13:13:59.366Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4f/fd/d3baea2eeb7b617efd47e87ca06e2ec2c6118d303aa9e918e0ce16eadc10/greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a", size = 239590, upload-time = "2026-05-20T13:13:37.382Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.3.0"
|
||||
@ -43,6 +113,8 @@ name = "job-research"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "beautifulsoup4" },
|
||||
{ name = "playwright" },
|
||||
{ name = "pydantic" },
|
||||
{ name = "pypdf" },
|
||||
{ name = "pyyaml" },
|
||||
@ -56,6 +128,8 @@ dev = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "beautifulsoup4", specifier = ">=4.12,<5" },
|
||||
{ name = "playwright", specifier = ">=1.52,<2" },
|
||||
{ name = "pydantic", specifier = ">=2.7,<3" },
|
||||
{ name = "pypdf", specifier = ">=5.0,<6" },
|
||||
{ name = "pyyaml", specifier = ">=6.0,<7" },
|
||||
@ -95,6 +169,25 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "playwright"
|
||||
version = "1.60.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "greenlet" },
|
||||
{ name = "pyee" },
|
||||
]
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/21/f0/832bd9677194908da118064eef20082f2791e3d18215cc6d9391ee2c5a67/playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7", size = 43474635, upload-time = "2026-05-18T12:00:31.969Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/7b/e1d32ae8a3ed937ec2be3721c5f728b13d731a0b7c6442e0b3bec5094ac0/playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5", size = 42261327, upload-time = "2026-05-18T12:00:35.638Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d7/bc/23de499ded6411c188a20c5a0dea6f0cd4ed5d2b3cc6042a5dbd3ed609aa/playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705", size = 43474636, upload-time = "2026-05-18T12:00:39.294Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/22/7b/1d679f4fced4ea94efadd17103856d8c565384f68382a1681264e46f5925/playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e", size = 47467220, upload-time = "2026-05-18T12:00:43.179Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/84/c2/1528d267d4442bd2c6b8eaeab819dd52c2030bf80e89293f0ba1f687473b/playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353", size = 47154856, upload-time = "2026-05-18T12:00:46.715Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/bb/4e/b008b6440a7a1624378041da94829956d4b8f7ab9ef5aad22d0dc3f2e26d/playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7", size = 37902157, upload-time = "2026-05-18T12:00:50.374Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/55/f0/0541524133104f9cc20bf900870ff4a736b76a23483f3a55295ddfa58409/playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02", size = 37902159, upload-time = "2026-05-18T12:00:53.728Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/80/c8/210f282d278e4709cdd71b12a31af45a30a22ab3207b387e29b37e478713/playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537", size = 34037981, upload-time = "2026-05-18T12:00:57.584Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.6.0"
|
||||
@ -175,6 +268,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f6/d2/42dd53d0a85c27606f316d3aa5d2869c4e8470a5ed6dec30e4a1abe19192/pydantic_core-2.46.4-cp314-cp314t-win_arm64.whl", hash = "sha256:4fcbe087dbc2068af7eda3aa87634eba216dbda64d1ae73c8684b621d33f6596", size = 2017325, upload-time = "2026-05-06T13:40:52.723Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyee"
|
||||
version = "13.0.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8b/04/e7c1fe4dc78a6fdbfd6c337b1c3732ff543b8a397683ab38378447baa331/pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8", size = 31655, upload-time = "2026-02-14T21:12:28.044Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/a0/c4/b4d4827c93ef43c01f599ef31453ccc1c132b353284fc6c87d535c233129/pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228", size = 15659, upload-time = "2026-02-14T21:12:26.263Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pygments"
|
||||
version = "2.20.0"
|
||||
@ -267,6 +372,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.8.4"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/47/2c/0a5f6f8ee0d5589e48c7640213ed5175d52cf540a06725b628cc1a45d6ce/soupsieve-2.8.4.tar.gz", hash = "sha256:e121fd02e975c695e4e9e8774a5ee35d74714b59307868dcc5319ad2d9e3328e", size = 121110, upload-time = "2026-05-24T13:55:57.154Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5e/f5/0c41cb68dcae6b7de4fac4188a3a9589e21fb31df21ea3a2e888db95e6c9/soupsieve-2.8.4-py3-none-any.whl", hash = "sha256:e7e6b0769c8f51ed59acab6e994b00621096cfb1c640a7509295987388fbaf65", size = 37304, upload-time = "2026-05-24T13:55:55.406Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typer"
|
||||
version = "0.26.2"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user