From 0cdeabc0e64ff822ab9cd40838329999e05e7e6c Mon Sep 17 00:00:00 2001 From: Ivo Oskamp Date: Tue, 26 May 2026 15:12:35 +0200 Subject: [PATCH] Dev build 2026-05-26 15:12 --- containers/clearview/alembic.ini | 45 + containers/clearview/requirements.txt | 1 + containers/clearview/site/app.js | 67 +- containers/clearview/site/index.html | 27 + containers/clearview/site/styles.css | 56 +- .../src/clearview_app/api_helpers.py | 321 +++++ .../clearview/src/clearview_app/api_jobs.py | 645 ++++++++++ .../src/clearview_app/api_onboarding.py | 76 ++ .../src/clearview_app/api_tenants.py | 86 ++ .../clearview/src/clearview_app/cert.py | 6 +- .../clearview/src/clearview_app/db_migrate.py | 53 + .../clearview/src/clearview_app/main.py | 1122 +---------------- .../src/clearview_app/migrations/env.py | 58 + .../clearview_app/migrations/script.py.mako | 26 + .../migrations/versions/0001_baseline.py | 31 + .../migrations/versions/0002_timestamptz.py | 63 + .../clearview/src/clearview_app/models.py | 35 +- .../src/clearview_app/scanners/sharepoint.py | 63 +- .../clearview/src/clearview_app/schemas.py | 7 +- .../clearview/src/clearview_app/version.py | 2 +- .../clearview/src/clearview_app/worker.py | 24 +- docs/changelog-develop.md | 36 + 22 files changed, 1639 insertions(+), 1211 deletions(-) create mode 100644 containers/clearview/alembic.ini create mode 100644 containers/clearview/src/clearview_app/api_helpers.py create mode 100644 containers/clearview/src/clearview_app/api_jobs.py create mode 100644 containers/clearview/src/clearview_app/api_onboarding.py create mode 100644 containers/clearview/src/clearview_app/api_tenants.py create mode 100644 containers/clearview/src/clearview_app/db_migrate.py create mode 100644 containers/clearview/src/clearview_app/migrations/env.py create mode 100644 containers/clearview/src/clearview_app/migrations/script.py.mako create mode 100644 containers/clearview/src/clearview_app/migrations/versions/0001_baseline.py create mode 100644 containers/clearview/src/clearview_app/migrations/versions/0002_timestamptz.py diff --git a/containers/clearview/alembic.ini b/containers/clearview/alembic.ini new file mode 100644 index 0000000..4c8bc67 --- /dev/null +++ b/containers/clearview/alembic.ini @@ -0,0 +1,45 @@ +# Alembic config for manual CLI use during development, e.g.: +# cd containers/clearview && DATABASE_URL=postgresql://... PYTHONPATH=src alembic revision -m "msg" +# +# The application itself does NOT read this file: clearview_app.db_migrate builds +# an Alembic Config programmatically and env.py takes the database URL from +# DATABASE_URL via clearview_app.config. sqlalchemy.url is therefore left blank. + +[alembic] +script_location = src/clearview_app/migrations +prepend_sys_path = src +sqlalchemy.url = + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/containers/clearview/requirements.txt b/containers/clearview/requirements.txt index fdc85c8..7da64eb 100644 --- a/containers/clearview/requirements.txt +++ b/containers/clearview/requirements.txt @@ -1,6 +1,7 @@ fastapi==0.115.0 uvicorn[standard]==0.30.6 sqlalchemy==2.0.36 +alembic==1.14.0 psycopg[binary]==3.2.3 python-multipart==0.0.12 requests==2.32.3 diff --git a/containers/clearview/site/app.js b/containers/clearview/site/app.js index e5ce1ac..8920b97 100644 --- a/containers/clearview/site/app.js +++ b/containers/clearview/site/app.js @@ -104,6 +104,8 @@ statTenants: document.getElementById('statTenants'), statJobs: document.getElementById('statJobs'), statRunning: document.getElementById('statRunning'), + statErrors: document.getElementById('statErrors'), + dashRecentJobs: document.getElementById('dashRecentJobs'), }; // ------------------------------------------------------------------------- @@ -610,6 +612,36 @@ // Jobs list // ------------------------------------------------------------------------- + function renderDashRecent(jobs) { + if (!els.dashRecentJobs) return; + if (!jobs.length) { + els.dashRecentJobs.innerHTML = 'No jobs yet.'; + return; + } + els.dashRecentJobs.innerHTML = jobs.slice(0, 5).map(function (job) { + var jobIdSafe = escHtml(job.id); + var tenantLabel = job.tenant_name + ? escHtml(job.tenant_name) + : 'manual'; + var progress = job.total_targets > 0 ? (job.processed_targets + '/' + job.total_targets) : '0/0'; + return '' + + '' + jobIdSafe + '' + + '' + escHtml(job.scan_type || 'sharepoint') + '' + + '' + tenantLabel + '' + + '' + statusBadge(job.status) + '' + + '' + progress + '' + + '' + formatDate(job.updated_at) + '' + + ''; + }).join(''); + els.dashRecentJobs.querySelectorAll('[data-dash-job]').forEach(function (row) { + row.addEventListener('click', function () { + state.selectedJobId = row.getAttribute('data-dash-job'); + navigateTo('jobs'); + refreshSelectedJob().catch(function () {}); + }); + }); + } + async function refreshJobs() { const filterTenant = els.jobTenantFilter.value; const filterType = els.jobTypeFilter ? els.jobTypeFilter.value : ''; @@ -626,6 +658,12 @@ els.statRunning.textContent = String(jobs.filter(function (j) { return j.status === 'running' || j.status === 'queued'; }).length); + if (els.statErrors) { + els.statErrors.textContent = String(jobs.filter(function (j) { + return j.status === 'completed_with_errors' || (j.failed_targets || 0) > 0; + }).length); + } + renderDashRecent(jobs); if (!jobs.length) { els.jobsTableBody.innerHTML = 'No jobs yet.'; @@ -654,22 +692,23 @@ } else { typeLabel = 'SharePoint'; } + const jobIdSafe = escHtml(job.id); return ( '' + - '' + job.id + '' + + '' + jobIdSafe + '' + '' + typeLabel + '' + '' + tenantLabel + '' + - '' + job.source_type + '' + + '' + escHtml(job.source_type) + '' + '' + statusBadge(job.status) + '' + '' + progress + '' + '' + (job.items_scanned > 0 ? job.items_scanned : '-') + '' + '' + formatDate(job.updated_at) + '' + '' + '
' + - '' + + '' + (job.status === 'queued' || job.status === 'running' - ? '' - : '') + + ? '' + : '') + '
' + '' + '' @@ -1527,14 +1566,16 @@ return hash; } - function applyRoute(route) { + function applyRoute(route, moveFocus) { if (!ROUTE_TITLES[route]) { route = 'dashboard'; } state.currentRoute = route; + var activePage = null; document.querySelectorAll('.route-page').forEach(function (page) { if (page.getAttribute('data-route-page') === route) { page.removeAttribute('hidden'); + activePage = page; } else { page.setAttribute('hidden', ''); } @@ -1549,6 +1590,16 @@ if (els.contentTitle) { els.contentTitle.textContent = ROUTE_TITLES[route]; } + document.title = 'Clearview | ' + ROUTE_TITLES[route]; + // On user navigation, move focus to the new page's first heading so + // screen-reader and keyboard users land in the freshly shown content. + if (moveFocus && activePage) { + var heading = activePage.querySelector('h1, h2'); + if (heading) { + heading.setAttribute('tabindex', '-1'); + heading.focus(); + } + } } function navigateTo(route) { @@ -1560,12 +1611,12 @@ if (window.location.hash !== hash) { window.location.hash = hash; } else { - applyRoute(route); + applyRoute(route, true); } } window.addEventListener('hashchange', function () { - applyRoute(parseRoute()); + applyRoute(parseRoute(), true); }); applyRoute(parseRoute()); diff --git a/containers/clearview/site/index.html b/containers/clearview/site/index.html index 2f63b0c..d9e9187 100644 --- a/containers/clearview/site/index.html +++ b/containers/clearview/site/index.html @@ -73,6 +73,33 @@ 0 Active Jobs +
+ 0 + With errors +
+ + + +
+
+

Recent jobs

+
+
+ + + + + + + + + + + + + + +
Job IDTypeTenantStatusTargetsUpdated
No jobs yet.
diff --git a/containers/clearview/site/styles.css b/containers/clearview/site/styles.css index 456d659..44f6172 100644 --- a/containers/clearview/site/styles.css +++ b/containers/clearview/site/styles.css @@ -55,38 +55,12 @@ body { background: radial-gradient(circle at center, rgba(3, 105, 161, 0.2), rgba(3, 105, 161, 0)); } -.topbar { - width: min(1100px, calc(100% - 2rem)); - margin: 1.1rem auto 0; - padding: 0.95rem 1.1rem; - border: 1px solid var(--cv-border); - border-radius: 18px; - background: rgba(255, 255, 255, 0.75); - backdrop-filter: blur(8px); - display: flex; - align-items: center; - justify-content: space-between; - box-shadow: 0 10px 24px rgba(20, 20, 19, 0.08); -} - .brand-logo { height: 42px; width: auto; display: block; } -.topbar-actions { - display: flex; - gap: 0.6rem; -} - -.layout { - width: min(1100px, calc(100% - 2rem)); - margin: 1rem auto 2.5rem; - display: grid; - gap: 1rem; -} - .hero, .panel { border-radius: 22px; @@ -131,7 +105,7 @@ h2 { .hero-stats { margin-top: 1.3rem; display: grid; - grid-template-columns: repeat(3, minmax(0, 1fr)); + grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 0.75rem; } @@ -291,8 +265,9 @@ textarea { input:focus, select:focus, textarea:focus, -button:focus { - outline: 2px solid rgba(14, 165, 233, 0.38); +button:focus, +a:focus-visible { + outline: 2px solid var(--cv-accent); outline-offset: 2px; } @@ -533,8 +508,8 @@ strong { } .risk.warn { - background: rgba(14, 165, 233, 0.15); - color: var(--cv-accent-dark); + background: rgba(234, 179, 8, 0.18); + color: #854d0e; } .risk.high { @@ -584,12 +559,6 @@ strong { } @media (max-width: 930px) { - .topbar { - flex-direction: column; - align-items: flex-start; - gap: 0.8rem; - } - .hero-stats { grid-template-columns: 1fr; } @@ -616,11 +585,6 @@ strong { } @media (max-width: 640px) { - .layout, - .topbar { - width: calc(100% - 1rem); - } - .hero, .panel { border-radius: 16px; @@ -633,14 +597,6 @@ strong { .hero h1 { max-width: none; } - - .topbar-actions { - width: 100%; - } - - .topbar-actions .btn { - flex: 1; - } } /* =========================================================================== diff --git a/containers/clearview/src/clearview_app/api_helpers.py b/containers/clearview/src/clearview_app/api_helpers.py new file mode 100644 index 0000000..721426a --- /dev/null +++ b/containers/clearview/src/clearview_app/api_helpers.py @@ -0,0 +1,321 @@ +"""Shared helpers for the API route modules. + +Extracted verbatim from the original monolithic ``main.py`` so the route +modules (``api_tenants``, ``api_jobs``) can share credential resolution, job +creation, response mapping, and export helpers without circular imports. +""" +from __future__ import annotations + +import re +import uuid +from datetime import datetime, timezone + +from fastapi import HTTPException +from sqlalchemy import select +from sqlalchemy.orm import joinedload + +from .db import SessionLocal +from .default_sites import is_default_site, normalize_site_url +from .models import ScanJob, ScanTarget, TenantProfile +from .scanners import AuthConfig +from .schemas import ScanJobCreateResponse, ScanJobSummary, TenantProfileItem + + +def _extract_sharing_link_group_and_type(principal: str) -> tuple[str, str] | None: + """ + Extract (group_name, link_type) from principal values such as: + - SharingLinks... + - c:0o.c|federateddirectoryclaimprovider|SharingLinks... + """ + if not principal: + return None + + text = principal.strip() + segments = [s.strip() for s in text.split("|") if s.strip()] + + candidate = "" + for segment in reversed(segments): + if segment.lower().startswith("sharinglinks."): + candidate = segment + break + if not candidate and text.lower().startswith("sharinglinks."): + candidate = text + if not candidate: + return None + + parts = candidate.split(".") + if len(parts) < 3: + return None + return candidate, parts[2] + + +_SCAN_TYPE_LABELS = { + "sharepoint": "Deviations", + "sharepoint_root": "Root", + "mailbox": "Mailbox", + "entra_groups": "EntraGroups", +} + + +def _build_export_filename(job: ScanJob, job_id: str) -> str: + tenant_label = (job.tenant_profile.name if job.tenant_profile else None) or "Manual" + safe_tenant = re.sub(r"[^A-Za-z0-9_-]+", "_", tenant_label).strip("_") or "Manual" + scan_type = job.scan_type or "sharepoint" + type_label = _SCAN_TYPE_LABELS.get(scan_type, scan_type) + short_id = job_id.replace("-", "")[-12:] + return f"ClearView_{safe_tenant}_{type_label}_{short_id}.xlsx" + + +def _enumerate_all_entra_groups( + tenant_id: str, + client_id: str, + client_secret: str | None, + profile_id: str | None, +) -> list[str]: + cert_private_key: str | None = None + cert_thumbprint: str | None = None + cert_public_pem: str | None = None + if profile_id: + with SessionLocal() as db: + profile = db.get(TenantProfile, profile_id) + if profile: + cert_private_key = profile.cert_private_key + cert_thumbprint = profile.cert_thumbprint + cert_public_pem = profile.cert_public_pem + + auth = AuthConfig( + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret or "", + cert_private_key=cert_private_key, + cert_thumbprint=cert_thumbprint, + cert_public_pem=cert_public_pem, + ) + + from .scanners import entra as _entra + + try: + return _entra.list_all_groups(auth) + except Exception as exc: # noqa: BLE001 + raise HTTPException(status_code=400, detail=f"Group enumeration failed: {exc}") from exc + + +def _enumerate_all_mailboxes( + organization: str | None, + tenant_id: str, + client_id: str, + client_secret: str | None, + profile_id: str | None, +) -> list[str]: + if not organization or "." not in organization: + raise HTTPException( + status_code=400, + detail="organization (e.g. contoso.onmicrosoft.com) is required when scan_all_mailboxes is true", + ) + + cert_private_key: str | None = None + cert_thumbprint: str | None = None + cert_public_pem: str | None = None + if profile_id: + with SessionLocal() as db: + profile = db.get(TenantProfile, profile_id) + if profile: + cert_private_key = profile.cert_private_key + cert_thumbprint = profile.cert_thumbprint + cert_public_pem = profile.cert_public_pem + + auth = AuthConfig( + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret or "", + cert_private_key=cert_private_key, + cert_thumbprint=cert_thumbprint, + cert_public_pem=cert_public_pem, + ) + + from .scanners import mailbox as _mailbox + + try: + return _mailbox.list_mailboxes(organization=organization.strip().lower(), auth=auth) + except Exception as exc: # noqa: BLE001 + raise HTTPException(status_code=400, detail=f"Mailbox enumeration failed: {exc}") from exc + + +def _resolve_credentials( + db, + tenant_profile_id: str | None, + tenant_id: str | None, + client_id: str | None, + client_secret: str | None, +) -> tuple[str, str, str | None, str | None]: + if tenant_profile_id: + profile = db.get(TenantProfile, tenant_profile_id) + if not profile: + raise HTTPException(status_code=404, detail="Tenant profile not found") + if not profile.client_secret and not profile.cert_thumbprint: + raise HTTPException( + status_code=400, + detail="Tenant profile has no client secret and no certificate. Generate a certificate first.", + ) + return profile.tenant_id, profile.client_id, profile.client_secret, tenant_profile_id + if tenant_id and client_id and client_secret: + return tenant_id.strip(), client_id.strip(), client_secret.strip(), None + raise HTTPException( + status_code=400, + detail="Provide either tenant_profile_id or all of tenant_id, client_id, and client_secret.", + ) + + +def _create_job_from_targets( + raw_targets: list[str], + scan_type: str, + skip_default_sites: bool, + source_type: str, + tenant_id: str, + client_id: str, + client_secret: str, + tenant_profile_id: str | None = None, +) -> ScanJobCreateResponse: + accepted: list[str] = [] + skipped_default_urls: list[str] = [] + invalid: list[str] = [] + + seen: set[str] = set() + + for raw in raw_targets: + if scan_type == "mailbox": + normalized = (raw or "").strip().lower() + if not normalized or "@" not in normalized: + invalid.append(raw) + continue + elif scan_type == "entra_groups": + normalized = (raw or "").strip() + if not normalized: + invalid.append(raw) + continue + else: + normalized = normalize_site_url(raw) or "" + if not normalized: + invalid.append(raw) + continue + + if normalized in seen: + continue + seen.add(normalized) + + if scan_type in ("sharepoint", "sharepoint_root") and skip_default_sites and is_default_site(normalized): + skipped_default_urls.append(normalized) + continue + + accepted.append(normalized) + + with SessionLocal() as db: + now = datetime.now(timezone.utc) + job = ScanJob( + id=str(uuid.uuid4()), + source_type=source_type, + scan_type=scan_type, + status="queued" if accepted else "completed", + skip_default_sites=skip_default_sites, + tenant_profile_id=tenant_profile_id, + auth_tenant_id=tenant_id, + auth_client_id=client_id, + auth_client_secret=client_secret, + total_targets=len(accepted), + skipped_targets=len(skipped_default_urls), + warning_message=None, + error_message=None, + created_at=now, + updated_at=now, + finished_at=now if not accepted else None, + ) + + if not accepted: + if scan_type == "mailbox": + job.warning_message = "No scannable mailboxes after validation" + else: + job.warning_message = "No scannable sites after validation and default-site filtering" + + db.add(job) + db.flush() + + for index, target in enumerate(accepted, start=1): + db.add( + ScanTarget( + job_id=job.id, + site_url=target, + source_row=index, + status="queued", + attempts=0, + created_at=now, + updated_at=now, + ) + ) + + db.commit() + + stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job.id) + job = db.execute(stmt).unique().scalar_one() + + return ScanJobCreateResponse( + job=_to_job_summary(job), + accepted_urls=accepted, + skipped_default_urls=skipped_default_urls, + invalid_urls=invalid, + ) + + +def _to_job_summary(job: ScanJob) -> ScanJobSummary: + return ScanJobSummary( + id=job.id, + status=job.status, + source_type=job.source_type, + scan_type=job.scan_type or "sharepoint", + skip_default_sites=job.skip_default_sites, + tenant_profile_id=job.tenant_profile_id, + tenant_name=job.tenant_profile.name if job.tenant_profile else None, + total_targets=job.total_targets, + processed_targets=job.processed_targets, + successful_targets=job.successful_targets, + failed_targets=job.failed_targets, + skipped_targets=job.skipped_targets, + items_scanned=job.items_scanned, + scan_activity=job.scan_activity if job.status == "running" else None, + warning_message=job.warning_message, + error_message=job.error_message, + created_at=job.created_at, + updated_at=job.updated_at, + started_at=job.started_at, + finished_at=job.finished_at, + ) + + +def _to_tenant_item(profile: TenantProfile) -> TenantProfileItem: + return TenantProfileItem( + id=profile.id, + name=profile.name, + tenant_id=profile.tenant_id, + primary_domain=profile.primary_domain, + client_id=profile.client_id, + has_certificate=bool(profile.cert_thumbprint), + cert_thumbprint=profile.cert_thumbprint, + cert_expires_at=profile.cert_expires_at, + created_at=profile.created_at, + updated_at=profile.updated_at, + ) + + +def _sharing_link_risk_label(principal: str) -> str: + if not principal.startswith("SharingLinks."): + return "" + parts = principal.split(".", 3) + link_type = parts[2] if len(parts) >= 3 else "" + if link_type.startswith("Anonymous"): + return "Critical" + if link_type == "Flexible": + return "High" + if link_type.startswith("Organization"): + return "Low" + if link_type.startswith("Direct"): + return "Low" + return "Unknown" diff --git a/containers/clearview/src/clearview_app/api_jobs.py b/containers/clearview/src/clearview_app/api_jobs.py new file mode 100644 index 0000000..c9cb6a4 --- /dev/null +++ b/containers/clearview/src/clearview_app/api_jobs.py @@ -0,0 +1,645 @@ +"""Scan-job routes: create, list, inspect, cancel, delete, resolve, export.""" +from __future__ import annotations + +import io +from datetime import datetime, timezone + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from fastapi.responses import Response, StreamingResponse +from sqlalchemy import select +from sqlalchemy.orm import joinedload + +from .api_helpers import ( + _build_export_filename, + _create_job_from_targets, + _enumerate_all_entra_groups, + _enumerate_all_mailboxes, + _extract_sharing_link_group_and_type, + _resolve_credentials, + _sharing_link_risk_label, + _to_job_summary, +) +from .csv_import import parse_entra_groups_csv, parse_mailboxes_csv, parse_sites_csv +from .db import SessionLocal +from .models import PermissionDeviation, ScanJob, ScanTarget, TenantProfile +from .scanners import AuthConfig, probe +from .schemas import ( + CreateScanJobRequest, + PermissionDeviationItem, + ProbeResultResponse, + ResolveGroupsResponse, + ResolveSharingLinksRequest, + ResolveSharingLinksResponse, + ScanJobCreateResponse, + ScanJobDetail, + ScanJobSummary, + ScanTargetItem, + SharingLinkTypesResponse, +) + +router = APIRouter() + + +@router.post("/api/scan-jobs", response_model=ScanJobCreateResponse) +def create_scan_job(payload: CreateScanJobRequest) -> ScanJobCreateResponse: + with SessionLocal() as db: + tenant_id, client_id, client_secret, profile_id = _resolve_credentials( + db=db, + tenant_profile_id=payload.tenant_profile_id, + tenant_id=payload.tenant_id, + client_id=payload.client_id, + client_secret=payload.client_secret, + ) + source_type = "manual" + if payload.scan_type == "entra_groups": + if payload.scan_all_groups: + raw_targets = _enumerate_all_entra_groups( + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret, + profile_id=profile_id, + ) + source_type = "tenant_all" + else: + raw_targets = [str(g) for g in payload.group_ids] + elif payload.scan_type == "mailbox": + if payload.scan_all_mailboxes: + organization = payload.organization + if (not organization) and profile_id: + with SessionLocal() as db: + profile = db.get(TenantProfile, profile_id) + if profile and profile.primary_domain: + organization = profile.primary_domain + raw_targets = _enumerate_all_mailboxes( + organization=organization, + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret, + profile_id=profile_id, + ) + source_type = "tenant_all" + else: + raw_targets = [str(m) for m in payload.mailboxes] + else: + raw_targets = [str(item) for item in payload.site_urls] + return _create_job_from_targets( + raw_targets=raw_targets, + scan_type=payload.scan_type, + skip_default_sites=payload.skip_default_sites, + source_type=source_type, + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret, + tenant_profile_id=profile_id, + ) + + +@router.post("/api/scan-jobs/import-csv", response_model=ScanJobCreateResponse) +def create_scan_job_from_csv( + skip_default_sites: bool = True, + scan_type: str = Form("sharepoint"), + tenant_profile_id: str | None = Form(None), + tenant_id: str | None = Form(None), + client_id: str | None = Form(None), + client_secret: str | None = Form(None), + file: UploadFile = File(...), +) -> ScanJobCreateResponse: + with SessionLocal() as db: + resolved_tenant_id, resolved_client_id, resolved_client_secret, profile_id = _resolve_credentials( + db=db, + tenant_profile_id=tenant_profile_id, + tenant_id=tenant_id, + client_id=client_id, + client_secret=client_secret, + ) + content = file.file.read() + if scan_type == "mailbox": + parsed = parse_mailboxes_csv(content) + targets = parsed.mailboxes + elif scan_type == "entra_groups": + parsed = parse_entra_groups_csv(content) + targets = parsed.urls + else: + parsed = parse_sites_csv(content) + targets = parsed.urls + response = _create_job_from_targets( + raw_targets=targets, + scan_type=scan_type, + skip_default_sites=skip_default_sites, + source_type="csv", + tenant_id=resolved_tenant_id, + client_id=resolved_client_id, + client_secret=resolved_client_secret, + tenant_profile_id=profile_id, + ) + + if parsed.invalid_rows: + csv_warning = f"CSV issues: {len(parsed.invalid_rows)}" + with SessionLocal() as db: + job = db.get(ScanJob, response.job.id) + if job: + if job.warning_message: + job.warning_message = f"{job.warning_message} | {csv_warning}" + else: + job.warning_message = csv_warning + job.updated_at = datetime.now(timezone.utc) + db.commit() + db.refresh(job) + response.job.warning_message = job.warning_message + + return response + + +@router.post("/api/scan-jobs/{job_id}/cancel", response_model=ScanJobSummary) +def cancel_scan_job(job_id: str) -> ScanJobSummary: + with SessionLocal() as db: + stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job_id) + job = db.execute(stmt).unique().scalar_one_or_none() + if not job: + raise HTTPException(status_code=404, detail="Job not found") + if job.status not in ("queued", "running"): + raise HTTPException(status_code=409, detail="Job is not queued or running") + now = datetime.now(timezone.utc) + job.status = "cancelled" + job.updated_at = now + job.finished_at = now + job.scan_activity = None + db.commit() + db.refresh(job) + stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job_id) + job = db.execute(stmt).unique().scalar_one() + return _to_job_summary(job) + + +@router.delete("/api/scan-jobs/{job_id}", status_code=204, response_class=Response) +def delete_scan_job(job_id: str) -> Response: + with SessionLocal() as db: + job = db.get(ScanJob, job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + if job.status in ("queued", "running"): + raise HTTPException(status_code=409, detail="Cannot delete a job that is queued or running") + db.delete(job) + db.commit() + return Response(status_code=204) + + +@router.get("/api/scan-jobs", response_model=list[ScanJobSummary]) +def list_scan_jobs( + limit: int = 20, + tenant_profile_id: str | None = None, + scan_type: str | None = None, +) -> list[ScanJobSummary]: + with SessionLocal() as db: + stmt = ( + select(ScanJob) + .options(joinedload(ScanJob.tenant_profile)) + .order_by(ScanJob.created_at.desc()) + .limit(max(1, min(limit, 100))) + ) + if tenant_profile_id: + stmt = stmt.where(ScanJob.tenant_profile_id == tenant_profile_id) + if scan_type: + stmt = stmt.where(ScanJob.scan_type == scan_type) + jobs = list(db.execute(stmt).unique().scalars()) + return [_to_job_summary(job) for job in jobs] + + +@router.get("/api/scan-jobs/{job_id}/sharing-link-types", response_model=SharingLinkTypesResponse) +def get_sharing_link_types(job_id: str) -> SharingLinkTypesResponse: + with SessionLocal() as db: + job = db.get(ScanJob, job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + principals = list( + db.execute( + select(PermissionDeviation.principal).where(PermissionDeviation.job_id == job_id) + ).scalars() + ) + + type_counts: dict[str, int] = {} + for principal in principals: + parsed = _extract_sharing_link_group_and_type(str(principal or "")) + if not parsed: + continue + _group_name, link_type = parsed + type_counts[link_type] = type_counts.get(link_type, 0) + 1 + + return SharingLinkTypesResponse(type_counts=type_counts) + + +@router.post("/api/scan-jobs/{job_id}/resolve-sharing-links", response_model=ResolveSharingLinksResponse) +def resolve_sharing_links_endpoint(job_id: str, payload: ResolveSharingLinksRequest) -> ResolveSharingLinksResponse: + from .scanner import resolve_sharing_link_members + + with SessionLocal() as db: + job = db.get(ScanJob, job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + if job.status in ("queued", "running"): + raise HTTPException(status_code=409, detail="Job is still running") + + cert_private_key: str | None = None + cert_thumbprint: str | None = None + cert_public_pem: str | None = None + if job.tenant_profile_id: + profile = db.get(TenantProfile, job.tenant_profile_id) + if profile: + cert_private_key = profile.cert_private_key + cert_thumbprint = profile.cert_thumbprint + cert_public_pem = profile.cert_public_pem + + auth = AuthConfig( + tenant_id=job.auth_tenant_id or "", + client_id=job.auth_client_id or "", + client_secret=job.auth_client_secret or "", + cert_private_key=cert_private_key, + cert_thumbprint=cert_thumbprint, + cert_public_pem=cert_public_pem, + ) + + all_deviations = list( + db.execute(select(PermissionDeviation).where(PermissionDeviation.job_id == job_id)).scalars() + ) + + # Group by (site_url, principal) so each unique group is resolved once + groups: dict[tuple[str, str], list[int]] = {} + for dev in all_deviations: + parsed = _extract_sharing_link_group_and_type(dev.principal) + if not parsed: + continue + group_name, link_type = parsed + if link_type not in payload.link_types: + continue + key = (dev.site_url, group_name) + groups.setdefault(key, []).append(dev.id) + + updated_deviations = 0 + for (site_url, group_name), dev_ids in groups.items(): + members = resolve_sharing_link_members(site_url, group_name, auth) + resolved_members = ", ".join(members) if members else "" + with SessionLocal() as db: + for dev_id in dev_ids: + dev = db.get(PermissionDeviation, dev_id) + if dev: + dev.resolved_members = resolved_members + db.commit() + updated_deviations += len(dev_ids) + + return ResolveSharingLinksResponse( + resolved_groups=len(groups), + updated_deviations=updated_deviations, + ) + + +@router.post("/api/scan-jobs/{job_id}/resolve-groups", response_model=ResolveGroupsResponse) +def resolve_groups_endpoint(job_id: str) -> ResolveGroupsResponse: + """ + Expand group principals on this job's deviations and write each group's + member list to permission_deviations.resolved_members. Handles both + classic SharePoint groups (via getbyname) and Entra/AAD or M365 groups + assigned directly at root (via Microsoft Graph). Skips email-shape users + and SharingLinks groups (those have their own resolver). + """ + from .scanners.sharepoint import ( + is_aad_group_principal, + is_sharepoint_group_principal, + resolve_aad_group_members, + resolve_sharing_link_members, + ) + + with SessionLocal() as db: + job = db.get(ScanJob, job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + if job.status in ("queued", "running"): + raise HTTPException(status_code=409, detail="Job is still running") + if (job.scan_type or "sharepoint") == "mailbox": + raise HTTPException(status_code=400, detail="Group resolution is only available for SharePoint jobs") + + cert_private_key: str | None = None + cert_thumbprint: str | None = None + cert_public_pem: str | None = None + if job.tenant_profile_id: + profile = db.get(TenantProfile, job.tenant_profile_id) + if profile: + cert_private_key = profile.cert_private_key + cert_thumbprint = profile.cert_thumbprint + cert_public_pem = profile.cert_public_pem + + auth = AuthConfig( + tenant_id=job.auth_tenant_id or "", + client_id=job.auth_client_id or "", + client_secret=job.auth_client_secret or "", + cert_private_key=cert_private_key, + cert_thumbprint=cert_thumbprint, + cert_public_pem=cert_public_pem, + ) + + all_deviations = list( + db.execute(select(PermissionDeviation).where(PermissionDeviation.job_id == job_id)).scalars() + ) + + # Group deviations by (site_url, principal) so each unique group is resolved once + groups: dict[tuple[str, str], list[int]] = {} + for dev in all_deviations: + if not (is_sharepoint_group_principal(dev.principal) or is_aad_group_principal(dev.principal)): + continue + key = (dev.site_url, dev.principal) + groups.setdefault(key, []).append(dev.id) + + resolved = 0 + skipped = 0 + updated = 0 + for (site_url, group_name), dev_ids in groups.items(): + try: + if is_aad_group_principal(group_name): + members = resolve_aad_group_members(group_name, auth) + else: + members = resolve_sharing_link_members(site_url, group_name, auth) + except Exception: # noqa: BLE001 + members = [] + + if not members: + skipped += 1 + continue + + resolved_text = ", ".join(members) + with SessionLocal() as db: + for dev_id in dev_ids: + dev = db.get(PermissionDeviation, dev_id) + if dev: + dev.resolved_members = resolved_text + db.commit() + resolved += 1 + updated += len(dev_ids) + + return ResolveGroupsResponse( + resolved_groups=resolved, + skipped_groups=skipped, + updated_deviations=updated, + ) + + +@router.post("/api/scan-jobs/{job_id}/targets/{target_id}/test-connection", response_model=ProbeResultResponse) +def test_target_connection(job_id: str, target_id: int) -> ProbeResultResponse: + with SessionLocal() as db: + job = db.get(ScanJob, job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + target = db.get(ScanTarget, target_id) + if not target or target.job_id != job_id: + raise HTTPException(status_code=404, detail="Target not found") + if job.status in ("queued", "running"): + raise HTTPException(status_code=409, detail="Job is still running") + + cert_private_key: str | None = None + cert_thumbprint: str | None = None + cert_public_pem: str | None = None + if job.tenant_profile_id: + profile = db.get(TenantProfile, job.tenant_profile_id) + if profile: + cert_private_key = profile.cert_private_key + cert_thumbprint = profile.cert_thumbprint + cert_public_pem = profile.cert_public_pem + + auth = AuthConfig( + tenant_id=job.auth_tenant_id or "", + client_id=job.auth_client_id or "", + client_secret=job.auth_client_secret or "", + cert_private_key=cert_private_key, + cert_thumbprint=cert_thumbprint, + cert_public_pem=cert_public_pem, + ) + site_url = target.site_url + job_scan_type = job.scan_type or "sharepoint" + + result = probe(job_scan_type, site_url, auth) + + with SessionLocal() as db: + target = db.get(ScanTarget, target_id) + if not target: + raise HTTPException(status_code=404, detail="Target not found") + now = datetime.now(timezone.utc) + target.last_probe_at = now + target.last_probe_ok = result.ok + target.last_probe_message = result.message + target.updated_at = now + db.commit() + db.refresh(target) + return ProbeResultResponse( + target_id=target.id, + ok=result.ok, + message=result.message, + last_probe_at=target.last_probe_at, + ) + + +@router.get("/api/scan-jobs/{job_id}/export") +def export_scan_job(job_id: str, site_url: str | None = None) -> StreamingResponse: + import openpyxl + from openpyxl.styles import Font, PatternFill + + with SessionLocal() as db: + job = db.get(ScanJob, job_id, options=[joinedload(ScanJob.tenant_profile)]) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + targets_q = select(ScanTarget).where(ScanTarget.job_id == job.id).order_by(ScanTarget.id.asc()) + if site_url: + targets_q = targets_q.where(ScanTarget.site_url == site_url) + targets = list(db.execute(targets_q).scalars()) + + deviations_q = ( + select(PermissionDeviation) + .where(PermissionDeviation.job_id == job.id) + .order_by(PermissionDeviation.id.desc()) + ) + if site_url: + deviations_q = deviations_q.where(PermissionDeviation.site_url == site_url) + deviations = list(db.execute(deviations_q).scalars()) + + wb = openpyxl.Workbook() + header_fill = PatternFill(start_color="1E2A3A", end_color="1E2A3A", fill_type="solid") + header_font_white = Font(bold=True, color="FFFFFF") + + _risk_styles: dict[str, tuple] = { + "Critical": ( + PatternFill(start_color="FDDEDE", end_color="FDDEDE", fill_type="solid"), + Font(bold=True, color="7B0000"), + ), + "High": ( + PatternFill(start_color="FEE8D3", end_color="FEE8D3", fill_type="solid"), + Font(bold=True, color="7C2D00"), + ), + "Low": ( + PatternFill(start_color="D6EEF8", end_color="D6EEF8", fill_type="solid"), + Font(bold=True, color="0C4A6E"), + ), + "Unknown": ( + PatternFill(start_color="F0F0F0", end_color="F0F0F0", fill_type="solid"), + Font(bold=True, color="555555"), + ), + } + + def _style_header(ws, headers): + ws.append(headers) + for cell in ws[1]: + cell.font = header_font_white + cell.fill = header_fill + + scan_type = job.scan_type or "sharepoint" + + target_label = { + "sharepoint": "Site URL", + "sharepoint_root": "Site URL", + "mailbox": "Mailbox", + "entra_groups": "Group", + }.get(scan_type, "Target") + + # Targets sheet + ws_targets = wb.active + ws_targets.title = "Targets" + _style_header(ws_targets, [target_label, "Status", "Attempts", "Error", "Started", "Finished"]) + for t in targets: + ws_targets.append([ + t.site_url, + t.status, + t.attempts, + t.error_message or "", + t.started_at.isoformat() if t.started_at else "", + t.finished_at.isoformat() if t.finished_at else "", + ]) + for col in ws_targets.columns: + ws_targets.column_dimensions[col[0].column_letter].width = max(len(str(c.value or "")) for c in col) + 4 + + # Results sheet — name and columns depend on scan type + if scan_type == "mailbox": + ws_dev = wb.create_sheet("Mailbox Permissions") + _style_header(ws_dev, ["Mailbox", "Object", "Permission Type", "Principal", "Access Rights"]) + deviations.sort(key=lambda d: (d.site_url or "", d.permission_type or "", d.principal or "")) + for d in deviations: + ws_dev.append([ + d.site_url, + d.object_url, + d.permission_type or d.object_type, + d.principal, + d.role_name, + ]) + elif scan_type == "entra_groups": + ws_dev = wb.create_sheet("Group Memberships") + _style_header(ws_dev, ["Group", "Group Type", "User", "Role"]) + deviations.sort(key=lambda d: (d.object_url or "", d.role_name or "", d.principal or "")) + for d in deviations: + ws_dev.append([ + d.object_url, + d.permission_type or "", + d.principal, + d.role_name, + ]) + elif scan_type == "sharepoint_root": + ws_dev = wb.create_sheet("Root Permissions") + _style_header(ws_dev, ["Site URL", "Principal", "Resolved Members", "Role"]) + deviations.sort(key=lambda d: (d.site_url or "", d.principal or "", d.role_name or "")) + for d in deviations: + ws_dev.append([ + d.site_url, + d.principal, + d.resolved_members or "", + d.role_name, + ]) + else: + ws_dev = wb.create_sheet("Deviations") + _style_header(ws_dev, ["Site URL", "Object URL", "Object Type", "Principal", "Link Risk", "Resolved Members", "Role", "Delta"]) + deviations.sort(key=lambda d: (d.site_url or "", d.object_url or "", d.principal or "")) + for d in deviations: + base = (d.site_url or "").rstrip("/") + obj_rel = d.object_url[len(base):] if base and d.object_url.startswith(base) else d.object_url + link_risk = _sharing_link_risk_label(d.principal) + ws_dev.append([ + d.site_url, + obj_rel, + d.object_type, + d.principal, + link_risk, + d.resolved_members or "", + d.role_name, + d.delta_type, + ]) + if link_risk in _risk_styles: + risk_fill, risk_font = _risk_styles[link_risk] + risk_cell = ws_dev.cell(row=ws_dev.max_row, column=5) + risk_cell.fill = risk_fill + risk_cell.font = risk_font + for col in ws_dev.columns: + ws_dev.column_dimensions[col[0].column_letter].width = max(len(str(c.value or "")) for c in col) + 4 + + buf = io.BytesIO() + wb.save(buf) + buf.seek(0) + + filename = _build_export_filename(job, job_id) + return StreamingResponse( + buf, + media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + headers={"Content-Disposition": f'attachment; filename="{filename}"'}, + ) + + +@router.get("/api/scan-jobs/{job_id}", response_model=ScanJobDetail) +def get_scan_job(job_id: str, site_url: str | None = None) -> ScanJobDetail: + with SessionLocal() as db: + job = db.get(ScanJob, job_id, options=[joinedload(ScanJob.tenant_profile)]) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + + targets_q = select(ScanTarget).where(ScanTarget.job_id == job.id).order_by(ScanTarget.id.asc()) + if site_url: + targets_q = targets_q.where(ScanTarget.site_url == site_url) + targets = list(db.execute(targets_q).scalars()) + + deviations_q = ( + select(PermissionDeviation) + .where(PermissionDeviation.job_id == job.id) + .order_by(PermissionDeviation.site_url.asc(), PermissionDeviation.object_url.asc(), PermissionDeviation.id.asc()) + ) + if site_url: + deviations_q = deviations_q.where(PermissionDeviation.site_url == site_url) + else: + deviations_q = deviations_q.limit(1000) + deviations = list(db.execute(deviations_q).scalars()) + + return ScanJobDetail( + **_to_job_summary(job).model_dump(), + targets=[ + ScanTargetItem( + id=t.id, + site_url=t.site_url, + status=t.status, + attempts=t.attempts, + error_message=t.error_message, + started_at=t.started_at, + finished_at=t.finished_at, + last_probe_at=t.last_probe_at, + last_probe_ok=t.last_probe_ok, + last_probe_message=t.last_probe_message, + ) + for t in targets + ], + deviations=[ + PermissionDeviationItem( + id=d.id, + site_url=d.site_url, + object_url=d.object_url, + object_type=d.object_type, + principal=d.principal, + role_name=d.role_name, + delta_type=d.delta_type, + permission_type=d.permission_type, + resolved_members=d.resolved_members, + created_at=d.created_at, + ) + for d in deviations + ], + ) diff --git a/containers/clearview/src/clearview_app/api_onboarding.py b/containers/clearview/src/clearview_app/api_onboarding.py new file mode 100644 index 0000000..979107c --- /dev/null +++ b/containers/clearview/src/clearview_app/api_onboarding.py @@ -0,0 +1,76 @@ +"""Microsoft onboarding routes (admin-consent connect + scan-app creation).""" +from __future__ import annotations + +from fastapi import APIRouter, HTTPException +from fastapi.responses import RedirectResponse + +from .onboarding import ( + OnboardingError, + consume_callback_state, + create_connect_url, + create_scan_app_for_tenant, +) +from .schemas import ( + ConnectMicrosoftResponse, + CreateScanAppRequest, + CreateScanAppResponse, +) + +router = APIRouter() + + +@router.post("/api/onboarding/create-scan-app", response_model=CreateScanAppResponse) +def onboarding_create_scan_app(payload: CreateScanAppRequest) -> CreateScanAppResponse: + try: + result = create_scan_app_for_tenant( + tenant_id=payload.tenant_id, + display_name=payload.display_name, + ) + except OnboardingError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except Exception as exc: # noqa: BLE001 + raise HTTPException(status_code=500, detail=f"Unexpected onboarding error: {exc}") from exc + + return CreateScanAppResponse( + tenant_id=result.tenant_id, + client_id=result.client_id, + client_secret=result.client_secret, + app_object_id=result.app_object_id, + service_principal_id=result.service_principal_id, + display_name=result.display_name, + ) + + +@router.get("/api/onboarding/microsoft/connect-url", response_model=ConnectMicrosoftResponse) +def onboarding_microsoft_connect_url() -> ConnectMicrosoftResponse: + try: + return ConnectMicrosoftResponse(connect_url=create_connect_url()) + except OnboardingError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + + +@router.get("/api/onboarding/microsoft/callback") +def onboarding_microsoft_callback( + tenant: str | None = None, + state: str | None = None, + error: str | None = None, + error_description: str | None = None, +) -> RedirectResponse: + if error: + message = (error_description or error).replace(" ", "+") + return RedirectResponse(url=f"/?onboarding_status=error&onboarding_message={message}") + + if not state or not consume_callback_state(state): + return RedirectResponse(url="/?onboarding_status=error&onboarding_message=invalid_or_expired_state") + + if not tenant: + return RedirectResponse(url="/?onboarding_status=error&onboarding_message=missing_tenant") + + return RedirectResponse(url=f"/?onboarding_status=connected&tenant_id={tenant}") + + +@router.get("/api/onboarding/status") +def onboarding_status() -> dict[str, bool]: + from . import config + automated = bool(config.ONBOARDING_CLIENT_ID and config.ONBOARDING_CLIENT_SECRET and config.ONBOARDING_REDIRECT_URI) + return {"automated_available": automated} diff --git a/containers/clearview/src/clearview_app/api_tenants.py b/containers/clearview/src/clearview_app/api_tenants.py new file mode 100644 index 0000000..f8cb473 --- /dev/null +++ b/containers/clearview/src/clearview_app/api_tenants.py @@ -0,0 +1,86 @@ +"""Tenant profile + certificate routes.""" +from __future__ import annotations + +import uuid +from datetime import datetime, timezone + +from fastapi import APIRouter, HTTPException +from fastapi.responses import Response +from sqlalchemy import select, text + +from .api_helpers import _to_tenant_item +from .cert import generate_tenant_certificate +from .db import SessionLocal +from .models import TenantProfile +from .schemas import ( + CreateTenantProfileRequest, + TenantCertificateResponse, + TenantProfileItem, +) + +router = APIRouter() + + +@router.get("/api/tenants", response_model=list[TenantProfileItem]) +def list_tenants() -> list[TenantProfileItem]: + with SessionLocal() as db: + profiles = list( + db.execute(select(TenantProfile).order_by(TenantProfile.created_at.asc())).scalars() + ) + return [_to_tenant_item(p) for p in profiles] + + +@router.post("/api/tenants", response_model=TenantProfileItem, status_code=201) +def create_tenant(payload: CreateTenantProfileRequest) -> TenantProfileItem: + with SessionLocal() as db: + now = datetime.now(timezone.utc) + profile = TenantProfile( + id=str(uuid.uuid4()), + name=payload.name.strip(), + tenant_id=payload.tenant_id.strip(), + primary_domain=payload.primary_domain.strip().lower() if payload.primary_domain else None, + client_id=payload.client_id.strip(), + client_secret=payload.client_secret.strip() if payload.client_secret else None, + created_at=now, + updated_at=now, + ) + db.add(profile) + db.commit() + db.refresh(profile) + return _to_tenant_item(profile) + + +@router.post("/api/tenants/{profile_id}/generate-certificate", response_model=TenantCertificateResponse) +def generate_certificate(profile_id: str) -> TenantCertificateResponse: + with SessionLocal() as db: + profile = db.get(TenantProfile, profile_id) + if not profile: + raise HTTPException(status_code=404, detail="Tenant profile not found") + result = generate_tenant_certificate() + profile.cert_private_key = result.private_key_pem + profile.cert_public_pem = result.public_cert_pem + profile.cert_thumbprint = result.thumbprint + profile.cert_expires_at = result.expires_at + profile.updated_at = datetime.now(timezone.utc) + db.commit() + return TenantCertificateResponse( + thumbprint=result.thumbprint, + expires_at=result.expires_at, + public_cert_pem=result.public_cert_pem, + ) + + +@router.delete("/api/tenants/{profile_id}", status_code=204, response_class=Response) +def delete_tenant(profile_id: str) -> Response: + with SessionLocal() as db: + profile = db.get(TenantProfile, profile_id) + if not profile: + raise HTTPException(status_code=404, detail="Tenant profile not found") + # Detach jobs from this profile before deleting + db.execute( + text("UPDATE scan_jobs SET tenant_profile_id = NULL WHERE tenant_profile_id = :pid"), + {"pid": profile_id}, + ) + db.delete(profile) + db.commit() + return Response(status_code=204) diff --git a/containers/clearview/src/clearview_app/cert.py b/containers/clearview/src/clearview_app/cert.py index accbb54..85f035f 100644 --- a/containers/clearview/src/clearview_app/cert.py +++ b/containers/clearview/src/clearview_app/cert.py @@ -2,7 +2,7 @@ from __future__ import annotations import hashlib from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from cryptography import x509 from cryptography.hazmat.primitives import hashes, serialization @@ -30,7 +30,7 @@ def generate_tenant_certificate(valid_years: int = 2) -> GeneratedCertificate: subject = x509.Name([ x509.NameAttribute(NameOID.COMMON_NAME, "Clearview Scan App"), ]) - expires_at = datetime.utcnow() + timedelta(days=365 * valid_years) + expires_at = datetime.now(timezone.utc) + timedelta(days=365 * valid_years) cert = ( x509.CertificateBuilder() @@ -38,7 +38,7 @@ def generate_tenant_certificate(valid_years: int = 2) -> GeneratedCertificate: .issuer_name(subject) .public_key(private_key.public_key()) .serial_number(x509.random_serial_number()) - .not_valid_before(datetime.utcnow()) + .not_valid_before(datetime.now(timezone.utc)) .not_valid_after(expires_at) .sign(private_key, hashes.SHA256()) ) diff --git a/containers/clearview/src/clearview_app/db_migrate.py b/containers/clearview/src/clearview_app/db_migrate.py new file mode 100644 index 0000000..02bfefd --- /dev/null +++ b/containers/clearview/src/clearview_app/db_migrate.py @@ -0,0 +1,53 @@ +"""Database migration bootstrap. + +Replaces the previous ``Base.metadata.create_all`` + ``_ensure_schema_columns`` +startup path with Alembic. The bootstrap is idempotent and handles three cases: + +* **Fresh database** (no tables): run ``upgrade head`` to create the schema and + record the Alembic version. +* **Existing pre-Alembic database** (tables present, no ``alembic_version``): + ``stamp head`` — adopt the baseline without re-creating existing tables. +* **Already under Alembic**: run ``upgrade head`` to apply any new revisions. +""" +from __future__ import annotations + +import logging +from pathlib import Path + +from alembic import command +from alembic.config import Config +from sqlalchemy import inspect + +from .db import engine + +log = logging.getLogger(__name__) + +_MIGRATIONS_DIR = Path(__file__).resolve().parent / "migrations" +# A table that exists in every pre-Alembic Clearview database; its presence +# (without alembic_version) marks a database that predates Alembic adoption. +_SENTINEL_TABLE = "scan_jobs" + + +def _alembic_config() -> Config: + cfg = Config() + cfg.set_main_option("script_location", str(_MIGRATIONS_DIR)) + return cfg + + +_BASELINE_REVISION = "0001_baseline" + + +def run_migrations() -> None: + """Bring the database schema up to date (see module docstring).""" + cfg = _alembic_config() + tables = set(inspect(engine).get_table_names()) + + if "alembic_version" not in tables and _SENTINEL_TABLE in tables: + # Pre-Alembic DB: it already matches the baseline, so adopt that + # revision without re-creating tables, then let upgrade apply any + # later migrations (e.g. the timestamptz conversion in 0002). + log.info("Existing pre-Alembic schema detected; stamping baseline %s.", _BASELINE_REVISION) + command.stamp(cfg, _BASELINE_REVISION) + + log.info("Applying Alembic migrations (upgrade head).") + command.upgrade(cfg, "head") diff --git a/containers/clearview/src/clearview_app/main.py b/containers/clearview/src/clearview_app/main.py index 7a8ff30..bb1da71 100644 --- a/containers/clearview/src/clearview_app/main.py +++ b/containers/clearview/src/clearview_app/main.py @@ -1,43 +1,21 @@ +"""Clearview API composition root. + +Routes live in the ``api_tenants``, ``api_jobs``, and ``api_onboarding`` modules +(shared helpers in ``api_helpers``). This module only wires the FastAPI app, +the scan worker lifecycle, health/version endpoints, and static file serving. +""" from __future__ import annotations -import io -import re -import uuid -from datetime import datetime from pathlib import Path -from fastapi import FastAPI, File, Form, HTTPException, UploadFile -from fastapi.responses import FileResponse, RedirectResponse, Response, StreamingResponse +from fastapi import FastAPI +from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles -from sqlalchemy import select, text -from sqlalchemy.orm import joinedload -from .csv_import import parse_entra_groups_csv, parse_mailboxes_csv, parse_sites_csv -from .db import SessionLocal, engine -from .default_sites import is_default_site, normalize_site_url -from .models import Base, PermissionDeviation, ScanJob, ScanTarget, TenantProfile -from .onboarding import OnboardingError, consume_callback_state, create_connect_url, create_scan_app_for_tenant -from .cert import generate_tenant_certificate -from .schemas import ( - ConnectMicrosoftResponse, - CreateScanAppRequest, - CreateScanAppResponse, - CreateScanJobRequest, - CreateTenantProfileRequest, - PermissionDeviationItem, - ProbeResultResponse, - ResolveGroupsResponse, - ResolveSharingLinksRequest, - ResolveSharingLinksResponse, - SharingLinkTypesResponse, - ScanJobCreateResponse, - ScanJobDetail, - ScanJobSummary, - ScanTargetItem, - TenantCertificateResponse, - TenantProfileItem, -) -from .scanners import AuthConfig, probe +from .api_jobs import router as jobs_router +from .api_onboarding import router as onboarding_router +from .api_tenants import router as tenants_router +from .db_migrate import run_migrations from .version import display_version from .worker import ScanWorker @@ -47,38 +25,9 @@ worker = ScanWorker() SITE_DIR = Path(__file__).resolve().parents[2] / "site" -def _extract_sharing_link_group_and_type(principal: str) -> tuple[str, str] | None: - """ - Extract (group_name, link_type) from principal values such as: - - SharingLinks... - - c:0o.c|federateddirectoryclaimprovider|SharingLinks... - """ - if not principal: - return None - - text = principal.strip() - segments = [s.strip() for s in text.split("|") if s.strip()] - - candidate = "" - for segment in reversed(segments): - if segment.lower().startswith("sharinglinks."): - candidate = segment - break - if not candidate and text.lower().startswith("sharinglinks."): - candidate = text - if not candidate: - return None - - parts = candidate.split(".") - if len(parts) < 3: - return None - return candidate, parts[2] - - @app.on_event("startup") def on_startup() -> None: - Base.metadata.create_all(bind=engine) - _ensure_schema_columns() + run_migrations() worker.start() @@ -98,747 +47,13 @@ def version() -> dict[str, str]: return {"version": display_version()} -# --------------------------------------------------------------------------- -# Tenant profiles -# --------------------------------------------------------------------------- - -@app.get("/api/tenants", response_model=list[TenantProfileItem]) -def list_tenants() -> list[TenantProfileItem]: - with SessionLocal() as db: - profiles = list( - db.execute(select(TenantProfile).order_by(TenantProfile.created_at.asc())).scalars() - ) - return [_to_tenant_item(p) for p in profiles] - - -@app.post("/api/tenants", response_model=TenantProfileItem, status_code=201) -def create_tenant(payload: CreateTenantProfileRequest) -> TenantProfileItem: - with SessionLocal() as db: - now = datetime.utcnow() - profile = TenantProfile( - id=str(uuid.uuid4()), - name=payload.name.strip(), - tenant_id=payload.tenant_id.strip(), - primary_domain=payload.primary_domain.strip().lower() if payload.primary_domain else None, - client_id=payload.client_id.strip(), - client_secret=payload.client_secret.strip() if payload.client_secret else None, - created_at=now, - updated_at=now, - ) - db.add(profile) - db.commit() - db.refresh(profile) - return _to_tenant_item(profile) - - -@app.post("/api/tenants/{profile_id}/generate-certificate", response_model=TenantCertificateResponse) -def generate_certificate(profile_id: str) -> TenantCertificateResponse: - with SessionLocal() as db: - profile = db.get(TenantProfile, profile_id) - if not profile: - raise HTTPException(status_code=404, detail="Tenant profile not found") - result = generate_tenant_certificate() - profile.cert_private_key = result.private_key_pem - profile.cert_public_pem = result.public_cert_pem - profile.cert_thumbprint = result.thumbprint - profile.cert_expires_at = result.expires_at - profile.updated_at = datetime.utcnow() - db.commit() - return TenantCertificateResponse( - thumbprint=result.thumbprint, - expires_at=result.expires_at, - public_cert_pem=result.public_cert_pem, - ) - - -@app.delete("/api/tenants/{profile_id}", status_code=204, response_class=Response) -def delete_tenant(profile_id: str) -> Response: - with SessionLocal() as db: - profile = db.get(TenantProfile, profile_id) - if not profile: - raise HTTPException(status_code=404, detail="Tenant profile not found") - # Detach jobs from this profile before deleting - db.execute( - text("UPDATE scan_jobs SET tenant_profile_id = NULL WHERE tenant_profile_id = :pid"), - {"pid": profile_id}, - ) - db.delete(profile) - db.commit() - return Response(status_code=204) +app.include_router(tenants_router) +app.include_router(jobs_router) +app.include_router(onboarding_router) # --------------------------------------------------------------------------- -# Scan jobs -# --------------------------------------------------------------------------- - -@app.post("/api/scan-jobs", response_model=ScanJobCreateResponse) -def create_scan_job(payload: CreateScanJobRequest) -> ScanJobCreateResponse: - with SessionLocal() as db: - tenant_id, client_id, client_secret, profile_id = _resolve_credentials( - db=db, - tenant_profile_id=payload.tenant_profile_id, - tenant_id=payload.tenant_id, - client_id=payload.client_id, - client_secret=payload.client_secret, - ) - source_type = "manual" - if payload.scan_type == "entra_groups": - if payload.scan_all_groups: - raw_targets = _enumerate_all_entra_groups( - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - profile_id=profile_id, - ) - source_type = "tenant_all" - else: - raw_targets = [str(g) for g in payload.group_ids] - elif payload.scan_type == "mailbox": - if payload.scan_all_mailboxes: - organization = payload.organization - if (not organization) and profile_id: - with SessionLocal() as db: - profile = db.get(TenantProfile, profile_id) - if profile and profile.primary_domain: - organization = profile.primary_domain - raw_targets = _enumerate_all_mailboxes( - organization=organization, - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - profile_id=profile_id, - ) - source_type = "tenant_all" - else: - raw_targets = [str(m) for m in payload.mailboxes] - else: - raw_targets = [str(item) for item in payload.site_urls] - return _create_job_from_targets( - raw_targets=raw_targets, - scan_type=payload.scan_type, - skip_default_sites=payload.skip_default_sites, - source_type=source_type, - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - tenant_profile_id=profile_id, - ) - - -@app.post("/api/scan-jobs/import-csv", response_model=ScanJobCreateResponse) -def create_scan_job_from_csv( - skip_default_sites: bool = True, - scan_type: str = Form("sharepoint"), - tenant_profile_id: str | None = Form(None), - tenant_id: str | None = Form(None), - client_id: str | None = Form(None), - client_secret: str | None = Form(None), - file: UploadFile = File(...), -) -> ScanJobCreateResponse: - with SessionLocal() as db: - resolved_tenant_id, resolved_client_id, resolved_client_secret, profile_id = _resolve_credentials( - db=db, - tenant_profile_id=tenant_profile_id, - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret, - ) - content = file.file.read() - if scan_type == "mailbox": - parsed = parse_mailboxes_csv(content) - targets = parsed.mailboxes - elif scan_type == "entra_groups": - parsed = parse_entra_groups_csv(content) - targets = parsed.urls - else: - parsed = parse_sites_csv(content) - targets = parsed.urls - response = _create_job_from_targets( - raw_targets=targets, - scan_type=scan_type, - skip_default_sites=skip_default_sites, - source_type="csv", - tenant_id=resolved_tenant_id, - client_id=resolved_client_id, - client_secret=resolved_client_secret, - tenant_profile_id=profile_id, - ) - - if parsed.invalid_rows: - csv_warning = f"CSV issues: {len(parsed.invalid_rows)}" - with SessionLocal() as db: - job = db.get(ScanJob, response.job.id) - if job: - if job.warning_message: - job.warning_message = f"{job.warning_message} | {csv_warning}" - else: - job.warning_message = csv_warning - job.updated_at = datetime.utcnow() - db.commit() - db.refresh(job) - response.job.warning_message = job.warning_message - - return response - - -@app.post("/api/scan-jobs/{job_id}/cancel", response_model=ScanJobSummary) -def cancel_scan_job(job_id: str) -> ScanJobSummary: - with SessionLocal() as db: - stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job_id) - job = db.execute(stmt).unique().scalar_one_or_none() - if not job: - raise HTTPException(status_code=404, detail="Job not found") - if job.status not in ("queued", "running"): - raise HTTPException(status_code=409, detail="Job is not queued or running") - now = datetime.utcnow() - job.status = "cancelled" - job.updated_at = now - job.finished_at = now - job.scan_activity = None - db.commit() - db.refresh(job) - stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job_id) - job = db.execute(stmt).unique().scalar_one() - return _to_job_summary(job) - - -@app.delete("/api/scan-jobs/{job_id}", status_code=204, response_class=Response) -def delete_scan_job(job_id: str) -> Response: - with SessionLocal() as db: - job = db.get(ScanJob, job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - if job.status in ("queued", "running"): - raise HTTPException(status_code=409, detail="Cannot delete a job that is queued or running") - db.delete(job) - db.commit() - return Response(status_code=204) - - -@app.get("/api/scan-jobs", response_model=list[ScanJobSummary]) -def list_scan_jobs( - limit: int = 20, - tenant_profile_id: str | None = None, - scan_type: str | None = None, -) -> list[ScanJobSummary]: - with SessionLocal() as db: - stmt = ( - select(ScanJob) - .options(joinedload(ScanJob.tenant_profile)) - .order_by(ScanJob.created_at.desc()) - .limit(max(1, min(limit, 100))) - ) - if tenant_profile_id: - stmt = stmt.where(ScanJob.tenant_profile_id == tenant_profile_id) - if scan_type: - stmt = stmt.where(ScanJob.scan_type == scan_type) - jobs = list(db.execute(stmt).unique().scalars()) - return [_to_job_summary(job) for job in jobs] - - -@app.get("/api/scan-jobs/{job_id}/sharing-link-types", response_model=SharingLinkTypesResponse) -def get_sharing_link_types(job_id: str) -> SharingLinkTypesResponse: - with SessionLocal() as db: - job = db.get(ScanJob, job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - - principals = list( - db.execute( - select(PermissionDeviation.principal).where(PermissionDeviation.job_id == job_id) - ).scalars() - ) - - type_counts: dict[str, int] = {} - for principal in principals: - parsed = _extract_sharing_link_group_and_type(str(principal or "")) - if not parsed: - continue - _group_name, link_type = parsed - type_counts[link_type] = type_counts.get(link_type, 0) + 1 - - return SharingLinkTypesResponse(type_counts=type_counts) - - -@app.post("/api/scan-jobs/{job_id}/resolve-sharing-links", response_model=ResolveSharingLinksResponse) -def resolve_sharing_links_endpoint(job_id: str, payload: ResolveSharingLinksRequest) -> ResolveSharingLinksResponse: - from .scanner import resolve_sharing_link_members - - with SessionLocal() as db: - job = db.get(ScanJob, job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - if job.status in ("queued", "running"): - raise HTTPException(status_code=409, detail="Job is still running") - - cert_private_key: str | None = None - cert_thumbprint: str | None = None - cert_public_pem: str | None = None - if job.tenant_profile_id: - profile = db.get(TenantProfile, job.tenant_profile_id) - if profile: - cert_private_key = profile.cert_private_key - cert_thumbprint = profile.cert_thumbprint - cert_public_pem = profile.cert_public_pem - - auth = AuthConfig( - tenant_id=job.auth_tenant_id or "", - client_id=job.auth_client_id or "", - client_secret=job.auth_client_secret or "", - cert_private_key=cert_private_key, - cert_thumbprint=cert_thumbprint, - cert_public_pem=cert_public_pem, - ) - - all_deviations = list( - db.execute(select(PermissionDeviation).where(PermissionDeviation.job_id == job_id)).scalars() - ) - - # Group by (site_url, principal) so each unique group is resolved once - groups: dict[tuple[str, str], list[int]] = {} - for dev in all_deviations: - parsed = _extract_sharing_link_group_and_type(dev.principal) - if not parsed: - continue - group_name, link_type = parsed - if link_type not in payload.link_types: - continue - key = (dev.site_url, group_name) - groups.setdefault(key, []).append(dev.id) - - updated_deviations = 0 - for (site_url, group_name), dev_ids in groups.items(): - members = resolve_sharing_link_members(site_url, group_name, auth) - resolved_members = ", ".join(members) if members else "" - with SessionLocal() as db: - for dev_id in dev_ids: - dev = db.get(PermissionDeviation, dev_id) - if dev: - dev.resolved_members = resolved_members - db.commit() - updated_deviations += len(dev_ids) - - return ResolveSharingLinksResponse( - resolved_groups=len(groups), - updated_deviations=updated_deviations, - ) - - -@app.post("/api/scan-jobs/{job_id}/resolve-groups", response_model=ResolveGroupsResponse) -def resolve_groups_endpoint(job_id: str) -> ResolveGroupsResponse: - """ - Expand group principals on this job's deviations and write each group's - member list to permission_deviations.resolved_members. Handles both - classic SharePoint groups (via getbyname) and Entra/AAD or M365 groups - assigned directly at root (via Microsoft Graph). Skips email-shape users - and SharingLinks groups (those have their own resolver). - """ - from .scanners.sharepoint import ( - is_aad_group_principal, - is_sharepoint_group_principal, - resolve_aad_group_members, - resolve_sharing_link_members, - ) - - with SessionLocal() as db: - job = db.get(ScanJob, job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - if job.status in ("queued", "running"): - raise HTTPException(status_code=409, detail="Job is still running") - if (job.scan_type or "sharepoint") == "mailbox": - raise HTTPException(status_code=400, detail="Group resolution is only available for SharePoint jobs") - - cert_private_key: str | None = None - cert_thumbprint: str | None = None - cert_public_pem: str | None = None - if job.tenant_profile_id: - profile = db.get(TenantProfile, job.tenant_profile_id) - if profile: - cert_private_key = profile.cert_private_key - cert_thumbprint = profile.cert_thumbprint - cert_public_pem = profile.cert_public_pem - - auth = AuthConfig( - tenant_id=job.auth_tenant_id or "", - client_id=job.auth_client_id or "", - client_secret=job.auth_client_secret or "", - cert_private_key=cert_private_key, - cert_thumbprint=cert_thumbprint, - cert_public_pem=cert_public_pem, - ) - - all_deviations = list( - db.execute(select(PermissionDeviation).where(PermissionDeviation.job_id == job_id)).scalars() - ) - - # Group deviations by (site_url, principal) so each unique group is resolved once - groups: dict[tuple[str, str], list[int]] = {} - for dev in all_deviations: - if not (is_sharepoint_group_principal(dev.principal) or is_aad_group_principal(dev.principal)): - continue - key = (dev.site_url, dev.principal) - groups.setdefault(key, []).append(dev.id) - - resolved = 0 - skipped = 0 - updated = 0 - for (site_url, group_name), dev_ids in groups.items(): - try: - if is_aad_group_principal(group_name): - members = resolve_aad_group_members(group_name, auth) - else: - members = resolve_sharing_link_members(site_url, group_name, auth) - except Exception: # noqa: BLE001 - members = [] - - if not members: - skipped += 1 - continue - - resolved_text = ", ".join(members) - with SessionLocal() as db: - for dev_id in dev_ids: - dev = db.get(PermissionDeviation, dev_id) - if dev: - dev.resolved_members = resolved_text - db.commit() - resolved += 1 - updated += len(dev_ids) - - return ResolveGroupsResponse( - resolved_groups=resolved, - skipped_groups=skipped, - updated_deviations=updated, - ) - - -@app.post("/api/scan-jobs/{job_id}/targets/{target_id}/test-connection", response_model=ProbeResultResponse) -def test_target_connection(job_id: str, target_id: int) -> ProbeResultResponse: - with SessionLocal() as db: - job = db.get(ScanJob, job_id) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - target = db.get(ScanTarget, target_id) - if not target or target.job_id != job_id: - raise HTTPException(status_code=404, detail="Target not found") - if job.status in ("queued", "running"): - raise HTTPException(status_code=409, detail="Job is still running") - - cert_private_key: str | None = None - cert_thumbprint: str | None = None - cert_public_pem: str | None = None - if job.tenant_profile_id: - profile = db.get(TenantProfile, job.tenant_profile_id) - if profile: - cert_private_key = profile.cert_private_key - cert_thumbprint = profile.cert_thumbprint - cert_public_pem = profile.cert_public_pem - - auth = AuthConfig( - tenant_id=job.auth_tenant_id or "", - client_id=job.auth_client_id or "", - client_secret=job.auth_client_secret or "", - cert_private_key=cert_private_key, - cert_thumbprint=cert_thumbprint, - cert_public_pem=cert_public_pem, - ) - site_url = target.site_url - job_scan_type = job.scan_type or "sharepoint" - - result = probe(job_scan_type, site_url, auth) - - with SessionLocal() as db: - target = db.get(ScanTarget, target_id) - if not target: - raise HTTPException(status_code=404, detail="Target not found") - now = datetime.utcnow() - target.last_probe_at = now - target.last_probe_ok = result.ok - target.last_probe_message = result.message - target.updated_at = now - db.commit() - db.refresh(target) - return ProbeResultResponse( - target_id=target.id, - ok=result.ok, - message=result.message, - last_probe_at=target.last_probe_at, - ) - - -@app.get("/api/scan-jobs/{job_id}/export") -def export_scan_job(job_id: str, site_url: str | None = None) -> StreamingResponse: - import openpyxl - from openpyxl.styles import Font, PatternFill - - with SessionLocal() as db: - job = db.get(ScanJob, job_id, options=[joinedload(ScanJob.tenant_profile)]) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - - targets_q = select(ScanTarget).where(ScanTarget.job_id == job.id).order_by(ScanTarget.id.asc()) - if site_url: - targets_q = targets_q.where(ScanTarget.site_url == site_url) - targets = list(db.execute(targets_q).scalars()) - - deviations_q = ( - select(PermissionDeviation) - .where(PermissionDeviation.job_id == job.id) - .order_by(PermissionDeviation.id.desc()) - ) - if site_url: - deviations_q = deviations_q.where(PermissionDeviation.site_url == site_url) - deviations = list(db.execute(deviations_q).scalars()) - - wb = openpyxl.Workbook() - header_fill = PatternFill(start_color="1E2A3A", end_color="1E2A3A", fill_type="solid") - header_font_white = Font(bold=True, color="FFFFFF") - - _risk_styles: dict[str, tuple] = { - "Critical": ( - PatternFill(start_color="FDDEDE", end_color="FDDEDE", fill_type="solid"), - Font(bold=True, color="7B0000"), - ), - "High": ( - PatternFill(start_color="FEE8D3", end_color="FEE8D3", fill_type="solid"), - Font(bold=True, color="7C2D00"), - ), - "Low": ( - PatternFill(start_color="D6EEF8", end_color="D6EEF8", fill_type="solid"), - Font(bold=True, color="0C4A6E"), - ), - "Unknown": ( - PatternFill(start_color="F0F0F0", end_color="F0F0F0", fill_type="solid"), - Font(bold=True, color="555555"), - ), - } - - def _style_header(ws, headers): - ws.append(headers) - for cell in ws[1]: - cell.font = header_font_white - cell.fill = header_fill - - scan_type = job.scan_type or "sharepoint" - - target_label = { - "sharepoint": "Site URL", - "sharepoint_root": "Site URL", - "mailbox": "Mailbox", - "entra_groups": "Group", - }.get(scan_type, "Target") - - # Targets sheet - ws_targets = wb.active - ws_targets.title = "Targets" - _style_header(ws_targets, [target_label, "Status", "Attempts", "Error", "Started", "Finished"]) - for t in targets: - ws_targets.append([ - t.site_url, - t.status, - t.attempts, - t.error_message or "", - t.started_at.isoformat() if t.started_at else "", - t.finished_at.isoformat() if t.finished_at else "", - ]) - for col in ws_targets.columns: - ws_targets.column_dimensions[col[0].column_letter].width = max(len(str(c.value or "")) for c in col) + 4 - - # Results sheet — name and columns depend on scan type - if scan_type == "mailbox": - ws_dev = wb.create_sheet("Mailbox Permissions") - _style_header(ws_dev, ["Mailbox", "Object", "Permission Type", "Principal", "Access Rights"]) - deviations.sort(key=lambda d: (d.site_url or "", d.permission_type or "", d.principal or "")) - for d in deviations: - ws_dev.append([ - d.site_url, - d.object_url, - d.permission_type or d.object_type, - d.principal, - d.role_name, - ]) - elif scan_type == "entra_groups": - ws_dev = wb.create_sheet("Group Memberships") - _style_header(ws_dev, ["Group", "Group Type", "User", "Role"]) - deviations.sort(key=lambda d: (d.object_url or "", d.role_name or "", d.principal or "")) - for d in deviations: - ws_dev.append([ - d.object_url, - d.permission_type or "", - d.principal, - d.role_name, - ]) - elif scan_type == "sharepoint_root": - ws_dev = wb.create_sheet("Root Permissions") - _style_header(ws_dev, ["Site URL", "Principal", "Resolved Members", "Role"]) - deviations.sort(key=lambda d: (d.site_url or "", d.principal or "", d.role_name or "")) - for d in deviations: - ws_dev.append([ - d.site_url, - d.principal, - d.resolved_members or "", - d.role_name, - ]) - else: - ws_dev = wb.create_sheet("Deviations") - _style_header(ws_dev, ["Site URL", "Object URL", "Object Type", "Principal", "Link Risk", "Resolved Members", "Role", "Delta"]) - deviations.sort(key=lambda d: (d.site_url or "", d.object_url or "", d.principal or "")) - for d in deviations: - base = (d.site_url or "").rstrip("/") - obj_rel = d.object_url[len(base):] if base and d.object_url.startswith(base) else d.object_url - link_risk = _sharing_link_risk_label(d.principal) - ws_dev.append([ - d.site_url, - obj_rel, - d.object_type, - d.principal, - link_risk, - d.resolved_members or "", - d.role_name, - d.delta_type, - ]) - if link_risk in _risk_styles: - risk_fill, risk_font = _risk_styles[link_risk] - risk_cell = ws_dev.cell(row=ws_dev.max_row, column=5) - risk_cell.fill = risk_fill - risk_cell.font = risk_font - for col in ws_dev.columns: - ws_dev.column_dimensions[col[0].column_letter].width = max(len(str(c.value or "")) for c in col) + 4 - - buf = io.BytesIO() - wb.save(buf) - buf.seek(0) - - filename = _build_export_filename(job, job_id) - return StreamingResponse( - buf, - media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - headers={"Content-Disposition": f'attachment; filename="{filename}"'}, - ) - - -@app.get("/api/scan-jobs/{job_id}", response_model=ScanJobDetail) -def get_scan_job(job_id: str, site_url: str | None = None) -> ScanJobDetail: - with SessionLocal() as db: - job = db.get(ScanJob, job_id, options=[joinedload(ScanJob.tenant_profile)]) - if not job: - raise HTTPException(status_code=404, detail="Job not found") - - targets_q = select(ScanTarget).where(ScanTarget.job_id == job.id).order_by(ScanTarget.id.asc()) - if site_url: - targets_q = targets_q.where(ScanTarget.site_url == site_url) - targets = list(db.execute(targets_q).scalars()) - - deviations_q = ( - select(PermissionDeviation) - .where(PermissionDeviation.job_id == job.id) - .order_by(PermissionDeviation.site_url.asc(), PermissionDeviation.object_url.asc(), PermissionDeviation.id.asc()) - ) - if site_url: - deviations_q = deviations_q.where(PermissionDeviation.site_url == site_url) - else: - deviations_q = deviations_q.limit(1000) - deviations = list(db.execute(deviations_q).scalars()) - - return ScanJobDetail( - **_to_job_summary(job).model_dump(), - targets=[ - ScanTargetItem( - id=t.id, - site_url=t.site_url, - status=t.status, - attempts=t.attempts, - error_message=t.error_message, - started_at=t.started_at, - finished_at=t.finished_at, - last_probe_at=t.last_probe_at, - last_probe_ok=t.last_probe_ok, - last_probe_message=t.last_probe_message, - ) - for t in targets - ], - deviations=[ - PermissionDeviationItem( - id=d.id, - site_url=d.site_url, - object_url=d.object_url, - object_type=d.object_type, - principal=d.principal, - role_name=d.role_name, - delta_type=d.delta_type, - permission_type=d.permission_type, - resolved_members=d.resolved_members, - created_at=d.created_at, - ) - for d in deviations - ], - ) - - -# --------------------------------------------------------------------------- -# Onboarding -# --------------------------------------------------------------------------- - -@app.post("/api/onboarding/create-scan-app", response_model=CreateScanAppResponse) -def onboarding_create_scan_app(payload: CreateScanAppRequest) -> CreateScanAppResponse: - try: - result = create_scan_app_for_tenant( - tenant_id=payload.tenant_id, - display_name=payload.display_name, - ) - except OnboardingError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - except Exception as exc: # noqa: BLE001 - raise HTTPException(status_code=500, detail=f"Unexpected onboarding error: {exc}") from exc - - return CreateScanAppResponse( - tenant_id=result.tenant_id, - client_id=result.client_id, - client_secret=result.client_secret, - app_object_id=result.app_object_id, - service_principal_id=result.service_principal_id, - display_name=result.display_name, - ) - - -@app.get("/api/onboarding/microsoft/connect-url", response_model=ConnectMicrosoftResponse) -def onboarding_microsoft_connect_url() -> ConnectMicrosoftResponse: - try: - return ConnectMicrosoftResponse(connect_url=create_connect_url()) - except OnboardingError as exc: - raise HTTPException(status_code=400, detail=str(exc)) from exc - - -@app.get("/api/onboarding/microsoft/callback") -def onboarding_microsoft_callback( - tenant: str | None = None, - state: str | None = None, - error: str | None = None, - error_description: str | None = None, -) -> RedirectResponse: - if error: - message = (error_description or error).replace(" ", "+") - return RedirectResponse(url=f"/?onboarding_status=error&onboarding_message={message}") - - if not state or not consume_callback_state(state): - return RedirectResponse(url="/?onboarding_status=error&onboarding_message=invalid_or_expired_state") - - if not tenant: - return RedirectResponse(url="/?onboarding_status=error&onboarding_message=missing_tenant") - - return RedirectResponse(url=f"/?onboarding_status=connected&tenant_id={tenant}") - - -@app.get("/api/onboarding/status") -def onboarding_status() -> dict[str, bool]: - from . import config - automated = bool(config.ONBOARDING_CLIENT_ID and config.ONBOARDING_CLIENT_SECRET and config.ONBOARDING_REDIRECT_URI) - return {"automated_available": automated} - - -# --------------------------------------------------------------------------- -# Static files +# Static files (mounted last so explicit API routes take precedence) # --------------------------------------------------------------------------- @app.get("/") @@ -847,306 +62,3 @@ def index() -> FileResponse: app.mount("/", StaticFiles(directory=SITE_DIR, html=True), name="site") - - -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - -_SCAN_TYPE_LABELS = { - "sharepoint": "Deviations", - "sharepoint_root": "Root", - "mailbox": "Mailbox", - "entra_groups": "EntraGroups", -} - - -def _build_export_filename(job: ScanJob, job_id: str) -> str: - tenant_label = (job.tenant_profile.name if job.tenant_profile else None) or "Manual" - safe_tenant = re.sub(r"[^A-Za-z0-9_-]+", "_", tenant_label).strip("_") or "Manual" - scan_type = job.scan_type or "sharepoint" - type_label = _SCAN_TYPE_LABELS.get(scan_type, scan_type) - short_id = job_id.replace("-", "")[-12:] - return f"ClearView_{safe_tenant}_{type_label}_{short_id}.xlsx" - - -def _enumerate_all_entra_groups( - tenant_id: str, - client_id: str, - client_secret: str | None, - profile_id: str | None, -) -> list[str]: - cert_private_key: str | None = None - cert_thumbprint: str | None = None - cert_public_pem: str | None = None - if profile_id: - with SessionLocal() as db: - profile = db.get(TenantProfile, profile_id) - if profile: - cert_private_key = profile.cert_private_key - cert_thumbprint = profile.cert_thumbprint - cert_public_pem = profile.cert_public_pem - - auth = AuthConfig( - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret or "", - cert_private_key=cert_private_key, - cert_thumbprint=cert_thumbprint, - cert_public_pem=cert_public_pem, - ) - - from .scanners import entra as _entra - - try: - return _entra.list_all_groups(auth) - except Exception as exc: # noqa: BLE001 - raise HTTPException(status_code=400, detail=f"Group enumeration failed: {exc}") from exc - - -def _enumerate_all_mailboxes( - organization: str | None, - tenant_id: str, - client_id: str, - client_secret: str | None, - profile_id: str | None, -) -> list[str]: - if not organization or "." not in organization: - raise HTTPException( - status_code=400, - detail="organization (e.g. contoso.onmicrosoft.com) is required when scan_all_mailboxes is true", - ) - - cert_private_key: str | None = None - cert_thumbprint: str | None = None - cert_public_pem: str | None = None - if profile_id: - with SessionLocal() as db: - profile = db.get(TenantProfile, profile_id) - if profile: - cert_private_key = profile.cert_private_key - cert_thumbprint = profile.cert_thumbprint - cert_public_pem = profile.cert_public_pem - - auth = AuthConfig( - tenant_id=tenant_id, - client_id=client_id, - client_secret=client_secret or "", - cert_private_key=cert_private_key, - cert_thumbprint=cert_thumbprint, - cert_public_pem=cert_public_pem, - ) - - from .scanners import mailbox as _mailbox - - try: - return _mailbox.list_mailboxes(organization=organization.strip().lower(), auth=auth) - except Exception as exc: # noqa: BLE001 - raise HTTPException(status_code=400, detail=f"Mailbox enumeration failed: {exc}") from exc - - -def _resolve_credentials( - db, - tenant_profile_id: str | None, - tenant_id: str | None, - client_id: str | None, - client_secret: str | None, -) -> tuple[str, str, str | None, str | None]: - if tenant_profile_id: - profile = db.get(TenantProfile, tenant_profile_id) - if not profile: - raise HTTPException(status_code=404, detail="Tenant profile not found") - if not profile.client_secret and not profile.cert_thumbprint: - raise HTTPException( - status_code=400, - detail="Tenant profile has no client secret and no certificate. Generate a certificate first.", - ) - return profile.tenant_id, profile.client_id, profile.client_secret, tenant_profile_id - if tenant_id and client_id and client_secret: - return tenant_id.strip(), client_id.strip(), client_secret.strip(), None - raise HTTPException( - status_code=400, - detail="Provide either tenant_profile_id or all of tenant_id, client_id, and client_secret.", - ) - - -def _create_job_from_targets( - raw_targets: list[str], - scan_type: str, - skip_default_sites: bool, - source_type: str, - tenant_id: str, - client_id: str, - client_secret: str, - tenant_profile_id: str | None = None, -) -> ScanJobCreateResponse: - accepted: list[str] = [] - skipped_default_urls: list[str] = [] - invalid: list[str] = [] - - seen: set[str] = set() - - for raw in raw_targets: - if scan_type == "mailbox": - normalized = (raw or "").strip().lower() - if not normalized or "@" not in normalized: - invalid.append(raw) - continue - elif scan_type == "entra_groups": - normalized = (raw or "").strip() - if not normalized: - invalid.append(raw) - continue - else: - normalized = normalize_site_url(raw) or "" - if not normalized: - invalid.append(raw) - continue - - if normalized in seen: - continue - seen.add(normalized) - - if scan_type in ("sharepoint", "sharepoint_root") and skip_default_sites and is_default_site(normalized): - skipped_default_urls.append(normalized) - continue - - accepted.append(normalized) - - with SessionLocal() as db: - now = datetime.utcnow() - job = ScanJob( - id=str(uuid.uuid4()), - source_type=source_type, - scan_type=scan_type, - status="queued" if accepted else "completed", - skip_default_sites=skip_default_sites, - tenant_profile_id=tenant_profile_id, - auth_tenant_id=tenant_id, - auth_client_id=client_id, - auth_client_secret=client_secret, - total_targets=len(accepted), - skipped_targets=len(skipped_default_urls), - warning_message=None, - error_message=None, - created_at=now, - updated_at=now, - finished_at=now if not accepted else None, - ) - - if not accepted: - if scan_type == "mailbox": - job.warning_message = "No scannable mailboxes after validation" - else: - job.warning_message = "No scannable sites after validation and default-site filtering" - - db.add(job) - db.flush() - - for index, target in enumerate(accepted, start=1): - db.add( - ScanTarget( - job_id=job.id, - site_url=target, - source_row=index, - status="queued", - attempts=0, - created_at=now, - updated_at=now, - ) - ) - - db.commit() - - stmt = select(ScanJob).options(joinedload(ScanJob.tenant_profile)).where(ScanJob.id == job.id) - job = db.execute(stmt).unique().scalar_one() - - return ScanJobCreateResponse( - job=_to_job_summary(job), - accepted_urls=accepted, - skipped_default_urls=skipped_default_urls, - invalid_urls=invalid, - ) - - -def _to_job_summary(job: ScanJob) -> ScanJobSummary: - return ScanJobSummary( - id=job.id, - status=job.status, - source_type=job.source_type, - scan_type=job.scan_type or "sharepoint", - skip_default_sites=job.skip_default_sites, - tenant_profile_id=job.tenant_profile_id, - tenant_name=job.tenant_profile.name if job.tenant_profile else None, - total_targets=job.total_targets, - processed_targets=job.processed_targets, - successful_targets=job.successful_targets, - failed_targets=job.failed_targets, - skipped_targets=job.skipped_targets, - items_scanned=job.items_scanned, - scan_activity=job.scan_activity if job.status == "running" else None, - warning_message=job.warning_message, - error_message=job.error_message, - created_at=job.created_at, - updated_at=job.updated_at, - started_at=job.started_at, - finished_at=job.finished_at, - ) - - -def _to_tenant_item(profile: TenantProfile) -> TenantProfileItem: - return TenantProfileItem( - id=profile.id, - name=profile.name, - tenant_id=profile.tenant_id, - primary_domain=profile.primary_domain, - client_id=profile.client_id, - has_certificate=bool(profile.cert_thumbprint), - cert_thumbprint=profile.cert_thumbprint, - cert_expires_at=profile.cert_expires_at, - created_at=profile.created_at, - updated_at=profile.updated_at, - ) - - -def _sharing_link_risk_label(principal: str) -> str: - if not principal.startswith("SharingLinks."): - return "" - parts = principal.split(".", 3) - link_type = parts[2] if len(parts) >= 3 else "" - if link_type.startswith("Anonymous"): - return "Critical" - if link_type == "Flexible": - return "High" - if link_type.startswith("Organization"): - return "Low" - if link_type.startswith("Direct"): - return "Low" - return "Unknown" - - -def _ensure_schema_columns() -> None: - stmts = [ - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS auth_tenant_id VARCHAR(128)", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS auth_client_id VARCHAR(128)", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS auth_client_secret TEXT", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS tenant_profile_id VARCHAR(36)", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS items_scanned INTEGER NOT NULL DEFAULT 0", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS scan_activity TEXT", - "ALTER TABLE scan_jobs ADD COLUMN IF NOT EXISTS scan_type VARCHAR(32) NOT NULL DEFAULT 'sharepoint'", - "ALTER TABLE permission_deviations ADD COLUMN IF NOT EXISTS permission_type VARCHAR(32)", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS primary_domain VARCHAR(256)", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS client_secret TEXT", - "ALTER TABLE tenant_profiles ALTER COLUMN client_secret DROP NOT NULL", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS cert_private_key TEXT", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS cert_public_pem TEXT", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS cert_thumbprint VARCHAR(64)", - "ALTER TABLE tenant_profiles ADD COLUMN IF NOT EXISTS cert_expires_at TIMESTAMP", - "ALTER TABLE permission_deviations ADD COLUMN IF NOT EXISTS resolved_members TEXT", - "ALTER TABLE scan_targets ADD COLUMN IF NOT EXISTS last_probe_at TIMESTAMP", - "ALTER TABLE scan_targets ADD COLUMN IF NOT EXISTS last_probe_ok BOOLEAN", - "ALTER TABLE scan_targets ADD COLUMN IF NOT EXISTS last_probe_message TEXT", - ] - with engine.begin() as conn: - for stmt in stmts: - conn.execute(text(stmt)) diff --git a/containers/clearview/src/clearview_app/migrations/env.py b/containers/clearview/src/clearview_app/migrations/env.py new file mode 100644 index 0000000..c07a2c4 --- /dev/null +++ b/containers/clearview/src/clearview_app/migrations/env.py @@ -0,0 +1,58 @@ +"""Alembic environment for Clearview. + +Reuses the application's SQLAlchemy engine (already configured with the +normalized DATABASE_URL and pool_pre_ping) so migrations run against exactly +the same database the app uses. Logging config from alembic.ini is applied +only when Alembic is invoked through the CLI; programmatic invocation from +``clearview_app.db_migrate`` passes a Config without a file. +""" +from __future__ import annotations + +from logging.config import fileConfig + +from alembic import context + +from clearview_app.config import DATABASE_URL +from clearview_app.db import _normalize_database_url, engine as app_engine +from clearview_app.models import Base + +config = context.config + +if config.config_file_name is not None: + try: + fileConfig(config.config_file_name) + except Exception: # noqa: BLE001 - logging config is best-effort + pass + +target_metadata = Base.metadata + + +def run_migrations_offline() -> None: + """Emit SQL to stdout without a live DB connection.""" + context.configure( + url=_normalize_database_url(DATABASE_URL), + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + compare_type=True, + ) + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations against the live database via the app engine.""" + with app_engine.connect() as connection: + context.configure( + connection=connection, + target_metadata=target_metadata, + compare_type=True, + ) + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/containers/clearview/src/clearview_app/migrations/script.py.mako b/containers/clearview/src/clearview_app/migrations/script.py.mako new file mode 100644 index 0000000..da29ede --- /dev/null +++ b/containers/clearview/src/clearview_app/migrations/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/containers/clearview/src/clearview_app/migrations/versions/0001_baseline.py b/containers/clearview/src/clearview_app/migrations/versions/0001_baseline.py new file mode 100644 index 0000000..fe959d5 --- /dev/null +++ b/containers/clearview/src/clearview_app/migrations/versions/0001_baseline.py @@ -0,0 +1,31 @@ +"""baseline schema + +Captures the full Clearview schema as defined by the SQLAlchemy models at the +time Alembic was adopted. Creating it via ``Base.metadata.create_all`` keeps the +baseline guaranteed-identical to the models (the same DDL the app emitted before +Alembic). Existing databases are ``stamp``-ed to this revision rather than +re-running ``upgrade`` (see ``clearview_app.db_migrate``). + +Revision ID: 0001_baseline +Revises: +Create Date: 2026-05-26 +""" +from __future__ import annotations + +from alembic import op + +from clearview_app.models import Base + +# revision identifiers, used by Alembic. +revision = "0001_baseline" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade() -> None: + Base.metadata.create_all(bind=op.get_bind()) + + +def downgrade() -> None: + Base.metadata.drop_all(bind=op.get_bind()) diff --git a/containers/clearview/src/clearview_app/migrations/versions/0002_timestamptz.py b/containers/clearview/src/clearview_app/migrations/versions/0002_timestamptz.py new file mode 100644 index 0000000..c32a973 --- /dev/null +++ b/containers/clearview/src/clearview_app/migrations/versions/0002_timestamptz.py @@ -0,0 +1,63 @@ +"""convert timestamp columns to timestamptz + +The app now uses timezone-aware UTC datetimes (DateTime(timezone=True)). +Existing databases store naive ``timestamp without time zone`` values that were +written as UTC, so we reinterpret them as UTC while converting. The conversion +is guarded per column on the current type, so it is a no-op on databases whose +columns are already ``timestamptz`` (e.g. a fresh DB created from the updated +baseline models). + +Revision ID: 0002_timestamptz +Revises: 0001_baseline +Create Date: 2026-05-26 +""" +from __future__ import annotations + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "0002_timestamptz" +down_revision = "0001_baseline" +branch_labels = None +depends_on = None + +# Table -> datetime columns (names come from our own models, never user input). +_COLUMNS: dict[str, tuple[str, ...]] = { + "tenant_profiles": ("cert_expires_at", "created_at", "updated_at"), + "scan_jobs": ("created_at", "updated_at", "started_at", "finished_at", "heartbeat_at"), + "scan_targets": ("last_probe_at", "created_at", "updated_at", "started_at", "finished_at"), + "permission_deviations": ("created_at",), +} + + +def _column_type(bind, table: str, column: str) -> str | None: + return bind.execute( + sa.text( + "SELECT data_type FROM information_schema.columns " + "WHERE table_name = :t AND column_name = :c" + ), + {"t": table, "c": column}, + ).scalar() + + +def upgrade() -> None: + bind = op.get_bind() + for table, columns in _COLUMNS.items(): + for column in columns: + if _column_type(bind, table, column) == "timestamp without time zone": + op.execute( + f'ALTER TABLE {table} ALTER COLUMN {column} ' + f"TYPE timestamptz USING {column} AT TIME ZONE 'UTC'" + ) + + +def downgrade() -> None: + bind = op.get_bind() + for table, columns in _COLUMNS.items(): + for column in columns: + if _column_type(bind, table, column) == "timestamp with time zone": + op.execute( + f'ALTER TABLE {table} ALTER COLUMN {column} ' + f"TYPE timestamp USING {column} AT TIME ZONE 'UTC'" + ) diff --git a/containers/clearview/src/clearview_app/models.py b/containers/clearview/src/clearview_app/models.py index ceccf26..e67483e 100644 --- a/containers/clearview/src/clearview_app/models.py +++ b/containers/clearview/src/clearview_app/models.py @@ -1,11 +1,16 @@ from __future__ import annotations -from datetime import datetime +from datetime import datetime, timezone from sqlalchemy import Boolean, DateTime, ForeignKey, Integer, String, Text from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship +def _utcnow() -> datetime: + """Timezone-aware UTC now, used as the default for timestamp columns.""" + return datetime.now(timezone.utc) + + class Base(DeclarativeBase): pass @@ -22,9 +27,9 @@ class TenantProfile(Base): cert_private_key: Mapped[str | None] = mapped_column(Text, nullable=True) cert_public_pem: Mapped[str | None] = mapped_column(Text, nullable=True) cert_thumbprint: Mapped[str | None] = mapped_column(String(64), nullable=True) - cert_expires_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) - updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + cert_expires_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) + updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) jobs: Mapped[list["ScanJob"]] = relationship(back_populates="tenant_profile") @@ -56,11 +61,11 @@ class ScanJob(Base): warning_message: Mapped[str | None] = mapped_column(Text, nullable=True) error_message: Mapped[str | None] = mapped_column(Text, nullable=True) - created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) - updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) - started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - heartbeat_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) + updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) + started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + heartbeat_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) tenant_profile: Mapped["TenantProfile | None"] = relationship(back_populates="jobs") targets: Mapped[list["ScanTarget"]] = relationship(back_populates="job", cascade="all,delete-orphan") @@ -79,14 +84,14 @@ class ScanTarget(Base): attempts: Mapped[int] = mapped_column(Integer, default=0) error_message: Mapped[str | None] = mapped_column(Text, nullable=True) - last_probe_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + last_probe_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) last_probe_ok: Mapped[bool | None] = mapped_column(Boolean, nullable=True) last_probe_message: Mapped[str | None] = mapped_column(Text, nullable=True) - created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) - updated_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) - started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) - finished_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) + updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) + started_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) job: Mapped[ScanJob] = relationship(back_populates="targets") deviations: Mapped[list["PermissionDeviation"]] = relationship(back_populates="target", cascade="all,delete-orphan") @@ -108,7 +113,7 @@ class PermissionDeviation(Base): permission_type: Mapped[str | None] = mapped_column(String(32), nullable=True) resolved_members: Mapped[str | None] = mapped_column(Text, nullable=True) - created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow) + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=_utcnow) job: Mapped[ScanJob] = relationship(back_populates="deviations") target: Mapped[ScanTarget] = relationship(back_populates="deviations") diff --git a/containers/clearview/src/clearview_app/scanners/sharepoint.py b/containers/clearview/src/clearview_app/scanners/sharepoint.py index 097aefa..db4db24 100644 --- a/containers/clearview/src/clearview_app/scanners/sharepoint.py +++ b/containers/clearview/src/clearview_app/scanners/sharepoint.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +import threading import time from dataclasses import dataclass from urllib.parse import urlparse @@ -32,7 +33,13 @@ class PermissionEntry: role_name: str -_TOKEN_CACHE: dict[str, str] = {} +# Cache maps cache_key -> (access_token, expires_at_epoch). Guarded by +# _TOKEN_LOCK because the worker acquires tokens from multiple threads. +_TOKEN_CACHE: dict[str, tuple[str, float]] = {} +_TOKEN_LOCK = threading.Lock() +# Reuse one MSAL app per (tenant, client, auth_method) so MSAL's own token +# cache works and refreshes app tokens automatically. +_MSAL_APPS: dict[str, "msal.ConfidentialClientApplication"] = {} def scan_site_for_deviations( @@ -612,18 +619,20 @@ def _probe_hint(error: str, stage: str) -> str: return error[:220] -def _get_token_for_host(host: str, auth: AuthConfig) -> str: - auth_method = "cert" if auth.cert_thumbprint and auth.cert_private_key else "secret" - cache_key = f"{host}|{auth.tenant_id}|{auth.client_id}|{auth_method}" - cached = _TOKEN_CACHE.get(cache_key) - if cached: - return cached +def _get_msal_app(auth: AuthConfig, auth_method: str) -> "msal.ConfidentialClientApplication": + """Return a cached ConfidentialClientApplication for these credentials. + + Reusing the app object lets MSAL's built-in token cache serve and refresh + app-only tokens instead of re-authenticating on every call. + """ + app_key = f"{auth.tenant_id}|{auth.client_id}|{auth_method}" + app = _MSAL_APPS.get(app_key) + if app is not None: + return app - scope = f"https://{host}/.default" authority = f"https://login.microsoftonline.com/{auth.tenant_id}" - if auth_method == "cert": - client_credential = { + client_credential: dict[str, str | None] | str | None = { "thumbprint": auth.cert_thumbprint, "private_key": auth.cert_private_key, } @@ -635,16 +644,34 @@ def _get_token_for_host(host: str, auth: AuthConfig) -> str: authority=authority, client_credential=client_credential, ) - result = app.acquire_token_for_client(scopes=[scope]) + _MSAL_APPS[app_key] = app + return app - if "access_token" not in result: - error = result.get("error", "unknown") - description = result.get("error_description", "") - raise RuntimeError(f"Token request failed ({error}): {description[:300]}") - token = str(result["access_token"]) - _TOKEN_CACHE[cache_key] = token - return token +def _get_token_for_host(host: str, auth: AuthConfig) -> str: + auth_method = "cert" if auth.cert_thumbprint and auth.cert_private_key else "secret" + cache_key = f"{host}|{auth.tenant_id}|{auth.client_id}|{auth_method}" + + with _TOKEN_LOCK: + cached = _TOKEN_CACHE.get(cache_key) + if cached is not None and time.time() < cached[1]: + return cached[0] + + scope = f"https://{host}/.default" + app = _get_msal_app(auth, auth_method) + result = app.acquire_token_for_client(scopes=[scope]) + + if "access_token" not in result: + error = result.get("error", "unknown") + description = result.get("error_description", "") + raise RuntimeError(f"Token request failed ({error}): {description[:300]}") + + token = str(result["access_token"]) + # expires_in is seconds-from-now; refresh 60s early to avoid edge expiry. + expires_in = int(result.get("expires_in", 3600)) + expires_at = time.time() + max(expires_in - 60, 0) + _TOKEN_CACHE[cache_key] = (token, expires_at) + return token def _iter_paged(url: str, headers: dict[str, str]): diff --git a/containers/clearview/src/clearview_app/schemas.py b/containers/clearview/src/clearview_app/schemas.py index 19cfe25..910ba14 100644 --- a/containers/clearview/src/clearview_app/schemas.py +++ b/containers/clearview/src/clearview_app/schemas.py @@ -1,9 +1,14 @@ from __future__ import annotations from datetime import datetime +from typing import Literal from pydantic import BaseModel, Field, HttpUrl +# Valid scan types, mirrored by the frontend scan-type dropdowns. Used to +# validate incoming job requests (FastAPI returns 422 on anything else). +ScanType = Literal["sharepoint", "sharepoint_root", "mailbox", "entra_groups"] + class CreateTenantProfileRequest(BaseModel): name: str @@ -33,7 +38,7 @@ class TenantCertificateResponse(BaseModel): class CreateScanJobRequest(BaseModel): - scan_type: str = "sharepoint" + scan_type: ScanType = "sharepoint" site_urls: list[HttpUrl] = Field(default_factory=list) mailboxes: list[str] = Field(default_factory=list) scan_all_mailboxes: bool = False diff --git a/containers/clearview/src/clearview_app/version.py b/containers/clearview/src/clearview_app/version.py index b31ae7a..c0cd924 100644 --- a/containers/clearview/src/clearview_app/version.py +++ b/containers/clearview/src/clearview_app/version.py @@ -7,7 +7,7 @@ history, so operators can see exactly which image build is running. from __future__ import annotations VERSION = "v0.1.0" -BUILD = 1 +BUILD = 2 def display_version() -> str: diff --git a/containers/clearview/src/clearview_app/worker.py b/containers/clearview/src/clearview_app/worker.py index a78ae63..3d3f6ca 100644 --- a/containers/clearview/src/clearview_app/worker.py +++ b/containers/clearview/src/clearview_app/worker.py @@ -4,7 +4,7 @@ import logging import threading import time from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError -from datetime import datetime +from datetime import datetime, timezone from sqlalchemy import select @@ -47,17 +47,21 @@ class ScanWorker: def _process_next_job(self) -> bool: with SessionLocal() as db: + # Atomic claim: lock the chosen queued row and skip rows already + # locked by another worker, so multiple workers/replicas never grab + # the same job. The status flip is committed in this transaction. job = db.execute( select(ScanJob) .where(ScanJob.status == "queued") .order_by(ScanJob.created_at.asc()) .limit(1) + .with_for_update(skip_locked=True) ).scalar_one_or_none() if job is None: return False - now = datetime.utcnow() + now = datetime.now(timezone.utc) job.status = "running" job.started_at = now job.heartbeat_at = now @@ -96,7 +100,7 @@ class ScanWorker: job = db.get(ScanJob, job_id) if not job: return - now = datetime.utcnow() + now = datetime.now(timezone.utc) job.heartbeat_at = now job.updated_at = now job.finished_at = now @@ -113,7 +117,7 @@ class ScanWorker: if not job or not target: return - now = datetime.utcnow() + now = datetime.now(timezone.utc) target.status = "running" target.started_at = now target.updated_at = now @@ -128,7 +132,7 @@ class ScanWorker: target = db.get(ScanTarget, target_id) if not job or not target: return - now = datetime.utcnow() + now = datetime.now(timezone.utc) target.status = "failed" target.attempts = 1 target.error_message = f"Preflight: {probe.message}" @@ -173,7 +177,7 @@ class ScanWorker: ) ) - now = datetime.utcnow() + now = datetime.now(timezone.utc) target.status = "completed" target.attempts = attempt target.error_message = None @@ -203,7 +207,7 @@ class ScanWorker: if not job or not target: return - now = datetime.utcnow() + now = datetime.now(timezone.utc) target.status = "failed" target.attempts = max_attempts target.error_message = last_error @@ -252,7 +256,7 @@ class ScanWorker: with SessionLocal() as db: target = db.get(ScanTarget, target_id) if target: - now = datetime.utcnow() + now = datetime.now(timezone.utc) target.last_probe_at = now target.last_probe_ok = result.ok target.last_probe_message = result.message @@ -298,8 +302,8 @@ class ScanWorker: job.scan_activity = activity if items > 0: job.items_scanned += items - job.heartbeat_at = datetime.utcnow() - job.updated_at = datetime.utcnow() + job.heartbeat_at = datetime.now(timezone.utc) + job.updated_at = datetime.now(timezone.utc) db.commit() except Exception: # noqa: BLE001 pass diff --git a/docs/changelog-develop.md b/docs/changelog-develop.md index 1097da5..8d76c14 100644 --- a/docs/changelog-develop.md +++ b/docs/changelog-develop.md @@ -2,6 +2,42 @@ This file documents changes on the develop branch of this project. +## 2026-05-26 — UI/UX: dead CSS removal, a11y, distinct risk colours, richer dashboard + +### Added +- **Dashboard enrichment** — a fourth KPI card **With errors** (`#statErrors`, counts jobs that are `completed_with_errors` or have `failed_targets > 0`) and a **Recent jobs** panel (`#dashRecentJobs`, last 5 jobs, each row clickable to jump to its details). Populated from the existing `/api/scan-jobs` list in `refreshJobs()` via a new `renderDashRecent()`; all interpolated fields run through `escHtml()`. + +### Changed +- **Removed dead CSS** — the pre-sidebar `.topbar`, `.topbar-actions`, and `.layout` rules (and their now-orphaned references inside the 930px/640px media queries) were deleted; the layout has used `.app-shell`/`.sidebar`/`.content` since the sidebar refactor. +- **Accessibility** — focus outline strengthened from `rgba(14,165,233,0.38)` to a solid `var(--cv-accent)` (meets WCAG non-text 3:1) and now also covers `a:focus-visible`. On route changes (`applyRoute`), focus now moves to the new page's first heading (`h1/h2`, `tabindex=-1`) and `document.title` updates, so screen-reader/keyboard users land in the freshly shown content. +- **Distinct risk colours** — the `risk.warn` badge changed from accent-blue (indistinguishable from `info`/`low`) to amber (`#854d0e` on `rgba(234,179,8,.18)`), giving a real low→high colour gradient. +- **Consistent XSS escaping** — `job.id` and `job.source_type` in the Scan Jobs table are now passed through `escHtml()` (previously interpolated raw), matching the rest of the table. + +## 2026-05-26 — Split monolithic main.py into route modules + +### Changed +- **`main.py` reduced from 1152 to 64 lines** — now a composition root that only wires the FastAPI app, scan-worker lifecycle, `/healthz`, `/api/version`, the `/` index + static mount, and `include_router` for the new route modules. All endpoint logic moved out verbatim (behaviour-preserving). +- **New route modules** (flat modules at package level so existing single-dot relative imports stay unchanged — lower risk than a `routers/` subpackage): `api_tenants.py` (tenant profiles + certificate), `api_jobs.py` (all scan-job routes incl. CSV import, cancel/delete, resolve-sharing-links, resolve-groups, test-connection, Excel export, detail), `api_onboarding.py` (Microsoft connect/callback/scan-app). Shared helpers (`_resolve_credentials`, `_create_job_from_targets`, `_enumerate_all_*`, `_to_job_summary`, `_to_tenant_item`, `_build_export_filename`, `_sharing_link_risk_label`, `_extract_sharing_link_group_and_type`) extracted to `api_helpers.py`. +- **Verified behaviour-preserving** — captured the OpenAPI route set before/after; both expose the identical 22 endpoints (`diff` empty). Built the image, booted against a fresh DB: `/healthz`, `/api/version`, `/api/tenants`, `/api/scan-jobs` all respond, invalid `scan_type` still returns 422, no startup errors. + +## 2026-05-26 — Correctness P1: token cache, atomic job claim, timezone-aware datetimes, scan_type validation + +### Changed +- **Token cache now has TTL + thread lock + MSAL app reuse** (`scanners/sharepoint.py`) — `_TOKEN_CACHE` previously stored access tokens as plain strings forever, so long scans started failing with 401s once the ~1h token expired. It now stores `(token, expires_at)` and refreshes 60s before expiry, guarded by a new `_TOKEN_LOCK` (the worker fetches tokens from multiple threads). New `_get_msal_app()` caches one `ConfidentialClientApplication` per `(tenant, client, auth_method)` so MSAL's own token cache is reused instead of building a fresh app on every call. +- **Atomic job claim** (`worker.py`) — the queued-job selection now uses `.with_for_update(skip_locked=True)` (`SELECT … FOR UPDATE SKIP LOCKED`), so multiple worker threads/replicas can never claim the same job. Behaviour is unchanged for the current single worker but is now replica-safe. +- **Timezone-aware datetimes everywhere** — replaced all 24 `datetime.utcnow()` (naive, deprecated) with `datetime.now(timezone.utc)` across `models.py`, `worker.py`, `main.py`, and `cert.py`. SQLAlchemy datetime columns are now `DateTime(timezone=True)`; model defaults use a new `_utcnow()` helper. New Alembic migration `0002_timestamptz` converts existing `timestamp without time zone` columns to `timestamptz` (reinterpreting stored values as UTC), guarded per-column so it is a no-op on databases already timestamptz. **Behaviour note:** API datetimes now carry a UTC offset, so the frontend renders them correctly in local time (previously stored UTC was shown as if local). +- **`scan_type` request validation** (`schemas.py`) — `CreateScanJobRequest.scan_type` is now `Literal["sharepoint","sharepoint_root","mailbox","entra_groups"]` instead of free `str`; invalid values return HTTP 422. The response model keeps `str` so legacy rows never trigger a serialization error. Verified: `scan_type=bogus` → 422, valid type passes schema validation. + +## 2026-05-26 — Alembic migrations replace startup `create_all` + raw ALTERs + +### Added +- **Alembic introduced (`alembic==1.14.0`)** — schema is now version-controlled instead of being patched at every startup. New `clearview_app/migrations/` package (`env.py` reuses the app's SQLAlchemy engine and `Base.metadata`; `versions/0001_baseline.py` baseline) and dev-only `containers/clearview/alembic.ini` for manual CLI use. The app builds the Alembic `Config` programmatically, so `alembic.ini` is not shipped in the image. +- **Baseline migration `0001_baseline`** — creates the full current schema via `Base.metadata.create_all`, guaranteed identical to the models (the same DDL the app emitted before). Future schema changes become explicit Alembic revisions. +- **Startup bootstrap `clearview_app/db_migrate.run_migrations()`** — idempotent, three cases: fresh DB → `upgrade head`; existing pre-Alembic DB (tables present, no `alembic_version`) → `stamp head` (adopt baseline without re-creating); already under Alembic → `upgrade head`. Verified end-to-end against throwaway databases (fresh upgrade, existing-DB stamp, re-run no-op) and a local image boot test (`/healthz` OK, schema + `alembic_version=0001_baseline`). + +### Changed +- **`main.py` startup** — `on_startup()` now calls `run_migrations()` instead of `Base.metadata.create_all(bind=engine)` + `_ensure_schema_columns()`. The 18-statement raw `ALTER TABLE ... ADD COLUMN IF NOT EXISTS` block (`_ensure_schema_columns`) is removed; unused `Base`/`engine` imports dropped. The existing dev/prod database is adopted automatically (stamped to baseline) on first start of the new build — no manual migration step required. + ## 2026-05-26 — Build/version number in the UI (Dropkeep-style) ### Added