#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script      : analyse_scraper.py
Chemin      : /var/www/html/analyses/analyse_scraper.py
Description : Récupère les analyses de biotoxines sur le site INTECMAR
              (https://www.intecmar.gal/Informacion/biotoxinas/ratos/)
              pour chaque site présent dans la table `analyses_sites`, puis insère
              ou met à jour les données dans la base MySQL `analyses`
              (table `analyses`), avec fusion et priorisation des valeurs
              en cas de doublons (même id_site / date / profondeur).
Options     : aucune (exécution simple, prévue pour un cron)
Exemple     : python3 analyse_scraper.py
Prérequis   : - Python 3
              - Modules : requests, bs4, mysql-connector-python (ou mysqlclient)
              - Fichier .env dans le même répertoire avec :
                    DB_HOST=localhost
                    DB_PORT=3306
                    DB_USER=...
                    DB_PASSWORD=...
                    DB_NAME=analyses
                    LOG_MODE=normal   (ou debug)
              - Accès HTTP sortant vers intecmar.gal
Auteur      : Sylvain SCATTOLINI
Date        : 27/11/2025
Version     : 1.3
"""

import os
import re
import sys
import math
import time
import logging
import traceback
from pathlib import Path
from datetime import datetime

import requests
from bs4 import BeautifulSoup
import mysql.connector


# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

BASE_DIR = Path(__file__).resolve().parent
ENV_PATH = BASE_DIR / ".env"
LOG_DIR = BASE_DIR / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)

LOG_FILE = LOG_DIR / "analyses_scraper.log"

BASE_URL = "https://www.intecmar.gal"
START_URL = BASE_URL + "/Informacion/biotoxinas/ratos/Default.aspx?sm=a4"

DROPDOWN_NAME = "ctl00$Contenido$dpZonas2"
GRID_ID = "ctl00_Contenido_GridView2"


# ---------------------------------------------------------------------------
# .env minimaliste
# ---------------------------------------------------------------------------

def load_env(env_path: Path) -> dict:
    env = {}
    if not env_path.is_file():
        return env
    with env_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or "=" not in line:
                continue
            k, v = line.split("=", 1)
            env[k.strip()] = v.strip()
    return env


ENV = load_env(ENV_PATH)


# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------

def setup_logging():
    logger = logging.getLogger("analyse_scraper")
    logger.setLevel(logging.DEBUG)
    logger.handlers.clear()

    # ------------------------------------------------------------------
    # LOG 1 : logs détaillés -> écrasés à chaque exécution
    # ------------------------------------------------------------------
    detailed_log_path = LOG_DIR / "analyse_scraper.log"
    fh = logging.FileHandler(detailed_log_path, mode="w", encoding="utf-8")
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(logging.Formatter(
        "%(asctime)s [%(levelname)s] %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    ))
    logger.addHandler(fh)

    # ------------------------------------------------------------------
    # LOG 2 : intecmar.log -> uniquement warnings + erreurs + résumés
    # ------------------------------------------------------------------
    essential_log_path = LOG_DIR / "intecmar.log"
    eh = logging.FileHandler(essential_log_path, mode="a", encoding="utf-8")
    eh.setLevel(logging.WARNING)  # WARNING et ERROR uniquement
    eh.setFormatter(logging.Formatter(
        "%(asctime)s [%(levelname)s] %(name)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    ))
    logger.addHandler(eh)

    # ------------------------------------------------------------------
    # Console (stdout)
    # ------------------------------------------------------------------
    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(logging.INFO)
    ch.setFormatter(logging.Formatter(
        "%(asctime)s [%(levelname)s] %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    ))
    logger.addHandler(ch)

    return logger

logger = setup_logging()

# ---------------------------------------------------------------------------
# MySQL
# ---------------------------------------------------------------------------

def get_db_connection():
    try:
        conn = mysql.connector.connect(
            host=ENV.get("DB_HOST", "localhost"),
            port=int(ENV.get("DB_PORT", "3306")),
            user=ENV.get("DB_USER", "root"),
            password=ENV.get("DB_PASSWORD", ""),
            database=ENV.get("DB_NAME", "analyses"),
            charset="utf8mb4",
            use_unicode=True,
        )
        return conn
    except Exception as e:
        logger.error("Erreur de connexion MySQL : %s", e)
        raise


# ---------------------------------------------------------------------------
# Helpers numériques
# ---------------------------------------------------------------------------

def truncate(value: float, decimals: int = 3) -> float:
    factor = 10 ** decimals
    if value >= 0:
        return math.floor(value * factor) / factor
    return math.ceil(value * factor) / factor


def parse_value_with_tolerance(raw: str, context: str):
    """
    Retourne (valeur, tolérance) à partir d'une chaîne brute.

    Règles :
      - '---'             -> (None, None)
      - ''                -> (None, None)
      - '< LQ', '< 2,0'   -> (0.0, None)
      - '146,8 ±30,1'     -> (146.8, 30.1)
      - '21,8 ­±2,7'      -> (21.8, 2.7)
      - '0,071'           -> (0.071, None)
      - '600 - 1000'      -> (600.0, None)   (fourchette : on garde la 1ère valeur)
    """
    if raw is None:
        return None, None

    txt = raw.strip()
    if not txt:
        return None, None

    # Valeur manquante
    if txt == "---":
        return None, None

    # "< LQ", "< 2,0", etc. => on force à 0
    if txt.startswith("<"):
        return 0.0, None

    # Nettoyage de base
    txt = txt.replace("\xa0", " ")   # espace insécable
    txt = txt.replace("­", "")       # soft hyphen éventuel

    # Séparation valeur / tolérance (± ou +/-)
    if "±" in txt:
        v_part, t_part = txt.split("±", 1)
    elif "+/-" in txt:
        v_part, t_part = txt.split("+/-", 1)
    else:
        v_part, t_part = txt, None

    def to_float(part: str):
        if part is None:
            return None
        part = part.strip()
        if not part:
            return None

        # Gestion d'une fourchette : "600 - 1000" -> on garde 600
        range_match = re.match(r"^\s*([\d.,]+)\s*-\s*([\d.,]+)\s*$", part)
        if range_match:
            part = range_match.group(1)

        # Normalisation : on ne garde que chiffres, , . et -
        cleaned = re.sub(r"[^0-9,.\-]", "", part)
        if not cleaned:
            return None

        # Virgule décimale -> point
        cleaned = cleaned.replace(",", ".")

        try:
            val = float(cleaned)
            return truncate(val, 3)
        except Exception:
            logger.warning(
                "Échec conversion numérique pour '%s' (%s)", raw, context
            )
            return None

    value = to_float(v_part)
    tolerance = to_float(t_part) if t_part else None
    return value, tolerance


def merge_numeric(existing, new):
    """
    Règles de fusion :
      - None + valeur => valeur
      - 0 et >0       => >0 prioritaire
      - >0 et 0       => >0 prioritaire
      - deux >0       => on garde l'existant (priorité à l'ordre de lecture)
    """
    if existing is None:
        return new
    if new is None:
        return existing

    if existing == 0 and new > 0:
        return new
    if existing > 0 and new == 0:
        return existing

    return existing


def merge_rows(existing: dict, new: dict) -> dict:
    # Valeurs numériques
    for field in ("psp", "asp", "ao", "aza", "ytx"):
        existing[field] = merge_numeric(existing.get(field), new.get(field))

    # Tolérances : on garde la 1re non nulle
    for field in ("psp_tolerance", "asp_tolerance",
                  "ao_tolerance", "aza_tolerance", "ytx_tolerance"):
        if existing.get(field) is None and new.get(field) is not None:
            existing[field] = new[field]

    # Résultat global : positif si au moins une ligne est positive
    if existing.get("resultat", 0) == 1 or new.get("resultat", 0) == 1:
        existing["resultat"] = 1
        existing["resultat_texte"] = "positif"
    else:
        existing["resultat"] = 0
        existing["resultat_texte"] = "négatif"

    return existing


# ---------------------------------------------------------------------------
# HTTP / ASP.NET
# ---------------------------------------------------------------------------

def get_datos_url(session: requests.Session) -> str:
    resp = session.get(START_URL, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    link = soup.find("a", href=True, string=lambda t: t and "biotoxinas por zonas" in t)
    if not link:
        link = soup.find("a", href=lambda h: h and "DatosMSZona.aspx" in h)

    if not link:
        raise RuntimeError("Impossible de trouver le lien vers DatosMSZona.aspx")

    href = link["href"]
    if href.startswith("http"):
        return href
    return requests.compat.urljoin(START_URL, href)


def get_zone_page(session: requests.Session, datos_url: str, zone_value: str) -> BeautifulSoup:
    """
    zone_value = code_site venant de la base (ex: 'BAIONA-A').
    On doit l'envoyer en version 10 caractères comme dans le <select> :
    'BAIONA-A  ', 'BUEU-A1   ', etc.
    """
    posted_value = zone_value.ljust(10)

    resp = session.get(datos_url, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    form = soup.find("form")
    if not form:
        raise RuntimeError("Formulaire ASP.NET introuvable sur DatosMSZona.aspx")

    def get_input_value(name: str) -> str:
        tag = form.find("input", {"name": name})
        return tag["value"] if tag and tag.has_attr("value") else ""

    viewstate = get_input_value("__VIEWSTATE")
    event_validation = get_input_value("__EVENTVALIDATION")
    viewstate_generator = get_input_value("__VIEWSTATEGENERATOR")

    payload = {
        "__EVENTTARGET": DROPDOWN_NAME,
        "__EVENTARGUMENT": "",
        "__VIEWSTATE": viewstate,
        "__EVENTVALIDATION": event_validation,
        "__VIEWSTATEGENERATOR": viewstate_generator,
        DROPDOWN_NAME: posted_value,
    }

    # Ajouter les autres champs du formulaire
    for inp in form.find_all("input"):
        name = inp.get("name")
        if not name or name in payload:
            continue
        payload.setdefault(name, inp.get("value", ""))

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; analyses_scraper/1.3)",
        "Referer": datos_url,
    }

    resp_post = session.post(datos_url, data=payload, headers=headers, timeout=20)
    resp_post.raise_for_status()
    return BeautifulSoup(resp_post.text, "html.parser")


# ---------------------------------------------------------------------------
# Parsing de la table d'analyses
# ---------------------------------------------------------------------------

def parse_grid_for_site(soup: BeautifulSoup, id_site: int, code_site: str, nom_site: str):
    table = soup.find("table", id=GRID_ID)
    if not table:
        logger.warning(
            "Tableau des analyses introuvable pour le site %s (%s)", code_site, nom_site
        )
        return []

    rows = table.find_all("tr")
    if len(rows) <= 1:
        logger.info(
            "Aucune donnée (hors en-tête) pour le site %s (%s)", code_site, nom_site
        )
        return []

    data_rows = []

    for tr in rows[1:]:
        tds = tr.find_all("td")
        if len(tds) < 9:
            continue

        # Date
        date_txt = tds[0].get_text(strip=True)
        try:
            d = datetime.strptime(date_txt, "%d/%m/%Y").date()
        except Exception:
            logger.warning(
                "Date invalide '%s' pour le site %s (%s)", date_txt, code_site, nom_site
            )
            continue

        # Profondeur(s)
        prof_txt = tds[2].get_text(strip=True)
        prof_numbers = re.findall(r"\d+", prof_txt)
        if not prof_numbers:
            logger.warning(
                "Profondeur introuvable '%s' pour le site %s (%s) - date %s",
                prof_txt, code_site, nom_site, d,
            )
            continue
        profondeurs = [int(x) for x in prof_numbers]

        # Résultat
        res_txt = tds[8].get_text(strip=True).lower()
        if "positiv" in res_txt:
            resultat = 1
            resultat_texte = "positif"
        else:
            resultat = 0
            resultat_texte = "négatif"

        context_base = f"site={code_site}, date={d}"

        # Colonnes de toxines
        psp_val, psp_tol = parse_value_with_tolerance(
            tds[3].get_text(strip=True), context_base + ", toxine=PSP"
        )
        asp_val, asp_tol = parse_value_with_tolerance(
            tds[4].get_text(strip=True), context_base + ", toxine=ASP"
        )
        ao_val, ao_tol = parse_value_with_tolerance(
            tds[5].get_text(strip=True), context_base + ", toxine=AO"
        )
        aza_val, aza_tol = parse_value_with_tolerance(
            tds[6].get_text(strip=True), context_base + ", toxine=AZA"
        )
        ytx_val, ytx_tol = parse_value_with_tolerance(
            tds[7].get_text(strip=True), context_base + ", toxine=YTX"
        )

        for prof in profondeurs:
            data_rows.append(
                {
                    "id_site": id_site,
                    "date_analyse": d,
                    "profondeur": prof,
                    "psp": psp_val,
                    "psp_tolerance": psp_tol,
                    "asp": asp_val,
                    "asp_tolerance": asp_tol,
                    "ao": ao_val,
                    "ao_tolerance": ao_tol,
                    "aza": aza_val,
                    "aza_tolerance": aza_tol,
                    "ytx": ytx_val,
                    "ytx_tolerance": ytx_tol,
                    "resultat": resultat,
                    "resultat_texte": resultat_texte,
                }
            )

    return data_rows


# ---------------------------------------------------------------------------
# Récupération des sites
# ---------------------------------------------------------------------------

def fetch_sites(conn):
    """
    On lit la table `analyses_sites` (les 64 valeurs de la liste déroulante).
    code_site doit correspondre EXACTEMENT à la value du <option>.
    """
    sql = """
        SELECT id_site, code_site, nom
        FROM analyses_sites
        ORDER BY id_site
    """
    with conn.cursor(dictionary=True) as cur:
        cur.execute(sql)
        return cur.fetchall()


# ---------------------------------------------------------------------------
# Insertion / update
# ---------------------------------------------------------------------------

UPSERT_SQL = """
INSERT INTO analyses_releves (
    id_site, date_analyse, profondeur,
    psp, psp_tolerance,
    asp, asp_tolerance,
    ao, ao_tolerance,
    aza, aza_tolerance,
    ytx, ytx_tolerance,
    resultat, resultat_texte,
    created_at, updated_at
)
VALUES (
    %(id_site)s, %(date_analyse)s, %(profondeur)s,
    %(psp)s, %(psp_tolerance)s,
    %(asp)s, %(asp_tolerance)s,
    %(ao)s, %(ao_tolerance)s,
    %(aza)s, %(aza_tolerance)s,
    %(ytx)s, %(ytx_tolerance)s,
    %(resultat)s, %(resultat_texte)s,
    NOW(), NOW()
)
ON DUPLICATE KEY UPDATE
    psp = VALUES(psp),
    psp_tolerance = VALUES(psp_tolerance),
    asp = VALUES(asp),
    asp_tolerance = VALUES(asp_tolerance),
    ao = VALUES(ao),
    ao_tolerance = VALUES(ao_tolerance),
    aza = VALUES(aza),
    aza_tolerance = VALUES(aza_tolerance),
    ytx = VALUES(ytx),
    ytx_tolerance = VALUES(ytx_tolerance),
    resultat = VALUES(resultat),
    resultat_texte = VALUES(resultat_texte),
    updated_at = NOW();
"""


def save_analyses(conn, analyses_list):
    if not analyses_list:
        return 0
    inserted = 0
    with conn.cursor() as cur:
        for rec in analyses_list:
            cur.execute(UPSERT_SQL, rec)
            inserted += 1
        conn.commit()
    return inserted


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    start_time = datetime.now()
    logger.info(
        "Démarrage du script analyses_scraper (mode logs=%s)",
        ENV.get("LOG_MODE", "normal")
    )

    try:
        conn = get_db_connection()
    except Exception:
        logger.error("Arrêt du script : impossible de se connecter à la base.")
        return

    try:
        sites = fetch_sites(conn)
        if not sites:
            logger.warning("Aucun site trouvé dans la table `sites`.")
            return

        logger.info("Nombre de sites à traiter : %d", len(sites))

        session = requests.Session()
        datos_url = get_datos_url(session)
        logger.debug("URL DatosMSZona.aspx : %s", datos_url)

        for site in sites:
            id_site = site["id_site"]
            code_site = site["code_site"]
            nom_site = site["nom"]

            logger.info("Traitement du site %s (%s)", code_site, nom_site)

            try:
                soup_zone = get_zone_page(session, datos_url, code_site)
            except Exception as e:
                logger.error(
                    "Erreur HTTP/ASP.NET pour le site %s (%s) : %s",
                    code_site, nom_site, e
                )
                logger.debug("Traceback : %s", traceback.format_exc())
                continue

            raw_rows = parse_grid_for_site(soup_zone, id_site, code_site, nom_site)

            # Fusion par (id_site, date, profondeur)
            merged = {}
            for row in raw_rows:
                key = (row["id_site"], row["date_analyse"], row["profondeur"])
                if key in merged:
                    merged[key] = merge_rows(merged[key], row)
                else:
                    merged[key] = row

            final_rows = list(merged.values())

            logger.info(
                "Site %s (%s) : %d lignes brutes, %d lignes fusionnées",
                code_site, nom_site, len(raw_rows), len(final_rows)
            )

            try:
                count = save_analyses(conn, final_rows)
                logger.info(
                    "Site %s (%s) : %d lignes insérées/mises à jour",
                    code_site, nom_site, count
                )
            except Exception as e:
                logger.error(
                    "Erreur lors de l'insertion en base pour le site %s (%s) : %s",
                    code_site, nom_site, e
                )
                logger.debug("Traceback : %s", traceback.format_exc())

        session.close()

    finally:
        try:
            conn.close()
        except Exception:
            pass

        end_time = datetime.now()
        duration = (end_time - start_time).total_seconds()
        logger.info(
            "Fin du script analyses_scraper - début=%s fin=%s durée=%.2f s",
            start_time.strftime("%Y-%m-%d %H:%M:%S"),
            end_time.strftime("%Y-%m-%d %H:%M:%S"),
            duration,
        )
        
        # Résumé fin dans le log essentiel
        logger.warning(
            "Résumé analyse_scraper : debut=%s fin=%s durée=%.2fs",
            start_time.strftime("%Y-%m-%d %H:%M:%S"),
            end_time.strftime("%Y-%m-%d %H:%M:%S"),
            duration,
        )

if __name__ == "__main__":
    main()