Source code for scrape_leafly

"""Leafly Strain Scraper -- Harvest ALL strains into terpene_profiles.yaml.

Extracts strain data directly from Leafly's __NEXT_DATA__ JSON embedded
in listing pages. Each listing page contains ~18 strains with full
terpene profiles, effects, cannabinoids, and metadata.

Total: ~9000 strains across ~500 pages.

# 💀🔥 scraping the entire weed bible 🌿
#
# Usage:
#   python scrape_leafly.py                          # scrape ALL strains
#   python scrape_leafly.py --pages 5                # first 5 pages only
#   python scrape_leafly.py --merge                  # merge into terpene_profiles.yaml
#   python scrape_leafly.py --output my_strains.yaml
"""

from __future__ import annotations

import argparse
import jsonutil as json
import logging
import os
import re
import time
from typing import Any, Dict, List, Optional, Set

import requests
import yaml

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

# ═══════════════════════════════════════════════════════════════════════
# CONFIG
# ═══════════════════════════════════════════════════════════════════════

LEAFLY_BASE = "https://www.leafly.com"
STRAINS_LIST_URL = f"{LEAFLY_BASE}/strains"
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/131.0.0.0 Safari/537.36"
)
HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
}

# Rate limiting
PAGE_DELAY_S = 1.5  # delay between listing pages
MAX_RETRIES = 3
RETRY_BACKOFF = 2.0

# 🌿 Terpene name normalization
TERPENE_NORMALIZE = {
    "myrcene": "MYRCENE",
    "beta-myrcene": "MYRCENE",
    "b-myrcene": "MYRCENE",
    "limonene": "LIMONENE",
    "d-limonene": "LIMONENE",
    "linalool": "LINALOOL",
    "caryophyllene": "CARYOPHYLLENE",
    "beta-caryophyllene": "CARYOPHYLLENE",
    "b-caryophyllene": "CARYOPHYLLENE",
    "pinene": "PINENE",
    "alpha-pinene": "PINENE",
    "a-pinene": "PINENE",
    "beta-pinene": "PINENE",
    "humulene": "HUMULENE",
    "alpha-humulene": "HUMULENE",
    "a-humulene": "HUMULENE",
    "terpinolene": "TERPINOLENE",
    "bisabolol": "BISABOLOL",
    "alpha-bisabolol": "BISABOLOL",
    "a-bisabolol": "BISABOLOL",
    "valencene": "VALENCENE",
    "eucalyptol": "EUCALYPTOL",
    "geraniol": "GERANIOL",
    "phytol": "PHYTOL",
    "camphene": "CAMPHENE",
    "borneol": "BORNEOL",
    "ocimene": "OCIMENE",
    "nerolidol": "NEROLIDOL",
    "trans-nerolidol": "NEROLIDOL",
    "guaiol": "GUAIOL",
    "carene": "CARENE",
    "delta-3-carene": "CARENE",
    "terpineol": "TERPINEOL",
    "alpha-terpineol": "TERPINEOL",
    "phellandrene": "PHELLANDRENE",
    "sabinene": "SABINENE",
    "cymene": "CYMENE",
    "p-cymene": "CYMENE",
    "fenchol": "FENCHOL",
    "pulegone": "PULEGONE",
    "cedrene": "CEDRENE",
    "isopulegol": "ISOPULEGOL",
    "maltol": "MALTOL",
}

# Classification -> strain_gradient base
CLASSIFICATION_GRADIENT = {
    "sativa": 0.85,
    "indica": 0.15,
    "hybrid": 0.50,
}


# ═══════════════════════════════════════════════════════════════════════
# HTTP HELPERS
# ═══════════════════════════════════════════════════════════════════════


def _create_session() -> requests.Session:
    """Build a configured :class:`requests.Session` for Leafly scraping.

    Creates a connection-pooling session and applies the module-level
    ``HEADERS`` (browser-like User-Agent, Accept, and encoding headers) so
    every request mimics a real browser and reuses TCP connections across
    listing-page fetches.

    Interactions: constructs a ``requests.Session`` and calls
    ``session.headers.update(HEADERS)`` with the module's ``HEADERS`` dict;
    no network I/O happens here. Called by ``scrape_all_strains`` once at the
    start of a scrape run, where the returned session is threaded through
    every ``_fetch_with_retry`` call.

    Returns:
        requests.Session: A session pre-populated with the scraper's default
        headers, ready to be passed to ``_fetch_with_retry``.
    """
    s = requests.Session()
    s.headers.update(HEADERS)
    return s


def _fetch_with_retry(
    session: requests.Session,
    url: str,
    max_retries: int = MAX_RETRIES,
) -> Optional[requests.Response]:
    """Fetch a URL with retries, rate-limit backoff, and 404 short-circuiting.

    Issues a GET against ``url`` (20s timeout) and inspects the status code:
    a ``200`` is returned immediately; a ``429`` triggers an exponential
    rate-limit sleep (``RETRY_BACKOFF`` raised to the attempt, times three)
    before retrying; a ``404`` returns ``None`` straight away since the page
    does not exist. Any other status, or a transport-level
    ``requests.RequestException``, is logged as a warning and retried after a
    shorter ``RETRY_BACKOFF ** attempt`` backoff.

    Interactions: calls ``session.get`` on the provided session, sleeps via
    ``time.sleep`` for backoff, and emits warnings through the module
    ``logger``. Called by ``scrape_all_strains`` for each listing page URL;
    the caller treats a ``None`` return as a fetch failure and increments its
    consecutive-empty counter.

    Args:
        session (requests.Session): The shared session (from
            ``_create_session``) used to issue the request.
        url (str): The fully-qualified listing-page URL to fetch.
        max_retries (int): Maximum number of attempts before giving up;
            defaults to the module-level ``MAX_RETRIES``.

    Returns:
        Optional[requests.Response]: The successful ``200`` response, or
        ``None`` if the page 404s or all retries are exhausted.
    """
    for attempt in range(max_retries):
        try:
            resp = session.get(url, timeout=20)
            if resp.status_code == 200:
                return resp
            if resp.status_code == 429:
                wait = RETRY_BACKOFF ** (attempt + 1) * 3
                logger.warning("Rate limited, waiting %.1fs...", wait)
                time.sleep(wait)
                continue
            if resp.status_code == 404:
                return None
            logger.warning(
                "HTTP %d for %s (attempt %d/%d)",
                resp.status_code,
                url,
                attempt + 1,
                max_retries,
            )
        except requests.RequestException as e:
            logger.warning("Request error: %s (attempt %d)", e, attempt + 1)
        time.sleep(RETRY_BACKOFF**attempt)
    return None


def _normalize_terpene(name: str) -> str:
    """Canonicalize a raw terpene name to a stable uppercase key.

    Lowercases, strips, and hyphenates the input, then looks it up in the
    module's ``TERPENE_NORMALIZE`` map (which collapses isomer/synonym
    variants such as ``beta-myrcene`` or ``b-myrcene`` onto ``MYRCENE``).
    Unknown names fall back to an uppercased form with hyphens and spaces
    converted to underscores, so every terpene yields a consistent YAML key.

    Interactions: reads the module-level ``TERPENE_NORMALIZE`` dict; pure and
    side-effect-free otherwise. Called by ``parse_listing_strain`` while
    building each strain's ``terpene_weights`` map (handling both the dict and
    list terpene formats from Leafly's ``__NEXT_DATA__``).

    Args:
        name (str): Raw terpene name as it appears in the scraped data.

    Returns:
        str: The canonical terpene key (e.g. ``"MYRCENE"``), either from the
        normalization map or the uppercased fallback form.
    """
    clean = name.lower().strip().replace(" ", "-")
    return TERPENE_NORMALIZE.get(
        clean,
        name.upper().replace("-", "_").replace(" ", "_"),
    )


# ═══════════════════════════════════════════════════════════════════════
# __NEXT_DATA__ EXTRACTION
# ═══════════════════════════════════════════════════════════════════════


def _extract_next_data(html: str) -> Optional[dict]:
    """Extract and parse the embedded ``__NEXT_DATA__`` JSON blob from a Leafly page.

    Leafly is a Next.js site that ships each page's hydration state in a
    ``<script id="__NEXT_DATA__">`` tag; this regex-finds that script body and
    parses it so the strain list can be read without scraping the rendered DOM.
    Returns ``None`` when the tag is absent or the JSON fails to parse, so the
    caller can treat the page as empty rather than raise. Pure in-memory parsing
    (regex plus ``json.loads``) with no network I/O. Called by
    :func:`scrape_all_strains` on each fetched listing page; no other callers.

    Args:
        html: The raw HTML body of a Leafly listing page.

    Returns:
        The decoded ``__NEXT_DATA__`` object as a dict, or ``None`` when the
        script tag is missing or its contents are not valid JSON.
    """
    match = re.search(
        r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>',
        html,
        re.DOTALL,
    )
    if not match:
        return None
    try:
        return json.loads(match.group(1))
    except json.JSONDecodeError:
        return None


def _extract_strains_from_page(next_data: dict) -> List[dict]:
    """Pull the list of raw strain objects out of a parsed ``__NEXT_DATA__`` blob.

    Walks the nested ``props.pageProps.data.strains`` path of the Next.js
    hydration data to reach the page's strain array, guarding every step so a
    missing or restructured payload yields an empty list instead of raising.
    Pure dict traversal with no I/O. Called by :func:`scrape_all_strains` after
    :func:`_extract_next_data`; an empty return feeds the caller's
    consecutive-empty-page counter. No other callers.

    Args:
        next_data: The parsed ``__NEXT_DATA__`` object for one listing page.

    Returns:
        The list of raw per-strain dicts for the page, or an empty list when
        the expected path is absent or not a list.
    """
    try:
        # Path: props.pageProps.data.strains
        page_props = next_data.get("props", {}).get("pageProps", {})
        data = page_props.get("data", {})
        strains = data.get("strains", [])
        if isinstance(strains, list):
            return strains
    except Exception:
        pass
    return []


def _extract_total_count(next_data: dict) -> int:
    """Read the total strain count from a page's ``__NEXT_DATA__`` metadata.

    Reaches into ``props.pageProps.data.metadata.totalCount`` to recover how
    many strains Leafly reports overall, which the scraper logs once (from the
    first page) as a progress target. Every lookup is guarded so a malformed
    payload yields ``0`` rather than an exception. Pure dict traversal with no
    I/O. Called by :func:`scrape_all_strains` on the first listing page only; no
    other callers.

    Args:
        next_data: The parsed ``__NEXT_DATA__`` object for one listing page.

    Returns:
        The reported total strain count, or ``0`` when the metadata is missing
        or non-numeric.
    """
    try:
        page_props = next_data.get("props", {}).get("pageProps", {})
        metadata = page_props.get("data", {}).get("metadata", {})
        return int(metadata.get("totalCount", 0))
    except Exception:
        return 0


# ═══════════════════════════════════════════════════════════════════════
# STRAIN DATA PARSING
# ═══════════════════════════════════════════════════════════════════════



[docs]
def parse_listing_strain(raw: dict) -> Optional[Dict[str, Any]]:
    """Parse a single strain from listing page __NEXT_DATA__.

    Each strain object in the listing contains:
    - slug, name, category
    - terps: {terpene_name: {score: float}}
    - effects: {effect_name: {score: float}}
    - cannabinoids: {thc: {percentile50: float}, ...}
    """
    try:
        slug = raw.get("slug", "")
        name = raw.get("name", slug.replace("-", " ").title())
        category = (raw.get("category") or "Hybrid").lower()

        if not slug and not name:
            return None

        # ── Cannabinoids ──
        cannabinoids = raw.get("cannabinoids", {}) or {}
        thc_data = cannabinoids.get("thc", {}) or {}
        cbd_data = cannabinoids.get("cbd", {}) or {}
        thc_pct = thc_data.get("percentile50", 0) or 0
        cbd_pct = cbd_data.get("percentile50", 0) or 0

        # ── Terpenes ──
        terps_raw = raw.get("terps", {}) or {}
        terpene_weights = {}

        if isinstance(terps_raw, dict):
            # Format: {terpene_name: {score: float}}
            total_score = 0.0
            raw_terps = []
            for tname, tdata in terps_raw.items():
                score = 1.0
                if isinstance(tdata, dict):
                    score = float(tdata.get("score", 1.0) or 1.0)
                elif isinstance(tdata, (int, float)):
                    score = float(tdata)
                normalized = _normalize_terpene(tname)
                raw_terps.append((normalized, score))
                total_score += score

            if total_score > 0:
                for tname, tscore in raw_terps:
                    terpene_weights[tname] = round(tscore / total_score, 4)
            elif raw_terps:
                for tname, _ in raw_terps:
                    terpene_weights[tname] = round(1.0 / len(raw_terps), 4)

        elif isinstance(terps_raw, list):
            # Alternate format: list of terpene objects/strings
            for t in terps_raw:
                if isinstance(t, str):
                    terpene_weights[_normalize_terpene(t)] = 1.0
                elif isinstance(t, dict):
                    tname = t.get("name", "")
                    tscore = float(t.get("score", 1.0) or 1.0)
                    if tname:
                        terpene_weights[_normalize_terpene(tname)] = tscore

        # ── Effects ──
        effects_raw = raw.get("effects", {}) or {}
        effect_list = []
        if isinstance(effects_raw, dict):
            # Sort by score descending, take top 5
            sorted_effects = sorted(
                effects_raw.items(),
                key=lambda x: (
                    x[1].get("score", 0) if isinstance(x[1], dict) else float(x[1] or 0)
                ),
                reverse=True,
            )
            for ename, _ in sorted_effects[:5]:
                effect_list.append(ename.lower())
        elif isinstance(effects_raw, list):
            for e in effects_raw[:5]:
                if isinstance(e, str):
                    effect_list.append(e.lower())
                elif isinstance(e, dict):
                    effect_list.append(e.get("name", "").lower())

        # ── Strain gradient ──
        base_gradient = CLASSIFICATION_GRADIENT.get(category, 0.50)

        # Refine with effects
        sativa_fx = {
            "energetic",
            "focused",
            "creative",
            "uplifted",
            "euphoric",
            "happy",
            "talkative",
            "giggly",
        }
        indica_fx = {
            "relaxed",
            "sleepy",
            "hungry",
            "sedated",
            "calm",
            "aroused",
            "tingly",
        }
        s_score = sum(1 for e in effect_list if e in sativa_fx)
        i_score = sum(1 for e in effect_list if e in indica_fx)
        if s_score + i_score > 0:
            bias = (s_score - i_score) / ((s_score + i_score) * 2)
            base_gradient = max(0.02, min(0.98, base_gradient + bias * 0.2))

        # ── YAML key ──
        yaml_key = name.upper().replace(" ", "_").replace("-", "_").replace("'", "")
        yaml_key = re.sub(r"[^A-Z0-9_]", "", yaml_key)
        if not yaml_key:
            yaml_key = slug.upper().replace("-", "_")

        result = {
            "_yaml_key": yaml_key,
            "strain_gradient": round(base_gradient, 2),
            "classification": category,
            "thc_pct": round(float(thc_pct), 1) if thc_pct else 0,
        }

        if cbd_pct and float(cbd_pct) > 0.5:
            result["cbd_pct"] = round(float(cbd_pct), 1)

        if terpene_weights:
            result["terpene_weights"] = terpene_weights

        if effect_list:
            result["effects"] = effect_list

        return result

    except Exception as e:
        logger.debug("Failed to parse strain: %s", e)
        return None



# ═══════════════════════════════════════════════════════════════════════
# MAIN SCRAPER
# ═══════════════════════════════════════════════════════════════════════



[docs]
def scrape_all_strains(
    max_pages: Optional[int] = None,
    output_path: str = "leafly_strains.yaml",
    page_delay: float = PAGE_DELAY_S,
) -> int:
    """Scrape all Leafly strains from listing pages.

    Each listing page's __NEXT_DATA__ contains ~18 strains with
    terpene profiles, effects, and cannabinoid data. No need to
    visit individual strain pages.
    """
    session = _create_session()
    all_strains: Dict[str, Dict[str, Any]] = {}
    seen_keys: Set[str] = set()
    total_expected = 0
    page = 1
    consecutive_empty = 0

    logger.info("Starting Leafly strain scrape... 🌿")

    while True:
        if max_pages and page > max_pages:
            logger.info("Reached page limit (%d)", max_pages)
            break

        url = f"{STRAINS_LIST_URL}?page={page}"
        logger.info("Fetching page %d...", page)

        resp = _fetch_with_retry(session, url)
        if not resp:
            consecutive_empty += 1
            if consecutive_empty >= 3:
                logger.info("3 consecutive failures, stopping at page %d", page)
                break
            page += 1
            time.sleep(page_delay)
            continue

        next_data = _extract_next_data(resp.text)
        if not next_data:
            logger.warning("No __NEXT_DATA__ found on page %d", page)
            consecutive_empty += 1
            if consecutive_empty >= 3:
                break
            page += 1
            time.sleep(page_delay)
            continue

        # Get total count on first page
        if page == 1:
            total_expected = _extract_total_count(next_data)
            logger.info("Total strains on Leafly: %d", total_expected)

        strains = _extract_strains_from_page(next_data)
        if not strains:
            consecutive_empty += 1
            if consecutive_empty >= 3:
                logger.info("No more strains found, stopping at page %d", page)
                break
            page += 1
            time.sleep(page_delay)
            continue

        consecutive_empty = 0
        new_count = 0

        for raw_strain in strains:
            parsed = parse_listing_strain(raw_strain)
            if not parsed:
                continue

            key = parsed.pop("_yaml_key")
            if key in seen_keys:
                # Handle duplicates by appending a suffix
                suffix = 2
                while f"{key}_{suffix}" in seen_keys:
                    suffix += 1
                key = f"{key}_{suffix}"

            seen_keys.add(key)
            all_strains[key] = parsed
            new_count += 1

        logger.info(
            "Page %d: %d strains (%d new, %d total)",
            page,
            len(strains),
            new_count,
            len(all_strains),
        )

        page += 1
        time.sleep(page_delay)

    # ── Write YAML ──
    logger.info("Writing %d strains to %s...", len(all_strains), output_path)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(
            "# ======================================================================\n"
        )
        f.write("# LEAFLY STRAIN DATABASE -- Scraped from leafly.com\n")
        f.write(f"# Total strains: {len(all_strains)}\n")
        f.write(f"# Expected: {total_expected}\n")
        f.write("# Format compatible with terpene_profiles.yaml / TerpeneEngine\n")
        f.write(
            "# ======================================================================\n\n"
        )
        f.write("strains:\n\n")

        for key in sorted(all_strains.keys()):
            data = all_strains[key]
            f.write(f"  {key}:\n")
            f.write(f"    strain_gradient: {data.get('strain_gradient', 0.5)}\n")
            f.write(f"    classification: {data.get('classification', 'hybrid')}\n")
            f.write(f"    thc_pct: {data.get('thc_pct', 0)}\n")
            if "cbd_pct" in data:
                f.write(f"    cbd_pct: {data['cbd_pct']}\n")
            if "terpene_weights" in data:
                f.write("    terpene_weights:\n")
                for tname, tweight in sorted(
                    data["terpene_weights"].items(),
                    key=lambda x: -x[1],
                ):
                    f.write(f"      {tname}: {tweight}\n")
            if "effects" in data:
                effects_str = json.dumps(data["effects"])
                f.write(f"    effects: {effects_str}\n")
            f.write("\n")

    logger.info(
        "DONE 🔥 %d strains scraped. Output: %s",
        len(all_strains),
        output_path,
    )
    return len(all_strains)



# ═══════════════════════════════════════════════════════════════════════
# MERGE INTO TERPENE_PROFILES.YAML
# ═══════════════════════════════════════════════════════════════════════



[docs]
def merge_into_terpene_profiles(
    leafly_yaml_path: str,
    terpene_profiles_path: str,
) -> int:
    """Merge scraped Leafly strains into terpene_profiles.yaml.

    Only adds strains not already in the curated database.
    Returns count of new strains added.
    """
    with open(terpene_profiles_path, "r", encoding="utf-8") as f:
        existing = yaml.safe_load(f) or {}
    existing_strains = existing.get("strains", {})
    existing_keys = set(existing_strains.keys())

    with open(leafly_yaml_path, "r", encoding="utf-8") as f:
        scraped = yaml.safe_load(f) or {}
    scraped_strains = scraped.get("strains", {})

    added = 0
    for key, data in scraped_strains.items():
        if key not in existing_keys:
            existing_strains[key] = data
            added += 1

    existing["strains"] = existing_strains

    with open(terpene_profiles_path, "w", encoding="utf-8") as f:
        yaml.dump(
            existing,
            f,
            default_flow_style=False,
            allow_unicode=True,
            sort_keys=False,
            width=120,
        )

    logger.info("Merged %d new strains into %s", added, terpene_profiles_path)
    return added



# ═══════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════



[docs]
def main():
    """Parse CLI arguments and drive a full scrape (and optional merge).

    Defines the ``--pages``, ``--output``, ``--merge``, and ``--delay``
    command-line options, runs the scrape, and -- when ``--merge`` is set and
    at least one strain was written -- folds the new strains into the
    repo-local ``terpene_profiles.yaml`` (resolved relative to this file's
    directory), warning if that file is absent.

    Interactions: builds an ``argparse.ArgumentParser``, calls
    ``scrape_all_strains`` with the parsed options, conditionally calls
    ``merge_into_terpene_profiles`` after resolving the path via
    ``os.path.dirname``/``os.path.abspath``/``os.path.exists``, and logs
    through the module ``logger``. Called only by the ``if __name__ ==
    "__main__"`` guard at the bottom of the module; it is the script's
    entry point and has no internal callers.
    """
    parser = argparse.ArgumentParser(
        description="Scrape ALL strains from Leafly into YAML",
    )
    parser.add_argument(
        "--pages",
        type=int,
        default=None,
        help="Max listing pages to scrape (default: all ~500 pages)",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="leafly_strains.yaml",
        help="Output YAML file path (default: leafly_strains.yaml)",
    )
    parser.add_argument(
        "--merge",
        action="store_true",
        help="After scraping, merge new strains into terpene_profiles.yaml",
    )
    parser.add_argument(
        "--delay",
        type=float,
        default=PAGE_DELAY_S,
        help=f"Delay between pages in seconds (default: {PAGE_DELAY_S})",
    )
    args = parser.parse_args()

    count = scrape_all_strains(
        max_pages=args.pages,
        output_path=args.output,
        page_delay=args.delay,
    )

    if args.merge and count > 0:
        terp_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "terpene_profiles.yaml",
        )
        if os.path.exists(terp_path):
            merge_into_terpene_profiles(args.output, terp_path)
        else:
            logger.warning("terpene_profiles.yaml not found, skipping merge")



if __name__ == "__main__":
    main()