Source code for scrape_leafly

"""Leafly Strain Scraper -- Harvest ALL strains into terpene_profiles.yaml.

Extracts strain data directly from Leafly's __NEXT_DATA__ JSON embedded
in listing pages. Each listing page contains ~18 strains with full
terpene profiles, effects, cannabinoids, and metadata.

Total: ~9000 strains across ~500 pages.

# 💀🔥 scraping the entire weed bible 🌿
#
# Usage:
#   python scrape_leafly.py                          # scrape ALL strains
#   python scrape_leafly.py --pages 5                # first 5 pages only
#   python scrape_leafly.py --merge                  # merge into terpene_profiles.yaml
#   python scrape_leafly.py --output my_strains.yaml
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import re
import sys
import time
from typing import Any, Dict, List, Optional, Set, Tuple

import requests
import yaml

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)

# ═══════════════════════════════════════════════════════════════════════
# CONFIG
# ═══════════════════════════════════════════════════════════════════════

LEAFLY_BASE = "https://www.leafly.com"
STRAINS_LIST_URL = f"{LEAFLY_BASE}/strains"
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/131.0.0.0 Safari/537.36"
)
HEADERS = {
    "User-Agent": USER_AGENT,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
}

# Rate limiting
PAGE_DELAY_S = 1.5           # delay between listing pages
MAX_RETRIES = 3
RETRY_BACKOFF = 2.0

# 🌿 Terpene name normalization
TERPENE_NORMALIZE = {
    "myrcene": "MYRCENE",
    "beta-myrcene": "MYRCENE",
    "b-myrcene": "MYRCENE",
    "limonene": "LIMONENE",
    "d-limonene": "LIMONENE",
    "linalool": "LINALOOL",
    "caryophyllene": "CARYOPHYLLENE",
    "beta-caryophyllene": "CARYOPHYLLENE",
    "b-caryophyllene": "CARYOPHYLLENE",
    "pinene": "PINENE",
    "alpha-pinene": "PINENE",
    "a-pinene": "PINENE",
    "beta-pinene": "PINENE",
    "humulene": "HUMULENE",
    "alpha-humulene": "HUMULENE",
    "a-humulene": "HUMULENE",
    "terpinolene": "TERPINOLENE",
    "bisabolol": "BISABOLOL",
    "alpha-bisabolol": "BISABOLOL",
    "a-bisabolol": "BISABOLOL",
    "valencene": "VALENCENE",
    "eucalyptol": "EUCALYPTOL",
    "geraniol": "GERANIOL",
    "phytol": "PHYTOL",
    "camphene": "CAMPHENE",
    "borneol": "BORNEOL",
    "ocimene": "OCIMENE",
    "nerolidol": "NEROLIDOL",
    "trans-nerolidol": "NEROLIDOL",
    "guaiol": "GUAIOL",
    "carene": "CARENE",
    "delta-3-carene": "CARENE",
    "terpineol": "TERPINEOL",
    "alpha-terpineol": "TERPINEOL",
    "phellandrene": "PHELLANDRENE",
    "sabinene": "SABINENE",
    "cymene": "CYMENE",
    "p-cymene": "CYMENE",
    "fenchol": "FENCHOL",
    "pulegone": "PULEGONE",
    "cedrene": "CEDRENE",
    "isopulegol": "ISOPULEGOL",
    "maltol": "MALTOL",
}

# Classification -> strain_gradient base
CLASSIFICATION_GRADIENT = {
    "sativa": 0.85,
    "indica": 0.15,
    "hybrid": 0.50,
}


# ═══════════════════════════════════════════════════════════════════════
# HTTP HELPERS
# ═══════════════════════════════════════════════════════════════════════

def _create_session() -> requests.Session:
    s = requests.Session()
    s.headers.update(HEADERS)
    return s


def _fetch_with_retry(
    session: requests.Session,
    url: str,
    max_retries: int = MAX_RETRIES,
) -> Optional[requests.Response]:
    for attempt in range(max_retries):
        try:
            resp = session.get(url, timeout=20)
            if resp.status_code == 200:
                return resp
            if resp.status_code == 429:
                wait = RETRY_BACKOFF ** (attempt + 1) * 3
                logger.warning("Rate limited, waiting %.1fs...", wait)
                time.sleep(wait)
                continue
            if resp.status_code == 404:
                return None
            logger.warning(
                "HTTP %d for %s (attempt %d/%d)",
                resp.status_code, url, attempt + 1, max_retries,
            )
        except requests.RequestException as e:
            logger.warning("Request error: %s (attempt %d)", e, attempt + 1)
        time.sleep(RETRY_BACKOFF ** attempt)
    return None


def _normalize_terpene(name: str) -> str:
    clean = name.lower().strip().replace(" ", "-")
    return TERPENE_NORMALIZE.get(
        clean, name.upper().replace("-", "_").replace(" ", "_"),
    )


# ═══════════════════════════════════════════════════════════════════════
# __NEXT_DATA__ EXTRACTION
# ═══════════════════════════════════════════════════════════════════════

def _extract_next_data(html: str) -> Optional[dict]:
    """Extract __NEXT_DATA__ JSON from an HTML page."""
    match = re.search(
        r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>',
        html, re.DOTALL,
    )
    if not match:
        return None
    try:
        return json.loads(match.group(1))
    except json.JSONDecodeError:
        return None


def _extract_strains_from_page(next_data: dict) -> List[dict]:
    """Extract strain list from __NEXT_DATA__ pageProps."""
    try:
        # Path: props.pageProps.data.strains
        page_props = next_data.get("props", {}).get("pageProps", {})
        data = page_props.get("data", {})
        strains = data.get("strains", [])
        if isinstance(strains, list):
            return strains
    except Exception:
        pass
    return []


def _extract_total_count(next_data: dict) -> int:
    """Get total strain count from metadata."""
    try:
        page_props = next_data.get("props", {}).get("pageProps", {})
        metadata = page_props.get("data", {}).get("metadata", {})
        return int(metadata.get("totalCount", 0))
    except Exception:
        return 0


# ═══════════════════════════════════════════════════════════════════════
# STRAIN DATA PARSING
# ═══════════════════════════════════════════════════════════════════════

[docs] def parse_listing_strain(raw: dict) -> Optional[Dict[str, Any]]: """Parse a single strain from listing page __NEXT_DATA__. Each strain object in the listing contains: - slug, name, category - terps: {terpene_name: {score: float}} - effects: {effect_name: {score: float}} - cannabinoids: {thc: {percentile50: float}, ...} """ try: slug = raw.get("slug", "") name = raw.get("name", slug.replace("-", " ").title()) category = (raw.get("category") or "Hybrid").lower() if not slug and not name: return None # ── Cannabinoids ── cannabinoids = raw.get("cannabinoids", {}) or {} thc_data = cannabinoids.get("thc", {}) or {} cbd_data = cannabinoids.get("cbd", {}) or {} thc_pct = thc_data.get("percentile50", 0) or 0 cbd_pct = cbd_data.get("percentile50", 0) or 0 # ── Terpenes ── terps_raw = raw.get("terps", {}) or {} terpene_weights = {} if isinstance(terps_raw, dict): # Format: {terpene_name: {score: float}} total_score = 0.0 raw_terps = [] for tname, tdata in terps_raw.items(): score = 1.0 if isinstance(tdata, dict): score = float(tdata.get("score", 1.0) or 1.0) elif isinstance(tdata, (int, float)): score = float(tdata) normalized = _normalize_terpene(tname) raw_terps.append((normalized, score)) total_score += score if total_score > 0: for tname, tscore in raw_terps: terpene_weights[tname] = round(tscore / total_score, 4) elif raw_terps: for tname, _ in raw_terps: terpene_weights[tname] = round(1.0 / len(raw_terps), 4) elif isinstance(terps_raw, list): # Alternate format: list of terpene objects/strings for t in terps_raw: if isinstance(t, str): terpene_weights[_normalize_terpene(t)] = 1.0 elif isinstance(t, dict): tname = t.get("name", "") tscore = float(t.get("score", 1.0) or 1.0) if tname: terpene_weights[_normalize_terpene(tname)] = tscore # ── Effects ── effects_raw = raw.get("effects", {}) or {} effect_list = [] if isinstance(effects_raw, dict): # Sort by score descending, take top 5 sorted_effects = sorted( effects_raw.items(), key=lambda x: ( x[1].get("score", 0) if isinstance(x[1], dict) else float(x[1] or 0) ), reverse=True, ) for ename, _ in sorted_effects[:5]: effect_list.append(ename.lower()) elif isinstance(effects_raw, list): for e in effects_raw[:5]: if isinstance(e, str): effect_list.append(e.lower()) elif isinstance(e, dict): effect_list.append(e.get("name", "").lower()) # ── Strain gradient ── base_gradient = CLASSIFICATION_GRADIENT.get(category, 0.50) # Refine with effects sativa_fx = {"energetic", "focused", "creative", "uplifted", "euphoric", "happy", "talkative", "giggly"} indica_fx = {"relaxed", "sleepy", "hungry", "sedated", "calm", "aroused", "tingly"} s_score = sum(1 for e in effect_list if e in sativa_fx) i_score = sum(1 for e in effect_list if e in indica_fx) if s_score + i_score > 0: bias = (s_score - i_score) / ((s_score + i_score) * 2) base_gradient = max(0.02, min(0.98, base_gradient + bias * 0.2)) # ── YAML key ── yaml_key = name.upper().replace(" ", "_").replace("-", "_").replace("'", "") yaml_key = re.sub(r"[^A-Z0-9_]", "", yaml_key) if not yaml_key: yaml_key = slug.upper().replace("-", "_") result = { "_yaml_key": yaml_key, "strain_gradient": round(base_gradient, 2), "classification": category, "thc_pct": round(float(thc_pct), 1) if thc_pct else 0, } if cbd_pct and float(cbd_pct) > 0.5: result["cbd_pct"] = round(float(cbd_pct), 1) if terpene_weights: result["terpene_weights"] = terpene_weights if effect_list: result["effects"] = effect_list return result except Exception as e: logger.debug("Failed to parse strain: %s", e) return None
# ═══════════════════════════════════════════════════════════════════════ # MAIN SCRAPER # ═══════════════════════════════════════════════════════════════════════
[docs] def scrape_all_strains( max_pages: Optional[int] = None, output_path: str = "leafly_strains.yaml", page_delay: float = PAGE_DELAY_S, ) -> int: """Scrape all Leafly strains from listing pages. Each listing page's __NEXT_DATA__ contains ~18 strains with terpene profiles, effects, and cannabinoid data. No need to visit individual strain pages. """ session = _create_session() all_strains: Dict[str, Dict[str, Any]] = {} seen_keys: Set[str] = set() total_expected = 0 page = 1 consecutive_empty = 0 logger.info("Starting Leafly strain scrape... 🌿") while True: if max_pages and page > max_pages: logger.info("Reached page limit (%d)", max_pages) break url = f"{STRAINS_LIST_URL}?page={page}" logger.info("Fetching page %d...", page) resp = _fetch_with_retry(session, url) if not resp: consecutive_empty += 1 if consecutive_empty >= 3: logger.info("3 consecutive failures, stopping at page %d", page) break page += 1 time.sleep(page_delay) continue next_data = _extract_next_data(resp.text) if not next_data: logger.warning("No __NEXT_DATA__ found on page %d", page) consecutive_empty += 1 if consecutive_empty >= 3: break page += 1 time.sleep(page_delay) continue # Get total count on first page if page == 1: total_expected = _extract_total_count(next_data) logger.info("Total strains on Leafly: %d", total_expected) strains = _extract_strains_from_page(next_data) if not strains: consecutive_empty += 1 if consecutive_empty >= 3: logger.info("No more strains found, stopping at page %d", page) break page += 1 time.sleep(page_delay) continue consecutive_empty = 0 new_count = 0 for raw_strain in strains: parsed = parse_listing_strain(raw_strain) if not parsed: continue key = parsed.pop("_yaml_key") if key in seen_keys: # Handle duplicates by appending a suffix suffix = 2 while f"{key}_{suffix}" in seen_keys: suffix += 1 key = f"{key}_{suffix}" seen_keys.add(key) all_strains[key] = parsed new_count += 1 logger.info( "Page %d: %d strains (%d new, %d total)", page, len(strains), new_count, len(all_strains), ) page += 1 time.sleep(page_delay) # ── Write YAML ── logger.info("Writing %d strains to %s...", len(all_strains), output_path) with open(output_path, "w", encoding="utf-8") as f: f.write("# ======================================================================\n") f.write("# LEAFLY STRAIN DATABASE -- Scraped from leafly.com\n") f.write(f"# Total strains: {len(all_strains)}\n") f.write(f"# Expected: {total_expected}\n") f.write("# Format compatible with terpene_profiles.yaml / TerpeneEngine\n") f.write("# ======================================================================\n\n") f.write("strains:\n\n") for key in sorted(all_strains.keys()): data = all_strains[key] f.write(f" {key}:\n") f.write(f" strain_gradient: {data.get('strain_gradient', 0.5)}\n") f.write(f" classification: {data.get('classification', 'hybrid')}\n") f.write(f" thc_pct: {data.get('thc_pct', 0)}\n") if "cbd_pct" in data: f.write(f" cbd_pct: {data['cbd_pct']}\n") if "terpene_weights" in data: f.write(" terpene_weights:\n") for tname, tweight in sorted( data["terpene_weights"].items(), key=lambda x: -x[1], ): f.write(f" {tname}: {tweight}\n") if "effects" in data: effects_str = json.dumps(data["effects"]) f.write(f" effects: {effects_str}\n") f.write("\n") logger.info( "DONE 🔥 %d strains scraped. Output: %s", len(all_strains), output_path, ) return len(all_strains)
# ═══════════════════════════════════════════════════════════════════════ # MERGE INTO TERPENE_PROFILES.YAML # ═══════════════════════════════════════════════════════════════════════
[docs] def merge_into_terpene_profiles( leafly_yaml_path: str, terpene_profiles_path: str, ) -> int: """Merge scraped Leafly strains into terpene_profiles.yaml. Only adds strains not already in the curated database. Returns count of new strains added. """ with open(terpene_profiles_path, "r", encoding="utf-8") as f: existing = yaml.safe_load(f) or {} existing_strains = existing.get("strains", {}) existing_keys = set(existing_strains.keys()) with open(leafly_yaml_path, "r", encoding="utf-8") as f: scraped = yaml.safe_load(f) or {} scraped_strains = scraped.get("strains", {}) added = 0 for key, data in scraped_strains.items(): if key not in existing_keys: existing_strains[key] = data added += 1 existing["strains"] = existing_strains with open(terpene_profiles_path, "w", encoding="utf-8") as f: yaml.dump( existing, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120, ) logger.info("Merged %d new strains into %s", added, terpene_profiles_path) return added
# ═══════════════════════════════════════════════════════════════════════ # CLI # ═══════════════════════════════════════════════════════════════════════
[docs] def main(): parser = argparse.ArgumentParser( description="Scrape ALL strains from Leafly into YAML", ) parser.add_argument( "--pages", type=int, default=None, help="Max listing pages to scrape (default: all ~500 pages)", ) parser.add_argument( "--output", type=str, default="leafly_strains.yaml", help="Output YAML file path (default: leafly_strains.yaml)", ) parser.add_argument( "--merge", action="store_true", help="After scraping, merge new strains into terpene_profiles.yaml", ) parser.add_argument( "--delay", type=float, default=PAGE_DELAY_S, help=f"Delay between pages in seconds (default: {PAGE_DELAY_S})", ) args = parser.parse_args() count = scrape_all_strains( max_pages=args.pages, output_path=args.output, page_delay=args.delay, ) if args.merge and count > 0: terp_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "terpene_profiles.yaml", ) if os.path.exists(terp_path): merge_into_terpene_profiles(args.output, terp_path) else: logger.warning("terpene_profiles.yaml not found, skipping merge")
if __name__ == "__main__": main()