"""Leafly Strain Scraper -- Harvest ALL strains into terpene_profiles.yaml.
Extracts strain data directly from Leafly's __NEXT_DATA__ JSON embedded
in listing pages. Each listing page contains ~18 strains with full
terpene profiles, effects, cannabinoids, and metadata.
Total: ~9000 strains across ~500 pages.
# 💀🔥 scraping the entire weed bible 🌿
#
# Usage:
# python scrape_leafly.py # scrape ALL strains
# python scrape_leafly.py --pages 5 # first 5 pages only
# python scrape_leafly.py --merge # merge into terpene_profiles.yaml
# python scrape_leafly.py --output my_strains.yaml
"""
from __future__ import annotations
import argparse
import jsonutil as json
import logging
import os
import re
import time
from typing import Any, Dict, List, Optional, Set
import requests
import yaml
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)
# ═══════════════════════════════════════════════════════════════════════
# CONFIG
# ═══════════════════════════════════════════════════════════════════════
LEAFLY_BASE = "https://www.leafly.com"
STRAINS_LIST_URL = f"{LEAFLY_BASE}/strains"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
HEADERS = {
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
# Rate limiting
PAGE_DELAY_S = 1.5 # delay between listing pages
MAX_RETRIES = 3
RETRY_BACKOFF = 2.0
# 🌿 Terpene name normalization
TERPENE_NORMALIZE = {
"myrcene": "MYRCENE",
"beta-myrcene": "MYRCENE",
"b-myrcene": "MYRCENE",
"limonene": "LIMONENE",
"d-limonene": "LIMONENE",
"linalool": "LINALOOL",
"caryophyllene": "CARYOPHYLLENE",
"beta-caryophyllene": "CARYOPHYLLENE",
"b-caryophyllene": "CARYOPHYLLENE",
"pinene": "PINENE",
"alpha-pinene": "PINENE",
"a-pinene": "PINENE",
"beta-pinene": "PINENE",
"humulene": "HUMULENE",
"alpha-humulene": "HUMULENE",
"a-humulene": "HUMULENE",
"terpinolene": "TERPINOLENE",
"bisabolol": "BISABOLOL",
"alpha-bisabolol": "BISABOLOL",
"a-bisabolol": "BISABOLOL",
"valencene": "VALENCENE",
"eucalyptol": "EUCALYPTOL",
"geraniol": "GERANIOL",
"phytol": "PHYTOL",
"camphene": "CAMPHENE",
"borneol": "BORNEOL",
"ocimene": "OCIMENE",
"nerolidol": "NEROLIDOL",
"trans-nerolidol": "NEROLIDOL",
"guaiol": "GUAIOL",
"carene": "CARENE",
"delta-3-carene": "CARENE",
"terpineol": "TERPINEOL",
"alpha-terpineol": "TERPINEOL",
"phellandrene": "PHELLANDRENE",
"sabinene": "SABINENE",
"cymene": "CYMENE",
"p-cymene": "CYMENE",
"fenchol": "FENCHOL",
"pulegone": "PULEGONE",
"cedrene": "CEDRENE",
"isopulegol": "ISOPULEGOL",
"maltol": "MALTOL",
}
# Classification -> strain_gradient base
CLASSIFICATION_GRADIENT = {
"sativa": 0.85,
"indica": 0.15,
"hybrid": 0.50,
}
# ═══════════════════════════════════════════════════════════════════════
# HTTP HELPERS
# ═══════════════════════════════════════════════════════════════════════
def _create_session() -> requests.Session:
"""Build a configured :class:`requests.Session` for Leafly scraping.
Creates a connection-pooling session and applies the module-level
``HEADERS`` (browser-like User-Agent, Accept, and encoding headers) so
every request mimics a real browser and reuses TCP connections across
listing-page fetches.
Interactions: constructs a ``requests.Session`` and calls
``session.headers.update(HEADERS)`` with the module's ``HEADERS`` dict;
no network I/O happens here. Called by ``scrape_all_strains`` once at the
start of a scrape run, where the returned session is threaded through
every ``_fetch_with_retry`` call.
Returns:
requests.Session: A session pre-populated with the scraper's default
headers, ready to be passed to ``_fetch_with_retry``.
"""
s = requests.Session()
s.headers.update(HEADERS)
return s
def _fetch_with_retry(
session: requests.Session,
url: str,
max_retries: int = MAX_RETRIES,
) -> Optional[requests.Response]:
"""Fetch a URL with retries, rate-limit backoff, and 404 short-circuiting.
Issues a GET against ``url`` (20s timeout) and inspects the status code:
a ``200`` is returned immediately; a ``429`` triggers an exponential
rate-limit sleep (``RETRY_BACKOFF`` raised to the attempt, times three)
before retrying; a ``404`` returns ``None`` straight away since the page
does not exist. Any other status, or a transport-level
``requests.RequestException``, is logged as a warning and retried after a
shorter ``RETRY_BACKOFF ** attempt`` backoff.
Interactions: calls ``session.get`` on the provided session, sleeps via
``time.sleep`` for backoff, and emits warnings through the module
``logger``. Called by ``scrape_all_strains`` for each listing page URL;
the caller treats a ``None`` return as a fetch failure and increments its
consecutive-empty counter.
Args:
session (requests.Session): The shared session (from
``_create_session``) used to issue the request.
url (str): The fully-qualified listing-page URL to fetch.
max_retries (int): Maximum number of attempts before giving up;
defaults to the module-level ``MAX_RETRIES``.
Returns:
Optional[requests.Response]: The successful ``200`` response, or
``None`` if the page 404s or all retries are exhausted.
"""
for attempt in range(max_retries):
try:
resp = session.get(url, timeout=20)
if resp.status_code == 200:
return resp
if resp.status_code == 429:
wait = RETRY_BACKOFF ** (attempt + 1) * 3
logger.warning("Rate limited, waiting %.1fs...", wait)
time.sleep(wait)
continue
if resp.status_code == 404:
return None
logger.warning(
"HTTP %d for %s (attempt %d/%d)",
resp.status_code,
url,
attempt + 1,
max_retries,
)
except requests.RequestException as e:
logger.warning("Request error: %s (attempt %d)", e, attempt + 1)
time.sleep(RETRY_BACKOFF**attempt)
return None
def _normalize_terpene(name: str) -> str:
"""Canonicalize a raw terpene name to a stable uppercase key.
Lowercases, strips, and hyphenates the input, then looks it up in the
module's ``TERPENE_NORMALIZE`` map (which collapses isomer/synonym
variants such as ``beta-myrcene`` or ``b-myrcene`` onto ``MYRCENE``).
Unknown names fall back to an uppercased form with hyphens and spaces
converted to underscores, so every terpene yields a consistent YAML key.
Interactions: reads the module-level ``TERPENE_NORMALIZE`` dict; pure and
side-effect-free otherwise. Called by ``parse_listing_strain`` while
building each strain's ``terpene_weights`` map (handling both the dict and
list terpene formats from Leafly's ``__NEXT_DATA__``).
Args:
name (str): Raw terpene name as it appears in the scraped data.
Returns:
str: The canonical terpene key (e.g. ``"MYRCENE"``), either from the
normalization map or the uppercased fallback form.
"""
clean = name.lower().strip().replace(" ", "-")
return TERPENE_NORMALIZE.get(
clean,
name.upper().replace("-", "_").replace(" ", "_"),
)
# ═══════════════════════════════════════════════════════════════════════
# __NEXT_DATA__ EXTRACTION
# ═══════════════════════════════════════════════════════════════════════
def _extract_next_data(html: str) -> Optional[dict]:
"""Extract and parse the embedded ``__NEXT_DATA__`` JSON blob from a Leafly page.
Leafly is a Next.js site that ships each page's hydration state in a
``<script id="__NEXT_DATA__">`` tag; this regex-finds that script body and
parses it so the strain list can be read without scraping the rendered DOM.
Returns ``None`` when the tag is absent or the JSON fails to parse, so the
caller can treat the page as empty rather than raise. Pure in-memory parsing
(regex plus ``json.loads``) with no network I/O. Called by
:func:`scrape_all_strains` on each fetched listing page; no other callers.
Args:
html: The raw HTML body of a Leafly listing page.
Returns:
The decoded ``__NEXT_DATA__`` object as a dict, or ``None`` when the
script tag is missing or its contents are not valid JSON.
"""
match = re.search(
r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>',
html,
re.DOTALL,
)
if not match:
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
return None
def _extract_strains_from_page(next_data: dict) -> List[dict]:
"""Pull the list of raw strain objects out of a parsed ``__NEXT_DATA__`` blob.
Walks the nested ``props.pageProps.data.strains`` path of the Next.js
hydration data to reach the page's strain array, guarding every step so a
missing or restructured payload yields an empty list instead of raising.
Pure dict traversal with no I/O. Called by :func:`scrape_all_strains` after
:func:`_extract_next_data`; an empty return feeds the caller's
consecutive-empty-page counter. No other callers.
Args:
next_data: The parsed ``__NEXT_DATA__`` object for one listing page.
Returns:
The list of raw per-strain dicts for the page, or an empty list when
the expected path is absent or not a list.
"""
try:
# Path: props.pageProps.data.strains
page_props = next_data.get("props", {}).get("pageProps", {})
data = page_props.get("data", {})
strains = data.get("strains", [])
if isinstance(strains, list):
return strains
except Exception:
pass
return []
def _extract_total_count(next_data: dict) -> int:
"""Read the total strain count from a page's ``__NEXT_DATA__`` metadata.
Reaches into ``props.pageProps.data.metadata.totalCount`` to recover how
many strains Leafly reports overall, which the scraper logs once (from the
first page) as a progress target. Every lookup is guarded so a malformed
payload yields ``0`` rather than an exception. Pure dict traversal with no
I/O. Called by :func:`scrape_all_strains` on the first listing page only; no
other callers.
Args:
next_data: The parsed ``__NEXT_DATA__`` object for one listing page.
Returns:
The reported total strain count, or ``0`` when the metadata is missing
or non-numeric.
"""
try:
page_props = next_data.get("props", {}).get("pageProps", {})
metadata = page_props.get("data", {}).get("metadata", {})
return int(metadata.get("totalCount", 0))
except Exception:
return 0
# ═══════════════════════════════════════════════════════════════════════
# STRAIN DATA PARSING
# ═══════════════════════════════════════════════════════════════════════
[docs]
def parse_listing_strain(raw: dict) -> Optional[Dict[str, Any]]:
"""Parse a single strain from listing page __NEXT_DATA__.
Each strain object in the listing contains:
- slug, name, category
- terps: {terpene_name: {score: float}}
- effects: {effect_name: {score: float}}
- cannabinoids: {thc: {percentile50: float}, ...}
"""
try:
slug = raw.get("slug", "")
name = raw.get("name", slug.replace("-", " ").title())
category = (raw.get("category") or "Hybrid").lower()
if not slug and not name:
return None
# ── Cannabinoids ──
cannabinoids = raw.get("cannabinoids", {}) or {}
thc_data = cannabinoids.get("thc", {}) or {}
cbd_data = cannabinoids.get("cbd", {}) or {}
thc_pct = thc_data.get("percentile50", 0) or 0
cbd_pct = cbd_data.get("percentile50", 0) or 0
# ── Terpenes ──
terps_raw = raw.get("terps", {}) or {}
terpene_weights = {}
if isinstance(terps_raw, dict):
# Format: {terpene_name: {score: float}}
total_score = 0.0
raw_terps = []
for tname, tdata in terps_raw.items():
score = 1.0
if isinstance(tdata, dict):
score = float(tdata.get("score", 1.0) or 1.0)
elif isinstance(tdata, (int, float)):
score = float(tdata)
normalized = _normalize_terpene(tname)
raw_terps.append((normalized, score))
total_score += score
if total_score > 0:
for tname, tscore in raw_terps:
terpene_weights[tname] = round(tscore / total_score, 4)
elif raw_terps:
for tname, _ in raw_terps:
terpene_weights[tname] = round(1.0 / len(raw_terps), 4)
elif isinstance(terps_raw, list):
# Alternate format: list of terpene objects/strings
for t in terps_raw:
if isinstance(t, str):
terpene_weights[_normalize_terpene(t)] = 1.0
elif isinstance(t, dict):
tname = t.get("name", "")
tscore = float(t.get("score", 1.0) or 1.0)
if tname:
terpene_weights[_normalize_terpene(tname)] = tscore
# ── Effects ──
effects_raw = raw.get("effects", {}) or {}
effect_list = []
if isinstance(effects_raw, dict):
# Sort by score descending, take top 5
sorted_effects = sorted(
effects_raw.items(),
key=lambda x: (
x[1].get("score", 0) if isinstance(x[1], dict) else float(x[1] or 0)
),
reverse=True,
)
for ename, _ in sorted_effects[:5]:
effect_list.append(ename.lower())
elif isinstance(effects_raw, list):
for e in effects_raw[:5]:
if isinstance(e, str):
effect_list.append(e.lower())
elif isinstance(e, dict):
effect_list.append(e.get("name", "").lower())
# ── Strain gradient ──
base_gradient = CLASSIFICATION_GRADIENT.get(category, 0.50)
# Refine with effects
sativa_fx = {
"energetic",
"focused",
"creative",
"uplifted",
"euphoric",
"happy",
"talkative",
"giggly",
}
indica_fx = {
"relaxed",
"sleepy",
"hungry",
"sedated",
"calm",
"aroused",
"tingly",
}
s_score = sum(1 for e in effect_list if e in sativa_fx)
i_score = sum(1 for e in effect_list if e in indica_fx)
if s_score + i_score > 0:
bias = (s_score - i_score) / ((s_score + i_score) * 2)
base_gradient = max(0.02, min(0.98, base_gradient + bias * 0.2))
# ── YAML key ──
yaml_key = name.upper().replace(" ", "_").replace("-", "_").replace("'", "")
yaml_key = re.sub(r"[^A-Z0-9_]", "", yaml_key)
if not yaml_key:
yaml_key = slug.upper().replace("-", "_")
result = {
"_yaml_key": yaml_key,
"strain_gradient": round(base_gradient, 2),
"classification": category,
"thc_pct": round(float(thc_pct), 1) if thc_pct else 0,
}
if cbd_pct and float(cbd_pct) > 0.5:
result["cbd_pct"] = round(float(cbd_pct), 1)
if terpene_weights:
result["terpene_weights"] = terpene_weights
if effect_list:
result["effects"] = effect_list
return result
except Exception as e:
logger.debug("Failed to parse strain: %s", e)
return None
# ═══════════════════════════════════════════════════════════════════════
# MAIN SCRAPER
# ═══════════════════════════════════════════════════════════════════════
[docs]
def scrape_all_strains(
max_pages: Optional[int] = None,
output_path: str = "leafly_strains.yaml",
page_delay: float = PAGE_DELAY_S,
) -> int:
"""Scrape all Leafly strains from listing pages.
Each listing page's __NEXT_DATA__ contains ~18 strains with
terpene profiles, effects, and cannabinoid data. No need to
visit individual strain pages.
"""
session = _create_session()
all_strains: Dict[str, Dict[str, Any]] = {}
seen_keys: Set[str] = set()
total_expected = 0
page = 1
consecutive_empty = 0
logger.info("Starting Leafly strain scrape... 🌿")
while True:
if max_pages and page > max_pages:
logger.info("Reached page limit (%d)", max_pages)
break
url = f"{STRAINS_LIST_URL}?page={page}"
logger.info("Fetching page %d...", page)
resp = _fetch_with_retry(session, url)
if not resp:
consecutive_empty += 1
if consecutive_empty >= 3:
logger.info("3 consecutive failures, stopping at page %d", page)
break
page += 1
time.sleep(page_delay)
continue
next_data = _extract_next_data(resp.text)
if not next_data:
logger.warning("No __NEXT_DATA__ found on page %d", page)
consecutive_empty += 1
if consecutive_empty >= 3:
break
page += 1
time.sleep(page_delay)
continue
# Get total count on first page
if page == 1:
total_expected = _extract_total_count(next_data)
logger.info("Total strains on Leafly: %d", total_expected)
strains = _extract_strains_from_page(next_data)
if not strains:
consecutive_empty += 1
if consecutive_empty >= 3:
logger.info("No more strains found, stopping at page %d", page)
break
page += 1
time.sleep(page_delay)
continue
consecutive_empty = 0
new_count = 0
for raw_strain in strains:
parsed = parse_listing_strain(raw_strain)
if not parsed:
continue
key = parsed.pop("_yaml_key")
if key in seen_keys:
# Handle duplicates by appending a suffix
suffix = 2
while f"{key}_{suffix}" in seen_keys:
suffix += 1
key = f"{key}_{suffix}"
seen_keys.add(key)
all_strains[key] = parsed
new_count += 1
logger.info(
"Page %d: %d strains (%d new, %d total)",
page,
len(strains),
new_count,
len(all_strains),
)
page += 1
time.sleep(page_delay)
# ── Write YAML ──
logger.info("Writing %d strains to %s...", len(all_strains), output_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write(
"# ======================================================================\n"
)
f.write("# LEAFLY STRAIN DATABASE -- Scraped from leafly.com\n")
f.write(f"# Total strains: {len(all_strains)}\n")
f.write(f"# Expected: {total_expected}\n")
f.write("# Format compatible with terpene_profiles.yaml / TerpeneEngine\n")
f.write(
"# ======================================================================\n\n"
)
f.write("strains:\n\n")
for key in sorted(all_strains.keys()):
data = all_strains[key]
f.write(f" {key}:\n")
f.write(f" strain_gradient: {data.get('strain_gradient', 0.5)}\n")
f.write(f" classification: {data.get('classification', 'hybrid')}\n")
f.write(f" thc_pct: {data.get('thc_pct', 0)}\n")
if "cbd_pct" in data:
f.write(f" cbd_pct: {data['cbd_pct']}\n")
if "terpene_weights" in data:
f.write(" terpene_weights:\n")
for tname, tweight in sorted(
data["terpene_weights"].items(),
key=lambda x: -x[1],
):
f.write(f" {tname}: {tweight}\n")
if "effects" in data:
effects_str = json.dumps(data["effects"])
f.write(f" effects: {effects_str}\n")
f.write("\n")
logger.info(
"DONE 🔥 %d strains scraped. Output: %s",
len(all_strains),
output_path,
)
return len(all_strains)
# ═══════════════════════════════════════════════════════════════════════
# MERGE INTO TERPENE_PROFILES.YAML
# ═══════════════════════════════════════════════════════════════════════
[docs]
def merge_into_terpene_profiles(
leafly_yaml_path: str,
terpene_profiles_path: str,
) -> int:
"""Merge scraped Leafly strains into terpene_profiles.yaml.
Only adds strains not already in the curated database.
Returns count of new strains added.
"""
with open(terpene_profiles_path, "r", encoding="utf-8") as f:
existing = yaml.safe_load(f) or {}
existing_strains = existing.get("strains", {})
existing_keys = set(existing_strains.keys())
with open(leafly_yaml_path, "r", encoding="utf-8") as f:
scraped = yaml.safe_load(f) or {}
scraped_strains = scraped.get("strains", {})
added = 0
for key, data in scraped_strains.items():
if key not in existing_keys:
existing_strains[key] = data
added += 1
existing["strains"] = existing_strains
with open(terpene_profiles_path, "w", encoding="utf-8") as f:
yaml.dump(
existing,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
)
logger.info("Merged %d new strains into %s", added, terpene_profiles_path)
return added
# ═══════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════
[docs]
def main():
"""Parse CLI arguments and drive a full scrape (and optional merge).
Defines the ``--pages``, ``--output``, ``--merge``, and ``--delay``
command-line options, runs the scrape, and -- when ``--merge`` is set and
at least one strain was written -- folds the new strains into the
repo-local ``terpene_profiles.yaml`` (resolved relative to this file's
directory), warning if that file is absent.
Interactions: builds an ``argparse.ArgumentParser``, calls
``scrape_all_strains`` with the parsed options, conditionally calls
``merge_into_terpene_profiles`` after resolving the path via
``os.path.dirname``/``os.path.abspath``/``os.path.exists``, and logs
through the module ``logger``. Called only by the ``if __name__ ==
"__main__"`` guard at the bottom of the module; it is the script's
entry point and has no internal callers.
"""
parser = argparse.ArgumentParser(
description="Scrape ALL strains from Leafly into YAML",
)
parser.add_argument(
"--pages",
type=int,
default=None,
help="Max listing pages to scrape (default: all ~500 pages)",
)
parser.add_argument(
"--output",
type=str,
default="leafly_strains.yaml",
help="Output YAML file path (default: leafly_strains.yaml)",
)
parser.add_argument(
"--merge",
action="store_true",
help="After scraping, merge new strains into terpene_profiles.yaml",
)
parser.add_argument(
"--delay",
type=float,
default=PAGE_DELAY_S,
help=f"Delay between pages in seconds (default: {PAGE_DELAY_S})",
)
args = parser.parse_args()
count = scrape_all_strains(
max_pages=args.pages,
output_path=args.output,
page_delay=args.delay,
)
if args.merge and count > 0:
terp_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"terpene_profiles.yaml",
)
if os.path.exists(terp_path):
merge_into_terpene_profiles(args.output, terp_path)
else:
logger.warning("terpene_profiles.yaml not found, skipping merge")
if __name__ == "__main__":
main()