"""Leafly Strain Scraper -- Harvest ALL strains into terpene_profiles.yaml.
Extracts strain data directly from Leafly's __NEXT_DATA__ JSON embedded
in listing pages. Each listing page contains ~18 strains with full
terpene profiles, effects, cannabinoids, and metadata.
Total: ~9000 strains across ~500 pages.
# 💀🔥 scraping the entire weed bible 🌿
#
# Usage:
# python scrape_leafly.py # scrape ALL strains
# python scrape_leafly.py --pages 5 # first 5 pages only
# python scrape_leafly.py --merge # merge into terpene_profiles.yaml
# python scrape_leafly.py --output my_strains.yaml
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import re
import sys
import time
from typing import Any, Dict, List, Optional, Set, Tuple
import requests
import yaml
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)
# ═══════════════════════════════════════════════════════════════════════
# CONFIG
# ═══════════════════════════════════════════════════════════════════════
LEAFLY_BASE = "https://www.leafly.com"
STRAINS_LIST_URL = f"{LEAFLY_BASE}/strains"
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
HEADERS = {
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
# Rate limiting
PAGE_DELAY_S = 1.5 # delay between listing pages
MAX_RETRIES = 3
RETRY_BACKOFF = 2.0
# 🌿 Terpene name normalization
TERPENE_NORMALIZE = {
"myrcene": "MYRCENE",
"beta-myrcene": "MYRCENE",
"b-myrcene": "MYRCENE",
"limonene": "LIMONENE",
"d-limonene": "LIMONENE",
"linalool": "LINALOOL",
"caryophyllene": "CARYOPHYLLENE",
"beta-caryophyllene": "CARYOPHYLLENE",
"b-caryophyllene": "CARYOPHYLLENE",
"pinene": "PINENE",
"alpha-pinene": "PINENE",
"a-pinene": "PINENE",
"beta-pinene": "PINENE",
"humulene": "HUMULENE",
"alpha-humulene": "HUMULENE",
"a-humulene": "HUMULENE",
"terpinolene": "TERPINOLENE",
"bisabolol": "BISABOLOL",
"alpha-bisabolol": "BISABOLOL",
"a-bisabolol": "BISABOLOL",
"valencene": "VALENCENE",
"eucalyptol": "EUCALYPTOL",
"geraniol": "GERANIOL",
"phytol": "PHYTOL",
"camphene": "CAMPHENE",
"borneol": "BORNEOL",
"ocimene": "OCIMENE",
"nerolidol": "NEROLIDOL",
"trans-nerolidol": "NEROLIDOL",
"guaiol": "GUAIOL",
"carene": "CARENE",
"delta-3-carene": "CARENE",
"terpineol": "TERPINEOL",
"alpha-terpineol": "TERPINEOL",
"phellandrene": "PHELLANDRENE",
"sabinene": "SABINENE",
"cymene": "CYMENE",
"p-cymene": "CYMENE",
"fenchol": "FENCHOL",
"pulegone": "PULEGONE",
"cedrene": "CEDRENE",
"isopulegol": "ISOPULEGOL",
"maltol": "MALTOL",
}
# Classification -> strain_gradient base
CLASSIFICATION_GRADIENT = {
"sativa": 0.85,
"indica": 0.15,
"hybrid": 0.50,
}
# ═══════════════════════════════════════════════════════════════════════
# HTTP HELPERS
# ═══════════════════════════════════════════════════════════════════════
def _create_session() -> requests.Session:
s = requests.Session()
s.headers.update(HEADERS)
return s
def _fetch_with_retry(
session: requests.Session,
url: str,
max_retries: int = MAX_RETRIES,
) -> Optional[requests.Response]:
for attempt in range(max_retries):
try:
resp = session.get(url, timeout=20)
if resp.status_code == 200:
return resp
if resp.status_code == 429:
wait = RETRY_BACKOFF ** (attempt + 1) * 3
logger.warning("Rate limited, waiting %.1fs...", wait)
time.sleep(wait)
continue
if resp.status_code == 404:
return None
logger.warning(
"HTTP %d for %s (attempt %d/%d)",
resp.status_code, url, attempt + 1, max_retries,
)
except requests.RequestException as e:
logger.warning("Request error: %s (attempt %d)", e, attempt + 1)
time.sleep(RETRY_BACKOFF ** attempt)
return None
def _normalize_terpene(name: str) -> str:
clean = name.lower().strip().replace(" ", "-")
return TERPENE_NORMALIZE.get(
clean, name.upper().replace("-", "_").replace(" ", "_"),
)
# ═══════════════════════════════════════════════════════════════════════
# __NEXT_DATA__ EXTRACTION
# ═══════════════════════════════════════════════════════════════════════
def _extract_next_data(html: str) -> Optional[dict]:
"""Extract __NEXT_DATA__ JSON from an HTML page."""
match = re.search(
r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>',
html, re.DOTALL,
)
if not match:
return None
try:
return json.loads(match.group(1))
except json.JSONDecodeError:
return None
def _extract_strains_from_page(next_data: dict) -> List[dict]:
"""Extract strain list from __NEXT_DATA__ pageProps."""
try:
# Path: props.pageProps.data.strains
page_props = next_data.get("props", {}).get("pageProps", {})
data = page_props.get("data", {})
strains = data.get("strains", [])
if isinstance(strains, list):
return strains
except Exception:
pass
return []
def _extract_total_count(next_data: dict) -> int:
"""Get total strain count from metadata."""
try:
page_props = next_data.get("props", {}).get("pageProps", {})
metadata = page_props.get("data", {}).get("metadata", {})
return int(metadata.get("totalCount", 0))
except Exception:
return 0
# ═══════════════════════════════════════════════════════════════════════
# STRAIN DATA PARSING
# ═══════════════════════════════════════════════════════════════════════
[docs]
def parse_listing_strain(raw: dict) -> Optional[Dict[str, Any]]:
"""Parse a single strain from listing page __NEXT_DATA__.
Each strain object in the listing contains:
- slug, name, category
- terps: {terpene_name: {score: float}}
- effects: {effect_name: {score: float}}
- cannabinoids: {thc: {percentile50: float}, ...}
"""
try:
slug = raw.get("slug", "")
name = raw.get("name", slug.replace("-", " ").title())
category = (raw.get("category") or "Hybrid").lower()
if not slug and not name:
return None
# ── Cannabinoids ──
cannabinoids = raw.get("cannabinoids", {}) or {}
thc_data = cannabinoids.get("thc", {}) or {}
cbd_data = cannabinoids.get("cbd", {}) or {}
thc_pct = thc_data.get("percentile50", 0) or 0
cbd_pct = cbd_data.get("percentile50", 0) or 0
# ── Terpenes ──
terps_raw = raw.get("terps", {}) or {}
terpene_weights = {}
if isinstance(terps_raw, dict):
# Format: {terpene_name: {score: float}}
total_score = 0.0
raw_terps = []
for tname, tdata in terps_raw.items():
score = 1.0
if isinstance(tdata, dict):
score = float(tdata.get("score", 1.0) or 1.0)
elif isinstance(tdata, (int, float)):
score = float(tdata)
normalized = _normalize_terpene(tname)
raw_terps.append((normalized, score))
total_score += score
if total_score > 0:
for tname, tscore in raw_terps:
terpene_weights[tname] = round(tscore / total_score, 4)
elif raw_terps:
for tname, _ in raw_terps:
terpene_weights[tname] = round(1.0 / len(raw_terps), 4)
elif isinstance(terps_raw, list):
# Alternate format: list of terpene objects/strings
for t in terps_raw:
if isinstance(t, str):
terpene_weights[_normalize_terpene(t)] = 1.0
elif isinstance(t, dict):
tname = t.get("name", "")
tscore = float(t.get("score", 1.0) or 1.0)
if tname:
terpene_weights[_normalize_terpene(tname)] = tscore
# ── Effects ──
effects_raw = raw.get("effects", {}) or {}
effect_list = []
if isinstance(effects_raw, dict):
# Sort by score descending, take top 5
sorted_effects = sorted(
effects_raw.items(),
key=lambda x: (
x[1].get("score", 0)
if isinstance(x[1], dict) else float(x[1] or 0)
),
reverse=True,
)
for ename, _ in sorted_effects[:5]:
effect_list.append(ename.lower())
elif isinstance(effects_raw, list):
for e in effects_raw[:5]:
if isinstance(e, str):
effect_list.append(e.lower())
elif isinstance(e, dict):
effect_list.append(e.get("name", "").lower())
# ── Strain gradient ──
base_gradient = CLASSIFICATION_GRADIENT.get(category, 0.50)
# Refine with effects
sativa_fx = {"energetic", "focused", "creative", "uplifted",
"euphoric", "happy", "talkative", "giggly"}
indica_fx = {"relaxed", "sleepy", "hungry", "sedated",
"calm", "aroused", "tingly"}
s_score = sum(1 for e in effect_list if e in sativa_fx)
i_score = sum(1 for e in effect_list if e in indica_fx)
if s_score + i_score > 0:
bias = (s_score - i_score) / ((s_score + i_score) * 2)
base_gradient = max(0.02, min(0.98, base_gradient + bias * 0.2))
# ── YAML key ──
yaml_key = name.upper().replace(" ", "_").replace("-", "_").replace("'", "")
yaml_key = re.sub(r"[^A-Z0-9_]", "", yaml_key)
if not yaml_key:
yaml_key = slug.upper().replace("-", "_")
result = {
"_yaml_key": yaml_key,
"strain_gradient": round(base_gradient, 2),
"classification": category,
"thc_pct": round(float(thc_pct), 1) if thc_pct else 0,
}
if cbd_pct and float(cbd_pct) > 0.5:
result["cbd_pct"] = round(float(cbd_pct), 1)
if terpene_weights:
result["terpene_weights"] = terpene_weights
if effect_list:
result["effects"] = effect_list
return result
except Exception as e:
logger.debug("Failed to parse strain: %s", e)
return None
# ═══════════════════════════════════════════════════════════════════════
# MAIN SCRAPER
# ═══════════════════════════════════════════════════════════════════════
[docs]
def scrape_all_strains(
max_pages: Optional[int] = None,
output_path: str = "leafly_strains.yaml",
page_delay: float = PAGE_DELAY_S,
) -> int:
"""Scrape all Leafly strains from listing pages.
Each listing page's __NEXT_DATA__ contains ~18 strains with
terpene profiles, effects, and cannabinoid data. No need to
visit individual strain pages.
"""
session = _create_session()
all_strains: Dict[str, Dict[str, Any]] = {}
seen_keys: Set[str] = set()
total_expected = 0
page = 1
consecutive_empty = 0
logger.info("Starting Leafly strain scrape... 🌿")
while True:
if max_pages and page > max_pages:
logger.info("Reached page limit (%d)", max_pages)
break
url = f"{STRAINS_LIST_URL}?page={page}"
logger.info("Fetching page %d...", page)
resp = _fetch_with_retry(session, url)
if not resp:
consecutive_empty += 1
if consecutive_empty >= 3:
logger.info("3 consecutive failures, stopping at page %d", page)
break
page += 1
time.sleep(page_delay)
continue
next_data = _extract_next_data(resp.text)
if not next_data:
logger.warning("No __NEXT_DATA__ found on page %d", page)
consecutive_empty += 1
if consecutive_empty >= 3:
break
page += 1
time.sleep(page_delay)
continue
# Get total count on first page
if page == 1:
total_expected = _extract_total_count(next_data)
logger.info("Total strains on Leafly: %d", total_expected)
strains = _extract_strains_from_page(next_data)
if not strains:
consecutive_empty += 1
if consecutive_empty >= 3:
logger.info("No more strains found, stopping at page %d", page)
break
page += 1
time.sleep(page_delay)
continue
consecutive_empty = 0
new_count = 0
for raw_strain in strains:
parsed = parse_listing_strain(raw_strain)
if not parsed:
continue
key = parsed.pop("_yaml_key")
if key in seen_keys:
# Handle duplicates by appending a suffix
suffix = 2
while f"{key}_{suffix}" in seen_keys:
suffix += 1
key = f"{key}_{suffix}"
seen_keys.add(key)
all_strains[key] = parsed
new_count += 1
logger.info(
"Page %d: %d strains (%d new, %d total)",
page, len(strains), new_count, len(all_strains),
)
page += 1
time.sleep(page_delay)
# ── Write YAML ──
logger.info("Writing %d strains to %s...", len(all_strains), output_path)
with open(output_path, "w", encoding="utf-8") as f:
f.write("# ======================================================================\n")
f.write("# LEAFLY STRAIN DATABASE -- Scraped from leafly.com\n")
f.write(f"# Total strains: {len(all_strains)}\n")
f.write(f"# Expected: {total_expected}\n")
f.write("# Format compatible with terpene_profiles.yaml / TerpeneEngine\n")
f.write("# ======================================================================\n\n")
f.write("strains:\n\n")
for key in sorted(all_strains.keys()):
data = all_strains[key]
f.write(f" {key}:\n")
f.write(f" strain_gradient: {data.get('strain_gradient', 0.5)}\n")
f.write(f" classification: {data.get('classification', 'hybrid')}\n")
f.write(f" thc_pct: {data.get('thc_pct', 0)}\n")
if "cbd_pct" in data:
f.write(f" cbd_pct: {data['cbd_pct']}\n")
if "terpene_weights" in data:
f.write(" terpene_weights:\n")
for tname, tweight in sorted(
data["terpene_weights"].items(),
key=lambda x: -x[1],
):
f.write(f" {tname}: {tweight}\n")
if "effects" in data:
effects_str = json.dumps(data["effects"])
f.write(f" effects: {effects_str}\n")
f.write("\n")
logger.info(
"DONE 🔥 %d strains scraped. Output: %s",
len(all_strains), output_path,
)
return len(all_strains)
# ═══════════════════════════════════════════════════════════════════════
# MERGE INTO TERPENE_PROFILES.YAML
# ═══════════════════════════════════════════════════════════════════════
[docs]
def merge_into_terpene_profiles(
leafly_yaml_path: str,
terpene_profiles_path: str,
) -> int:
"""Merge scraped Leafly strains into terpene_profiles.yaml.
Only adds strains not already in the curated database.
Returns count of new strains added.
"""
with open(terpene_profiles_path, "r", encoding="utf-8") as f:
existing = yaml.safe_load(f) or {}
existing_strains = existing.get("strains", {})
existing_keys = set(existing_strains.keys())
with open(leafly_yaml_path, "r", encoding="utf-8") as f:
scraped = yaml.safe_load(f) or {}
scraped_strains = scraped.get("strains", {})
added = 0
for key, data in scraped_strains.items():
if key not in existing_keys:
existing_strains[key] = data
added += 1
existing["strains"] = existing_strains
with open(terpene_profiles_path, "w", encoding="utf-8") as f:
yaml.dump(
existing, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
)
logger.info("Merged %d new strains into %s", added, terpene_profiles_path)
return added
# ═══════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════
[docs]
def main():
parser = argparse.ArgumentParser(
description="Scrape ALL strains from Leafly into YAML",
)
parser.add_argument(
"--pages", type=int, default=None,
help="Max listing pages to scrape (default: all ~500 pages)",
)
parser.add_argument(
"--output", type=str, default="leafly_strains.yaml",
help="Output YAML file path (default: leafly_strains.yaml)",
)
parser.add_argument(
"--merge", action="store_true",
help="After scraping, merge new strains into terpene_profiles.yaml",
)
parser.add_argument(
"--delay", type=float, default=PAGE_DELAY_S,
help=f"Delay between pages in seconds (default: {PAGE_DELAY_S})",
)
args = parser.parse_args()
count = scrape_all_strains(
max_pages=args.pages,
output_path=args.output,
page_delay=args.delay,
)
if args.merge and count > 0:
terp_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"terpene_profiles.yaml",
)
if os.path.exists(terp_path):
merge_into_terpene_profiles(args.output, terp_path)
else:
logger.warning("terpene_profiles.yaml not found, skipping merge")
if __name__ == "__main__":
main()