"""
Yelp Business Details Scraper

Usage:
    python3 yelp_business.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco"
    python3 yelp_business.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco" -o details.json

Proxy: Set PROXY_URL in .env file (e.g., PROXY_URL=http://user:pass@host:port)
"""

import argparse
import json
import os
import random
import re
import time
from html import unescape
from pathlib import Path

from bs4 import BeautifulSoup
from curl_cffi import CurlOpt, requests
from curl_cffi.requests.exceptions import RequestException
from dotenv import load_dotenv

# Load .env from the project directory
load_dotenv(Path(__file__).parent / ".env")


BROWSER_HEADERS = {
    "Referer": "https://www.google.com/",
}

RETRY_IMPERSONATIONS = ["safari2601", "safari184", "safari260", "chrome133a", "safari170"]


def _get_proxy() -> str | None:
    """Get proxy URL from environment."""
    return os.environ.get("PROXY_URL")


def _get_session(impersonate: str | None = None) -> requests.Session:
    """Create a curl_cffi session with browser impersonation and optional proxy."""
    choice = impersonate or "safari2601"
    proxy_url = _get_proxy()

    return requests.Session(
        impersonate=choice,
        proxy=proxy_url,
        timeout=(10, 30),
        curl_options={
            CurlOpt.TCP_KEEPALIVE: 1,
            CurlOpt.TCP_KEEPIDLE: 60,
            CurlOpt.TCP_KEEPINTVL: 30,
            CurlOpt.DNS_CACHE_TIMEOUT: 300,
        },
    )


def _request_with_retry(session, method, url, max_retries=5, **kwargs):
    """Make a request with automatic retry on 403/429/network errors, cycling impersonation."""
    resp = None
    for attempt in range(max_retries):
        imp = RETRY_IMPERSONATIONS[attempt % len(RETRY_IMPERSONATIONS)]
        try:
            resp = getattr(session, method)(url, impersonate=imp, **kwargs)
        except RequestException as e:
            if attempt < max_retries - 1:
                wait = (attempt + 1) * 3 + random.uniform(1, 3)
                print(f"  Network error ({e}), retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait)
                continue
            raise

        if resp.status_code in (200, 201):
            return resp

        if resp.status_code in (403, 429) and attempt < max_retries - 1:
            wait = (attempt + 1) * 3 + random.uniform(1, 3)
            next_imp = RETRY_IMPERSONATIONS[(attempt + 1) % len(RETRY_IMPERSONATIONS)]
            print(f"  HTTP {resp.status_code}, retrying in {wait:.1f}s as {next_imp} (attempt {attempt + 1}/{max_retries})...")
            time.sleep(wait)
            continue

        return resp

    return resp


def _fetch_biz_page(url: str) -> BeautifulSoup:
    """Fetch a Yelp business page with validation that we got real content."""
    session = _get_session("safari2601")
    resp = _request_with_retry(session, "get", url, headers=BROWSER_HEADERS)
    if resp.status_code != 200:
        raise RuntimeError(f"Failed to fetch {url} (HTTP {resp.status_code})")

    soup = BeautifulSoup(resp.text, "html.parser")

    # Validate we got real page content, not a captcha
    meta = soup.find("meta", attrs={"name": "yelp-biz-id"})
    if not meta and len(resp.text) < 50000:
        raise RuntimeError(f"Got blocked — page is a captcha/redirect ({len(resp.text)} bytes)")

    return soup


def _save_json(data: dict | list, filepath: str):
    """Save data to JSON."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved to {filepath}")


# ─── Business-specific logic ───────────────────────────────────────────────────


def _parse_apollo_cache(soup: BeautifulSoup) -> dict | None:
    """Extract and parse the Apollo state cache from page HTML."""
    for tag in soup.find_all("script", type="application/json"):
        text = tag.string or ""
        if len(text) > 50000 and "ROOT_QUERY" in text:
            clean = unescape(text).strip()
            if clean.startswith("<!--"):
                clean = clean[4:]
            if clean.endswith("-->"):
                clean = clean[:-3]
            return json.loads(clean.strip())
    return None


def _resolve_ref(cache: dict, ref_or_val):
    """Resolve a {'__ref': 'Key'} reference to the actual cache entity."""
    if isinstance(ref_or_val, dict) and "__ref" in ref_or_val:
        return cache.get(ref_or_val["__ref"], {})
    return ref_or_val


def _find_biz_entity(cache: dict, enc_biz_id: str) -> dict | None:
    """Find the Business entity in the cache."""
    key = f"Business:{enc_biz_id}"
    if key in cache:
        return cache[key]
    # Fallback: search for any Business: key
    for k, v in cache.items():
        if k.startswith("Business:") and isinstance(v, dict) and v.get("name"):
            return v
    return None


def _find_location_entity(cache: dict, enc_biz_id: str) -> dict | None:
    """Find the BusinessLocation entity in the cache."""
    key = f"BusinessLocation:{enc_biz_id}"
    if key in cache:
        return cache[key]
    for k, v in cache.items():
        if k.startswith("BusinessLocation:") and isinstance(v, dict):
            return v
    return None


def _extract_coords(biz: dict) -> dict | None:
    """Extract coordinates from the map image URL."""
    for key in biz:
        if key.startswith("map("):
            val = biz[key]
            if isinstance(val, dict):
                src = val.get("src", "")
                match = re.search(r"center=([\d.-]+)%2C([\d.-]+)", src)
                if match:
                    return {"latitude": float(match.group(1)), "longitude": float(match.group(2))}
    return None


def _extract_hours(biz: dict) -> list[dict]:
    """Extract operating hours from the business entity."""
    op = biz.get("operationHours")
    if not op:
        return []
    weekly = op.get("regularHoursMergedWithSpecialHoursForCurrentWeek", [])
    hours = []
    for day in weekly:
        day_hours = day.get("regularHours", [])
        is_open = bool(day_hours) and day_hours != ["Closed"]
        hours.append({
            "day": day.get("dayOfWeekShort", ""),
            "hours": day_hours,
            "is_open": is_open,
        })
    return hours


def _extract_categories(biz: dict, cache: dict) -> list[dict]:
    """Extract categories with resolved references."""
    cats = biz.get("categories", [])
    result = []
    for cat in cats:
        resolved = _resolve_ref(cache, cat)
        if isinstance(resolved, dict):
            result.append({
                "title": resolved.get("title", ""),
                "alias": resolved.get("alias", resolved.get("encid", "")),
            })
    return result


def _extract_attributes(biz: dict) -> list[dict]:
    """Extract business attributes/amenities."""
    attrs = []
    for key in biz:
        if key.startswith("organizedProperties("):
            props_list = biz[key]
            if isinstance(props_list, list):
                for group in props_list:
                    if isinstance(group, dict):
                        for prop in group.get("properties", []):
                            attrs.append({
                                "name": prop.get("displayText", ""),
                                "alias": prop.get("alias", ""),
                                "is_active": prop.get("isActive", False),
                            })
    return attrs


def _extract_photos(biz: dict, cache: dict, limit: int = 10) -> list[dict]:
    """Extract business photos from the media entity."""
    photos = []

    # Photos live under biz["media"]["orderedMediaItems(...)"]
    media = biz.get("media", {})
    if not isinstance(media, dict):
        return photos

    for key in media:
        if "orderedMediaItems" not in key:
            continue
        container = media[key]
        if not isinstance(container, dict):
            continue
        edges = container.get("edges", [])
        for edge in edges:
            node = _resolve_ref(cache, edge.get("node", {}))
            if not isinstance(node, dict):
                continue

            # Skip non-photo items (e.g. BusinessVideo)
            if node.get("__typename") != "BusinessPhoto":
                continue

            # Extract photo URL — keys like url({"size":"LARGE"})
            photo_url = ""
            photo_url_obj = node.get("photoUrl", {})
            if isinstance(photo_url_obj, dict):
                for url_key, url_val in photo_url_obj.items():
                    if isinstance(url_val, str) and url_val.startswith("http"):
                        if "LARGE" in url_key or "ORIGINAL" in url_key:
                            photo_url = url_val
                            break
                # Fallback to any URL
                if not photo_url:
                    for url_val in photo_url_obj.values():
                        if isinstance(url_val, str) and url_val.startswith("http"):
                            photo_url = url_val
                            break

            photos.append({
                "id": node.get("encid", ""),
                "caption": node.get("caption") or "",
                "url": photo_url,
            })

            if len(photos) >= limit:
                break
    return photos


def scrape_business_details(url: str) -> dict:
    """Scrape full business details from a Yelp business page."""
    proxy = _get_proxy()
    print(f"Proxy: {'enabled' if proxy else 'disabled (set PROXY_URL in .env)'}")
    print(f"Fetching: {url}")
    soup = _fetch_biz_page(url)

    # Get encBizId
    meta = soup.find("meta", attrs={"name": "yelp-biz-id"})
    if not meta:
        raise RuntimeError("Could not find yelp-biz-id meta tag.")
    enc_biz_id = str(meta["content"])

    # Parse Apollo cache
    cache = _parse_apollo_cache(soup)
    if not cache:
        raise RuntimeError("Could not find Apollo cache in page HTML.")

    biz = _find_biz_entity(cache, enc_biz_id)
    if not biz:
        raise RuntimeError(f"Could not find Business entity for {enc_biz_id}")

    loc = _find_location_entity(cache, enc_biz_id) or {}
    address = loc.get("address", {}) if isinstance(loc, dict) else {}

    # Extract rating (uses parameterized key)
    rating = None
    for key in biz:
        if key.startswith("rating("):
            rating = biz[key]
            break

    # External resources
    ext = biz.get("externalResources", {}) or {}
    website = (ext.get("website") or {}).get("url", "")
    menu_url = (ext.get("menu") or {}).get("url", "")

    # Phone
    phone_obj = biz.get("phoneNumber", {}) or {}

    # Price
    price_obj = biz.get("priceRange", {}) or {}

    details = {
        "biz_id": enc_biz_id,
        "name": biz.get("name", ""),
        "alias": biz.get("alias", ""),
        "url": f"https://www.yelp.com/biz/{biz.get('alias', '')}",
        "is_closed": biz.get("isClosed", False),
        "rating": rating,
        "review_count": biz.get("reviewCount"),
        "price_range": price_obj.get("display", ""),
        "phone": phone_obj.get("formatted", ""),
        "address": {
            "line1": address.get("addressLine1", ""),
            "line2": address.get("addressLine2", ""),
            "city": address.get("city", ""),
            "state": address.get("regionCode", ""),
            "postal_code": address.get("postalCode", ""),
            "country": (loc.get("country") or {}).get("code", "") if isinstance(loc, dict) else "",
            "formatted": address.get("formatted", ""),
        },
        "neighborhoods": loc.get("neighborhoods", []) if isinstance(loc, dict) else [],
        "coordinates": _extract_coords(biz),
        "categories": _extract_categories(biz, cache),
        "hours": _extract_hours(biz),
        "website": website,
        "menu_url": menu_url,
        "specialties": biz.get("specialties", ""),
        "year_established": (biz.get("history") or {}).get("yearEstablished", ""),
        "attributes": _extract_attributes(biz),
        "photos": _extract_photos(biz, cache),
    }

    print(f"Extracted details for: {details['name']}")
    return details


def main():
    parser = argparse.ArgumentParser(description="Scrape Yelp business details")
    parser.add_argument("url", help="Yelp business page URL")
    parser.add_argument("--output", "-o", help="Output file path")
    args = parser.parse_args()

    details = scrape_business_details(args.url)

    slug = args.url.rstrip("/").split("/")[-1]
    outpath = args.output or f"{slug}_details.json"
    _save_json(details, outpath)


if __name__ == "__main__":
    main()
