"""
Yelp Search Scraper

Usage:
    python3 yelp_search.py "pizza" "San Francisco, CA"
    python3 yelp_search.py "pizza" "San Francisco, CA" --max-pages 3
    python3 yelp_search.py "pizza" "San Francisco, CA" --format csv

Proxy: Set PROXY_URL in .env file (e.g., PROXY_URL=http://user:pass@host:port)
"""

import argparse
import csv
import json
import os
import re
import sys
import time
import random
from html import unescape
from pathlib import Path
from urllib.parse import quote_plus, unquote

from bs4 import BeautifulSoup
from curl_cffi import CurlOpt, requests
from curl_cffi.requests.exceptions import RequestException
from dotenv import load_dotenv

# Load .env from the project directory
load_dotenv(Path(__file__).parent / ".env")


SEARCH_URL = "https://www.yelp.com/search"
RESULTS_PER_PAGE = 10

BROWSER_HEADERS = {
    "Referer": "https://www.google.com/",
}

RETRY_IMPERSONATIONS = ["safari2601", "safari184", "safari260", "chrome133a", "safari170"]


def _get_proxy() -> str | None:
    """Get proxy URL from environment."""
    return os.environ.get("PROXY_URL")


def _get_session(impersonate: str | None = None) -> requests.Session:
    """Create a curl_cffi session with browser impersonation and optional proxy."""
    choice = impersonate or "safari2601"
    proxy_url = _get_proxy()

    return requests.Session(
        impersonate=choice,
        proxy=proxy_url,
        timeout=(10, 30),
        curl_options={
            CurlOpt.TCP_KEEPALIVE: 1,
            CurlOpt.TCP_KEEPIDLE: 60,
            CurlOpt.TCP_KEEPINTVL: 30,
            CurlOpt.DNS_CACHE_TIMEOUT: 300,
        },
    )


def _request_with_retry(session, method, url, max_retries=5, **kwargs):
    """Make a request with automatic retry on 403/429/network errors, cycling impersonation."""
    resp = None
    for attempt in range(max_retries):
        imp = RETRY_IMPERSONATIONS[attempt % len(RETRY_IMPERSONATIONS)]
        try:
            resp = getattr(session, method)(url, impersonate=imp, **kwargs)
        except RequestException as e:
            if attempt < max_retries - 1:
                wait = (attempt + 1) * 3 + random.uniform(1, 3)
                print(f"  Network error ({e}), retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait)
                continue
            raise

        if resp.status_code in (200, 201):
            return resp

        if resp.status_code in (403, 429) and attempt < max_retries - 1:
            wait = (attempt + 1) * 3 + random.uniform(1, 3)
            next_imp = RETRY_IMPERSONATIONS[(attempt + 1) % len(RETRY_IMPERSONATIONS)]
            print(f"  HTTP {resp.status_code}, retrying in {wait:.1f}s as {next_imp} (attempt {attempt + 1}/{max_retries})...")
            time.sleep(wait)
            continue

        return resp

    return resp


def _save_json(data: list | dict, filepath: str):
    """Save data to JSON."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved to {filepath}")


def _save_csv(rows: list[dict], filepath: str):
    """Save data to CSV."""
    if not rows:
        print("No data to save.")
        return

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)
    print(f"Saved to {filepath}")


# ─── Search-specific logic ─────────────────────────────────────────────────────


def _build_search_url(query: str, location: str, offset: int = 0) -> str:
    """Build the search URL with params."""
    url = f"{SEARCH_URL}?find_desc={quote_plus(query)}&find_loc={quote_plus(location)}"
    if offset > 0:
        url += f"&start={offset}"
    return url


def _parse_hypernova(soup: BeautifulSoup) -> dict | None:
    """Extract and parse the Hypernova JSON blob from search page."""
    script = soup.find("script", attrs={"data-hypernova-key": True})
    if not script or not script.string:
        return None

    text = script.string.strip()
    # Strip HTML comment wrappers
    if text.startswith("<!--"):
        text = text[4:]
    if text.endswith("-->"):
        text = text[:-3]

    return json.loads(text)


def _parse_apollo_addresses(soup: BeautifulSoup) -> dict[str, dict]:
    """Extract address data from Apollo state cache, keyed by business alias."""
    addresses = {}
    tag = soup.find("script", attrs={"data-apollo-state": True})
    if not tag or not tag.string:
        return addresses

    raw = unescape(tag.string).strip()
    if raw.startswith("<!--"):
        raw = raw[4:]
    if raw.endswith("-->"):
        raw = raw[:-3]

    try:
        cache = json.loads(raw)
    except json.JSONDecodeError:
        return addresses

    # Map encid -> alias from Business entities
    id_to_alias = {}
    for key, val in cache.items():
        if key.startswith("Business:") and isinstance(val, dict):
            alias = val.get("alias", "")
            encid = key.split(":", 1)[1]
            if alias:
                id_to_alias[encid] = alias

    # Extract addresses from BusinessLocation entities
    for key, val in cache.items():
        if key.startswith("BusinessLocation:") and isinstance(val, dict):
            encid = key.split(":", 1)[1]
            alias = id_to_alias.get(encid, "")
            addr = val.get("address", {})
            if alias and addr:
                addresses[alias] = {
                    "line1": addr.get("addressLine1", ""),
                    "city": addr.get("city", ""),
                    "state": addr.get("regionCode", ""),
                    "postal_code": addr.get("postalCode", ""),
                }

    return addresses


def _extract_businesses(data: dict, addresses: dict) -> list[dict]:
    """Extract business results from Hypernova data."""
    components = (
        data.get("legacyProps", {})
        .get("searchAppProps", {})
        .get("searchPageProps", {})
        .get("mainContentComponentsListProps", [])
    )

    businesses = []
    for comp in components:
        srb = comp.get("searchResultBusiness")
        if not srb:
            continue

        alias = srb.get("alias", "")
        is_ad = comp.get("isAd", False)

        # Get clean URL
        biz_url = srb.get("businessUrl", "")
        if "/adredir" in biz_url:
            match = re.search(r"redirect_url=([^&]+)", biz_url)
            if match:
                redirect = unquote(match.group(1))
                alias = redirect.split("/biz/")[-1].split("?")[0]

        # Get address from Apollo cache
        addr = addresses.get(alias, {})
        addr_str = f"{addr.get('line1', '')}, {addr.get('city', '')}".strip(", ") if addr else ""

        categories = [unescape(c.get("title", "")) for c in (srb.get("categories") or [])]

        businesses.append({
            "rank": comp.get("ranking"),
            "name": unescape(srb.get("name", "")),
            "alias": alias,
            "url": f"https://www.yelp.com/biz/{alias}",
            "biz_id": comp.get("bizId", ""),
            "rating": srb.get("rating"),
            "review_count": srb.get("reviewCount"),
            "price_range": srb.get("priceRange", ""),
            "categories": categories,
            "phone": srb.get("phone", ""),
            "neighborhoods": srb.get("neighborhoods", []),
            "address": addr_str,
            "is_ad": is_ad,
            "is_closed": srb.get("isClosed", False),
        })

    return businesses


def scrape_search(
    query: str,
    location: str,
    max_pages: int | None = None,
    include_ads: bool = False,
    delay: float = 5.0,
) -> list[dict]:
    """
    Search Yelp and return matching businesses.

    Uses a single persistent session (with cookies) to mimic real browser
    navigation. This is critical — Yelp blocks page 2+ if you create new
    sessions for each request.

    Args:
        query: Search term (e.g., "pizza")
        location: Location (e.g., "San Francisco, CA")
        max_pages: Max pages to scrape (None = all)
        include_ads: Whether to include sponsored results
        delay: Delay between requests (5s+ recommended for search)
    """
    proxy = _get_proxy()
    print(f'Searching Yelp for "{query}" in "{location}"')
    print(f"Proxy: {'enabled' if proxy else 'disabled (set PROXY_URL in .env)'}")

    # Single session for the entire search — cookies carry over like a real browser
    session = _get_session("safari2601")

    # First page — Referer is Google (as if user clicked a search result)
    first_url = _build_search_url(query, location, 0)
    headers = {**BROWSER_HEADERS}
    resp = _request_with_retry(session, "get", first_url, headers=headers)
    if resp.status_code != 200:
        raise RuntimeError(f"Search failed (HTTP {resp.status_code})")

    soup = BeautifulSoup(resp.text, "html.parser")
    data = _parse_hypernova(soup)
    if not data:
        raise RuntimeError("Could not find Hypernova data in search page.")

    addresses = _parse_apollo_addresses(soup)

    # Get total results
    search_ctx = (
        data.get("legacyProps", {})
        .get("searchAppProps", {})
        .get("searchPageProps", {})
        .get("searchContext", {})
    )
    total_results = search_ctx.get("totalResults", 0)
    total_pages = (total_results + RESULTS_PER_PAGE - 1) // RESULTS_PER_PAGE

    if max_pages:
        total_pages = min(total_pages, max_pages)

    print(f"Total results: {total_results} ({total_pages} pages to scrape)")

    all_businesses = _extract_businesses(data, addresses)
    organic = [b for b in all_businesses if not b["is_ad"]]
    print(f"  Page 1/{total_pages}: {len(organic)} organic results")

    prev_url = first_url

    # Paginate — same session, Referer = previous page URL
    for page in range(2, total_pages + 1):
        jitter = delay + random.uniform(0, delay * 0.3)
        time.sleep(jitter)
        offset = (page - 1) * RESULTS_PER_PAGE

        session.upkeep()

        try:
            page_url = _build_search_url(query, location, offset)
            headers = {**BROWSER_HEADERS, "Referer": prev_url}
            resp = _request_with_retry(session, "get", page_url, headers=headers)

            if resp.status_code != 200:
                print(f"  Page {page} failed (HTTP {resp.status_code}), stopping.")
                break

            if len(resp.text) < 5000:
                print(f"  Page {page}: got captcha/block page, stopping.")
                break

            soup = BeautifulSoup(resp.text, "html.parser")
            data = _parse_hypernova(soup)
            if not data:
                print(f"  Page {page}: no Hypernova data, stopping.")
                break

            addresses = _parse_apollo_addresses(soup)
            page_biz = _extract_businesses(data, addresses)
            organic = [b for b in page_biz if not b["is_ad"]]
            all_businesses.extend(page_biz)
            print(f"  Page {page}/{total_pages}: {len(organic)} organic results")

            prev_url = page_url

            if not organic:
                break
        except Exception as e:
            print(f"  Page {page} error: {e}")
            break

    if not include_ads:
        all_businesses = [b for b in all_businesses if not b["is_ad"]]

    print(f"\nDone! Found {len(all_businesses)} businesses.")
    return all_businesses


def main():
    parser = argparse.ArgumentParser(description="Search Yelp businesses")
    parser.add_argument("query", help="Search term (e.g., 'pizza')")
    parser.add_argument("location", help="Location (e.g., 'San Francisco, CA')")
    parser.add_argument("--format", choices=["json", "csv"], default="json")
    parser.add_argument("--output", "-o", help="Output file path")
    parser.add_argument("--max-pages", type=int, default=None)
    parser.add_argument("--include-ads", action="store_true", help="Include sponsored results")
    parser.add_argument("--delay", type=float, default=5.0, help="Delay between requests (default: 5.0)")
    args = parser.parse_args()

    results = scrape_search(
        args.query, args.location,
        max_pages=args.max_pages,
        include_ads=args.include_ads,
        delay=args.delay,
    )

    if not results:
        print("No results found.")
        sys.exit(1)

    slug = f"{args.query.replace(' ', '_')}_{args.location.replace(' ', '_').replace(',', '')}"
    outpath = args.output or f"search_{slug}.{args.format}"

    if args.format == "csv":
        # Flatten categories for CSV
        for r in results:
            r["categories"] = ", ".join(r["categories"])
            r["neighborhoods"] = ", ".join(r["neighborhoods"])
        _save_csv(results, outpath)
    else:
        _save_json(results, outpath)


if __name__ == "__main__":
    main()