"""
Yelp Review Scraper

Usage:
    python3 yelp_scraper.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco"
    python3 yelp_scraper.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco" --format csv
    python3 yelp_scraper.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco" --max-pages 5
    python3 yelp_scraper.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco" --biz-id jLjbntnlmxsX8gsOxZVUcQ

Proxy: Set PROXY_URL in .env file (e.g., PROXY_URL=http://user:pass@host:port)
"""

import argparse
import csv
import json
import os
import random
import sys
import time
from base64 import b64encode
from pathlib import Path

from bs4 import BeautifulSoup
from curl_cffi import CurlOpt, requests
from curl_cffi.requests.exceptions import RequestException
from dotenv import load_dotenv

# Load .env from the project directory
load_dotenv(Path(__file__).parent / ".env")


GQL_URL = "https://www.yelp.com/gql/batch"
REVIEWS_PER_PAGE = 10
DOC_ID = "ef51f33d1b0eccc958dddbf6cde15739c48b34637a00ebe316441031d4bf7681"

BROWSER_HEADERS = {
    "Referer": "https://www.google.com/",
}

RETRY_IMPERSONATIONS = ["safari2601", "safari184", "safari260", "chrome133a", "safari170"]


def _get_proxy() -> str | None:
    """Get proxy URL from environment."""
    return os.environ.get("PROXY_URL")


def _get_session(impersonate: str | None = None) -> requests.Session:
    """Create a curl_cffi session with browser impersonation and optional proxy."""
    choice = impersonate or "safari2601"
    proxy_url = _get_proxy()

    return requests.Session(
        impersonate=choice,
        proxy=proxy_url,
        timeout=(10, 30),
        curl_options={
            CurlOpt.TCP_KEEPALIVE: 1,
            CurlOpt.TCP_KEEPIDLE: 60,
            CurlOpt.TCP_KEEPINTVL: 30,
            CurlOpt.DNS_CACHE_TIMEOUT: 300,
        },
    )


def _request_with_retry(session, method, url, max_retries=5, **kwargs):
    """Make a request with automatic retry on 403/429/network errors, cycling impersonation."""
    resp = None
    for attempt in range(max_retries):
        imp = RETRY_IMPERSONATIONS[attempt % len(RETRY_IMPERSONATIONS)]
        try:
            resp = getattr(session, method)(url, impersonate=imp, **kwargs)
        except RequestException as e:
            if attempt < max_retries - 1:
                wait = (attempt + 1) * 3 + random.uniform(1, 3)
                print(f"  Network error ({e}), retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait)
                continue
            raise

        if resp.status_code in (200, 201):
            return resp

        if resp.status_code in (403, 429) and attempt < max_retries - 1:
            wait = (attempt + 1) * 3 + random.uniform(1, 3)
            next_imp = RETRY_IMPERSONATIONS[(attempt + 1) % len(RETRY_IMPERSONATIONS)]
            print(f"  HTTP {resp.status_code}, retrying in {wait:.1f}s as {next_imp} (attempt {attempt + 1}/{max_retries})...")
            time.sleep(wait)
            continue

        return resp

    return resp


def _fetch_biz_page(url: str, session: requests.Session | None = None) -> BeautifulSoup:
    """Fetch a Yelp business page with validation that we got real content."""
    if session is None:
        session = _get_session("safari2601")
    resp = _request_with_retry(session, "get", url, headers=BROWSER_HEADERS)
    if resp.status_code != 200:
        raise RuntimeError(f"Failed to fetch {url} (HTTP {resp.status_code})")

    soup = BeautifulSoup(resp.text, "html.parser")

    # Validate we got real page content, not a captcha
    meta = soup.find("meta", attrs={"name": "yelp-biz-id"})
    if not meta and len(resp.text) < 50000:
        raise RuntimeError(f"Got blocked — page is a captcha/redirect ({len(resp.text)} bytes)")

    return soup


def extract_biz_id(session: requests.Session, url: str) -> tuple[str, str]:
    """Fetch the business page and extract encBizId + business name."""
    soup = _fetch_biz_page(url, session=session)

    meta = soup.find("meta", attrs={"name": "yelp-biz-id"})
    if not meta:
        raise RuntimeError("Could not find yelp-biz-id meta tag. Page may have changed.")
    enc_biz_id = str(meta["content"])

    title_tag = soup.find("title")
    biz_name = title_tag.get_text(strip=True).split(" - ")[0] if title_tag else "unknown"

    return enc_biz_id, biz_name


def build_gql_payload(enc_biz_id: str, offset: int = 0) -> list[dict]:
    """Build the GraphQL batch request payload."""
    variables = {
        "encBizId": enc_biz_id,
        "reviewsPerPage": REVIEWS_PER_PAGE,
        "selectedReviewEncId": "",
        "hasSelectedReview": False,
        "sortBy": "DATE_DESC",
        "languageCode": "en",
        "ratings": [5, 4, 3, 2, 1],
        "isSearching": False,
        "isTranslating": False,
        "translateLanguageCode": "en",
        "reactionsSourceFlow": "businessPageReviewSection",
        "minConfidenceLevel": "HIGH_CONFIDENCE",
        "highlightType": "",
        "highlightIdentifier": "",
        "isHighlighting": False,
    }

    if offset > 0:
        token = b64encode(
            json.dumps({"version": 1, "type": "offset", "offset": offset}).encode()
        ).decode()
        variables["after"] = token

    return [
        {
            "operationName": "GetBusinessReviewFeed",
            "variables": variables,
            "extensions": {
                "operationType": "query",
                "documentId": DOC_ID,
            },
        }
    ]


def fetch_reviews_page(session: requests.Session, enc_biz_id: str, offset: int, referer: str) -> dict:
    """Fetch a single page of reviews via GraphQL."""
    headers = {
        "Content-Type": "application/json",
        "Accept": "*/*",
        "Origin": "https://www.yelp.com",
        "Referer": referer,
        "x-apollo-operation-name": "GetBusinessReviewFeed",
    }

    payload = build_gql_payload(enc_biz_id, offset)
    resp = _request_with_retry(session, "post", GQL_URL, json=payload, headers=headers)

    if resp.status_code != 200:
        raise RuntimeError(f"GraphQL request failed (HTTP {resp.status_code}): {resp.text[:300]}")

    data = resp.json()

    if isinstance(data, list):
        data = data[0]

    if "errors" in data:
        raise RuntimeError(f"GraphQL errors: {json.dumps(data['errors'])[:500]}")

    return data["data"]["business"]["reviews"]


def parse_review(node: dict) -> dict:
    """Parse a single review node into a flat dict."""
    author = node.get("author", {})
    text_obj = node.get("text", {})
    feedback = node.get("feedback", {})
    created = node.get("createdAt", {})
    photos = node.get("businessPhotos") or []

    return {
        "review_id": node.get("encid", ""),
        "rating": node.get("rating"),
        "date": created.get("utcDateTime", ""),
        "text": text_obj.get("full", ""),
        "language": text_obj.get("language", ""),
        "author_name": author.get("displayName", ""),
        "author_location": author.get("displayLocation", ""),
        "useful_count": feedback.get("usefulCount", 0),
        "funny_count": feedback.get("funnyCount", 0),
        "cool_count": feedback.get("coolCount", 0),
        "photo_count": len(photos),
    }


def scrape_reviews(
    url: str,
    biz_id: str | None = None,
    max_pages: int | None = None,
    delay: float = 1.5,
) -> list[dict]:
    """
    Scrape all reviews from a Yelp business URL.

    Args:
        url: Yelp business page URL
        biz_id: Pre-known encBizId (skips initial page fetch if provided)
        max_pages: Limit number of pages to scrape (None = all)
        delay: Seconds to wait between page requests
    """
    session = _get_session()

    proxy = _get_proxy()
    print(f"Proxy: {'enabled' if proxy else 'disabled (set PROXY_URL in .env)'}")

    if biz_id:
        enc_biz_id = biz_id
        biz_name = url.rstrip("/").split("/")[-1]
        print(f"Using provided biz ID: {enc_biz_id}")
    else:
        print(f"Fetching business page: {url}")
        enc_biz_id, biz_name = extract_biz_id(session, url)

    print(f"Business: {biz_name}")
    print(f"Business ID: {enc_biz_id}")

    # First page
    reviews_data = fetch_reviews_page(session, enc_biz_id, 0, url)
    total_count = reviews_data.get("totalCount", 0)
    total_pages = (total_count + REVIEWS_PER_PAGE - 1) // REVIEWS_PER_PAGE

    if max_pages:
        total_pages = min(total_pages, max_pages)

    print(f"Total reviews: {total_count} ({total_pages} pages to scrape)")

    all_reviews = []

    # Parse first page
    edges = reviews_data.get("edges", [])
    for edge in edges:
        all_reviews.append(parse_review(edge["node"]))

    print(f"  Page 1/{total_pages}: got {len(edges)} reviews")

    # Paginate through remaining pages
    for page in range(2, total_pages + 1):
        jitter = delay + random.uniform(0, delay * 0.5)
        time.sleep(jitter)
        offset = (page - 1) * REVIEWS_PER_PAGE

        # Keep HTTP/2 connections alive
        session.upkeep()

        try:
            reviews_data = fetch_reviews_page(session, enc_biz_id, offset, url)
            edges = reviews_data.get("edges", [])
            for edge in edges:
                all_reviews.append(parse_review(edge["node"]))
            print(f"  Page {page}/{total_pages}: got {len(edges)} reviews")

            if not edges:
                print("  No more reviews, stopping.")
                break
        except RuntimeError as e:
            print(f"  Page {page} failed: {e}")
            break

    print(f"\nDone! Scraped {len(all_reviews)} reviews total.")
    return all_reviews


def save_json(reviews: list[dict], filepath: str):
    """Save reviews to JSON."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(reviews, f, indent=2, ensure_ascii=False)
    print(f"Saved to {filepath}")


def save_csv(reviews: list[dict], filepath: str):
    """Save reviews to CSV."""
    if not reviews:
        print("No reviews to save.")
        return

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=reviews[0].keys())
        writer.writeheader()
        writer.writerows(reviews)
    print(f"Saved to {filepath}")


def main():
    parser = argparse.ArgumentParser(description="Scrape Yelp reviews")
    parser.add_argument("url", help="Yelp business page URL")
    parser.add_argument("--format", choices=["json", "csv"], default="json", help="Output format (default: json)")
    parser.add_argument("--output", "-o", help="Output file path (auto-generated if not set)")
    parser.add_argument("--max-pages", type=int, default=None, help="Max pages to scrape (default: all)")
    parser.add_argument("--delay", type=float, default=1.5, help="Delay between requests in seconds (default: 1.5)")
    parser.add_argument("--biz-id", default=None, help="Pre-known encBizId (skips initial page fetch)")
    args = parser.parse_args()

    reviews = scrape_reviews(args.url, biz_id=args.biz_id, max_pages=args.max_pages, delay=args.delay)

    if not reviews:
        print("No reviews found.")
        sys.exit(1)

    # Generate output filename from business URL slug
    slug = args.url.rstrip("/").split("/")[-1]
    ext = args.format
    outpath = args.output or f"{slug}_reviews.{ext}"

    if args.format == "csv":
        save_csv(reviews, outpath)
    else:
        save_json(reviews, outpath)


if __name__ == "__main__":
    main()
