"""
Yelp Not-Recommended Reviews Scraper

Usage:
    python3 yelp_not_recommended.py "raw-sugar-factory-san-francisco"
    python3 yelp_not_recommended.py "https://www.yelp.com/biz/raw-sugar-factory-san-francisco"
    python3 yelp_not_recommended.py "raw-sugar-factory-san-francisco" --format csv

Proxy: Set PROXY_URL in .env file (e.g., PROXY_URL=http://user:pass@host:port)
"""

import argparse
import csv
import json
import os
import re
import sys
import time
import random
from pathlib import Path

from bs4 import BeautifulSoup
from curl_cffi import CurlOpt, requests
from curl_cffi.requests.exceptions import RequestException
from dotenv import load_dotenv

# Load .env from the project directory
load_dotenv(Path(__file__).parent / ".env")


NR_URL_BASE = "https://www.yelp.com/not_recommended_reviews"
PER_PAGE = 10

BROWSER_HEADERS = {
    "Referer": "https://www.google.com/",
}

RETRY_IMPERSONATIONS = ["safari2601", "safari184", "safari260", "chrome133a", "safari170"]


def _get_proxy() -> str | None:
    """Get proxy URL from environment."""
    return os.environ.get("PROXY_URL")


def _get_session(impersonate: str | None = None) -> requests.Session:
    """Create a curl_cffi session with browser impersonation and optional proxy."""
    choice = impersonate or "safari2601"
    proxy_url = _get_proxy()

    return requests.Session(
        impersonate=choice,
        proxy=proxy_url,
        timeout=(10, 30),
        curl_options={
            CurlOpt.TCP_KEEPALIVE: 1,
            CurlOpt.TCP_KEEPIDLE: 60,
            CurlOpt.TCP_KEEPINTVL: 30,
            CurlOpt.DNS_CACHE_TIMEOUT: 300,
        },
    )


def _request_with_retry(session, method, url, max_retries=5, **kwargs):
    """Make a request with automatic retry on 403/429/network errors, cycling impersonation."""
    resp = None
    for attempt in range(max_retries):
        imp = RETRY_IMPERSONATIONS[attempt % len(RETRY_IMPERSONATIONS)]
        try:
            resp = getattr(session, method)(url, impersonate=imp, **kwargs)
        except RequestException as e:
            if attempt < max_retries - 1:
                wait = (attempt + 1) * 3 + random.uniform(1, 3)
                print(f"  Network error ({e}), retrying in {wait:.1f}s (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait)
                continue
            raise

        if resp.status_code in (200, 201):
            return resp

        if resp.status_code in (403, 429) and attempt < max_retries - 1:
            wait = (attempt + 1) * 3 + random.uniform(1, 3)
            next_imp = RETRY_IMPERSONATIONS[(attempt + 1) % len(RETRY_IMPERSONATIONS)]
            print(f"  HTTP {resp.status_code}, retrying in {wait:.1f}s as {next_imp} (attempt {attempt + 1}/{max_retries})...")
            time.sleep(wait)
            continue

        return resp

    return resp


def _save_json(data: list | dict, filepath: str):
    """Save data to JSON."""
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Saved to {filepath}")


def _save_csv(rows: list[dict], filepath: str):
    """Save data to CSV."""
    if not rows:
        print("No data to save.")
        return

    with open(filepath, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        writer.writerows(rows)
    print(f"Saved to {filepath}")


# ─── Not-recommended-specific logic ────────────────────────────────────────────


def _extract_alias(url_or_alias: str) -> str:
    """Extract business alias from a URL or plain alias string."""
    if "/biz/" in url_or_alias:
        return url_or_alias.split("/biz/")[-1].split("?")[0].split("#")[0]
    if "/not_recommended_reviews/" in url_or_alias:
        return url_or_alias.split("/not_recommended_reviews/")[-1].split("?")[0]
    return url_or_alias.strip("/")


def _parse_review_div(div, section_type: str = "not_recommended") -> dict | None:
    """Parse a single review div into a flat dict."""
    review_id = div.get("data-review-id", "")
    if not review_id:
        return None

    # Rating from i-stars title or img alt
    rating = None
    stars_div = div.find("div", class_=re.compile(r"i-stars"))
    if stars_div:
        title = stars_div.get("title", "")
        match = re.search(r"([\d.]+)\s*star", title)
        if match:
            rating = float(match.group(1))
    if rating is None:
        stars_img = div.find("img", class_="offscreen")
        if stars_img:
            match = re.search(r"([\d.]+)\s*star", stars_img.get("alt", ""))
            if match:
                rating = float(match.group(1))

    # Date — extract just the M/D/YYYY portion (span may contain extra text like "Updated review")
    date_str = ""
    date_span = div.find("span", class_="rating-qualifier")
    if date_span:
        raw_date = date_span.get_text(strip=True)
        date_match = re.match(r"(\d{1,2}/\d{1,2}/\d{4})", raw_date)
        date_str = date_match.group(1) if date_match else raw_date

    # Review text
    text = ""
    text_p = div.find("p", attrs={"lang": True})
    if text_p:
        text = text_p.get_text(strip=True)
    language = text_p.get("lang", "en") if text_p else ""

    # Author
    author_name = ""
    author_span = div.find("span", class_="user-display-name")
    if author_span:
        author_name = author_span.get_text(strip=True)

    author_location = ""
    loc_li = div.find("li", class_="user-location")
    if loc_li:
        loc_b = loc_li.find("b")
        if loc_b:
            author_location = loc_b.get_text(strip=True)

    # Author stats
    def _get_stat(class_name):
        li = div.find("li", class_=class_name)
        if li:
            b = li.find("b")
            if b:
                try:
                    return int(b.get_text(strip=True))
                except ValueError:
                    pass
        return 0

    # Skip empty placeholder reviews (Yelp stripped all content)
    if rating is None and not text:
        return None

    return {
        "review_id": review_id,
        "section": section_type,
        "rating": rating,
        "date": date_str,
        "text": text,
        "language": language,
        "author_name": author_name,
        "author_location": author_location,
        "author_friends": _get_stat("friend-count"),
        "author_reviews": _get_stat("review-count"),
        "author_photos": _get_stat("photo-count"),
    }


def _parse_nr_page(html: str) -> tuple[list[dict], list[dict], int, int]:
    """
    Parse a not-recommended reviews page.
    Returns: (nr_reviews, removed_reviews, nr_total, removed_total)
    """
    soup = BeautifulSoup(html, "html.parser")

    # Extract total counts from page text
    nr_total = 0
    removed_total = 0
    page_text = soup.get_text()

    nr_match = re.search(r"(\d+)\s+reviews?\s+.*?not\s+currently\s+recommended", page_text, re.I)
    if nr_match:
        nr_total = int(nr_match.group(1))

    rm_match = re.search(r"(\d+)\s+Reviews?\s+Removed", page_text, re.I)
    if rm_match:
        removed_total = int(rm_match.group(1))

    # Parse not-recommended reviews section (must match the ysection, not the page container)
    nr_reviews = []
    nr_section = soup.find("div", class_=lambda c: c and "review-list-wide" in c and "not-recommended" in c)
    if nr_section:
        for div in nr_section.find_all("div", attrs={"data-review-id": True}):
            review = _parse_review_div(div, "not_recommended")
            if review:
                nr_reviews.append(review)

    # Parse removed reviews section
    removed_reviews = []
    rm_section = soup.find("div", class_=lambda c: c and "removed-reviews" in c)
    if rm_section:
        for div in rm_section.find_all("div", attrs={"data-review-id": True}):
            review = _parse_review_div(div, "removed")
            if review:
                removed_reviews.append(review)

    return nr_reviews, removed_reviews, nr_total, removed_total


def scrape_not_recommended(
    url_or_alias: str,
    include_removed: bool = True,
    max_pages: int | None = None,
    delay: float = 1.5,
) -> list[dict]:
    """
    Scrape all not-recommended (and optionally removed) reviews.

    Args:
        url_or_alias: Business URL or alias
        include_removed: Also scrape removed reviews
        max_pages: Max pages per section (None = all)
        delay: Delay between requests
    """
    alias = _extract_alias(url_or_alias)
    base_url = f"{NR_URL_BASE}/{alias}"

    proxy = _get_proxy()
    print(f"Fetching not-recommended reviews: {alias}")
    print(f"Proxy: {'enabled' if proxy else 'disabled (set PROXY_URL in .env)'}")
    session = _get_session()

    # First page
    resp = _request_with_retry(session, "get", base_url, headers=BROWSER_HEADERS)
    if resp.status_code != 200:
        raise RuntimeError(f"Failed to fetch {base_url} (HTTP {resp.status_code})")

    nr_reviews, rm_reviews, nr_total, rm_total = _parse_nr_page(resp.text)
    print(f"Not-recommended: {nr_total} total | Removed: {rm_total} total")
    print(f"  Page 1: {len(nr_reviews)} NR + {len(rm_reviews)} removed")

    # Deduplicate by review_id
    seen_nr = {r["review_id"] for r in nr_reviews}
    seen_rm = {r["review_id"] for r in rm_reviews}
    all_nr = list(nr_reviews)
    all_rm = list(rm_reviews)

    # Paginate not-recommended reviews
    nr_pages = (nr_total + PER_PAGE - 1) // PER_PAGE if nr_total else 1
    if max_pages:
        nr_pages = min(nr_pages, max_pages)
    for page in range(2, nr_pages + 1):
        time.sleep(delay + random.uniform(0, 0.5))
        offset = (page - 1) * PER_PAGE
        url = f"{base_url}?not_recommended_start={offset}"

        session.upkeep()
        resp = _request_with_retry(session, "get", url, headers=BROWSER_HEADERS)
        if resp.status_code != 200:
            print(f"  NR page {page} failed (HTTP {resp.status_code})")
            break

        nr_page, _, _, _ = _parse_nr_page(resp.text)
        if not nr_page:
            break
        new = [r for r in nr_page if r["review_id"] not in seen_nr]
        seen_nr.update(r["review_id"] for r in new)
        all_nr.extend(new)
        print(f"  NR page {page}/{nr_pages}: {len(new)} new reviews")

    # Paginate removed reviews
    if include_removed and rm_total > PER_PAGE:
        rm_pages = (rm_total + PER_PAGE - 1) // PER_PAGE
        if max_pages:
            rm_pages = min(rm_pages, max_pages)
        for page in range(2, rm_pages + 1):
            time.sleep(delay + random.uniform(0, 0.5))
            offset = (page - 1) * PER_PAGE
            url = f"{base_url}?removed_start={offset}"

            session.upkeep()
            resp = _request_with_retry(session, "get", url, headers=BROWSER_HEADERS)
            if resp.status_code != 200:
                print(f"  Removed page {page} failed (HTTP {resp.status_code})")
                break

            _, rm_page, _, _ = _parse_nr_page(resp.text)
            if not rm_page:
                break
            new = [r for r in rm_page if r["review_id"] not in seen_rm]
            seen_rm.update(r["review_id"] for r in new)
            all_rm.extend(new)
            print(f"  Removed page {page}/{rm_pages}: {len(new)} new reviews")

    combined = all_nr + (all_rm if include_removed else [])
    print(f"\nDone! {len(all_nr)} not-recommended + {len(all_rm)} removed = {len(combined)} total.")
    return combined


def main():
    parser = argparse.ArgumentParser(description="Scrape Yelp not-recommended reviews")
    parser.add_argument("url", help="Yelp business URL or alias")
    parser.add_argument("--format", choices=["json", "csv"], default="json")
    parser.add_argument("--output", "-o", help="Output file path")
    parser.add_argument("--no-removed", action="store_true", help="Skip removed reviews")
    parser.add_argument("--max-pages", type=int, default=None, help="Max pages per section (default: all)")
    parser.add_argument("--delay", type=float, default=1.5)
    args = parser.parse_args()

    reviews = scrape_not_recommended(
        args.url,
        include_removed=not args.no_removed,
        max_pages=args.max_pages,
        delay=args.delay,
    )

    if not reviews:
        print("No reviews found.")
        sys.exit(1)

    alias = _extract_alias(args.url)
    outpath = args.output or f"{alias}_not_recommended.{args.format}"

    if args.format == "csv":
        _save_csv(reviews, outpath)
    else:
        _save_json(reviews, outpath)


if __name__ == "__main__":
    main()
