#!/usr/bin/env python3
"""
Enhanced Image Scraper Script
Crawls all pages of a website and downloads all images found using aggressive methods.
"""

import os
import sys
import argparse
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, unquote
from urllib.robotparser import RobotFileParser
import time
import hashlib
from pathlib import Path
import re
from typing import Set, List, Optional, Dict
import logging

class EnhancedImageScraper:
    def __init__(self, base_url: str, output_dir: str = "scraped_images", 
                 delay: float = 1.0, max_pages: Optional[int] = None, respect_robots: bool = True):
        """
        Initialize the enhanced image scraper.
        
        Args:
            base_url: The base URL of the website to scrape
            output_dir: Directory to save images
            delay: Delay between requests in seconds
            max_pages: Maximum number of pages to crawl (None for unlimited)
            respect_robots: Whether to respect robots.txt
        """
        self.base_url = base_url.rstrip('/')
        self.domain = urlparse(base_url).netloc
        self.output_dir = Path(output_dir)
        self.delay = delay
        self.max_pages = max_pages
        self.respect_robots = respect_robots
        
        # Sets to track visited URLs and downloaded images
        self.visited_pages: Set[str] = set()
        self.downloaded_images: Set[str] = set()
        self.image_hashes: Set[str] = set()
        self.failed_urls: Set[str] = set()
        self.successful_downloads: List[Dict] = []
        
        # Session for connection pooling
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # Setup logging
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)
        
        # Create output directory
        self.output_dir.mkdir(exist_ok=True)
        
        # Check robots.txt if requested
        self.robots_parser: Optional[RobotFileParser] = None
        if self.respect_robots:
            self._check_robots_txt()
    
    def _check_robots_txt(self):
        """Check and parse robots.txt file."""
        try:
            robots_url = f"{self.base_url}/robots.txt"
            self.robots_parser = RobotFileParser()
            self.robots_parser.set_url(robots_url)
            self.robots_parser.read()
            self.logger.info(f"Loaded robots.txt from {robots_url}")
        except Exception as e:
            self.logger.warning(f"Could not load robots.txt: {e}")
    
    def _can_fetch(self, url: str) -> bool:
        """Check if we can fetch the URL according to robots.txt."""
        if not self.robots_parser:
            return True
        return self.robots_parser.can_fetch('*', url)
    
    def _is_valid_image_url(self, url: str) -> bool:
        """Check if URL points to an image file."""
        image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.ico'}
        parsed_url = urlparse(url)
        path = parsed_url.path.lower()
        return any(path.endswith(ext) for ext in image_extensions)
    
    def _get_image_filename(self, url: str, content_type: Optional[str] = None) -> str:
        """Generate a filename for the image."""
        parsed_url = urlparse(url)
        filename = os.path.basename(unquote(parsed_url.path))
        
        # If no filename or extension, generate one
        if not filename or '.' not in filename:
            # Try to get extension from content type
            ext = '.jpg'  # default
            if content_type:
                if 'png' in content_type:
                    ext = '.png'
                elif 'gif' in content_type:
                    ext = '.gif'
                elif 'webp' in content_type:
                    ext = '.webp'
                elif 'svg' in content_type:
                    ext = '.svg'
            
            # Generate filename from URL hash
            url_hash = hashlib.md5(url.encode()).hexdigest()[:8]
            filename = f"image_{url_hash}{ext}"
        
        # Clean filename
        filename = re.sub(r'[^\w\-_\.]', '_', filename)
        return filename
    
    def _download_image(self, img_url: str) -> bool:
        """Download a single image."""
        try:
            # Skip if already downloaded
            if img_url in self.downloaded_images:
                return True
            
            # Skip if already failed
            if img_url in self.failed_urls:
                return False
            
            response = self.session.get(img_url, timeout=15, stream=True)
            response.raise_for_status()
            
            # Check if it's actually an image
            content_type = response.headers.get('content-type', '').lower()
            if not (content_type.startswith('image/') or self._is_valid_image_url(img_url)):
                self.logger.debug(f"Skipping non-image: {img_url}")
                return False
            
            # Check for duplicate content using hash
            content = response.content
            content_hash = hashlib.md5(content).hexdigest()
            if content_hash in self.image_hashes:
                self.logger.debug(f"Skipping duplicate image: {img_url}")
                return True
            
            # Generate filename and save
            filename = self._get_image_filename(img_url, content_type)
            filepath = self.output_dir / filename
            
            # Handle filename conflicts
            counter = 1
            original_stem = filepath.stem
            while filepath.exists():
                filepath = self.output_dir / f"{original_stem}_{counter}{filepath.suffix}"
                counter += 1
            
            with open(filepath, 'wb') as f:
                f.write(content)
            
            self.downloaded_images.add(img_url)
            self.image_hashes.add(content_hash)
            self.successful_downloads.append({
                'url': img_url,
                'filename': filepath.name,
                'size': len(content),
                'content_type': content_type
            })
            
            self.logger.info(f"✅ Downloaded: {filepath.name} ({len(content)} bytes) from {img_url}")
            return True
            
        except Exception as e:
            self.logger.error(f"❌ Failed to download {img_url}: {e}")
            self.failed_urls.add(img_url)
            return False
    
    def _extract_images_from_html(self, html_content: str, base_url: str) -> List[str]:
        """Extract image URLs from HTML content."""
        soup = BeautifulSoup(html_content, 'html.parser')
        image_urls = []
        
        # Find img tags with multiple attributes
        for img in soup.find_all('img'):
            for attr in ['src', 'data-src', 'data-original', 'data-lazy-src']:
                src = img.get(attr)
                if src:
                    full_url = urljoin(base_url, src)
                    image_urls.append(full_url)
        
        # Find CSS background images
        for element in soup.find_all(['div', 'section', 'header', 'span', 'a'], style=True):
            style = element.get('style')
            if style and 'background-image' in style:
                # Extract URL from background-image: url(...)
                matches = re.findall(r'background-image:\s*url\(["\']?([^"\']+)["\']?\)', style)
                for match in matches:
                    bg_url = urljoin(base_url, match)
                    image_urls.append(bg_url)
        
        return list(set(image_urls))  # Remove duplicates
    
    def _extract_images_from_css(self, css_content: str, base_url: str) -> List[str]:
        """Extract image URLs from CSS content."""
        image_urls = []
        
        # Find url() references in CSS
        url_patterns = [
            r'url\(["\']?([^"\']+)["\']?\)',
            r'background-image:\s*url\(["\']?([^"\']+)["\']?\)',
            r'background:\s*url\(["\']?([^"\']+)["\']?\)',
        ]
        
        for pattern in url_patterns:
            matches = re.findall(pattern, css_content, re.IGNORECASE)
            for match in matches:
                if self._is_valid_image_url(match):
                    full_url = urljoin(base_url, match)
                    image_urls.append(full_url)
        
        return list(set(image_urls))
    
    def _try_common_image_paths(self) -> List[str]:
        """Try common image paths that might exist."""
        common_paths = [
            # Main images directory
            f"{self.base_url}/images/",
            f"{self.base_url}/img/",
            f"{self.base_url}/assets/",
            f"{self.base_url}/media/",
            f"{self.base_url}/photos/",
            f"{self.base_url}/gallery/",
            
            # Specific image files
            f"{self.base_url}/images/logo.png",
            f"{self.base_url}/images/logo.jpg",
            f"{self.base_url}/images/logo.gif",
            f"{self.base_url}/images/favicon.ico",
            f"{self.base_url}/favicon.ico",
            
            # Gallery images
            f"{self.base_url}/images/gallery/",
            f"{self.base_url}/gallery/images/",
            f"{self.base_url}/images/photos/",
            
            # Menu images
            f"{self.base_url}/images/menu/",
            f"{self.base_url}/menu/images/",
            
            # Page-specific images
            f"{self.base_url}/images/about/",
            f"{self.base_url}/images/private/",
            f"{self.base_url}/images/specials/",
            f"{self.base_url}/images/stampede/",
        ]
        
        image_urls = []
        for path in common_paths:
            try:
                response = self.session.get(path, timeout=10)
                if response.status_code == 200:
                    # If it's a directory listing, parse for images
                    if 'text/html' in response.headers.get('content-type', ''):
                        soup = BeautifulSoup(response.content, 'html.parser')
                        for link in soup.find_all('a', href=True):
                            href = link['href']
                            if self._is_valid_image_url(href):
                                full_url = urljoin(path, href)
                                image_urls.append(full_url)
                    else:
                        # Direct image file
                        image_urls.append(path)
            except Exception as e:
                self.logger.debug(f"Failed to check {path}: {e}")
        
        return image_urls
    
    def _generate_gallery_urls(self) -> List[str]:
        """Generate potential gallery image URLs."""
        gallery_urls = []
        
        # Try different gallery patterns
        patterns = [
            f"{self.base_url}/images/gallery/g{{}}.jpg",
            f"{self.base_url}/gallery/images/g{{}}.jpg",
            f"{self.base_url}/images/gallery/g-{{}}.jpg",
            f"{self.base_url}/gallery/images/g-{{}}.jpg",
            f"{self.base_url}/images/gallery/g{{}}.png",
            f"{self.base_url}/gallery/images/g{{}}.png",
        ]
        
        # Try numbers 1-50
        for pattern in patterns:
            for i in range(1, 51):
                url = pattern.format(i)
                gallery_urls.append(url)
        
        return gallery_urls
    
    def _try_alternative_image_paths(self) -> List[str]:
        """Try alternative image paths that might exist."""
        alternative_paths = [
            # Try different variations of the logo
            f"{self.base_url}/logo.png",
            f"{self.base_url}/logo.jpg",
            f"{self.base_url}/logo.gif",
            f"{self.base_url}/images/logo.png",
            f"{self.base_url}/img/logo.png",
            f"{self.base_url}/assets/logo.png",
            
            # Try different variations of common images
            f"{self.base_url}/hero.jpg",
            f"{self.base_url}/hero.png",
            f"{self.base_url}/banner.jpg",
            f"{self.base_url}/banner.png",
            f"{self.base_url}/images/hero.jpg",
            f"{self.base_url}/images/banner.jpg",
            
            # Try menu images
            f"{self.base_url}/menu.png",
            f"{self.base_url}/menu.jpg",
            f"{self.base_url}/images/menu.png",
            f"{self.base_url}/images/menu.jpg",
        ]
        
        return alternative_paths
    
    def _extract_images_from_page(self, url: str) -> List[str]:
        """Extract all image URLs from a page using enhanced methods."""
        try:
            if not self._can_fetch(url):
                self.logger.warning(f"Robots.txt disallows fetching: {url}")
                return []
            
            response = self.session.get(url, timeout=15)
            response.raise_for_status()
            
            all_image_urls = []
            
            # Extract images from HTML
            html_images = self._extract_images_from_html(response.text, url)
            all_image_urls.extend(html_images)
            
            # Extract images from CSS
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('link', rel='stylesheet'):
                href = link.get('href')
                if href:
                    css_url = urljoin(url, href)
                    try:
                        css_response = self.session.get(css_url, timeout=10)
                        if css_response.status_code == 200:
                            css_images = self._extract_images_from_css(css_response.text, css_url)
                            all_image_urls.extend(css_images)
                    except Exception as e:
                        self.logger.debug(f"Failed to fetch CSS {css_url}: {e}")
            
            return list(set(all_image_urls))  # Remove duplicates
            
        except Exception as e:
            self.logger.error(f"Failed to extract images from {url}: {e}")
            return []
    
    def _find_all_pages(self, start_url: str) -> List[str]:
        """Find all pages on the website through crawling."""
        pages_to_visit = [start_url]
        all_pages = set()
        
        while pages_to_visit and (self.max_pages is None or len(all_pages) < self.max_pages):
            current_url = pages_to_visit.pop(0)
            
            if current_url in all_pages:
                continue
            
            try:
                if not self._can_fetch(current_url):
                    continue
                
                response = self.session.get(current_url, timeout=15)
                response.raise_for_status()
                
                all_pages.add(current_url)
                self.logger.info(f"Found page: {current_url}")
                
                # Parse page for more links
                soup = BeautifulSoup(response.content, 'html.parser')
                
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(current_url, href)
                    parsed_url = urlparse(full_url)
                    
                    # Only follow links within the same domain
                    if (parsed_url.netloc == self.domain and 
                        full_url not in all_pages and 
                        full_url not in pages_to_visit):
                        
                        # Skip common non-page URLs
                        if not any(ext in parsed_url.path.lower() 
                                 for ext in ['.jpg', '.png', '.gif', '.pdf', '.zip', '.exe']):
                            pages_to_visit.append(full_url)
                
                time.sleep(self.delay)
                
            except Exception as e:
                self.logger.error(f"Failed to crawl {current_url}: {e}")
        
        return list(all_pages)
    
    def scrape(self) -> dict:
        """Main scraping method with enhanced aggressive techniques."""
        self.logger.info(f"🚀 Starting enhanced image scraping for: {self.base_url}")
        self.logger.info(f"📁 Output directory: {self.output_dir}")
        
        all_image_urls = set()
        
        # Method 1: Crawl pages for image references
        self.logger.info("📄 Method 1: Crawling pages for image references...")
        all_pages = self._find_all_pages(self.base_url)
        total_pages = len(all_pages)
        self.logger.info(f"Found {total_pages} pages to scrape")
        
        for i, page_url in enumerate(all_pages, 1):
            self.logger.info(f"Processing page {i}/{total_pages}: {page_url}")
            page_images = self._extract_images_from_page(page_url)
            all_image_urls.update(page_images)
            self.logger.info(f"Found {len(page_images)} images on this page")
            time.sleep(self.delay)
        
        # Method 2: Try common image paths
        self.logger.info("🔍 Method 2: Trying common image paths...")
        common_images = self._try_common_image_paths()
        all_image_urls.update(common_images)
        self.logger.info(f"Found {len(common_images)} images from common paths")
        
        # Method 3: Generate gallery URLs
        self.logger.info("🖼️ Method 3: Generating gallery image URLs...")
        gallery_images = self._generate_gallery_urls()
        all_image_urls.update(gallery_images)
        self.logger.info(f"Generated {len(gallery_images)} potential gallery URLs")
        
        # Method 4: Try alternative image paths
        self.logger.info("🔄 Method 4: Trying alternative image paths...")
        alternative_paths = self._try_alternative_image_paths()
        all_image_urls.update(alternative_paths)
        self.logger.info(f"Added {len(alternative_paths)} alternative paths")
        
        # Download all found images
        total_urls = len(all_image_urls)
        self.logger.info(f"🎯 Total unique image URLs to try: {total_urls}")
        
        successful_downloads = 0
        for i, img_url in enumerate(all_image_urls, 1):
            self.logger.info(f"📥 Downloading {i}/{total_urls}: {img_url}")
            if self._download_image(img_url):
                successful_downloads += 1
            time.sleep(self.delay)
        
        # Generate comprehensive report
        results = {
            'scraping_info': {
                'website_url': self.base_url,
                'scraping_date': time.strftime('%Y-%m-%d %H:%M:%S'),
                'pages_crawled': total_pages,
                'total_urls_tried': total_urls,
                'successful_downloads': successful_downloads,
                'failed_downloads': len(self.failed_urls),
                'unique_images': len(self.image_hashes),
                'output_directory': str(self.output_dir)
            },
            'downloaded_images': self.successful_downloads,
            'failed_urls': list(self.failed_urls)
        }
        
        # Save detailed report
        report_file = self.output_dir / 'scraping_report.json'
        with open(report_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        self.logger.info(f"✅ Scraping completed!")
        self.logger.info(f"📊 Total URLs tried: {total_urls}")
        self.logger.info(f"✅ Successful downloads: {successful_downloads}")
        self.logger.info(f"❌ Failed downloads: {len(self.failed_urls)}")
        self.logger.info(f"📁 Report saved to: {report_file}")
        
        return results

def main():
    parser = argparse.ArgumentParser(description='Enhanced Image Scraper - Download all images from a website using aggressive methods')
    parser.add_argument('url', help='Base URL of the website to scrape')
    parser.add_argument('-o', '--output', default='scraped_images', 
                       help='Output directory for images (default: scraped_images)')
    parser.add_argument('-d', '--delay', type=float, default=1.0,
                       help='Delay between requests in seconds (default: 1.0)')
    parser.add_argument('-m', '--max-pages', type=int, default=None,
                       help='Maximum number of pages to crawl (default: unlimited)')
    parser.add_argument('--ignore-robots', action='store_true',
                       help='Ignore robots.txt restrictions')
    parser.add_argument('-v', '--verbose', action='store_true',
                       help='Enable verbose logging')
    
    args = parser.parse_args()
    
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    
    # Validate URL
    if not args.url.startswith(('http://', 'https://')):
        args.url = 'https://' + args.url
    
    # Create scraper and run
    scraper = EnhancedImageScraper(
        base_url=args.url,
        output_dir=args.output,
        delay=args.delay,
        max_pages=args.max_pages,
        respect_robots=not args.ignore_robots
    )
    
    try:
        results = scraper.scrape()
        
        print("\n" + "="*60)
        print("🎯 ENHANCED IMAGE SCRAPING COMPLETED!")
        print("="*60)
        print(f"🌐 Website: {results['scraping_info']['website_url']}")
        print(f"📅 Date: {results['scraping_info']['scraping_date']}")
        print(f"📄 Pages crawled: {results['scraping_info']['pages_crawled']}")
        print(f"🔍 Total URLs tried: {results['scraping_info']['total_urls_tried']}")
        print(f"✅ Successful downloads: {results['scraping_info']['successful_downloads']}")
        print(f"❌ Failed downloads: {results['scraping_info']['failed_downloads']}")
        print(f"📁 Output directory: {results['scraping_info']['output_directory']}")
        
        if results['downloaded_images']:
            print(f"\n📸 Downloaded Images:")
            for img in results['downloaded_images']:
                print(f"  • {img['filename']} ({img['size']} bytes) - {img['url']}")
        
    except KeyboardInterrupt:
        print("\n⚠️ Scraping interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main() 