#!/usr/bin/env python3
"""
Website Navigation Crawler
Crawls an entire website and creates a comprehensive navigation structure in JSON format.
"""

import requests
from urllib.parse import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
import json
import time
import re
from collections import defaultdict, deque
import argparse
import logging
from typing import Dict, List, Set, Optional, Any
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class WebsiteCrawler:
    def __init__(self, base_url: str, max_depth: int = 3, delay: float = 1.0, 
                 max_pages: int = 100, output_file: str = "navigation_structure.json"):
        """
        Initialize the website crawler.
        
        Args:
            base_url: The starting URL for crawling
            max_depth: Maximum depth to crawl (0 = only base URL)
            delay: Delay between requests in seconds
            max_pages: Maximum number of pages to crawl
            output_file: Output JSON file name
        """
        self.base_url = base_url.rstrip('/')
        self.domain = urlparse(base_url).netloc
        self.max_depth = max_depth
        self.delay = delay
        self.max_pages = max_pages
        self.output_file = output_file
        
        # Crawling state
        self.visited_urls: Set[str] = set()
        self.url_structure: Dict[str, Any] = {}
        self.navigation_links: Dict[str, List[str]] = defaultdict(list)
        self.page_count = 0
        
        # Session for requests
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
    def is_valid_url(self, url: str) -> bool:
        """Check if URL is valid and should be crawled."""
        try:
            parsed = urlparse(url)
            
            # Must be HTTP/HTTPS
            if parsed.scheme not in ['http', 'https']:
                return False
                
            # Must be same domain (unless external links are allowed)
            if parsed.netloc != self.domain:
                return False
                
            # Skip common non-content URLs
            skip_patterns = [
                r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|zip|rar|tar|gz)$',
                r'\.(jpg|jpeg|png|gif|svg|ico|css|js|woff|woff2|ttf|eot)$',
                r'#.*$',  # Skip anchors
                r'mailto:',  # Skip email links
                r'tel:',  # Skip phone links
                r'javascript:',  # Skip JavaScript
            ]
            
            for pattern in skip_patterns:
                if re.search(pattern, url, re.IGNORECASE):
                    return False
                    
            return True
            
        except Exception:
            return False
    
    def normalize_url(self, url: str) -> str:
        """Normalize URL by removing fragments and query parameters."""
        parsed = urlparse(url)
        return urlunparse((
            parsed.scheme,
            parsed.netloc,
            parsed.path,
            parsed.params,
            '',  # Remove query
            ''   # Remove fragment
        )).rstrip('/')
    
    def extract_page_info(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
        """Extract comprehensive page information."""
        page_info = {
            'url': url,
            'title': '',
            'description': '',
            'keywords': '',
            'h1': [],
            'h2': [],
            'h3': [],
            'navigation_links': [],
            'content_links': [],
            'external_links': [],
            'images': [],
            'forms': [],
            'meta_tags': {},
            'word_count': 0,
            'last_modified': None
        }
        
        # Title
        title_tag = soup.find('title')
        if title_tag:
            page_info['title'] = title_tag.get_text(strip=True)
        
        # Meta tags
        for meta in soup.find_all('meta'):
            name = meta.get('name', meta.get('property', ''))
            content = meta.get('content', '')
            if name and content:
                page_info['meta_tags'][name] = content
                if name.lower() == 'description':
                    page_info['description'] = content
                elif name.lower() == 'keywords':
                    page_info['keywords'] = content
        
        # Headings
        for tag in ['h1', 'h2', 'h3']:
            headings = soup.find_all(tag)
            page_info[tag] = [h.get_text(strip=True) for h in headings if h.get_text(strip=True)]
        
        # Links
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            link_text = link.get_text(strip=True)
            
            if not href:
                continue
                
            absolute_url = urljoin(url, href)
            normalized_url = self.normalize_url(absolute_url)
            
            link_info = {
                'url': normalized_url,
                'text': link_text,
                'title': link.get('title', ''),
                'is_external': urlparse(absolute_url).netloc != self.domain
            }
            
            # Categorize links
            if self.is_navigation_link(link):
                page_info['navigation_links'].append(link_info)
            elif link_info['is_external']:
                page_info['external_links'].append(link_info)
            else:
                page_info['content_links'].append(link_info)
        
        # Images
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                absolute_src = urljoin(url, src)
                page_info['images'].append({
                    'src': absolute_src,
                    'alt': img.get('alt', ''),
                    'title': img.get('title', '')
                })
        
        # Forms
        for form in soup.find_all('form'):
            form_info = {
                'action': form.get('action', ''),
                'method': form.get('method', 'GET'),
                'inputs': []
            }
            
            for input_tag in form.find_all(['input', 'textarea', 'select']):
                input_info = {
                    'type': input_tag.get('type', input_tag.name),
                    'name': input_tag.get('name', ''),
                    'id': input_tag.get('id', ''),
                    'required': input_tag.get('required') is not None
                }
                form_info['inputs'].append(input_info)
            
            page_info['forms'].append(form_info)
        
        # Word count (approximate)
        text_content = soup.get_text()
        page_info['word_count'] = len(text_content.split())
        
        return page_info
    
    def is_navigation_link(self, link_tag) -> bool:
        """Determine if a link is likely a navigation link."""
        # Check for common navigation indicators
        nav_indicators = [
            'nav', 'menu', 'navigation', 'header', 'footer',
            'breadcrumb', 'sidebar', 'main-menu', 'top-menu'
        ]
        
        # Check parent elements
        parent = link_tag.parent
        for _ in range(5):  # Check up to 5 levels up
            if parent:
                parent_classes = parent.get('class', [])
                parent_id = parent.get('id', '')
                
                for indicator in nav_indicators:
                    if (indicator in ' '.join(parent_classes).lower() or 
                        indicator in parent_id.lower()):
                        return True
                
                parent = parent.parent
        
        # Check link text for common navigation words
        link_text = link_tag.get_text(strip=True).lower()
        nav_words = ['home', 'about', 'contact', 'services', 'products', 
                    'blog', 'news', 'help', 'support', 'login', 'register']
        
        return any(word in link_text for word in nav_words)
    
    def crawl_page(self, url: str, depth: int = 0) -> Optional[Dict[str, Any]]:
        """Crawl a single page and extract information."""
        if depth > self.max_depth or self.page_count >= self.max_pages:
            return None
            
        normalized_url = self.normalize_url(url)
        
        if normalized_url in self.visited_urls:
            return None
            
        logger.info(f"Crawling: {normalized_url} (depth: {depth})")
        
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            # Check if it's HTML content
            content_type = response.headers.get('content-type', '').lower()
            if 'text/html' not in content_type:
                logger.info(f"Skipping non-HTML content: {content_type}")
                return None
            
            soup = BeautifulSoup(response.content, 'html.parser')
            page_info = self.extract_page_info(soup, normalized_url)
            
            self.visited_urls.add(normalized_url)
            self.page_count += 1
            
            # Store page info
            self.url_structure[normalized_url] = page_info
            
            # Follow links if not at max depth
            if depth < self.max_depth:
                for link_info in page_info['content_links']:
                    link_url = link_info['url']
                    if (self.is_valid_url(link_url) and 
                        link_url not in self.visited_urls):
                        time.sleep(self.delay)  # Be respectful
                        self.crawl_page(link_url, depth + 1)
            
            return page_info
            
        except requests.RequestException as e:
            logger.error(f"Error crawling {url}: {e}")
            return None
        except Exception as e:
            logger.error(f"Unexpected error crawling {url}: {e}")
            return None
    
    def build_navigation_structure(self) -> Dict[str, Any]:
        """Build a comprehensive navigation structure from crawled data."""
        navigation = {
            'site_info': {
                'base_url': self.base_url,
                'domain': self.domain,
                'total_pages': len(self.url_structure),
                'crawl_date': time.strftime('%Y-%m-%d %H:%M:%S'),
                'crawl_settings': {
                    'max_depth': self.max_depth,
                    'max_pages': self.max_pages,
                    'delay': self.delay
                }
            },
            'pages': {},
            'navigation': {
                'main_menu': [],
                'footer_links': [],
                'breadcrumbs': {},
                'sitemap': []
            },
            'statistics': {
                'total_links': 0,
                'internal_links': 0,
                'external_links': 0,
                'images': 0,
                'forms': 0,
                'avg_word_count': 0
            }
        }
        
        # Process each page
        total_links = 0
        total_images = 0
        total_forms = 0
        total_words = 0
        
        for url, page_info in self.url_structure.items():
            # Add page to structure
            navigation['pages'][url] = {
                'title': page_info['title'],
                'description': page_info['description'],
                'h1': page_info['h1'],
                'h2': page_info['h2'],
                'navigation_links': page_info['navigation_links'],
                'content_links': page_info['content_links'],
                'external_links': page_info['external_links'],
                'images': len(page_info['images']),
                'forms': len(page_info['forms']),
                'word_count': page_info['word_count'],
                'meta_tags': page_info['meta_tags']
            }
            
            # Add to sitemap
            navigation['navigation']['sitemap'].append({
                'url': url,
                'title': page_info['title'],
                'description': page_info['description']
            })
            
            # Update statistics
            total_links += len(page_info['navigation_links']) + len(page_info['content_links'])
            total_images += len(page_info['images'])
            total_forms += len(page_info['forms'])
            total_words += page_info['word_count']
        
        # Calculate averages
        if self.url_structure:
            navigation['statistics']['avg_word_count'] = total_words // len(self.url_structure)
        
        navigation['statistics']['total_links'] = total_links
        navigation['statistics']['images'] = total_images
        navigation['statistics']['forms'] = total_forms
        
        # Build main navigation from most common navigation links
        nav_links = defaultdict(int)
        for page_info in self.url_structure.values():
            for link in page_info['navigation_links']:
                nav_links[link['url']] += 1
        
        # Sort by frequency and add to main menu
        sorted_nav = sorted(nav_links.items(), key=lambda x: x[1], reverse=True)
        navigation['navigation']['main_menu'] = [
            {
                'url': url,
                'frequency': count,
                'title': self.url_structure.get(url, {}).get('title', '')
            }
            for url, count in sorted_nav[:10]  # Top 10 most common
        ]
        
        return navigation
    
    def crawl(self) -> Dict[str, Any]:
        """Main crawling method."""
        logger.info(f"Starting crawl of {self.base_url}")
        logger.info(f"Max depth: {self.max_depth}, Max pages: {self.max_pages}")
        
        # Start crawling from base URL
        self.crawl_page(self.base_url, 0)
        
        # Build navigation structure
        navigation_structure = self.build_navigation_structure()
        
        # Save to JSON file
        with open(self.output_file, 'w', encoding='utf-8') as f:
            json.dump(navigation_structure, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Crawl completed. Found {len(self.url_structure)} pages.")
        logger.info(f"Results saved to {self.output_file}")
        
        return navigation_structure

def main():
    parser = argparse.ArgumentParser(description='Crawl website and create navigation structure')
    parser.add_argument('url', help='Base URL to crawl')
    parser.add_argument('--max-depth', type=int, default=3, help='Maximum crawl depth (default: 3)')
    parser.add_argument('--max-pages', type=int, default=100, help='Maximum pages to crawl (default: 100)')
    parser.add_argument('--delay', type=float, default=1.0, help='Delay between requests in seconds (default: 1.0)')
    parser.add_argument('--output', default='navigation_structure.json', help='Output JSON file (default: navigation_structure.json)')
    
    args = parser.parse_args()
    
    # Create crawler and start crawling
    crawler = WebsiteCrawler(
        base_url=args.url,
        max_depth=args.max_depth,
        max_pages=args.max_pages,
        delay=args.delay,
        output_file=args.output
    )
    
    try:
        navigation_structure = crawler.crawl()
        
        # Print summary
        print("\n" + "="*50)
        print("CRAWL SUMMARY")
        print("="*50)
        print(f"Base URL: {args.url}")
        print(f"Pages crawled: {len(crawler.url_structure)}")
        print(f"Total links found: {navigation_structure['statistics']['total_links']}")
        print(f"Total images: {navigation_structure['statistics']['images']}")
        print(f"Total forms: {navigation_structure['statistics']['forms']}")
        print(f"Average word count: {navigation_structure['statistics']['avg_word_count']}")
        print(f"Output file: {args.output}")
        print("="*50)
        
    except KeyboardInterrupt:
        logger.info("Crawl interrupted by user")
    except Exception as e:
        logger.error(f"Crawl failed: {e}")

if __name__ == "__main__":
    main() 