#!/usr/bin/env python3
"""
Web scraping script to extract content from smart-const.com
Extracts texts and images from all pages
"""

import requests
from bs4 import BeautifulSoup
import os
import json
from urllib.parse import urljoin, urlparse
import time
from pathlib import Path

class SmartConstScraper:
    def __init__(self, base_url="https://smart-const.com", output_dir="scraped_content"):
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.data_dir = self.output_dir / "data"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Create directories
        self.images_dir.mkdir(parents=True, exist_ok=True)
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        self.scraped_pages = {}
        self.image_urls = set()
        
    def get_page(self, url):
        """Fetch a page with retry logic"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None
    
    def extract_text_content(self, soup):
        """Extract meaningful text content from page"""
        # Remove script and style elements
        for script in soup(["script", "style", "meta", "link"]):
            script.decompose()
        
        # Extract main content areas
        content = {
            'title': '',
            'headings': [],
            'paragraphs': [],
            'lists': [],
            'meta_description': ''
        }
        
        # Title
        title_tag = soup.find('title')
        if title_tag:
            content['title'] = title_tag.get_text(strip=True)
        
        # Meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            content['meta_description'] = meta_desc.get('content', '')
        
        # Headings
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            headings = soup.find_all(tag)
            for heading in headings:
                text = heading.get_text(strip=True)
                if text:
                    content['headings'].append({
                        'level': tag,
                        'text': text
                    })
        
        # Paragraphs
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text and len(text) > 20:  # Filter out very short text
                content['paragraphs'].append(text)
        
        # Lists
        lists = soup.find_all(['ul', 'ol'])
        for lst in lists:
            items = []
            for li in lst.find_all('li', recursive=False):
                text = li.get_text(strip=True)
                if text:
                    items.append(text)
            if items:
                content['lists'].append(items)
        
        return content
    
    def extract_images(self, soup, page_url):
        """Extract all image URLs from page"""
        images = []
        
        # Find all img tags
        img_tags = soup.find_all('img')
        for img in img_tags:
            src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
            if src:
                # Convert relative URLs to absolute
                full_url = urljoin(page_url, src)
                # Filter out data URIs and very small images
                if not full_url.startswith('data:') and 'icon' not in full_url.lower():
                    images.append({
                        'url': full_url,
                        'alt': img.get('alt', ''),
                        'title': img.get('title', '')
                    })
                    self.image_urls.add(full_url)
        
        # Find background images in style attributes
        elements_with_bg = soup.find_all(attrs={'style': True})
        for elem in elements_with_bg:
            style = elem.get('style', '')
            if 'background-image' in style or 'url(' in style:
                # Extract URL from style
                import re
                urls = re.findall(r'url\(["\']?([^"\']+)["\']?\)', style)
                for url in urls:
                    full_url = urljoin(page_url, url)
                    if not full_url.startswith('data:'):
                        images.append({
                            'url': full_url,
                            'alt': '',
                            'title': 'Background image'
                        })
                        self.image_urls.add(full_url)
        
        return images
    
    def download_image(self, img_url, filename):
        """Download an image"""
        try:
            response = self.session.get(img_url, timeout=10, stream=True)
            response.raise_for_status()
            
            filepath = self.images_dir / filename
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            return str(filepath)
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")
            return None
    
    def get_filename_from_url(self, url):
        """Extract filename from URL"""
        parsed = urlparse(url)
        filename = os.path.basename(parsed.path)
        if not filename or '.' not in filename:
            filename = f"image_{hash(url) % 100000}.jpg"
        # Sanitize filename
        filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
        return filename
    
    def scrape_page(self, url):
        """Scrape a single page"""
        print(f"Scraping: {url}")
        
        response = self.get_page(url)
        if not response:
            return None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract content
        text_content = self.extract_text_content(soup)
        images = self.extract_images(soup, url)
        
        page_data = {
            'url': url,
            'text': text_content,
            'images': images,
            'html': str(soup)
        }
        
        self.scraped_pages[url] = page_data
        
        # Download images
        for img in images:
            filename = self.get_filename_from_url(img['url'])
            local_path = self.download_image(img['url'], filename)
            if local_path:
                img['local_path'] = local_path
        
        return page_data
    
    def find_all_pages(self):
        """Find all pages on the site"""
        pages = set()
        
        # Start with homepage
        response = self.get_page(self.base_url)
        if not response:
            return pages
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all internal links
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(self.base_url, href)
            
            # Only include pages from the same domain
            if self.base_url in full_url:
                # Remove fragments and query params for deduplication
                parsed = urlparse(full_url)
                clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                if clean_url not in pages and not any(ext in clean_url for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js']):
                    pages.add(clean_url)
        
        return pages
    
    def scrape_all(self):
        """Scrape all pages"""
        print("Finding all pages...")
        pages = self.find_all_pages()
        print(f"Found {len(pages)} pages to scrape")
        
        # Scrape each page
        for i, page_url in enumerate(pages, 1):
            print(f"\n[{i}/{len(pages)}] Processing: {page_url}")
            self.scrape_page(page_url)
            time.sleep(1)  # Be polite
        
        # Save all data
        self.save_data()
    
    def save_data(self):
        """Save scraped data to JSON files"""
        # Save pages data (without HTML to reduce size)
        pages_summary = {}
        for url, data in self.scraped_pages.items():
            pages_summary[url] = {
                'url': data['url'],
                'text': data['text'],
                'images': [
                    {
                        'url': img['url'],
                        'alt': img.get('alt', ''),
                        'local_path': img.get('local_path', '')
                    }
                    for img in data['images']
                ]
            }
        
        # Save to JSON
        with open(self.data_dir / 'scraped_pages.json', 'w', encoding='utf-8') as f:
            json.dump(pages_summary, f, indent=2, ensure_ascii=False)
        
        # Save image URLs list
        with open(self.data_dir / 'image_urls.txt', 'w', encoding='utf-8') as f:
            for url in sorted(self.image_urls):
                f.write(f"{url}\n")
        
        print(f"\nScraping complete!")
        print(f"Pages scraped: {len(self.scraped_pages)}")
        print(f"Images found: {len(self.image_urls)}")
        print(f"Data saved to: {self.data_dir}")

if __name__ == "__main__":
    scraper = SmartConstScraper()
    scraper.scrape_all()