#!/usr/bin/env python3
"""
Enhanced scraping script to get additional pages and deeper content
"""

import requests
from bs4 import BeautifulSoup
import os
import json
from urllib.parse import urljoin, urlparse
import time
from pathlib import Path

class EnhancedScraper:
    def __init__(self, base_url="https://smart-const.com", output_dir="scraped_content"):
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.data_dir = self.output_dir / "data"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        # Create directories
        self.images_dir.mkdir(parents=True, exist_ok=True)
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        self.scraped_pages = {}
        self.image_urls = set()
        self.visited_urls = set()
        
    def get_page(self, url):
        """Fetch a page with retry logic"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            return None
    
    def extract_text_content(self, soup):
        """Extract meaningful text content from page"""
        for script in soup(["script", "style", "meta", "link"]):
            script.decompose()
        
        content = {
            'title': '',
            'headings': [],
            'paragraphs': [],
            'lists': [],
            'meta_description': ''
        }
        
        title_tag = soup.find('title')
        if title_tag:
            content['title'] = title_tag.get_text(strip=True)
        
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            content['meta_description'] = meta_desc.get('content', '')
        
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            headings = soup.find_all(tag)
            for heading in headings:
                text = heading.get_text(strip=True)
                if text:
                    content['headings'].append({
                        'level': tag,
                        'text': text
                    })
        
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text and len(text) > 20:
                content['paragraphs'].append(text)
        
        lists = soup.find_all(['ul', 'ol'])
        for lst in lists:
            items = []
            for li in lst.find_all('li', recursive=False):
                text = li.get_text(strip=True)
                if text:
                    items.append(text)
            if items:
                content['lists'].append(items)
        
        return content
    
    def extract_images(self, soup, page_url):
        """Extract all image URLs from page"""
        images = []
        
        img_tags = soup.find_all('img')
        for img in img_tags:
            src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
            if src:
                full_url = urljoin(page_url, src)
                if not full_url.startswith('data:') and 'icon' not in full_url.lower():
                    images.append({
                        'url': full_url,
                        'alt': img.get('alt', ''),
                        'title': img.get('title', '')
                    })
                    self.image_urls.add(full_url)
        
        elements_with_bg = soup.find_all(attrs={'style': True})
        for elem in elements_with_bg:
            style = elem.get('style', '')
            if 'background-image' in style or 'url(' in style:
                import re
                urls = re.findall(r'url\(["\']?([^"\']+)["\']?\)', style)
                for url in urls:
                    full_url = urljoin(page_url, url)
                    if not full_url.startswith('data:'):
                        images.append({
                            'url': full_url,
                            'alt': '',
                            'title': 'Background image'
                        })
                        self.image_urls.add(full_url)
        
        return images
    
    def download_image(self, img_url, filename):
        """Download an image"""
        try:
            response = self.session.get(img_url, timeout=10, stream=True)
            response.raise_for_status()
            
            filepath = self.images_dir / filename
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            return str(filepath)
        except Exception as e:
            print(f"Error downloading {img_url}: {e}")
            return None
    
    def get_filename_from_url(self, url):
        """Extract filename from URL"""
        parsed = urlparse(url)
        filename = os.path.basename(parsed.path)
        if not filename or '.' not in filename:
            filename = f"image_{hash(url) % 100000}.jpg"
        filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
        return filename
    
    def scrape_page(self, url):
        """Scrape a single page"""
        if url in self.visited_urls:
            return None
            
        print(f"Scraping: {url}")
        self.visited_urls.add(url)
        
        response = self.get_page(url)
        if not response:
            return None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        text_content = self.extract_text_content(soup)
        images = self.extract_images(soup, url)
        
        page_data = {
            'url': url,
            'text': text_content,
            'images': images,
            'html': str(soup)
        }
        
        self.scraped_pages[url] = page_data
        
        for img in images:
            filename = self.get_filename_from_url(img['url'])
            local_path = self.download_image(img['url'], filename)
            if local_path:
                img['local_path'] = local_path
        
        return page_data
    
    def find_all_pages(self, max_depth=2):
        """Find all pages on the site with depth control"""
        pages = set()
        to_visit = {self.base_url}
        visited = set()
        depth = 0
        
        while to_visit and depth < max_depth:
            current_level = to_visit.copy()
            to_visit.clear()
            
            for url in current_level:
                if url in visited:
                    continue
                visited.add(url)
                
                response = self.get_page(url)
                if not response:
                    continue
                
                soup = BeautifulSoup(response.content, 'html.parser')
                
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    full_url = urljoin(self.base_url, href)
                    
                    if self.base_url in full_url:
                        parsed = urlparse(full_url)
                        clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                        
                        if (clean_url not in visited and 
                            clean_url not in pages and
                            not any(ext in clean_url for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.zip', '.doc']) and
                            'wp-content' not in clean_url and
                            'wp-admin' not in clean_url and
                            'wp-json' not in clean_url and
                            'feed' not in clean_url and
                            'xmlrpc' not in clean_url):
                            pages.add(clean_url)
                            if depth < max_depth - 1:
                                to_visit.add(clean_url)
            
            depth += 1
        
        return pages
    
    def scrape_all(self, additional_urls=None):
        """Scrape all pages"""
        print("Finding all pages...")
        pages = self.find_all_pages(max_depth=2)
        
        # Add additional specific URLs if provided
        if additional_urls:
            for url in additional_urls:
                full_url = urljoin(self.base_url, url)
                pages.add(full_url)
        
        print(f"Found {len(pages)} pages to scrape")
        
        # Load existing scraped data to avoid duplicates
        existing_file = self.data_dir / 'scraped_pages.json'
        if existing_file.exists():
            with open(existing_file, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
                self.scraped_pages.update(existing_data)
                print(f"Loaded {len(existing_data)} existing pages")
        
        # Scrape each page
        new_pages = [p for p in pages if p not in self.scraped_pages]
        print(f"Scraping {len(new_pages)} new pages...")
        
        for i, page_url in enumerate(new_pages, 1):
            print(f"\n[{i}/{len(new_pages)}] Processing: {page_url}")
            self.scrape_page(page_url)
            time.sleep(1)
        
        # Save all data
        self.save_data()
    
    def save_data(self):
        """Save scraped data to JSON files"""
        pages_summary = {}
        for url, data in self.scraped_pages.items():
            pages_summary[url] = {
                'url': data['url'],
                'text': data['text'],
                'images': [
                    {
                        'url': img['url'],
                        'alt': img.get('alt', ''),
                        'local_path': img.get('local_path', '')
                    }
                    for img in data['images']
                ]
            }
        
        with open(self.data_dir / 'scraped_pages.json', 'w', encoding='utf-8') as f:
            json.dump(pages_summary, f, indent=2, ensure_ascii=False)
        
        with open(self.data_dir / 'image_urls.txt', 'w', encoding='utf-8') as f:
            for url in sorted(self.image_urls):
                f.write(f"{url}\n")
        
        print(f"\nScraping complete!")
        print(f"Total pages scraped: {len(self.scraped_pages)}")
        print(f"Total images found: {len(self.image_urls)}")
        print(f"Data saved to: {self.data_dir}")

if __name__ == "__main__":
    scraper = EnhancedScraper()
    
    # Additional specific pages to scrape
    additional_pages = [
        '/company-profile/',
        '/magazine/',
        '/careers/',
    ]
    
    scraper.scrape_all(additional_urls=additional_pages)
