#!/usr/bin/env python3
"""
Deep scraping script to get all pages including project details and news
"""

import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin, urlparse
import time
from pathlib import Path

class DeepScraper:
    def __init__(self, base_url="https://smart-const.com", output_dir="scraped_content"):
        self.base_url = base_url
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.data_dir = self.output_dir / "data"
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        
        self.images_dir.mkdir(parents=True, exist_ok=True)
        self.data_dir.mkdir(parents=True, exist_ok=True)
        
        self.scraped_pages = {}
        self.image_urls = set()
        self.visited_urls = set()
        
    def get_page(self, url):
        """Fetch a page"""
        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except Exception as e:
            return None
    
    def extract_text_content(self, soup):
        """Extract text content"""
        for script in soup(["script", "style", "meta", "link"]):
            script.decompose()
        
        content = {
            'title': '',
            'headings': [],
            'paragraphs': [],
            'lists': [],
            'meta_description': ''
        }
        
        title_tag = soup.find('title')
        if title_tag:
            content['title'] = title_tag.get_text(strip=True)
        
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        if meta_desc:
            content['meta_description'] = meta_desc.get('content', '')
        
        for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            headings = soup.find_all(tag)
            for heading in headings:
                text = heading.get_text(strip=True)
                if text:
                    content['headings'].append({
                        'level': tag,
                        'text': text
                    })
        
        paragraphs = soup.find_all('p')
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text and len(text) > 20:
                content['paragraphs'].append(text)
        
        lists = soup.find_all(['ul', 'ol'])
        for lst in lists:
            items = []
            for li in lst.find_all('li', recursive=False):
                text = li.get_text(strip=True)
                if text:
                    items.append(text)
            if items:
                content['lists'].append(items)
        
        return content
    
    def extract_images(self, soup, page_url):
        """Extract images"""
        images = []
        
        img_tags = soup.find_all('img')
        for img in img_tags:
            src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
            if src:
                full_url = urljoin(page_url, src)
                if not full_url.startswith('data:') and 'icon' not in full_url.lower():
                    images.append({
                        'url': full_url,
                        'alt': img.get('alt', ''),
                        'title': img.get('title', '')
                    })
                    self.image_urls.add(full_url)
        
        return images
    
    def download_image(self, img_url, filename):
        """Download image"""
        try:
            response = self.session.get(img_url, timeout=10, stream=True)
            response.raise_for_status()
            
            filepath = self.images_dir / filename
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            return str(filepath)
        except:
            return None
    
    def get_filename_from_url(self, url):
        """Get filename from URL"""
        parsed = urlparse(url)
        filename = parsed.path.split('/')[-1]
        if not filename or '.' not in filename:
            filename = f"image_{hash(url) % 100000}.jpg"
        filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
        return filename
    
    def scrape_page(self, url):
        """Scrape a page"""
        if url in self.visited_urls:
            return None
            
        print(f"Scraping: {url}")
        self.visited_urls.add(url)
        
        response = self.get_page(url)
        if not response:
            return None
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        text_content = self.extract_text_content(soup)
        images = self.extract_images(soup, url)
        
        page_data = {
            'url': url,
            'text': text_content,
            'images': images
        }
        
        self.scraped_pages[url] = page_data
        
        for img in images:
            filename = self.get_filename_from_url(img['url'])
            local_path = self.download_image(img['url'], filename)
            if local_path:
                img['local_path'] = local_path
        
        return page_data
    
    def find_links_on_page(self, url):
        """Find all links on a page"""
        response = self.get_page(url)
        if not response:
            return set()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            full_url = urljoin(self.base_url, href)
            
            if self.base_url in full_url:
                parsed = urlparse(full_url)
                clean_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
                
                if (not any(ext in clean_url for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.zip', '.doc', '.xls']) and
                    'wp-content' not in clean_url and
                    'wp-admin' not in clean_url and
                    'wp-json' not in clean_url and
                    'feed' not in clean_url and
                    'xmlrpc' not in clean_url and
                    '#' not in clean_url.split('?')[0]):
                    links.add(clean_url)
        
        return links
    
    def scrape_all_pages(self):
        """Scrape all pages recursively"""
        # Load existing data
        existing_file = self.data_dir / 'scraped_pages.json'
        if existing_file.exists():
            with open(existing_file, 'r', encoding='utf-8') as f:
                self.scraped_pages = json.load(f)
                self.visited_urls = set(self.scraped_pages.keys())
                print(f"Loaded {len(self.scraped_pages)} existing pages")
        
        # Start with main pages
        main_pages = [
            self.base_url,
            f"{self.base_url}/",
            f"{self.base_url}/who-we-are/",
            f"{self.base_url}/our-projects/",
            f"{self.base_url}/our-services/construction-services/",
            f"{self.base_url}/contact-us/",
            f"{self.base_url}/certificates/",
            f"{self.base_url}/classification/",
            f"{self.base_url}/completion/",
            f"{self.base_url}/iso-certificates/",
            f"{self.base_url}/osh-certificates/",
            f"{self.base_url}/construction-services/",
            f"{self.base_url}/home-video-header/",
        ]
        
        # Find all links from main pages
        all_links = set()
        for page in main_pages:
            print(f"Finding links on: {page}")
            links = self.find_links_on_page(page)
            all_links.update(links)
            time.sleep(0.5)
        
        print(f"\nFound {len(all_links)} unique pages to scrape")
        
        # Scrape new pages
        new_pages = [url for url in all_links if url not in self.visited_urls]
        print(f"Scraping {len(new_pages)} new pages...")
        
        for i, url in enumerate(new_pages, 1):
            print(f"\n[{i}/{len(new_pages)}]")
            self.scrape_page(url)
            time.sleep(1)
        
        # Save
        self.save_data()
    
    def save_data(self):
        """Save scraped data"""
        pages_summary = {}
        for url, data in self.scraped_pages.items():
            pages_summary[url] = {
                'url': data['url'],
                'text': data['text'],
                'images': [
                    {
                        'url': img['url'],
                        'alt': img.get('alt', ''),
                        'local_path': img.get('local_path', '')
                    }
                    for img in data['images']
                ]
            }
        
        with open(self.data_dir / 'scraped_pages.json', 'w', encoding='utf-8') as f:
            json.dump(pages_summary, f, indent=2, ensure_ascii=False)
        
        print(f"\nScraping complete!")
        print(f"Total pages: {len(self.scraped_pages)}")
        print(f"Total images: {len(self.image_urls)}")

if __name__ == "__main__":
    scraper = DeepScraper()
    scraper.scrape_all_pages()
