#!/usr/bin/env python3
"""
Integration script to update HTML files with scraped content
"""

import json
import shutil
from pathlib import Path
from bs4 import BeautifulSoup
import re

class ContentIntegrator:
    def __init__(self):
        self.base_dir = Path(".")
        self.scraped_data_file = self.base_dir / "scraped_content" / "data" / "scraped_pages.json"
        self.scraped_images_dir = self.base_dir / "scraped_content" / "images"
        self.assets_dir = self.base_dir / "assets"
        self.scraped_data = {}
        
        # Page mapping: scraped URL -> local HTML file
        self.page_mapping = {
            "https://smart-const.com/": "home.html",
            "https://smart-const.com/who-we-are/": "about.html",
            "https://smart-const.com/our-services/construction-services/": "ourService.html",
            "https://smart-const.com/our-projects/": "project.html",
            "https://smart-const.com/contact-us/": "contactUs.html",
        }
        
    def load_scraped_data(self):
        """Load scraped data from JSON"""
        with open(self.scraped_data_file, 'r', encoding='utf-8') as f:
            self.scraped_data = json.load(f)
        print(f"Loaded {len(self.scraped_data)} scraped pages")
    
    def copy_images_to_assets(self):
        """Copy scraped images to assets folder"""
        if not self.scraped_images_dir.exists():
            print("Scraped images directory not found")
            return
        
        # Create subdirectory for scraped images
        target_dir = self.assets_dir / "scraped_images"
        target_dir.mkdir(exist_ok=True)
        
        # Copy all images
        image_count = 0
        for img_file in self.scraped_images_dir.iterdir():
            if img_file.is_file() and img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.gif', '.webp']:
                target_path = target_dir / img_file.name
                shutil.copy2(img_file, target_path)
                image_count += 1
        
        print(f"Copied {image_count} images to {target_dir}")
        return target_dir
    
    def update_home_page(self, soup, page_data):
        """Update home page content"""
        text_data = page_data.get('text', {})
        
        # Update hero section text if exists
        hero_section = soup.find('div', class_='hero-content') or soup.find('section', id='section1')
        if hero_section and text_data.get('headings'):
            h1 = hero_section.find('h1')
            if h1 and text_data['headings']:
                # Use first h1 from scraped content
                for heading in text_data['headings']:
                    if heading['level'] == 'h1':
                        h1.string = heading['text']
                        break
        
        # Update about section
        about_section = soup.find('div', id='section2') or soup.find('section', class_='slide-2')
        if about_section and text_data.get('paragraphs'):
            paragraphs = about_section.find_all('p')
            for i, p_tag in enumerate(paragraphs[:3]):  # Update first 3 paragraphs
                if i < len(text_data['paragraphs']):
                    p_tag.string = text_data['paragraphs'][i]
        
        return soup
    
    def update_about_page(self, soup, page_data):
        """Update about page content"""
        text_data = page_data.get('text', {})
        
        # Update main heading
        h1 = soup.find('h1')
        if h1 and text_data.get('headings'):
            for heading in text_data['headings']:
                if heading['level'] == 'h1':
                    h1.string = heading['text']
                    break
        
        # Update main content paragraphs
        content_section = soup.find('section', id='aboutUsBoxText') or soup.find('div', class_='box-max-width')
        if content_section and text_data.get('paragraphs'):
            paragraphs = content_section.find_all('p')
            for i, p_tag in enumerate(paragraphs):
                if i < len(text_data['paragraphs']):
                    p_tag.string = text_data['paragraphs'][i]
        
        return soup
    
    def update_services_page(self, soup, page_data):
        """Update services page content"""
        text_data = page_data.get('text', {})
        
        # Update headings
        if text_data.get('headings'):
            h1 = soup.find('h1')
            if h1:
                for heading in text_data['headings']:
                    if heading['level'] == 'h1':
                        h1.string = heading['text']
                        break
        
        return soup
    
    def update_images(self, soup, page_data, image_mapping):
        """Update image references in HTML"""
        images = page_data.get('images', [])
        
        # Create mapping from original URL to local path
        url_to_local = {}
        for img in images:
            if 'local_path' in img:
                filename = Path(img['local_path']).name
                url_to_local[img['url']] = f"./assets/scraped_images/{filename}"
        
        # Update img tags
        for img_tag in soup.find_all('img'):
            src = img_tag.get('src', '')
            # Check if this image was scraped
            for original_url, local_path in url_to_local.items():
                if original_url in src or Path(src).name in original_url:
                    img_tag['src'] = local_path
                    if not img_tag.get('alt') and 'alt' in img:
                        img_tag['alt'] = img.get('alt', '')
                    break
        
        return soup
    
    def integrate_content(self):
        """Main integration function"""
        self.load_scraped_data()
        
        # Copy images first
        image_dir = self.copy_images_to_assets()
        
        # Process each mapped page
        for scraped_url, html_file in self.page_mapping.items():
            if scraped_url not in self.scraped_data:
                print(f"Skipping {html_file} - no scraped data for {scraped_url}")
                continue
            
            html_path = self.base_dir / html_file
            if not html_path.exists():
                print(f"HTML file not found: {html_file}")
                continue
            
            print(f"\nUpdating {html_file}...")
            
            # Load HTML
            with open(html_path, 'r', encoding='utf-8') as f:
                html_content = f.read()
            
            soup = BeautifulSoup(html_content, 'html.parser')
            page_data = self.scraped_data[scraped_url]
            
            # Update content based on page type
            if 'home' in html_file:
                soup = self.update_home_page(soup, page_data)
            elif 'about' in html_file:
                soup = self.update_about_page(soup, page_data)
            elif 'Service' in html_file:
                soup = self.update_services_page(soup, page_data)
            
            # Update images
            soup = self.update_images(soup, page_data, {})
            
            # Save updated HTML
            with open(html_path, 'w', encoding='utf-8') as f:
                f.write(str(soup))
            
            print(f"Updated {html_file}")

if __name__ == "__main__":
    integrator = ContentIntegrator()
    integrator.integrate_content()