import requests
import csv
from bs4 import BeautifulSoup
import time
import random
from tqdm import tqdm
import re

class LianjiaScraper:     
    def __init__(self):
        self.session = requests.Session()
        
        self.session.headers.update({
            'Accept': '*',
            'Accept-Language': 'zh-CN',
            'Cache-Control': 'no-cache',
            'Connection': 'keep-alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
        })
        
        self.session.cookies.update({
            'select_city': '110000'
        })
    
    def fetch_page(self, url):
        try:
            response = self.session.get(url, timeout=10)
            if response.status_code == 200:
                return response
            else:
                print(f"❌ Failed to fetch. Status code: {response.status_code} for {url}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"❌ Request failed: {e} for {url}")
            return None
    
    def parse_rental_data(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        rental_items = []
        
        items = soup.find_all('div', class_='content__list--item--main')
        
        for item in items:
            try:
                # Extract raw title
                title_elem = item.find('p', class_='content__list--item--title')
                raw_title = title_elem.find('a').text.strip() if title_elem and title_elem.find('a') else ""

                if "车库" in raw_title:
                    print(f"Skipping item as it contains '车库' in title: '{raw_title}'") 
                    continue # Skip to the next item in the loop

                # Initialize rentalType and cleaned_title
                rental_type = ""
                cleaned_title = raw_title

                # Split the raw_title by "·" to get type and the rest of the title
                if '·' in raw_title:
                    parts = raw_title.split('·', 1)
                    extracted_type = parts[0].strip()
                    temp_title_part = parts[1].strip()

                    # Always attempt to get cleaned_title by splitting on first space
                    # from the part after '·'
                    first_space_split_after_dot = temp_title_part.split(' ', 1)
                    cleaned_title = first_space_split_after_dot[0].strip()
                    
                    # Set rental_type from extracted_type
                    rental_type = extracted_type
                else:
                    # Case 1: raw_title didn't contain "·"
                    # Extract the community name as cleaned_title by splitting on the first space
                    first_space_split_no_dot = raw_title.split(' ', 1)
                    cleaned_title = first_space_split_no_dot[0].strip()

                # Case 2: Default rental_type
                if rental_type not in ["独栋", "整租"]:
                    rental_type = "整租"

                # Initialize new description-based fields
                district = ""
                community = ""
                location = ""
                total_area = ""
                orientation = ""
                specification = ""
                floor_type = ""
                floor = ""
                quantity = 1

                if rental_type == "独栋":
                    district = "-"
                    community = "-"
                    location = "-"
                    orientation = "-"
                    floor_type = "-"
                    floor = "-"

                # Extract and process description
                desc_elem = item.find('p', class_='content__list--item--des')
                raw_description = desc_elem.text.strip() if desc_elem else ""

                if "精选" in raw_description:
                        continue # Skip to the next item in the loop

                # Parse raw_description if it exists
                if raw_description:
                    desc_parts = raw_description.split('/')

                    # Handle District, Community, Location conditionally
                    if rental_type != "独栋" and len(desc_parts) > 0:
                        location_info = desc_parts[0].strip()
                        location_sub_parts = location_info.split('-')
                        if len(location_sub_parts) > 0:
                            district = location_sub_parts[0].strip()
                        if len(location_sub_parts) > 1:
                            community = location_sub_parts[1].strip()
                        if len(location_sub_parts) > 2:
                            location = location_sub_parts[2].strip()

                    # Handle TotalArea (can be single value or range)
                    area_index = -1
                    for i, part in enumerate(desc_parts):
                        if '㎡' in part:
                            area_index = i
                            break
                    
                    if area_index != -1:
                        area_str = desc_parts[area_index].strip()
                        area_match = re.search(r'(\d+\.?\d*)', area_str)
                        if area_match:
                            total_area = area_match.group(1)

                    # Extract Quantity
                    for part in desc_parts:
                        quantity_match_1 = re.search(r'(\d+)间在租', part)
                        quantity_match_2 = re.search(r'仅剩(\d+)间', part)
                        if quantity_match_1:
                            quantity = int(quantity_match_1.group(1))
                            break # Found quantity, no need to check other parts
                        elif quantity_match_2:
                            quantity = int(quantity_match_2.group(1))
                            break # Found quantity, no need to check other parts

                    # Part 3: Orientation
                    orientation_index = area_index + 1 if area_index != -1 else -1
                    if orientation_index != -1 and len(desc_parts) > orientation_index:
                        orientation_str = desc_parts[orientation_index].strip()
                        # Ensure it's not a quantity string before parsing as orientation
                        if not re.search(r'(\d+)间在租|仅剩(\d+)间', orientation_str):
                            orientation = ','.join([o.strip() for o in orientation_str.split(' ') if o.strip()])

                    # Part 4: Specification - Updated regex for explicit patterns
                    specification_index = -1
                    specification_regex = r'(\d+室\d*厅?\d+卫|\d+房间\d+卫)' 
                    
                    for i in range(len(desc_parts)):
                        stripped_part = desc_parts[i].strip()
                        if re.search(specification_regex, stripped_part):
                            specification_index = i
                            break
                    if specification_index != -1:
                        specification = desc_parts[specification_index].strip()
                    # Part 5: Floor information
                    floor_info_index = specification_index + 1 if specification_index != -1 else -1
                    if floor_info_index != -1 and len(desc_parts) > floor_info_index:
                        floor_info_str = desc_parts[floor_info_index].strip()
                        
                        floor_type_match = re.match(r'(.+?)（', floor_info_str)
                        if floor_type_match:
                            floor_type = floor_type_match.group(1).strip()
                        else: 
                            floor_type = floor_info_str.strip()

                        floor_num_match = re.search(r'（(\d+)层）', floor_info_str)
                        if floor_num_match:
                            floor = floor_num_match.group(1).strip()

                # Extract tags and join with comma
                bottom_elem = item.find('p', class_='content__list--item--bottom oneline')
                tags = []
                if bottom_elem:
                    tag_elements = bottom_elem.find_all('i')
                    tags = [tag.text.strip() for tag in tag_elements if tag.text.strip()]
                tag_string = ", ".join(tags)
                
                # Extract source
                brand_source = ""
                brand_elem = item.find('p', class_='content__list--item--brand oneline')
                if brand_elem:
                    brand_span = brand_elem.find('span', class_='brand')
                    if brand_span:
                        brand_source = brand_span.text.strip()
                
                # Extract modified_date_days_ago
                modified_date_days_ago = "" 
                if brand_elem: 
                    time_elem = brand_elem.find('span', class_='content__list--item--time') 
                    if time_elem:
                        time_text = time_elem.text.strip()
                        if '今天' in time_text:
                            modified_date_days_ago = 0 
                        else:
                            days_match = re.search(r'(\d+)天前维护', time_text)
                            if days_match:
                                modified_date_days_ago = int(days_match.group(1)) 
                
                # Extract price
                price_elem = item.find('span', class_='content__list--item-price')
                price = ""
                if price_elem:
                    em_elem = price_elem.find('em')
                    price = em_elem.text.strip() if em_elem else ""
                
                # Create rental item dictionary
                rental_item = {
                    'type': rental_type,
                    'title': cleaned_title,
                    'district': district,
                    'community': community,
                    'location': location,
                    'total_area': total_area,
                    'orientation': orientation,
                    'specification': specification,
                    'floor_type': floor_type,
                    'floor': floor,
                    'quantity': quantity,
                    'tag': tag_string,
                    'brand_source': brand_source,
                    'modified_date_days_ago': modified_date_days_ago,
                    'price': price
                }
                
                rental_items.append(rental_item)   
            except Exception as e:
                print(f"Error parsing item: {e}")
                continue
        return rental_items
    
    def get_total_pages(self, html_content):

        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find pagination element
        pg_elem = soup.find('div', class_='content__pg')
        
        if pg_elem and pg_elem.get('data-totalpage'):
            total_pages = int(pg_elem.get('data-totalpage'))
            current_page = int(pg_elem.get('data-curpage', 1))
            
            print(f"Found pagination: Current page {current_page}, Total pages {total_pages}")
            return total_pages
        else:
            print("No pagination found, assuming single page")
            return 1
    
    def build_page_url(self, base_url, page_num):
        # return base_url
        if page_num == 1:
            return base_url
        else:
            base_url = base_url.rstrip('/')
            return f"{base_url}/pg{page_num}/"
    
    def save_to_csv(self, data, filename='data.csv'):

        if not data:
            print("No data to save")
            return False
        
        try:
            # Ensure UTF-8 encoding when saving CSV
            with open(filename, 'w', newline='', encoding='utf-8-sig') as csvfile:
                fieldnames = ['type', 'title', 'district', 'community', 'location', 
                'total_area', 'orientation', 'specification', 
                'floor_type', 'floor', 'quantity', 'tag', 'brand_source', 
                'modified_date_days_ago', 'price']
                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
                
                # Write header
                writer.writeheader()
                
                # Write data
                for item in data:
                    writer.writerow(item)
                
                print(f"✅ Successfully saved {len(data)} items to {filename} (UTF-8)")
                return True
                
        except Exception as e:
            print(f"❌ Error saving to CSV: {e}")
            return False
    
    def scrape_all_pages(self, base_url='https://bj.lianjia.com/zufang/', filename='data.csv'):
        print("=== Starting Multi-Page Scraping Process ===")
        
        all_rental_data = []
        
        # Step 1: Get first page to determine total pages
        print("Fetching first page to get pagination info...")
        response = self.fetch_page(base_url)
        if not response:
            return False
        
        # Get total pages
        total_pages = self.get_total_pages(response.text)
        
        # Parse first page data
        print("\n=== Parsing Page 1 ===")
        page_data = self.parse_rental_data(response.text)
        if page_data:
            all_rental_data.extend(page_data)
            print(f"Page 1: Found {len(page_data)} items")
        
        # Step 2: Scrape remaining pages if there are more
        if total_pages > 1:
            print(f"\n=== Scraping Pages 2 to {total_pages} ===")
            
            # Initialize tqdm for the loop over pages
            for page_num in tqdm(range(2, total_pages + 1), desc="Scraping pages"):
                
                # Build page URL
                page_url = self.build_page_url(base_url, page_num)
                
                # Fetch page
                response = self.fetch_page(page_url)
                if not response:
                    tqdm.write(f"❌ Failed to fetch page {page_num}, skipping...") 
                    continue
                
                # Parse page data
                page_data = self.parse_rental_data(response.text)
                if page_data:
                    all_rental_data.extend(page_data)
                else:
                    tqdm.write(f"Page {page_num}: No data found") 
                
                # Add random delay 
                time.sleep(random.uniform(1, 5))
        
        # Step 3: Save all data to CSV
        if all_rental_data:
            print(f"\n=== Summary ===")
            print(f"Total items collected: {len(all_rental_data)}")
            print(f"Pages processed: {min(total_pages, len([p for p in range(1, total_pages + 1)]))}")
            
            # Save to CSV with UTF-8 encoding
            success = self.save_to_csv(all_rental_data, filename)
            return success
        else:
            print("❌ No rental data collected from any pages")
            return False

def main():
    print("Fetching all pages, parsing, and saving rental data to CSV...\n")
    
    # Create scraper instance
    scraper = LianjiaScraper()
    success = scraper.scrape_all_pages()
    
    if success:
        print("Check 'data.csv' file for all extracted rental data.")
    else:
        print("\n⚠️ Scraping process failed. Please check the error messages above.")

if __name__ == "__main__":
    main()