Announcement

Collapse
No announcement yet.
X
  • Filter
  • Time
  • Show
Clear All
new posts

  • Scraping Housing Data for Stata Users

    In case anyone on here who both uses Python and would be interested in scraping together some housing price data , here's some Python code to do that.
    Code:
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.common.by import By
    import pandas as pd
    from io import StringIO
    from datetime import datetime
    
    # Create Chrome options with headless mode
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Enable headless mode
    # Disable SSL certificate errors
    chrome_options.add_argument('--ignore-certificate-errors')
    
    driver = webdriver.Chrome(options=chrome_options)
    
    # The URL of the page containing the table
    url = 'https://data.stats.gov.cn/english/easyquery.htm?cn=E0104'
    driver.get(url)
    
    wait = WebDriverWait(driver, 30)
    
    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#treeZhiBiao_3_span"))).click()
    
    
    wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click()
    
    
    wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_2_span"))).click()
    
    
    wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_11_span"))).click()
    
    wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click()
    
    wait.until(EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtHead"))).click()
    
    
    # Find the text box element by CSS selector or other means
    text_box = driver.find_element_by_css_selector(
        "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > input")
    
    # Send the text "201001-last10" to the text box
    text_box.send_keys("2006-,")
    
    wait.until(EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > div.dtTextBtn.f10"))).click()
    
    # Click on the dropdown box to open it
    wait.until(EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click()
    
    # Find all the elements in the dropdown
    dropdown_elements = driver.find_elements(
        By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li")
    
    # Create an empty list to store DataFrames
    dataframes = []
    
    # Find all dropdown elements
    dropdown_elements = driver.find_elements(
        By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li")
    
    # Iterate through dropdown elements starting from index 1
    for i in range(1, len(dropdown_elements)):
        # Click the dropdown to open it
        wait.until(EC.element_to_be_clickable(
            (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click()
    
        # Scroll the element into view
        driver.execute_script("arguments[0].scrollIntoView();", dropdown_elements[i])
    
        # Get the name of the element using JavaScript
        name = driver.execute_script("return arguments[0].textContent;", dropdown_elements[i]).strip()
    
        # Click the specific element
        dropdown_elements[i].click()
    
        # Wait for the table to load
        time.sleep(2)  # Adjust the sleep time as needed
    
        # Scrape the table and convert it to a DataFrame
        table = driver.find_element(By.CSS_SELECTOR, ".table-container")
        table_html = table.get_attribute('outerHTML')
        df = pd.read_html(StringIO(table_html))[0]
    
        # Drop the "Aug 2023" column
        df = df.drop(columns=['Aug 2023'])
    
        df = df.dropna(subset=df.columns[2:], how='any')
    
        # Get the column names and reverse their order
        columns = df.columns.tolist()
        columns = [columns[0]] + columns[1:][::-1]
    
        # Reorder the columns in the DataFrame
        df = df[columns]
    
        # Use the melt function to reshape the DataFrame
        df = df.melt(id_vars=['Indicators'], var_name='String Date', value_name='Normalized Price')
    
        # Create a 'City' column with the city name
        df['City'] = name
    
        # Store the DataFrame with the name as the identifier
        dataframes.append((name, df))
    
    
    # Close the WebDriver
    driver.quit()
    %clear
    
    # Create an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()
    
    # Iterate through the dataframes list and concatenate them
    for _, df in dataframes:
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    It gives us normalized housing price data across 70 Chinese cities over 17 years. China is doing some pretty interesting things like introducing property taxes at various levels, among other things, so someone may find this resource helpful, assuming you do most of your 'metrics work in Stata.
Working...
X