Scraping Housing Data for Stata Users

Jared Greathouse

Join Date: Sep 2021
Posts: 2172

Scraping Housing Data for Stata Users

09 Sep 2023, 19:42

In case anyone on here who both uses Python and would be interested in scraping together some housing price data , here's some Python code to do that.

Code:

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import pandas as pd
from io import StringIO
from datetime import datetime

# Create Chrome options with headless mode
chrome_options = Options()
chrome_options.add_argument('--headless')  # Enable headless mode
# Disable SSL certificate errors
chrome_options.add_argument('--ignore-certificate-errors')

driver = webdriver.Chrome(options=chrome_options)

# The URL of the page containing the table
url = 'https://data.stats.gov.cn/english/easyquery.htm?cn=E0104'
driver.get(url)

wait = WebDriverWait(driver, 30)

wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#treeZhiBiao_3_span"))).click()


wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click()


wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_2_span"))).click()


wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_11_span"))).click()

wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click()

wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtHead"))).click()


# Find the text box element by CSS selector or other means
text_box = driver.find_element_by_css_selector(
    "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > input")

# Send the text "201001-last10" to the text box
text_box.send_keys("2006-,")

wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > div.dtTextBtn.f10"))).click()

# Click on the dropdown box to open it
wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click()

# Find all the elements in the dropdown
dropdown_elements = driver.find_elements(
    By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li")

# Create an empty list to store DataFrames
dataframes = []

# Find all dropdown elements
dropdown_elements = driver.find_elements(
    By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li")

# Iterate through dropdown elements starting from index 1
for i in range(1, len(dropdown_elements)):
    # Click the dropdown to open it
    wait.until(EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click()

    # Scroll the element into view
    driver.execute_script("arguments[0].scrollIntoView();", dropdown_elements[i])

    # Get the name of the element using JavaScript
    name = driver.execute_script("return arguments[0].textContent;", dropdown_elements[i]).strip()

    # Click the specific element
    dropdown_elements[i].click()

    # Wait for the table to load
    time.sleep(2)  # Adjust the sleep time as needed

    # Scrape the table and convert it to a DataFrame
    table = driver.find_element(By.CSS_SELECTOR, ".table-container")
    table_html = table.get_attribute('outerHTML')
    df = pd.read_html(StringIO(table_html))[0]

    # Drop the "Aug 2023" column
    df = df.drop(columns=['Aug 2023'])

    df = df.dropna(subset=df.columns[2:], how='any')

    # Get the column names and reverse their order
    columns = df.columns.tolist()
    columns = [columns[0]] + columns[1:][::-1]

    # Reorder the columns in the DataFrame
    df = df[columns]

    # Use the melt function to reshape the DataFrame
    df = df.melt(id_vars=['Indicators'], var_name='String Date', value_name='Normalized Price')

    # Create a 'City' column with the city name
    df['City'] = name

    # Store the DataFrame with the name as the identifier
    dataframes.append((name, df))


# Close the WebDriver
driver.quit()
%clear

# Create an empty DataFrame to store the combined data
combined_df = pd.DataFrame()

# Iterate through the dataframes list and concatenate them
for _, df in dataframes:
    combined_df = pd.concat([combined_df, df], ignore_index=True)

It gives us normalized housing price data across 70 Chinese cities over 17 years. China is doing some pretty interesting things like introducing property taxes at various levels, among other things, so someone may find this resource helpful, assuming you do most of your 'metrics work in Stata.

Tags: None

Announcement

Scraping Housing Data for Stata Users