In case anyone on here who both uses Python and would be interested in scraping together some housing price data , here's some Python code to do that.
It gives us normalized housing price data across 70 Chinese cities over 17 years. China is doing some pretty interesting things like introducing property taxes at various levels, among other things, so someone may find this resource helpful, assuming you do most of your 'metrics work in Stata.
Code:
from selenium import webdriver from bs4 import BeautifulSoup import time from selenium.webdriver.chrome.options import Options from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By import pandas as pd from io import StringIO from datetime import datetime # Create Chrome options with headless mode chrome_options = Options() chrome_options.add_argument('--headless') # Enable headless mode # Disable SSL certificate errors chrome_options.add_argument('--ignore-certificate-errors') driver = webdriver.Chrome(options=chrome_options) # The URL of the page containing the table url = 'https://data.stats.gov.cn/english/easyquery.htm?cn=E0104' driver.get(url) wait = WebDriverWait(driver, 30) wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#treeZhiBiao_3_span"))).click() wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click() wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_2_span"))).click() wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_11_span"))).click() wait.until(EC.element_to_be_clickable((By.ID, "treeZhiBiao_10_span"))).click() wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtHead"))).click() # Find the text box element by CSS selector or other means text_box = driver.find_element_by_css_selector( "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > input") # Send the text "201001-last10" to the text box text_box.send_keys("2006-,") wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, "#mySelect_sj > div.dtHtml > div.dtBody > div.dtFoot > div.dtTextBtn.f10"))).click() # Click on the dropdown box to open it wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click() # Find all the elements in the dropdown dropdown_elements = driver.find_elements( By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li") # Create an empty list to store DataFrames dataframes = [] # Find all dropdown elements dropdown_elements = driver.find_elements( By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtBody > div.dtList > ul > li") # Iterate through dropdown elements starting from index 1 for i in range(1, len(dropdown_elements)): # Click the dropdown to open it wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, "#mySelect_reg > div.dtHtml > div.dtHead"))).click() # Scroll the element into view driver.execute_script("arguments[0].scrollIntoView();", dropdown_elements[i]) # Get the name of the element using JavaScript name = driver.execute_script("return arguments[0].textContent;", dropdown_elements[i]).strip() # Click the specific element dropdown_elements[i].click() # Wait for the table to load time.sleep(2) # Adjust the sleep time as needed # Scrape the table and convert it to a DataFrame table = driver.find_element(By.CSS_SELECTOR, ".table-container") table_html = table.get_attribute('outerHTML') df = pd.read_html(StringIO(table_html))[0] # Drop the "Aug 2023" column df = df.drop(columns=['Aug 2023']) df = df.dropna(subset=df.columns[2:], how='any') # Get the column names and reverse their order columns = df.columns.tolist() columns = [columns[0]] + columns[1:][::-1] # Reorder the columns in the DataFrame df = df[columns] # Use the melt function to reshape the DataFrame df = df.melt(id_vars=['Indicators'], var_name='String Date', value_name='Normalized Price') # Create a 'City' column with the city name df['City'] = name # Store the DataFrame with the name as the identifier dataframes.append((name, df)) # Close the WebDriver driver.quit() %clear # Create an empty DataFrame to store the combined data combined_df = pd.DataFrame() # Iterate through the dataframes list and concatenate them for _, df in dataframes: combined_df = pd.concat([combined_df, df], ignore_index=True)