How to Scrape Airbnb With Selenium
Since 2008, Airbnb has been in operation. After its rise in popularity, it completely transformed the rental and hotel industries. Instead of a hotel, you can search for short-stay rental properties on Airbnb, which provides a unique dataset for us to analyze.
Today, we’ll build a scraper project to collect Airbnb listings along with their reviews.
- TLDR: How to Scrape Airbnb
- How To Architect Our Scraper
- Understanding How To Scrape Airbnb
- Setting Up Our Airbnb Scraper
- Build An Airbnb Search Crawler
- Build An Airbnb Scraper
- Legal and Ethical Considerations
- Conclusion
- More Python Web Scraping Guides
Need help scraping the web?
Then check out ScrapeOps, the complete toolkit for web scraping.
TLDR - How to Scrape Airbnb
If reading is not your preference but you need a scraper, you've come to the right place. Below, we offer a pre-built scraper ready for you to use.
- Start by creating a new project folder and include a config.json file with your ScrapeOps API keys.
- Next, create a new Python file and insert the following code.
import os
import re
import csv
import json
import logging
import time
from urllib.parse import urlencode
from dataclasses import dataclass, fields, asdict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import concurrent.futures
API_KEY = ""
with open("config.json", "r") as config_file:
config = json.load(config_file)
API_KEY = config["api_key"]
def get_scrapeops_url(url, location="us"):
payload = {
"api_key": API_KEY,
"url": url,
"country": location,
"wait": 5000,
}
proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload)
return proxy_url
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class SearchData:
name: str = ""
description: str = ""
dates: str = ""
price: str = ""
url: str = ""
def __post_init__(self):
self.check_string_fields()
def check_string_fields(self):
for field in fields(self):
if isinstance(getattr(self, field.name), str) and getattr(self, field.name) == "":
setattr(self, field.name, f"No {field.name}")
else:
value = getattr(self, field.name)
setattr(self, field.name, value.strip())
@dataclass
class ReviewData:
name: str = ""
stars: int = 0
review: str = ""
def __post_init__(self):
self.check_string_fields()
def check_string_fields(self):
for field in fields(self):
# Check string fields
if isinstance(getattr(self, field.name), str):
# If empty set default text
if getattr(self, field.name) == "":
setattr(self, field.name, f"No {field.name}")
continue
# Strip any trailing spaces, etc.
value = getattr(self, field.name)
setattr(self, field.name, value.strip())
class DataPipeline:
def __init__(self, csv_filename="", storage_queue_limit=50):
self.names_seen = []
self.storage_queue = []
self.storage_queue_limit = storage_queue_limit
self.csv_filename = csv_filename
self.csv_file_open = False
def save_to_csv(self):
self.csv_file_open = True
data_to_save = list(self.storage_queue)
self.storage_queue.clear()
if not data_to_save:
return
keys = [field.name for field in fields(data_to_save[0])]
file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0
with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file:
writer = csv.DictWriter(output_file, fieldnames=keys)
if not file_exists:
writer.writeheader()
for item in data_to_save:
writer.writerow(asdict(item))
self.csv_file_open = False
def is_duplicate(self, input_data):
if input_data.name in self.names_seen:
logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.")
return True
self.names_seen.append(input_data.name)
return False
def add_data(self, scraped_data):
if not self.is_duplicate(scraped_data):
self.storage_queue.append(scraped_data)
if len(self.storage_queue) >= self.storage_queue_limit and not self.csv_file_open:
self.save_to_csv()
def close_pipeline(self):
if self.csv_file_open:
time.sleep(3)
if self.storage_queue:
self.save_to_csv()
def find_pagination_urls(keyword, location, pages=4, retries=3):
formatted_keyword = keyword.replace(", ", "--").replace(" ", "-")
url = f"https://www.airbnb.com/s/{formatted_keyword}/homes"
tries, success = 0, False
links = [url]
while tries < retries and not success:
try:
chrome_options = Options()
chrome_options.add_argument("--headless") # Run headless for speed
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get(url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "nav[aria-label='Search results pagination']")))
pagination_bar = driver.find_element(By.CSS_SELECTOR, "nav[aria-label='Search results pagination']")
a_tags = pagination_bar.find_elements(By.TAG_NAME, "a")
acceptable_pages = ["1", "2", "3", "4"]
for a in a_tags:
if a.text in acceptable_pages and len(links) < pages:
link = a.get_attribute("href")
if link:
links.append(link)
success = True
driver.quit()
except Exception as e:
logger.warning(f"Failed to fetch page list for {url} tries left {retries - tries}")
logger.warning(f"Exception: {e}")
tries += 1
driver.quit()
if not success:
raise Exception("Failed to find pagination, max retries exceeded!")
return links
def scrape_search_results(url, location, data_pipeline=None, retries=3):
tries = 0
success = False
scrapeops_proxy_url = get_scrapeops_url(url, location=location)
while tries <= retries and not success:
try:
# Initialize WebDriver inside the function
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get(scrapeops_proxy_url)
logger.info(f"Loaded page: {url}")
# Wait for listings to load
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='card-container']"))
)
div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='card-container']")
for div_card in div_cards:
description = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='listing-card-title']").text
subtitle_elements = div_card.find_elements(By.CSS_SELECTOR, "div[data-testid='listing-card-subtitle']")
name = subtitle_elements[0].text if len(subtitle_elements) > 0 else "No Name"
dates = subtitle_elements[-1].text if len(subtitle_elements) > 1 else "No Dates"
price = div_card.find_element(By.CSS_SELECTOR, "span div span").text if div_card.find_elements(By.CSS_SELECTOR, "span div span") else "No Price"
href = div_card.find_element(By.TAG_NAME, "a").get_attribute("href")
# Remove the proxy URL part and construct the original Airbnb URL
original_url = href.replace("https://proxy.scrapeops.io/", "https://www.airbnb.com/")
search_data = SearchData(
name=name,
description=description,
dates=dates,
price=price,
url=original_url # Use the cleaned URL
)
data_pipeline.add_data(search_data)
logger.info(f"Successfully parsed data from: {url}")
success = True
except Exception as e:
logger.error(f"An error occurred while processing page {url}: {e}")
logger.info(f"Retrying request for page: {url}, retries left {retries - tries - 1}")
tries += 1
finally:
driver.quit() # Ensures driver is closed on each attempt
if not success:
raise Exception(f"Max retries exceeded for: {url}")
def start_scrape(url_list, location, data_pipeline=None, max_threads=5, retries=3):
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
executor.map(
scrape_search_results,
url_list,
[location] * len(url_list),
[data_pipeline] * len(url_list),
[retries] * len(url_list)
)
def process_listing(row, location, retries=3):
url = row["url"]
tries = 0
success = False
csv_name = re.sub(r'[<>:"/|?*]', "", row["name"].replace(" ", "-"))
scrapeops_proxy_url = get_scrapeops_url(url, location=location)
while tries <= retries and not success:
try:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.get(scrapeops_proxy_url)
logger.info(f"Accessing URL: {url}")
# Wait for the review cards to load
WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[role='listitem']"))
)
review_cards = driver.find_elements(By.CSS_SELECTOR, "div[role='listitem']")
review_pipeline = DataPipeline(csv_filename=f"{csv_name}.csv")
for review_card in review_cards:
name = review_card.find_element(By.TAG_NAME, "h3").text
stars = len(review_card.find_elements(By.TAG_NAME, "svg"))
spans = review_card.find_elements(By.TAG_NAME, "span")
review = spans[-1].text if spans else "No review available"
review_data = ReviewData(
name=name,
stars=stars,
review=review
)
review_pipeline.add_data(review_data)
review_pipeline.close_pipeline()
success = True
logger.info(f"Successfully parsed: {url}")
except Exception as e:
logger.error(f"Exception thrown: {e}")
logger.warning(f"Failed to process page: {url}")
logger.warning(f"Retries left: {retries - tries}")
tries += 1
finally:
driver.quit()
if not success:
raise Exception(f"Max Retries exceeded: {retries}")
def process_results(csv_file, location, max_threads=5, retries=3):
logger.info(f"processing {csv_file}")
with open(csv_file, newline="") as file:
reader = list(csv.DictReader(file))
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
executor.map(
process_listing,
reader,
[location] * len(reader),
[retries] * len(reader)
)
if __name__ == "__main__":
MAX_RETRIES = 3
MAX_THREADS = 5
PAGES = 4
LOCATION = "us"
logger.info(f"Crawl starting...")
## INPUT ---> List of keywords to scrape
keyword_list = ["Myrtle Beach, South Carolina, United States"]
aggregate_files = []
## Job Processes
for keyword in keyword_list:
filename = keyword.replace(", ", "-").replace(" ", "-")
page_urls = find_pagination_urls(keyword, LOCATION, pages=PAGES, retries=MAX_RETRIES)
crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv")
start_scrape(page_urls, LOCATION, data_pipeline=crawl_pipeline, max_threads=MAX_THREADS, retries=MAX_RETRIES)
crawl_pipeline.close_pipeline()
aggregate_files.append(f"{filename}.csv")
logger.info(f"Crawl complete.")
for file in aggregate_files:
process_results(file, LOCATION,max_threads=MAX_THREADS, retries=MAX_RETRIES)
If you want to adjust your results, feel free to modify any of the following constants in main:
MAX_RETRIES
: Specifies the maximum number of attempts the script will retry if a request fails.MAX_THREADS
: Specifies the maximum number of concurrent tasks (or threads) the script will use while scraping data.PAGES
: Sets the number of search result pages the scraper will try to process.LOCATION
: Defines the country code of the location for simulating the scraping requests.keyword_list
: Contains the list of phrases or keywords the script will use to search for listings on the website.
How To Architect Our Airbnb Scraper
This project will include three different scrapers. Our primary scrapers are the results crawler and the listing scraper.
- The results crawler will execute a search and save the results.
- The listing scraper will read the crawler’s report and then scrape reviews for each specific listing.
The steps for building our crawler are as follows:
- Create a function for parsing Airbnb listings.
- Implement pagination to scrape multiple result pages; this involves a mini-scraper.
- The mini-scraper will perform a search and gather links to additional pages.
- Data storage will be used to save data from each listing.
- Concurrency will allow us to scrape multiple pages simultaneously.
- A proxy will be integrated to bypass anti-bot measures.
Our listing scraper will be developed with the following:
- Write a parsing function to gather review data.
- Enable reading of URLs from a CSV file.
- Store each review’s data in a CSV file.
- Scrape review pages concurrently.
- Integrate a proxy once again to bypass anti-bot protection.
Understanding How To Scrape Airbnb
We now need to take a high-level view of our data. In the sections ahead, we should examine Airbnb pages to understand how they’re structured.
It’s necessary to look at how their URLs are created and identify where on each page our data is stored.
Step 1: How To Request Airbnb Pages
We'll use a simple GET request to locate our Airbnb search pages. Our initial search will provide the pages from which we will extract the reviews.
Each card within the search results has its own link that leads to the individual listing page, where the reviews can also be found.
The URL for our result pages begins like this:
https://www.airbnb.com/s/Myrtle-Beach--South-Carolina--United-States/homes
The format follows this structure:
https://www.airbnb.com/s/{NAME-OF-SEARCH-LOCATION}/homes
You can observe this in the image below.
This is an individual page listing. Using our CSV report, these are the pages we will look up. The URL, as you can see, includes a series of hashes that cannot be reproduced:
https://www.airbnb.com/rooms/34653621?adults=1&children=0&enable_m3_private_room=true&infants=0&pets=0&search_mode=regular_search&check_in=2024-09-02&check_out=2024-09-07&source_impression_id=p3_1723223538_P3jJDPiXFbNNUsdP&previous_page_section_name=1000&federated_search_id=532193a1-1995-4edd-824a-5987dfa778f1
Fortunately, we will be scraping these URLs throughout our crawl.
Step 2: How To Extract Data From Airbnb Results and Pages
Since we know how to retrieve these pages, we now need to identify where the data resides. All of the data on the results page is found within div cards that have a data-testid
set to card-container.
We can locate them through the CSS selector "div[data-testid='card-container']"
. Within these cards, we can access all the additional information we need to extract. The location of this data is shown in the HTML below.
The process of extracting our reviews is quite similar. This time, we will be targeting div elements that have a listitem role. The CSS selector we’ll utilize is "div[role='listitem']"
.
Check it out in the image below. From this div, we can access all the review data relevant to us.
Step 3: How To Control Pagination
Handling pagination with Airbnb will be quite different from how some of our other scrapers in this series work. For our listing pages, similar to other cases, the page URLs contain a sequence of hashes that cannot be recreated.
Here’s an example URL:
https://www.airbnb.com/s/Myrtle-Beach--South-Carolina--United-States/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Myrtle%20Beach%2C%20South%20Carolina%2C%20United%20States&place_id=ChIJASFVO5VoAIkRGJbQtRWxD7w&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-09-01&monthly_length=3&monthly_end_date=2024-12-01&search_mode=regular_search&price_filter_input_type=0&channel=EXPLORE&federated_search_session_id=dcc6f5af-f1c5-4463-8c02-7e4dcf38a02d&search_type=unknown&pagination_search=true&cursor=eyJzZWN0aW9uX29mZnNldCI6MCwiaXRlbXNfb2Zmc2V0IjoxOCwidmVyc2lvbiI6MX0%3D
To get these URLs, we’ll need to scrape them first. To do this, we’ll perform an extra GET request on the first page before initiating the scrape, then gather URLs from the page buttons, as shown in the image below.
Step 4: Geolocated Data
To work with geolocated data, we will utilize the ScrapeOps Proxy Aggregator API and specify the country parameter.
When a country is provided to ScrapeOps, it routes us through a server located in that country.
- For instance,
"country": "us"
directs ScrapeOps to make us appear in the US. - Similarly,
"country": "uk"
allows us to appear in the UK.
This setup provides us with an IP address from within the selected country.
Setting Up Our Airbnb Scraper Project
Create a New Project Folder
mkdir airbnb-scraper
cd airbnb-scraper
Create a New Virtual Environment
python -m venv venv
Activate the Environment
source venv/bin/activate
Install Our Dependencies
pip install selenium
pip install webdriver-manager