How to Scrape Pinterest With NodeJS Puppeteer
Pinterest is a goldmine of creative information. No matter what you're looking for, whether it's art ideas, recipes, or nifty ways to setup a room, Pinterest has something that can inspire you. On top of this, Pinterest is also a social network, so we can scrape account names, follower counts and things like that.
In this guide, we'll go over the following topics:
- TLDR: How to Scrape Pinterest
- How To Architect Our Scraper
- Understanding How To Scrape Pinterest
- Setting Up Our Pinterest Scraper
- Build A Pinterest Search Crawler
- Build A Pinterest Scraper
- Legal and Ethical Considerations
- Conclusion
- More Web Scraping Guides
Need help scraping the web?
Then check out ScrapeOps, the complete toolkit for web scraping.
TLDR - How to Scrape Pinterest
If you don't have time to read, but you need a Pinterest scraper, give the one below a try. Just create a new Puppeteer project and install the dependencies at the top.
Also make sure to create a config.json
file with your API key.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists,
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 3000,
residential: true
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function processPin(browser, row, location, retries = 3) {
const url = row.url;
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const page = await browser.newPage();
await page.setExtraHTTPHeaders({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
});
try {
await page.goto(getScrapeOpsUrl(url, location), { timeout: 60000 });
const mainCard = await page.$("div[data-test-id='CloseupDetails']");
if (!mainCard) {
throw new Error("Failed to load the page!");
}
let website = "n/a";
const websiteHolder = await page.$("span[style='text-decoration: underline;']");
if (websiteHolder) {
website = await page.evaluate(element => element.textContent, websiteHolder);
}
const starDivs = await page.$$("div[data-test-id='rating-star-full']");
const stars = starDivs.length;
const profileInfoDiv = await mainCard.$("div[data-test-id='follower-count']");
if (profileInfoDiv === null) {
throw new Error("Page failed to loaded, most likely blocked!");
}
const profileText = await page.evaluate(element => element.textContent, profileInfoDiv);
const accountNameDiv = await profileInfoDiv.$("div[data-test-id='creator-profile-name']");
const nestedDiv = await accountNameDiv.$("div");
const accountName = await page.evaluate(element => element.getAttribute("title"), nestedDiv);
const followerCount = profileText.replace(accountName, "").replace(" followers", "");
const pinData = {
name: accountName,
website: website,
stars: stars,
follower_count: followerCount,
image: row.image
}
await writeToCsv([pinData], `${row.name.replace(" ", "-")}.csv`);
success = true;
} catch (err) {
await page.screenshot({path: "ERROR.png"});
console.log(`Error: ${err}, tries left: ${retries-tries}, url: ${url}`);
tries++;
} finally {
await page.close();
}
}
}
async function processResults(csvFile, location, concurrencyLimit, retries) {
const pins = await readCsv(csvFile);
const browser = await puppeteer.launch();
while (pins.length > 0) {
const currentBatch = pins.splice(0, concurrencyLimit);
const tasks = currentBatch.map(pin => processPin(browser, pin, location, retries));
try {
await Promise.all(tasks);
} catch (err) {
console.log(`Failed to process batch: ${err}`);
}
}
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "us";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
console.log("Starting scrape");
for (const file of aggregateFiles) {
await processResults(file, location, concurrencyLimit, retries);
}
console.log("Scrape complete");
}
main();
Feel free to change any of the const
variables inside of main
if you'd like to tweak your results. Try changing the following:
keywords
: This list contains the keywords for which you want to scrape Pinterest search results.concurrencyLimit
: This parameter sets the number of concurrent tasks (or browser pages) that the script will process at the same time.location
: This parameter sets the geographical location from which the requests are made. It can affect the content returned by the website due to region-specific restrictions or differences.retries
: This parameter sets the maximum number of attempts the script will make to fetch data from a URL if the initial request fails.
If you are having any issues, try changing your country
and maybe lowering your concurrencyLimit
.
How To How To Architect Our Pinterest Scraper
When we scrape Pinterest, we're going to need two separate scrapers.
- The first scraper will be our crawler. The purpose of the crawler is simple: fetch search results and save them to a CSV file.
- Along with the crawler, we need a pin scraper. The pin scraper needs to lookup individual pins and save their information.
We're going to design this project to use the following things:
- Parsing: to extract the important data from Pinterest.
- Data Storage: To store our data for later review and also to feed information into our scraper.
- Concurrency: to process multiple pages simultaneously and efficiently.
- Proxy Integration: Pinterest is notoriously difficult to access programmatically, so we'll be using the ScrapeOps Proxy API.
Understanding How To Scrape Pinterest
Step 1: How To Request Pinterest Pages
Pretty much everything we do on the web starts with a GET request. When you perform a Google search, or even type a domain name into your address bar, you're performing a GET.
As you probably guessed, we start with a GET when we access Pinterest too. In this tutorial, we're going to scrape results for the keyword, "grilling".
When you perform a search for it in your normal browser, the URL looks like this:
https://www.pinterest.com/search/pins/?q=grilling&rs=typed
- Our actual URL is
https://www.pinterest.com/search/pins/
. ?
tells the server that we'd like to perform a query.- We can string multiple queries together with
&
. - Our query string for the grilling page is
?q=grilling&rs=typed
. typed
is a standard query when we perform a Pinterest search on our computer.grilling
is the search we actually want to perform.
Take a look at the address bar in the image below and see for yourself.
The pin pages are even simpler. With pins, we don't even have to deal with queries, we only need the basic URL. All pin URLs are constructed like this:
https://www.pinterest.com/pin/PIN-NUMBER-GOES-HERE/
Take a look at the pin and the url below.
Step 2: How To Extract Data From Pinterest Results and Pages
Pinterest is very difficult to scrape. Not only is all of our content generated dynamically, but they use some pretty active client side JavaScript to authenticate during sessions.
Even though we're already using a headless browser, Puppeteer, our secret weapon here is the ScrapeOps Headless Browser.
Take a look at the results page below and you can see how nasty the HTML is.
And here is our pin page.
Step 3: Geolocated Data
To handle geolocation, we'll make full use of the ScrapeOps Proxy API.
When we use the API, we can set a country
param which will actually route us through a server in that country.
- If we want to be in the US, we'll be routed through a server in the US.
- If we want to show up in the UK, we'll be routed through a server in the UK.
Setting Up Our Pinterest Scraper Project
Let's get started. You can run the following commands to get setup.
Create a New Project Folder
mkdir pinterest-scraper
cd pinterest-scraper
Create a New JavaScript Project
npm init --y
Install Our Dependencies
npm install puppeteer
npm install csv-writer
npm install csv-parse
npm install fs
Build A Pinterest Search Crawler
Time to start on our first scraper, the crawler. Our crawler will make good use of the following:
- Parsing: to extract valuable data from the page.
- Data Storage: to store our data in a safe and efficient manner.
- Proxy Integration: to get past anti-bots and anything else that might block us.
Step 1: Create Simple Search Data Parser
To start our crawler, we'll build a parser. The job of the parser is relatively simple. It performs a search and pulls data from the search results.
You should notice some basic retry logic and error handling. scrapeSearchResults()
is our parsing function.
Take a look at the code below.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
await page.goto(url);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
console.log(searchData);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
main();
Here are some things you should pay attention to in our parsing function:
await page.$$("div[data-grid-item='true']")
finds all the result items on the page. On Pinterest,data-grid-item='true']
denotes an individual search result.await divCard.$("a")
pulls the link or<a>
element from the search result.- We get the name of the search result from
await page.evaluate(element => element.getAttribute("aria-label"), aElement)
. - The link to the pin gets extracted using
await page.evaluate(element => element.getAttribute("href"), aElement)
- We find the url of the image with
await page.evaluate(element => element.getAttribute("src"), imgElement)
Step 2: Storing the Scraped Data
Parsing our data isn't enough, we need to be able to store it for later use. Take a look at the function in the snippet below. This function takes an array of JSON objects and writes them to CSV.
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
writeToCsv()
takes an array of JSON objects and a filename. First, it checks if our outputFile
exists. If it doesn't exist, we create it. If the file does exist, we append it.
This approach allows us to always write the maximum possible data to a file without overwriting existing data.
In our updated code below, we adjust it to write the object to a CSV file instead of printing it to the console.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
await page.goto(url);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
main();
Step 3: Bypassing Anti-Bots
Bypassing anti-bots is a very important part of scraping any site. Most sites use these to block malicious traffic. Even though we're not being malicious, our scraper doesn't really look human at all.
To bypass anti-bots and add support for geolocation, we'll make a getScrapeOpsUrl()
function. While it's only a small amount of code, this function converts any regular URL into a ScrapeOps proxied URL.
Another important point in our case today is the wait
parameter. If you remember from our earlier examples, we actually disable JavaScript from running inside Puppeteer. wait: 2000
tell the ScrapeOps server to wait two seconds for our content to render before sending the page back to us.
We're then able to read the static page without getting blocked or redirected by the JavaScript code that Pinterest tries to execute.
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 2000
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
Here is our fully updated code with proxy integration.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 2000
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
main();
Step 4: Production Run
Alright! Now that we've got a fully functional crawler, let's run it in production. Take a look at the main
below.
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "us";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
}
Feel free to change any of the following constants to tweak your results:
keywords
location
retries
Don't change the concurrencyLimit
yet because we're not using it just yet. This will come into play when we build our pin scraper.
Here are our results:
It took 9.49 seconds to fetch and parse the page. Results may vary based on your hardware, internet speed, and the country
you passed into the API.
Build A Pinterest Scraper
Our pin scraper is going to fetch an individual pin and pull information from it. Our design needs to be able to all of the following things:
- Parse the information from a pin.
- Read the rows from the CSV file.
- Store the data we extracted when parsing.
- Perform all these actions concurrently.
- Integrate with the ScrapeOps Proxy API
Step 1: Create Simple Data Parser
Just like we did earlier with the crawler, we're going to start our scraper out with a parsing function. Take a look at the code below. It has retries and error handling just like before, but this time, the parsing logic is a bit different.
async function processPin(browser, row, location, retries = 3) {
const url = row.url;
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(url, { timeout: 60000 });
const mainCard = await page.$("div[data-test-id='CloseupDetails']");
let website = "n/a";
const websiteHolder = await page.$("span[style='text-decoration: underline;']");
if (websiteHolder) {
website = await page.evaluate(element => element.textContent, websiteHolder);
}
const starDivs = await page.$$("div[data-test-id='rating-star-full']");
const stars = starDivs.length;
const profileInfoDiv = await mainCard.$("div[data-test-id='follower-count']");
if (profileInfoDiv === null) {
throw new Error("Page failed to loaded, most likely blocked!");
}
const profileText = await page.evaluate(element => element.textContent, profileInfoDiv);
const accountNameDiv = await profileInfoDiv.$("div[data-test-id='creator-profile-name']");
const nestedDiv = await accountNameDiv.$("div");
const accountName = await page.evaluate(element => element.getAttribute("title"), nestedDiv);
const followerCount = profileText.replace(accountName, "").replace(" followers", "");
const pinData = {
name: accountName,
website: website,
stars: stars,
follower_count: followerCount,
image: row.image
}
console.log(pinData);
success = true;
} catch (err) {
await page.screenshot({path: "ERROR.png"});
console.log(`Error: ${err}, tries left: ${retries-tries}, url: ${url}`);
tries++;
} finally {
await page.close();
}
}
}
Here is some key parsing logic to notice in processPin()
:
await page.$("div[data-test-id='CloseupDetails']")
finds the main card on the page.- We find the
websiteHolder
withawait page.$("span[style='text-decoration: underline;']")
- If there is a
websiteHolder
present, we useawait page.evaluate(element => element.textContent, websiteHolder)
to extract thetextContent
from it. await mainCard.$("div[data-test-id='follower-count']")
looks for the profile section on the page. If this item isn't present, we throw an error because the page didn't load correctly.await page.evaluate(element => element.getAttribute("title"), nestedDiv)
pulls the account name from ournestedDiv
.- We then use
replace()
to remove unneeded text and retrieve our follower count.
Step 2: Loading URLs To Scrape
Our processPin()
function isn't very useful if it doesn't know what to scrape. We need to read the CSV file created by our crawler and then pass all the rows from the crawler into processPin()
.
The function below takes a CSV file and reads it into an array of JSON objects.
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
Now that we know how to read a CSV, let's add this and our parser into our overall code.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 2000
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function processPin(browser, row, location, retries = 3) {
const url = row.url;
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(url, { timeout: 60000 });
const mainCard = await page.$("div[data-test-id='CloseupDetails']");
let website = "n/a";
const websiteHolder = await page.$("span[style='text-decoration: underline;']");
if (websiteHolder) {
website = await page.evaluate(element => element.textContent, websiteHolder);
}
const starDivs = await page.$$("div[data-test-id='rating-star-full']");
const stars = starDivs.length;
const profileInfoDiv = await mainCard.$("div[data-test-id='follower-count']");
if (profileInfoDiv === null) {
throw new Error("Page failed to loaded, most likely blocked!");
}
const profileText = await page.evaluate(element => element.textContent, profileInfoDiv);
const accountNameDiv = await profileInfoDiv.$("div[data-test-id='creator-profile-name']");
const nestedDiv = await accountNameDiv.$("div");
const accountName = await page.evaluate(element => element.getAttribute("title"), nestedDiv);
const followerCount = profileText.replace(accountName, "").replace(" followers", "");
const pinData = {
name: accountName,
website: website,
stars: stars,
follower_count: followerCount,
image: row.image
}
console.log(pinData);
success = true;
} catch (err) {
await page.screenshot({path: "ERROR.png"});
console.log(`Error: ${err}, tries left: ${retries-tries}, url: ${url}`);
tries++;
} finally {
await page.close();
}
}
}
async function processResults(csvFile, location, concurrencyLimit, retries) {
const pins = await readCsv(csvFile);
const browser = await puppeteer.launch();
for (const pin of pins) {
await processPin(browser, pin, location, location, retries)
}
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
console.log("Starting scrape");
for (const file of aggregateFiles) {
await processResults(file, location, concurrencyLimit, retries);
}
console.log("Scrape complete");
}
main();
In the next section, we'll do a little more than just print the JSON object to the console.
Step 3: Storing the Scraped Data
We already have our writeToCsv()
function from earlier, we just need to put it in the right place. Instead of logging each pin item to the console, we're going to do this.
await writeToCsv([pinData], `${row.name.replace(" ", "-")}.csv`);
Even though we only changed one line, here is the full code if you need it.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 2000
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function processPin(browser, row, location, retries = 3) {
const url = row.url;
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(url, { timeout: 60000 });
const mainCard = await page.$("div[data-test-id='CloseupDetails']");
let website = "n/a";
const websiteHolder = await page.$("span[style='text-decoration: underline;']");
if (websiteHolder) {
website = await page.evaluate(element => element.textContent, websiteHolder);
}
const starDivs = await page.$$("div[data-test-id='rating-star-full']");
const stars = starDivs.length;
const profileInfoDiv = await mainCard.$("div[data-test-id='follower-count']");
if (profileInfoDiv === null) {
throw new Error("Page failed to load, most likely blocked!");
}
const profileText = await page.evaluate(element => element.textContent, profileInfoDiv);
const accountNameDiv = await profileInfoDiv.$("div[data-test-id='creator-profile-name']");
const nestedDiv = await accountNameDiv.$("div");
const accountName = await page.evaluate(element => element.getAttribute("title"), nestedDiv);
const followerCount = profileText.replace(accountName, "").replace(" followers", "");
const pinData = {
name: accountName,
website: website,
stars: stars,
follower_count: followerCount,
image: row.image
}
await writeToCsv([pinData], `${row.name.replace(" ", "-")}.csv`);
success = true;
} catch (err) {
await page.screenshot({path: "ERROR.png"});
console.log(`Error: ${err}, tries left: ${retries-tries}, url: ${url}`);
tries++;
} finally {
await page.close();
}
}
}
async function processResults(csvFile, location, concurrencyLimit, retries) {
const pins = await readCsv(csvFile);
const browser = await puppeteer.launch();
for (const pin of pins) {
await processPin(browser, pin, location, location, retries)
}
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
console.log("Starting scrape");
for (const file of aggregateFiles) {
await processResults(file, location, concurrencyLimit, retries);
}
console.log("Scrape complete");
}
main();
One line changed and we now have proper storage!!!
Step 4: Adding Concurrency
Our scraper is getting really close to production ready, but first, we need to add both concurrency and proxy support. In this section, we'll make some small changes that create a big impact.
We're going to refactor our processResults()
function to look like this.
async function processResults(csvFile, location, concurrencyLimit, retries) {
const pins = await readCsv(csvFile);
const browser = await puppeteer.launch();
while (pins.length > 0) {
const currentBatch = pins.splice(0, concurrencyLimit);
const tasks = currentBatch.map(pin => processPin(browser, pin, location, retries));
try {
await Promise.all(tasks);
} catch (err) {
console.log(`Failed to process batch: ${err}`);
}
}
await browser.close();
}
while
pins
is longer than 0, we're going to splice from index 0 up to ourconcurrencyLimit
. This shortens the array (therefore reducing its size in memory) and also runsprocessPin()
on each row we spliced from the array.- After
await Promise.all(tasks)
resolves, we repeat this process, constantly shrinking the array and improving performance as time goes on.
Step 5: Bypassing Anti-Bots
There is one final change we need to make inside of our processPin()
function. We need to replace page.goto(url)
with the following line.
For extra redundancy, in getScrapeOpsUrl()
, we'll be setting residential
to true. Adding the residential
argument reduces the likelihood that Pinterest will block the proxy.
During extensive testing, the Pinterest server was able to detect and block the scraper a good portion of the time when not using residential
.
Here is our updated proxy function.
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
wait: 3000,
residential: true
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
And here is the single line we change in the parser.
await page.goto(getScrapeOpsUrl(url, location), { timeout: 60000 });
Make sure to use a long timeout (we used 60 seconds). Sometimes it takes a minute to get a response back from the server.
Here is our production ready code.
const puppeteer = require("puppeteer");
const createCsvWriter = require("csv-writer").createObjectCsvWriter;
const csvParse = require("csv-parse");
const fs = require("fs");
const API_KEY = JSON.parse(fs.readFileSync("config.json")).api_key;
async function writeToCsv(data, outputFile) {
if (!data || data.length === 0) {
throw new Error("No data to write!");
}
const fileExists = fs.existsSync(outputFile);
const headers = Object.keys(data[0]).map(key => ({id: key, title: key}))
const csvWriter = createCsvWriter({
path: outputFile,
header: headers,
append: fileExists
});
try {
await csvWriter.writeRecords(data);
} catch (e) {
throw new Error("Failed to write to csv");
}
}
async function readCsv(inputFile) {
const results = [];
const parser = fs.createReadStream(inputFile).pipe(csvParse.parse({
columns: true,
delimiter: ",",
trim: true,
skip_empty_lines: true
}));
for await (const record of parser) {
results.push(record);
}
return results;
}
function getScrapeOpsUrl(url, location="us") {
const params = new URLSearchParams({
api_key: API_KEY,
url: url,
country: location,
residential: true,
wait: 3000
});
return `https://proxy.scrapeops.io/v1/?${params.toString()}`;
}
async function scrapeSearchResults(browser, keyword, pageNumber, location="us", retries=3) {
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const formattedKeyword = keyword.replace(" ", "+");
const page = await browser.newPage();
await page.setJavaScriptEnabled(false);
try {
const url = `https://www.pinterest.com/search/pins/?q=${formattedKeyword}&rs=typed`;
const proxyUrl = getScrapeOpsUrl(url, location);
await page.goto(proxyUrl);
console.log(`Successfully fetched: ${url}`);
const divCards = await page.$$("div[data-grid-item='true']");
for (const divCard of divCards) {
const aElement = await divCard.$("a");
const name = await page.evaluate(element => element.getAttribute("aria-label"), aElement);
const href = await page.evaluate(element => element.getAttribute("href"), aElement);
const imgElement = await divCard.$("img");
const imgLink = await page.evaluate(element => element.getAttribute("src"), imgElement);
const searchData = {
name: name,
url: `https://www.pinterest.com${href.replace("https://proxy.scrapeops.io", "")}`,
image: imgLink
};
await writeToCsv([searchData], `${keyword.replace(" ", "-")}.csv`);
}
success = true;
} catch (err) {
console.log(`Error: ${err}, tries left ${retries - tries}`);
tries++;
} finally {
await page.close();
}
}
}
async function startScrape(keyword, location, concurrencyLimit, retries) {
const browser = await puppeteer.launch()
await scrapeSearchResults(browser, keyword, location, retries);
await browser.close();
}
async function processPin(browser, row, location, retries = 3) {
const url = row.url;
let tries = 0;
let success = false;
while (tries <= retries && !success) {
const page = await browser.newPage();
try {
await page.goto(getScrapeOpsUrl(url, location), { timeout: 60000 });
const mainCard = await page.$("div[data-test-id='CloseupDetails']");
let website = "n/a";
const websiteHolder = await page.$("span[style='text-decoration: underline;']");
if (websiteHolder) {
website = await page.evaluate(element => element.textContent, websiteHolder);
}
const starDivs = await page.$$("div[data-test-id='rating-star-full']");
const stars = starDivs.length;
const profileInfoDiv = await mainCard.$("div[data-test-id='follower-count']");
if (profileInfoDiv === null) {
throw new Error("Page failed to loaded, most likely blocked!");
}
const profileText = await page.evaluate(element => element.textContent, profileInfoDiv);
const accountNameDiv = await profileInfoDiv.$("div[data-test-id='creator-profile-name']");
const nestedDiv = await accountNameDiv.$("div");
const accountName = await page.evaluate(element => element.getAttribute("title"), nestedDiv);
const followerCount = profileText.replace(accountName, "").replace(" followers", "");
const pinData = {
name: accountName,
website: website,
stars: stars,
follower_count: followerCount,
image: row.image
}
await writeToCsv([pinData], `${row.name.replace(" ", "-")}.csv`);
success = true;
} catch (err) {
await page.screenshot({path: "ERROR.png"});
console.log(`Error: ${err}, tries left: ${retries-tries}, url: ${url}`);
tries++;
} finally {
await page.close();
}
}
}
async function processResults(csvFile, location, concurrencyLimit, retries) {
const pins = await readCsv(csvFile);
const browser = await puppeteer.launch();
while (pins.length > 0) {
const currentBatch = pins.splice(0, concurrencyLimit);
const tasks = currentBatch.map(pin => processPin(browser, pin, location, retries));
try {
await Promise.all(tasks);
} catch (err) {
console.log(`Failed to process batch: ${err}`);
}
}
await browser.close();
}
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "uk";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
console.log("Starting scrape");
for (const file of aggregateFiles) {
await processResults(file, location, concurrencyLimit, retries);
}
console.log("Scrape complete");
}
main();
Step 6: Production Run
Now that we've put everything together in working order, it's time to run it all in production. Here is tha main
function that we'll be running.
async function main() {
const keywords = ["grilling"];
const concurrencyLimit = 4;
const location = "us";
const retries = 3;
const aggregateFiles = [];
for (const keyword of keywords) {
console.log("Crawl starting");
await startScrape(keyword, location, retries);
console.log("Crawl complete");
aggregateFiles.push(`${keyword.replace(" ", "-")}.csv`);
}
console.log("Starting scrape");
for (const file of aggregateFiles) {
await processResults(file, location, concurrencyLimit, retries);
}
console.log("Scrape complete");
}
As always, feel free to change any of the constants above to tweak your own results.
Here was our result. There was one page that simply wouldn't load.
And here is the page that didn't load.
All in all, we scraped 8 pages in 74.428 seconds. While this brings our average result to 9.30 seconds per page, about half the run was spent trying to reload the failed page above.
Legal and Ethical Considerations
When scraping any site, you always need to be mindful of their Terms of Service
and robots.txt.
. Pinterest's terms are available here.
If you violate these terms, you can even lose your Pinterest account! Their robots.txt
is available here.
Also, keep in mind whether the data you're scraping is public. Private data (data behind a login), can often be illegal to scrape. Generally, public data (data not behind a login) is public information and therefore fair game when scraping.
If you are unsure of the legality of a your scraper, it is best to consult an attorney based in your jurisdiction.
Conclusion
You've finished our tutorial!!! You now know how to use Puppeteer and CSS selectors. You should also have a solid grasp of parsing, data storage, concurrency, and proxy integration. You might have even gotten a taste of the brand new ScrapeOps Residential Proxy.
If you'd like to learn more about the stack used in this article, take a look at the links below!
More Web Scraping Guides
If you're looking for more ways to level up your scraping, take a look at our Puppeteer Web Scraping Playbook or one of the links below: