Then check out ScrapeOps, the complete toolkit for web scraping.
Unencoded --> 'https://www.indeed.com/jobs?q=software engineer&l=San Francisco&start=0&filter=0'
Encoded --> 'https%3A%2F%2Fwww.indeed.com%2Fjobs%3Fq%3Dsoftware%20engineer%26l%3DSan%20Francisco%26start%3D0%26filter%3D0'
q
stands for the search query. In our case, q=software engineer
. Note: If you want to search for a keyword that contains spaces or special characters then remember you need to encode this value. (Encoded: q%3Dsoftware%20engineer
)l
stands for the location you want to search for jobs. In our case, we used l=California
.start
stands for the starting point for the pagination. We use the start
parameter to paginate through results.<script id="mosaic-data" type="text/javascript">
tag, under window.mosaic.providerData["mosaic-provider-jobcards"]
.<script id="mosaic-data" type="text/javascript"> ... window.mosaic.providerData["mosaic-provider-jobcards"]={"metaData":{"mosaicProviderJobCardsModel":{"adSignature":"3573","appName":"jasx","applyHolisticStyle":true,"bot":false,"brandedAds":[],"csrfToken":"VfkW0LdLxAXrjIkxOeasTnBCW8vSv9TE","encryptedQueryData":"RnZhMybXSk4M3QtTVGXWoe9dbTL46KyFjV9_vwSAcQxuziQ2QCDK8B6B0pUnV6xlgzK1HVOkc0tMGyUpMO9yEdnbun4jJaS6CbMzioz2PqM","experienceLevelFilterRefineBy":"","fccId":-1,"hasResume":false,"indeedApplyOnlyFilterUsed":false,"ipCountry":"IE","isDesktop":true,"isHighContrastIconShown":true,"isIpadApp":false,"isJobCardShelfApplied":true,"isTablet":false,"jobSeenLogParameters":{},"linkTargetAttribute":"_blank","loggedIn":false,"mobtk":"1ge736cml2gra002","mosaicNonJobContent":[],"mustShowSponsoredLabel":false,"myIndeedEnabled":true,"myIndeedRegisterLink":"https://www.indeed.com/account/register?dest=%2Fjobs%3Fjson%3D1%26q%3Dpython%26vjk%3D532734731891698b%26l%3DTexas","noJsUrlOnly":false,"overrideShelf":true,"pageNumber":1,"prforceGroups":"","proctorContext":{"accountId":-1,"app":false,"country":"US","ctkAge":72611863,"ctkDate":"20220929","hasRez":false,"lang":"en","loggedIn":false,"mtkAge":72611863,"platform":"","privileged":false,"smartphone":false,"stealthGroups":[],"tablet":false,"uaData":"{\"android\":false,\"androidApp\":false,\"androidEmployerApp\":false,\"androidJobSearchApp\":false,\"app\":false,\"bot\":false,\"browser\":\"CHROME\",\"browserFamily\":\"CHROME\",\"browserReleaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"browserVersion\":{\"majorVersion\":\"105\",\"minorVersion\":\"-1\",\"version\":\"105\"},\"chrome\":true,\"chromeForIOS\":false,\"currentJobseekerDeprecatedBrowser\":false,\"deviceType\":\"COMPUTER\",\"droidRezUploadDialog\":false,\"dumbPhone\":false,\"employerApp\":false,\"fileUploadCapable\":true,\"futureJobseekerDeprecatedBrowser\":false,\"geolocationCapable\":false,\"googleWebLight\":false,\"ios\":false,\"iosemployerApp\":false,\"iosjobSearchApp\":false,\"ipad\":false,\"ipadApp\":false,\"ipadJobSearchApp\":false,\"jobSearchApp\":false,\"mobileDevice\":false,\"operatingSystem\":\"WINDOWS\",\"os\":{\"family\":\"windows\",\"majorVersion\":-1,\"minorVersion\":-1,\"osFamily\":\"windows\",\"osVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"patchVersion\":-1,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"version\":\"\"},\"phone\":false,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"safari\":false,\"safariForIOS\":false,\"smartPhone\":false,\"tablet\":false,\"uaVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"userAgentDelegate\":{\"android\":false,\"bot\":false,\"browser\":\"CHROME\",\"browserName\":\"Chrome\",\"browserReleaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"browserVersion\":{\"majorVersion\":\"105\",\"minorVersion\":\"-1\",\"version\":\"105\"},\"browserVersionString\":\"105\",\"chrome\":true,\"delegate\":{\"allFields\":{\"DeviceClass\":{\"confidence\":500,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"DeviceName\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"DeviceBrand\":{\"confidence\":0,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"OperatingSystemClass\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"OperatingSystemName\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Windows NT\"},\"OperatingSystemVersion\":{\"confidence\":400001,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"OperatingSystemVersionMajor\":{\"confidence\":400001,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"AgentClass\":{\"confidence\":2014,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Browser\"},\"AgentName\":{\"confidence\":2014,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Chrome\"},\"AgentVersion\":{\"confidence\":3000,\"defaultValue\":\"??\",\"isDefaultValue\":false,\"value\":\"105\"},\"AgentInformationEmail\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"AgentInformationUrl\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"WebviewAppName\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"WebviewAppVersion\":{\"confidence\":-1,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"__SyntaxError__\":{\"confidence\":-1,\"defaultValue\":\"false\",\"isDefaultValue\":true,\"value\":\"false\"}},\"ambiguityCount\":0,\"availableFieldNamesSorted\":[\"DeviceClass\",\"DeviceName\",\"DeviceBrand\",\"OperatingSystemClass\",\"OperatingSystemName\",\"OperatingSystemVersion\",\"OperatingSystemVersionMajor\",\"AgentClass\",\"AgentName\",\"AgentVersion\",\"AgentInformationEmail\",\"AgentInformationUrl\",\"WebviewAppName\",\"WebviewAppVersion\",\"__SyntaxError__\"],\"cleanedAvailableFieldNamesSorted\":[\"DeviceClass\",\"DeviceName\",\"DeviceBrand\",\"OperatingSystemClass\",\"OperatingSystemName\",\"OperatingSystemVersion\",\"OperatingSystemVersionMajor\",\"AgentClass\",\"AgentName\",\"AgentVersion\"],\"hasAmbiguity\":false,\"hasSyntaxError\":false,\"headers\":{\"User-Agent\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\"},\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"userAgentStringField\":{\"confidence\":0,\"defaultValue\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"isDefaultValue\":false,\"value\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\"}},\"deviceName\":\"Desktop\",\"deviceType\":\"COMPUTER\",\"deviceTypeString\":\"Desktop\",\"dumbPhone\":false,\"ios\":false,\"ipad\":false,\"mobileDevice\":false,\"operatingSystem\":\"WINDOWS\",\"operatingSystemFamily\":\"Windows NT\",\"operatingSystemVersion\":\"??\",\"os\":{\"family\":\"windows\",\"majorVersion\":-1,\"minorVersion\":-1,\"osFamily\":\"windows\",\"osVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"patchVersion\":-1,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"version\":\"\"},\"phone\":false,\"safari\":false,\"smartPhone\":false,\"tablet\":false,\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"webviewName\":\"Unknown\",\"webviewVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"windowsPhone\":false},\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"version\":{\"major\":105,\"minor\":-1,\"version\":\"105\"},\"windowsPhone\":false}"},"proctorIdentifiers":{"ACCOUNT":"-1","USER":"1ge4tueuklhdh800"},"queryModifierResult":{"originalQuery":"python","queryModifiers":[{"clickUrl":"http://www.indeed.com/jobs?q=python%2Bintern&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"python intern"},{"clickUrl":"http://www.indeed.com/jobs?q=panda&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"panda"},{"clickUrl":"http://www.indeed.com/jobs?q=bobcat&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"bobcat"},{"clickUrl":"http://www.indeed.com/jobs?q=rhino&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"rhino"},{"clickUrl":"http://www.indeed.com/jobs?q=reptile&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"reptile"},{"clickUrl":"http://www.indeed.com/jobs?q=boba&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"boba"},{"clickUrl":"http://www.indeed.com/jobs?q=drupal&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"drupal"},{"clickUrl":"http://www.indeed.com/jobs?q=caterpillar&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"caterpillar"},{"clickUrl":"http://www.indeed.com/jobs?q=abacus&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"abacus"},{"clickUrl":"http://www.indeed.com/jobs?q=food%2Blion&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"food lion"}]},"radius":25,"refineByTypes":[],"results":[{"appliedOrGreater":false,"company":"John Deere","companyBrandingAttributes":{"headerImageUrl":"https://d2q79iu7y748jz.cloudfront.net/s/_headerimage/1960x400/5e8d35d0dcbc8a32f12d61e4541c55ae","logoUrl":"https://d2q79iu7y748jz.cloudfront.net/s/_squarelogo/256x256/b46cb1797d2ea21811908aaa0ab2bdad"},"companyIdEncrypted":"38eb72d608d80c79","companyOverviewLink":"/cmp/John-Deere","companyOverviewLinkCampaignId":"serp-linkcompanyname","companyRating":4,"companyReviewCount":3767,"companyReviewLink":"/cmp/John-Deere/reviews","companyReviewLinkCampaignId":"cmplinktst2","d2iEnabled":false,"displayTitle":"Part-Time Student-MLOps Software Engineer-Remote","dradisJob":false,"employerAssistEnabled":false,"employerResponsive":false,"encryptedFccompanyId":"eade00c6021a5947","encryptedResultData":"VwIPTVJ1cTn5AN7Q-tSqGRXGNe2wB2UYx73qSczFnGU","expired":false,"extractTrackingUrls":"","extractedEntities":[],"fccompanyId":-1,"featuredCompanyAttributes":{},"featuredEmployer":false,"featuredEmployerCandidate":false,"feedId":2701,"formattedLocation":"Austin, TX 78704","formattedRelativeTime":"Today","hideMetaData":false,"hideSave":false,"highVolumeHiringModel":{"highVolumeHiring":false},"highlyRatedEmployer":false,"hiringEventJob":false,"indeedApplyEnabled":false,"indeedApplyable":false,"isJobSpotterJob":false,"isJobVisited":false,"isMobileThirdPartyApplyable":false,"isNoResumeJob":false,"isSubsidiaryJob":false,"jobCardRequirementsModel":{"additionalRequirementsCount":0,"requirementsHeaderShown":false},"jobLocationCity":"Austin","jobLocationExtras":"South Lamar-South Congress","jobLocationPostal":"78704","jobLocationState":"TX","jobTypes":["Full-time","Part-time"],"jobkey":"a22fa26470cfb9ad","jsiEnabled":false,"link":"/rc/clk?jk=a22fa26470cfb9ad&fccid=38eb72d608d80c79&vjs=3","locationCount":1,"loceJobTagModel":{},"mobtk":"1ge736cml2gra002","moreLinks":{"companyName":"John Deere","companyText":"John Deere jobs in Austin, TX","locationName":"Austin","qnaUrl":"/cmp/John-Deere/faq","qnaUrlParams":"?from=serp-more&campaignid=serp-more&fromjk=a22fa26470cfb9ad&jcid=38eb72d608d80c79","resultNumber":0,"salaryLocationName":"Austin, TX","salaryNoFollowLink":false,"salaryUrl":"/career/software-engineer/salaries/78704--TX","salaryUrlParams":"?campaignid=serp-more&fromjk=a22fa26470cfb9ad&from=serp-more","shortLocationName":"Austin, TX","showAcmeLink":true,"showAcmeQnaLink":true,"showViewAllCompanyAndLocationLinks":true,"showViewAllCompanyLink":true,"showViewAllLocationLink":true,"showViewAllNormalizedTitleLink":false,"viewAllCompanyLinkText":"John Deere jobs in Austin, TX","viewAllCompanyUrl":"/q-John-Deere-l-Austin,-TX-jobs.html","viewAllLocationUrl":"/l-Austin,-TX-jobs.html","visible":false},"moreLocUrl":"/jobs?q=python&l=Texas&jtid=b3a825820658bf92&jcid=38eb72d608d80c79&grp=tcl","mouseDownHandlerOption":{"adId":"","advn":"","extractTrackingUrls":[],"from":"vjs","jobKey":"a22fa26470cfb9ad","link":"/rc/clk?jk=a22fa26470cfb9ad&fccid=38eb72d608d80c79&vjs=3","tk":"1ge736cml2gra002"},"newJob":true,"normTitle":"Part Time Student Mlop Software Engineer Remote","openInterviewsInterviewsOnTheSpot":false,"openInterviewsJob":false,"openInterviewsOffersOnTheSpot":false,"openInterviewsPhoneJob":false,"overrideIndeedApplyText":true,"preciseLocationModel":{"obfuscateLocation":false,"overrideJCMPreciseLocationModel":true},"pubDate":1664427600000,"redirectToThirdPartySite":false,"remoteLocation":false,"remoteWorkModel":{"inlineText":true,"type":"REMOTE_ALWAYS"},"resumeMatch":false,"salarySnippet":{"salaryTextFormatted":false},"saved":false,"savedApplication":false,"screenerQuestionsURL":"","showCommutePromo":false,"showEarlyApply":false,"showJobType":false,"showRelativeDate":true,"showSponsoredLabel":false,"showStrongerAppliedLabel":false,"smartFillEnabled":false,"smbD2iEnabled":false,"snippet":"\u003Cul style=\"list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;\"\u003E \n \u003Cli style=\"margin-bottom:0px;\"\u003ETitle: Part-Time Student-MLOps Software Engineer-Remote - 91235.\u003C/li\u003E\n \u003Cli\u003EThe Part-Time Student Program is primarily designed to augment the Company’s regular full-time…\u003C/li\u003E\n\u003C/ul\u003E","sourceId":2775,"sponsored":false,"taxoAttributes":[],"taxoAttributesDisplayLimit":5,"taxoLogAttributes":[],"taxonomyAttributes":[{"attributes":[{"label":"Part-time","suid":"75GKK"}};" ...</script>
window.mosaic.providerData["mosaic-provider-jobcards"]
json on the page and parse it's contents.We can do this with a regex command like:'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});'
script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', html)
json_blob = json.loads(script_tag[0])jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results']
import reimport jsonimport requestsfrom urllib.parse import urlencode def get_indeed_search_url(keyword, location, offset=0): parameters = {"q": keyword, "l": location, "filter": 0, "start": offset} return "https://www.indeed.com/jobs?" + urlencode(parameters) headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} job_id_list = [] ## Job Search Parameterskeyword_list = ['software engineer']location_list = ['California'] ## Loop Through Indeed Pages Until No More Jobsfor keyword in keyword_list: for location in location_list: for offset in range(0, 1010, 10): try: indeed_jobs_url = get_indeed_search_url(keyword, location, offset) response = requests.get(indeed_jobs_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results'] for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: job_id_list.append(job.get('jobkey')) ## If response contains less than 10 jobs then stop pagination if len(jobs_list) < 10: break except Exception as e: print('Error', e) print(job_id_list)
job_id_list
will look like this:[ "f6288f8af00406b1", "56ab4e4fe59ae782", "29bd7638828fab65", "697a7a3f18590465", "08e92505e27442d3", "105529f69e3fdae2", "cb6051b3810c1a1e", "27b6100b1824c062", "5ed01f909421e1dc", "51e6c0cb491f1eb8", "9c00bbf58c08a82b", "1744f664384cf3f9", "d07ab6e42b00c36a", "4afb447f36e4ca83", "a4c8a5cac15f5a54", "2c05f3937cde360a", "37bd28548878ed8f" ]
jobKey
from the jobs_list
:for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: job_id_list.append(job.get('jobkey'))
for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: jobs_data_list.append({ 'keyword': keyword, 'location': location, 'page': round(offset / 10) + 1 if offset > 0 else 1, 'position': index, 'company': job.get('company'), 'companyRating': job.get('companyRating'), 'companyReviewCount': job.get('companyReviewCount'), 'companyRating': job.get('companyRating'), 'highlyRatedEmployer': job.get('highlyRatedEmployer'), 'jobkey': job.get('jobkey'), 'jobTitle': job.get('title'), 'jobLocationCity': job.get('jobLocationCity'), 'jobLocationPostal': job.get('jobLocationPostal'), 'jobLocationState': job.get('jobLocationState'), 'maxSalary': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 0, 'minSalary': job.get('estimatedSalary').get('min') if job.get('estimatedSalary') is not None else 0, 'salaryType': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 'none', 'pubDate': job.get('pubDate'), })
window._initialData={}
inside a <script>
tag in the HTML response it is pretty easy to extract the data.<script> ... window._initialData={"accountKey":null,"apiPaths":{},"appCommonData":null,"averageRatingsModel":null,"base64EncodedJson":"eyJjIjpmYWxzZSwiZSI6ZmFsc2UsImciOiJodHRwOi8vd3d3LmluZGVlZC5jb20vbS9iYXNlY2FtcC92aWV3am9iP3ZpZXd0eXBlPWVtYmVkZGVkJmprPWY2Mjg4ZjhhZjAwNDA2YjEifQ","baseInboxUrl":"https:\u002F\u002Finbox.indeed.com","baseUrl":"https:\u002F\u002Fwww.indeed.com","benefitsModel":{"benefits":[{"key":"FVKX2","label":"401(k)"},{"key":"SENX8","label":"401(k) matching"},{"key":"6XHWW","label":"Commuter assistance"},{"key":"YJ8XR","label":"Food provided"},{"key":"3K96F","label":"Free parking"},{"key":"TZV2T","label":"Gym membership"},{"key":"EY33Q","label":"Health insurance"},{"key":"Y2WS5","label":"Life insurance"},{"key":"HW4J4","label":"Paid time off"},{"key":"NPHPU","label":"Parental leave"},{"key":"6XT6J","label":"Stock options"}]},"callToInterviewButtonModel":null,"categorizedAttributesModel":null,"chatbotApplyButtonLinkModel":null,"clientsideProctorGroups":{"callToApplyStickySideBySide":true,"mobmapandcommutetimetst9":false,"showSalaryGuide":true,"desktopvj_stickyheader_tst":false,"callToApplyStickyBelowApplyNow":false,"mob_desktop_serp_tst":true,"callButtonPrimaryApplySecondary":false,"showInterviewCardBelowJobDesc":false},"cmiJobCategoryModel":null,"commuteInfoModel":null,"companyAvatarModel":null,"companyFollowFormModel":null,"companyTabModel":null,"contactPersonModel":null,"country":"US","cssResetProviders":{"mosaic-provider-reportcontent":false,"mosaic-provider-salary-feedback":true,"mosaic-provider-company-info-salary":true,"mosaic-provider-rich-media":false,"js-match-insights-provider":false,"MosaicProviderCallToApplyFeedback":true,"mosaic-provider-dislike-feedback":false},"ctk":"1ge4tueuklhdh800","dcmModel":{"category":"jobse0","source":"6927552","type":"organic"},"desktop":true,"desktopSponsoredJobSeenData":"tk=1ge7cg6jhkke6800","dgToken":"6987EEB4A2C6A193E5C44936188510EB","dislikeFrom2paneEnabled":false,"downloadAppButtonModel":null,"employerResponsiveCardModel":null,"from":null,"globalnavFooterHTML":"","globalnavHeaderHTML":"","highQualityMarketplace":null,"hiringInsightsModel":{"age":"30+ days ago","employerLastReviewed":null,"employerResponsiveCardModel":null,"numOfCandidates":null,"postedToday":false,"recurringHireText":null,"urgentlyHiringModel":null},"indeedApplyButtonContainer":{"brandingText":null,"buttonClickUrl":null,"disabled":false,"employerResponsiveCard":null,"enableStickyInquiry":false,"enableStickyInquiryTooltip":false,"hasMessage":false,"indeedApplyAttributes":{"content":"data-indeed-apply-apiToken='f09f8f8add995328354a7a9e7a7fefdedfe230dee1f12eeb30c6e4b184f2dd9e' data-indeed-apply-jobTitle='Senior Full Stack Engineer (python)' data-indeed-apply-jobId='4931506003' data-indeed-apply-jobLocation='Austin, TX' data-indeed-apply-jobCompanyName='Bluevine' data-indeed-apply-jobUrl='https:\u002F\u002Fwww.indeed.com\u002Fviewjob?jk=f6288f8af00406b1' data-indeed-apply-questions='https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed' data-indeed-apply-postUrl='https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed' data-indeed-apply-name='firstlastname' data-indeed-apply-coverletter='optional' data-indeed-apply-phone='required' data-indeed-apply-resume='required' data-indeed-apply-noButtonUI='true' data-indeed-apply-pingbackUrl='https:\u002F\u002Fgdc.indeed.com\u002Fconv\u002ForgIndApp?co=US&vjtk=1ge7cg6jhkke6800&jk=f6288f8af00406b1&mvj=0&astse=b78e1ac815f57228&assa=2193' data-indeed-apply-onappliedstatus='_updateIndeedApplyStatus' data-indeed-apply-onready='_onButtonReady' data-indeed-apply-jk='f6288f8af00406b1' data-indeed-apply-onclose=\"indeedApplyHandleModalClose\" data-indeed-apply-onapplied=\"indeedApplyHandleApply\" data-indeed-apply-oncontinueclick=\"indeedApplyHandleModalClose\" data-indeed-apply-onClick=\"indeedApplyHandleButtonClick\" data-indeed-apply-returnToJobSearchUrl='' data-acc-payload=\"1,2,22,1,144,1,552,1,3648,1,4392,1\" data-indeed-apply-recentsearchquery='{\"what\":\"software engineer\",\"where\":\"California\"}'","contentKind":"ATTRIBUTES"},"indeedApplyBaseUrl":"https:\u002F\u002Fapply.indeed.com","indeedApplyBootStrapAttributes":{"hl":"en","source":"idd","co":"US","vjtk":"1ge7cg6jhkke6800"},"indeedApplyButtonAttributes":{"postUrl":"https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed","jk":"f6288f8af00406b1","onClick":"indeedApplyHandleButtonClick","jobTitle":"Senior Full Stack Engineer (python)","questions":"https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed","onappliedstatus":"_updateIndeedApplyStatus","jobCompanyName":"Bluevine","recentsearchquery":"{\"what\":\"software engineer\",\"where\":\"California\"}","onclose":"indeedApplyHandleModalClose","jobUrl":"https:\u002F\u002Fwww.indeed.com\u002Fviewjob?jk=f6288f8af00406b1","onready":"_onButtonReady","onapplied":"indeedApplyHandleApply","coverletter":"optional","resume":"required","pingbackUrl":"https:\u002F\u002Fgdc.indeed.com\u002Fconv\u002ForgIndApp?co=US&vjtk=1ge7cg6jhkke6800&jk=f6288f8af00406b1&mvj=0&astse=b78e1ac815f57228&assa=2193","noButtonUI":"true","jobId":"4931506003","apiToken":"f09f8f8add995328354a7a9e7a7fefdedfe230dee1f12eeb30c6e4b184f2dd9e","jobLocation":"Austin, TX","phone":"required","name":"firstlastname","oncontinueclick":"indeedApplyHandleModalClose","returnToJobSearchUrl":""},"indeedApplyButtonModel":{"applyBtnNewStyle":true,"buttonSize":"block","buttonType":"branded","contentHtml":"Apply now","dataHref":null,"disclaimer":null,"href":"\u002F","icon":null,"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":null},"indeedApplyLoginModalModel":null,"indeedApplyScriptAttributes":{"data-indeed-apply-qs":"vjtk=1ge7cg6jhkke6800"},"indeedApplyScriptLocation":"https:\u002F\u002Fapply.indeed.com\u002Findeedapply\u002Fstatic\u002Fscripts\u002Fapp\u002Fbootstrap.js?hl=en&co=US&source=idd","shouldUseButtonPlaceholder":true,"stagingLevel":"prod","viewFormUrl":null,"viewFormUrlAttribute":{"content":"","contentKind":"ATTRIBUTES"}},"indeedLogoModel":null,"inlineJsErrEnabled":false,"isApp":false,"isApplyTextColorChanges":true,"isApplyTextSizeChanges":true,"isCriOS":false,"isDislikeFormV2Enabled":false,"isSafariForIOS":false,"isSalaryNewDesign":false,"isSyncJobs":false,"jasJobViewPingModel":null,"jasxInputWhatWhereActive":true,"jobAlertSignInModalModel":null,"jobAlertSignUp":null,"jobCardStyleModel":{"elementSpacingIncreased":false,"fontSizeEnlarged":false,"highContrastIconShown":false,"jobCardShelfApplied":false,"salaryBlack":false,"shouldMarkClickedJobAsVisited":false},"jobInfoWrapperModel":{"jobInfoModel":{"appliedStateBannerModel":null,"commuteInfoModel":null,"expiredJobMetadataModel":null,"hideCmpHeader":false,"isD2iEnabled":false,"isJsiEnabled":false,"jobAttributesTestValue":-1,"jobDebugInfoModel":null,"jobDescriptionSectionModel":null,"jobInfoHeaderModel":{"a11yNewtabIconActive":false,"averageRatingsModel":null,"companyImagesModel":{"ejiBannerAsBackground":false,"enhancedJobDescription":false,"featuredEmployer":false,"headerImageUrl":"https:\u002F\u002Fd2q79iu7y748jz.cloudfront.net\u002Fs\u002F_headerimage\u002F1960x400\u002Fae55ead6c2c0702692b9e43ac06f3277","logoAltText":"Bluevine logo","logoImageOverlayLower":false,"logoUrl":"https:\u002F\u002Fd2q79iu7y748jz.cloudfront.net\u002Fs\u002F_squarelogo\u002F256x256\u002Fc338b6786b5eadab2a1f404e10259004","showBannerTop":false,"showEnhancedJobImp":false,"showIconInTitle":false},"companyName":"Bluevine","companyOverviewLink":"https:\u002F\u002Fwww.indeed.com\u002Fcmp\u002FBluevine?campaignid=mobvjcmp&from=mobviewjob&tk=1ge7cg6jhkke6800&fromjk=f6288f8af00406b1","companyReviewLink":"https:\u002F\u002Fwww.indeed.com\u002Fcmp\u002FBluevine\u002Freviews?campaignid=mobvjcmp&cmpratingc=mobviewjob&from=mobviewjob&tk=1ge7cg6jhkke6800&fromjk=f6288f8af00406b1&jt=Senior+Full+Stack+Engineer+%28python%29","companyReviewModel":null,"disableAcmeLink":false,"employerActivity":null,"employerResponsiveCardModel":null,"encryptedFccCompanyId":null,"formattedLocation":"Austin, TX","hideRating":false,"isDesktopApplyButtonSticky":false,"isSimplifiedHeader":false,"jobNormTitle":null,"jobTitle":"Senior Full Stack Engineer (python)","jobTypes":null,"location":null,"openCompanyLinksInNewTab":false,"parentCompanyName":null,"preciseLocationModel":null,"ratingsModel":null,"recentSearch":null,"remoteWorkModel":{"inlineText":true,"text":"Hybrid remote","type":"REMOTE_HYBRID"},"salaryMax":null,"salaryMin":null,"salaryType":null,"subtitle":"Bluevine - Austin, TX","tagModels":null,"taxonomyAttributes":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"jobMetadataHeaderModel":{"jobType":null},"jobTagModel":null,"resumeEvaluationResult":null,"sanitizedJobDescription":{"content":"<div>\n <div>\n <p><b>About Bluevine<\u002Fb><\u002Fp> \n <p> Bluevine is on a mission to enable a better financial future for small business owners through innovative banking solutions designed just for them. By combining best-in-class technology with advanced security and a deep understanding of the small business community, we deliver end-to-end banking and lending products that empower always-on entrepreneurs to grow their businesses with confidence.<\u002Fp> \n <p> As a dynamic company with massive potential, we're backed by leading investors such as Lightspeed Venture Partners, Menlo Ventures, 83North, Citi Ventures, and nearly 9 years of proven success. Since launching in 2013, we have grown exponentially, amassing over 400,000 customers across all 50 states and a global team of more than 500 people. Our passion is driven by purpose: to give small businesses the tools they need to succeed and we're just getting started.<\u002Fp> \n <p> All of this begins with our team who are driven by collaboration, problem-solving, and learning and growing together. With a commitment to innovation and community impact, our mission is to help every small business—and every team member—thrive. Join us!<\u002Fp>\n <\u002Fdiv>\n <p><b><i> This is a hybrid role<\u002Fi><\u002Fb><i>. <\u002Fi>At Bluevine, we pride ourselves on our collaborative culture, which we believe is best maintained through in-person interactions and a vibrant office environment. All of our offices have reopened in accordance with local guidelines, and are following a hybrid model. In-office days will be determined by location and discipline.<\u002Fp> \n <p><b> ABOUT THE ROLE:<\u002Fb><\u002Fp> \n <p> We're looking for a Senior Full Stack Engineer flexible enough to develop features from the front (beautiful UX) to the back (scalable and robust components and integrations). If you're drawn to engineering challenges and have a strong desire to make a big impact as part of a small, agile team, in an exciting space, we'd love to talk to you.<\u002Fp> \n <p> The team serves a variety of stakeholders across all the business and the platform.<\u002Fp> \n <p><b> WHAT YOU'LL DO:<\u002Fb><\u002Fp> \n <ul> \n <li>Independently drive the engineering development of complex features<\u002Fli> \n <li>Design and build state-of-the-art responsive banking applications<\u002Fli> \n <li>Work closely with, and incorporate feedback from product managers and other stakeholders in the company<\u002Fli> \n <li>Be part of a fast-paced and highly-flexible team with the comfort of making decisions using your best judgement<\u002Fli> \n <li>Develop projects through their entire life cycle<\u002Fli> \n <\u002Ful> \n <p><b>WHAT WE LOOK FOR:<\u002Fb><\u002Fp> \n <ul> \n <li>5+ years of combined full stack experience experience building fast, reliable, web and\u002For mobile applications on applications with Python backends<\u002Fli> \n <li>Experience with Web frameworks (e.g., Angular, React, or Vue)<\u002Fli> \n <li>Experience with source control management systems, preferably Git<\u002Fli> \n <li>B.S. in Computer Science or a related field preferred<\u002Fli> \n <\u002Ful> \n <p><b>Nice to Haves<\u002Fb><\u002Fp> \n <ul> \n <li>Experience with AWS<\u002Fli> \n <li>Experience with mobile development (e.g., Native, Native Script, or React)<\u002Fli> \n <\u002Ful>\n <div>\n <div> \n <p><b>BENEFITS AND PERKS - for employees located in the US<\u002Fb><\u002Fp> \n <ul> \n <li>Excellent health coverage and life insurance benefits<\u002Fli> \n <li>401K with an immediate 3% company match<\u002Fli> \n <li>PTO, Company Holidays, and Flexible Holidays<\u002Fli> \n <li>Company-sponsored Mental Health Benefits, including 1:1 therapy<\u002Fli> \n <li>Over $1,000 annually for a wellness benefit of your choice<\u002Fli> \n <li>Monthly WFH stipend<\u002Fli> \n <li>Generous, paid parental leave covering up to 16 weeks<\u002Fli> \n <li>Access to financial coaches and education sessions<\u002Fli> \n <li>Free commuter benefits - Caltrain passes for San Francisco employees and a monthly parking allowance<\u002Fli> \n <li>Monthly DoorDash credit<\u002Fli> \n <li>Weekly catered lunches and fully stocked kitchen pantries<\u002Fli> \n <li>Dog-friendly Redwood City, CA office<\u002Fli> \n <li>Community-based volunteering opportunities<\u002Fli> \n <\u002Ful> \n <p><b>BENEFITS & PERKS - for employees located in Israel<\u002Fb><\u002Fp> \n <ul> \n <li>Excellent group health coverage and life insurance benefits<\u002Fli> \n <li>Stock options<\u002Fli> \n <li>Flexible hybrid work model<\u002Fli> \n <li>Large Study Fund contribution<\u002Fli> \n <li>Salary Benchmarks and Checkpoints<\u002Fli> \n <li>Monthly meal card of TenBis or CiBus (your choice) with generous balance<\u002Fli> \n <li>Free parking for cars, scooters, and bikes<\u002Fli> \n <li>Free gym membership<\u002Fli> \n <li>Company-sponsored Mental Health Benefits<\u002Fli> \n <li>PTO, Company Holidays, and Flexible Holidays<\u002Fli> \n <li>Community-based volunteering opportunities<\u002Fli>\n <\u002Ful>\n <\u002Fdiv>\n <\u002Fdiv>\n<\u002Fdiv>\n<div><\u002Fdiv>","contentKind":"HTML"},"screenerRequirementsModel":null,"showExpiredHeader":false,"tagModels":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"sectionedJobInfoModel":null},"jobKey":"f6288f8af00406b1","jobLocation":"Austin, TX","jobMetadataFooterModel":{"age":"30+ days ago","indeedApplyAdaNotice":"If you require alternative methods of application or screening, you must approach the employer directly to request this as Indeed is not responsible for the employer's application process.","originalJobLink":null,"phoneNumber":null,"saveJobLink":null,"showReportJobAsButton":true,"source":"BlueVine"},"jobSeenData":"tk=1ge7cg6jhkke6800&context=viewjobrecs","jobTitle":"Senior Full Stack Engineer (python)","language":"en","lastVisitTime":1664538063,"lazyProviders":{"mosaic-provider-reportcontent":"<div class=\"mosaic-reportcontent-wrapper button\"><style data-emotion=\"css 1686x4\">.css-1686x4{box-sizing:border-box;background:none;-webkit-appearance:none;-moz-appearance:none;-ms-appearance:none;appearance:none;text-align:left;-webkit-text-decoration:none;text-decoration:none;border:none;cursor:pointer;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-ms-flex-pack:center;-webkit-justify-content:center;justify-content:center;position:relative;margin:0;padding-left:1rem;padding-right:1rem;line-height:1.5;font-family:\"Noto Sans\",\"Helvetica Neue\",\"Helvetica\",\"Arial\",\"Liberation Sans\",\"Roboto\",\"Noto\",sans-serif;font-size:1rem;font-weight:700;border-radius:0.5rem;border-width:1px;border-style:solid;-webkit-transition:border-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),background-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),opacity 200ms cubic-bezier(0.645, 0.045, 0.355, 1),box-shadow 200ms cubic-bezier(0.645, 0.045, 0.355, 1),color 200ms cubic-bezier(0.645, 0.045, 0.355, 1);transition:border-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),background-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),opacity 200ms cubic-bezier(0.645, 0.045, 0.355, 1),box-shadow 200ms cubic-bezier(0.645, 0.045, 0.355, 1),color 200ms cubic-bezier(0.645, 0.045, 0.355, 1);display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;width:auto;padding-top:0.5625rem;padding-bottom:0.5625rem;color:#2d2d2d;border-color:#e4e2e0;background-color:#e4e2e0;}.css-1686x4::-moz-focus-inner{border:0;}@media (prefers-reduced-motion: reduce){.css-1686x4{-webkit-transition:none;transition:none;}}.css-1686x4:disabled{opacity:0.4;pointer-events:none;}.css-1686x4:focus{outline:none;box-shadow:0 0 0 0.125rem #ffffff,0 0 0 0.1875rem #2557a7;}.css-1686x4:focus:not([data-focus-visible-added]){box-shadow:none;}.css-1686x4:visited{color:#2d2d2d;}.css-1686x4:hover{border-color:#d4d2d0;background-color:#d4d2d0;}.css-1686x4:active{box-shadow:inset 0 0.125rem 0.25rem rgba(45, 45, 45, 0.2),inset 0 0.0625rem 0.1875rem rgba(45, 45, 45, 0.12),inset 0 0 0.125rem rgba(45, 45, 45, 0.2);border-color:#b4b2b1;background-color:#b4b2b1;}<\u002Fstyle><button class=\"mosaic-reportcontent-button desktop css-1686x4 e8ju0x51\"><span class=\"mosaic-reportcontent-button-icon\"><\u002Fspan>Report job<\u002Fbutton><div class=\"mosaic-reportcontent-content\"><\u002Fdiv><\u002Fdiv>","mosaic-provider-salary-feedback":"","mosaic-provider-company-info-salary":"","mosaic-provider-rich-media":"","js-match-insights-provider":"","MosaicProviderCallToApplyFeedback":"","mosaic-provider-dislike-feedback":"<div class=\"animatedToast i-unmask\"><div class=\"\"><\u002Fdiv><\u002Fdiv>"},"locale":"en_US","localeData":{"":[null,"Project-Id-Version: \nReport-Msgid-Bugs-To: \nPOT-Creation-Date: 2022-09-27 23:51-0500\nPO-Revision-Date: 2021-08-06 19:00+0000\nLast-Translator: Auto Generated <noreply@indeed.com>\nLanguage-Team: English (United States) <https:\u002F\u002Fweblate.corp.indeed.com\u002Fprojects\u002Findeed\u002Findeedmobile-i18n-content\u002Fen_US\u002F>\nLanguage: en_US\nMIME-Version: 1.0\nContent-Type: text\u002Fplain; charset=UTF-8\nContent-Transfer-Encoding: 8bit\nPlural-Forms: nplurals=2; plural=n != 1;\nX-Generator: Weblate 3.9.1\n"],"\"Interview times available\" card content\u0004If your application meets the employer's criteria, you may be able to book a call or request an interview that suits your schedule.":[null,"If your application meets the employer's criteria, you can directly provide your availability for a video interview."]},"loggedIn":false,"mobResourceTimingEnabled":false,"mobileGlobalHeader":null,"mobtk":"1ge7cg6jhkke6800","mosaicData":null,"originalJobLinkModel":null,"pageId":"viewjob","parenttk":null,"phoneLinkType":null,"phoneNumberButtonLinkModel":null,"preciseLocationModel":null,"profileBaseUrl":"https:\u002F\u002Fprofile.indeed.com","queryString":null,"recentQueryString":"q1=software engineer&l1=california&r1=-1&q2=python&l2=texas&r2=-1","relatedLinks":null,"resumeFooterModel":{"buttonLink":{"applyBtnNewStyle":false,"buttonSize":"md","buttonType":"primary","contentHtml":"Upload Your Resume","dataHref":"\u002Fpromo\u002Fresume?from=bottomResumeCTAviewjob&trk.origin=viewjob","disclaimer":null,"href":"\u002Fpromo\u002Fresume","icon":null,"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":null},"isJanusActive":true,"letEmployersFindText":"Let Employers Find You"},"resumePromoCardModel":null,"rtl":false,"salaryGuideModel":{"acmeMicrocontentEndpoint":"https:\u002F\u002Fcocos-api.indeed.com","country":"US","estimatedSalaryModel":{"formattedRange":"$120K - $152K a year","max":151976.5,"min":120023.5,"type":"YEARLY"},"formattedLocation":"Austin, TX","jobKey":"f6288f8af00406b1","language":"en"},"salaryInfoModel":null,"saveJobButtonContainerModel":{"alreadySavedButtonModel":{"actions":["Saved","Applied","Interviewing","Offered","Hired"],"buttonSize":"block","buttonType":"secondary","contentHtml":"Saved","href":"\u002F","iconSize":null},"applyFromComputerLogUrl":"\u002Fm\u002Frpc\u002Flog\u002Femailmyself?jk=f6288f8af00406b1&mobvjtk=1ge7cg6jhkke6800&sbt=121f10e71cf3df2d415dae11933eb9ce&ctk=1ge4tueuklhdh800&acctKey=","currentJobState":"VISITED","didYouApplyPromptModel":{"calloutModel":{"actionsList":null,"actionsMap":{"NO":{"children":"Not interested","className":null,"href":null,"target":null},"LATER":{"children":"Maybe later","className":null,"href":null,"target":null},"YES":{"children":"Yes","className":null,"href":null,"target":null}},"caretPosition":null,"children":null,"dismissAriaLabel":"Close","dismissAttributes":null,"dismissHref":null,"heading":"Did you apply?"},"jobKey":"f6288f8af00406b1","possibleResponses":{"NO":"NO","LATER":"LATER","YES":"YES"},"userCanView":false},"didYouApplyResponseUrl":"\u002Fm\u002Frpc\u002Fdidyouapply?tk=1ge7cg6jhkke6800&jobKey=f6288f8af00406b1&originPage=viewjob&from=viewjob","hashedCSRFToken":"121f10e71cf3df2d415dae11933eb9ce","isAlreadySavedButtonVisible":false,"isDisableJobStatusChange":false,"isLoggedIn":false,"isSaveWithoutLoginEnabled":false,"isSticky":false,"isSyncJobs":false,"mobtk":"1ge7cg6jhkke6800","myIndeedLoginLink":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1&from=jsfe-desktopembedded-save-indeedmobile","myJobsAPIHref":"\u002Frpc\u002Flog\u002Fmyjobs\u002Ftransition_job_state?client=mobile&cause=statepicker&preserveTimestamp=false&tk=1ge7cg6jhkke6800&jobKey=f6288f8af00406b1&originPage=viewjob","myJobsURL":"https:\u002F\u002Fmyjobs.indeed.com?co=US&hl=en_US&from=viewjob","pageId":"viewjob","possibleJobActions":{"SAVED":"save","APPLIED":"apply","INTERVIEWING":"interview","OFFERED":"offer","HIRED":"hire","VISITED":"visit","ARCHIVED":"archive"},"possibleJobStates":{"SAVED":"Saved","APPLIED":"Applied","INTERVIEWING":"Interviewing","OFFERED":"Offered","HIRED":"Hired","VISITED":"Visited","ARCHIVED":"Archived"},"saveButtonModel":{"applyBtnNewStyle":false,"buttonSize":"block","buttonType":"secondary","contentHtml":"","dataHref":null,"disclaimer":null,"href":"\u002F","icon":{"iconTitle":"save-icon","iconType":"favorite-border"},"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"showSaveJobInlineCallout":true,"uistates":{"INTERVIEWING":"INTERVIEWING","OFFERED":"OFFERED","SAVED":"SAVED","VISITED":"VISITED","HIRED":"HIRED","ARCHIVED":"ARCHIVED","APPLIED":"APPLIED"},"viewJobDisplay":"DESKTOP_EMBEDDED"},"saveJobCalloutModel":{"actionsList":null,"actionsMap":{"createaccount":{"children":"Create account (it's free)","className":null,"href":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Fregister?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1","target":"_PARENT"},"signin":{"children":"Sign in","className":null,"href":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1","target":"_PARENT"}},"caretPosition":null,"children":"You must sign in to save jobs:","dismissAriaLabel":"Close","dismissAttributes":null,"dismissHref":null,"heading":"Save jobs and view them from any computer."},"saveJobFailureModalModel":{"closeAriaLabel":"Close","closeButtonText":"Close","message":"Please retry","signInButtonText":null,"signInHref":null,"title":"Failed to Save Job"},"saveJobLimitExceededModalModel":{"closeAriaLabel":"Close","closeButtonText":null,"message":"You reached the limit. Please log in to save additional jobs.","signInButtonText":"Sign in","signInHref":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1&from=viewjob_savejoblimitmodal","title":"You've already saved 20 jobs"},"segmentId":"software_dev_seo","segmentPhoneNumberButtonLinkModel":null,"shareJobButtonContainerModel":{"buttonIconModel":{"color":"blue","position":null,"size":"md","title":"Share this job","type":"\u002Fm\u002Fimages\u002Fnativeshare.svg"},"buttonModel":{"buttonSize":null,"buttonType":"secondary","children":"Share this job","disabled":false,"href":null,"isActive":false,"isBlock":false,"isResponsive":false,"size":"md"},"fallbackButtonIconModel":{"color":"blue","position":null,"size":"md","title":"Copy link","type":"\u002Fm\u002Fimages\u002Ficon-copy.svg"},"shareText":"Check out this job on Indeed:\nBluevine\nSenior Full Stack Engineer (python)\nAustin, TX\nhttps:\u002F\u002Fwww.indeed.com\u002Fm\u002Fviewjob?jk=f6288f8af00406b1&from=native","shareType":"native","shareUrl":"https:\u002F\u002Fwww.indeed.com\u002Fm\u002Fviewjob?jk=f6288f8af00406b1&from=native","showUnderSaveButton":true},"shouldLogResolution":true,"showEmployerResponsiveCard":false,"showGlobalNavContent":false,"showReportInJobButtons":false,"sponsored":false,"sponsoredAdsContainerModel":null,"sponsoredJobs":null,"staticPrefix":"\u002F\u002Fd3fw5vlhllyvee.cloudfront.net\u002Fm\u002Fs\u002F","stickyType":"ALWAYS","successfullySignedInModel":null,"viewJobButtonLinkContainerModel":null,"viewJobDisplay":"DESKTOP_EMBEDDED","viewJobDisplayParam":"dtembd","viewjobDislikes":false,"whatWhereFormModel":null,"zoneProviders":{"aboveViewjobButtons":[],"viewjobModals":["MosaicProviderCallToApplyFeedback"],"aboveExtractedJobDescription":[],"aboveFullJobDescription":["mosaic-provider-company-info-salary"],"rightRail":[],"legacyProvidersViewJob":["mosaic-provider-reportcontent"],"betweenJobDescriptionAndButtons":[],"ssrVJModals":[],"belowJobDescription":[],"belowFullJobDescription":[],"belowViewjobButtons":["mosaic-provider-dislike-feedback","mosaic-provider-salary-feedback"],"belowViewjobNav":[]}};</script>
'window._initialData=(\{.+?\});'
script_tag = re.findall(r"_initialData=(\{.+?\});", html)
json_blob = json.loads(script_tag[0])job = json_blob["jobInfoWrapperModel"]["jobInfoModel"]
import reimport jsonimport requestsfrom urllib.parse import urlencode headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} job_id_list = [ 'f6288f8af00406b1', '56ab4e4fe59ae782', '29bd7638828fab65', '697a7a3f18590465', '08e92505e27442d3', '105529f69e3fdae2'] full_job_data_list = [] for job_id in job_id_list: try: indeed_job_url = 'https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk=' + job_id response = requests.get(indeed_job_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r"_initialData=(\{.+?\});", response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) job = json_blob["jobInfoWrapperModel"]["jobInfoModel"] full_job_data_list.append({ 'company': job.get('jobInfoHeaderModel').get('companyName') if job.get('jobInfoHeaderModel') is not None else '', 'jobkey': job_id, 'jobTitle': job.get('jobInfoHeaderModel').get('jobTitle') if job.get('jobInfoHeaderModel') is not None else '', 'jobDescription': job.get('sanitizedJobDescription').get('content') if job.get('sanitizedJobDescription') is not None else '', }) except Exception as e: print('Error', e) print(full_job_data_list)
[{ "company": "Bluevine", "jobkey": "f6288f8af00406b1", "jobTitle": "Senior Full Stack Engineer (python)", "jobDescription": "<div>\n <div>\n <p><b>About Bluevine</b></p> \n <p> Bluevine is on a mission to enable a better financial future for small business owners through innovative banking solutions designed just for them. By combining best-in-class technology with advanced security and a deep understanding of the small business community, we deliver end-to-end banking and lending products that empower always-on entrepreneurs to grow their businesses with confidence.</p> \n <p> As a dynamic company with massive potential, we're backed by leading investors such as Lightspeed Venture Partners, Menlo Ventures, 83North, Citi Ventures, and nearly 9 years of proven success. Since launching in 2013, we have grown exponentially, amassing over 400,000 customers across all 50 states and a global team of more than 500 people. Our passion is driven by purpose: to give small businesses the tools they need to succeed and we're just getting started.</p> \n <p> All of this begins with our team who are driven by collaboration, problem-solving, and learning and growing together. With a commitment to innovation and community impact, our mission is to help every small business—and every team member—thrive. Join us!</p>\n </div>\n <p><b><i> This is a hybrid role</i></b><i>. </i>At Bluevine, we pride ourselves on our collaborative culture, which we believe is best maintained through in-person interactions and a vibrant office environment. All of our offices have reopened in accordance with local guidelines, and are following a hybrid model. In-office days will be determined by location and discipline.</p> \n <p><b> ABOUT THE ROLE:</b></p> \n <p> We're looking for a Senior Full Stack Engineer flexible enough to develop features from the front (beautiful UX) to the back (scalable and robust components and integrations). If you're drawn to engineering challenges and have a strong desire to make a big impact as part of a small, agile team, in an exciting space, we'd love to talk to you.</p> \n <p> The team serves a variety of stakeholders across all the business and the platform.</p> \n <p><b> WHAT YOU'LL DO:</b></p> \n <ul> \n <li>Independently drive the engineering development of complex features</li> \n <li>Design and build state-of-the-art responsive banking applications</li> \n <li>Work closely with, and incorporate feedback from product managers and other stakeholders in the company</li> \n <li>Be part of a fast-paced and highly-flexible team with the comfort of making decisions using your best judgement</li> \n <li>Develop projects through their entire life cycle</li> \n </ul> \n <p><b>WHAT WE LOOK FOR:</b></p> \n <ul> \n <li>5+ years of combined full stack experience experience building fast, reliable, web and/or mobile applications on applications with Python backends</li> \n <li>Experience with Web frameworks (e.g., Angular, React, or Vue)</li> \n <li>Experience with source control management systems, preferably Git</li> \n <li>B.S. in Computer Science or a related field preferred</li> \n </ul> \n <p><b>Nice to Haves</b></p> \n <ul> \n <li>Experience with AWS</li> \n <li>Experience with mobile development (e.g., Native, Native Script, or React)</li> \n </ul>\n <div>\n <div> \n <p><b>BENEFITS AND PERKS - for employees located in the US</b></p> \n <ul> \n <li>Excellent health coverage and life insurance benefits</li> \n <li>401K with an immediate 3% company match</li> \n <li>PTO, Company Holidays, and Flexible Holidays</li> \n <li>Company-sponsored Mental Health Benefits, including 1:1 therapy</li> \n <li>Over $1,000 annually for a wellness benefit of your choice</li> \n <li>Monthly WFH stipend</li> \n <li>Generous, paid parental leave covering up to 16 weeks</li> \n <li>Access to financial coaches and education sessions</li> \n <li>Free commuter benefits - Caltrain passes for San Francisco employees and a monthly parking allowance</li> \n <li>Monthly DoorDash credit</li> \n <li>Weekly catered lunches and fully stocked kitchen pantries</li> \n <li>Dog-friendly Redwood City, CA office</li> \n <li>Community-based volunteering opportunities</li> \n </ul> \n <p><b>BENEFITS & PERKS - for employees located in Israel</b></p> \n <ul> \n <li>Excellent group health coverage and life insurance benefits</li> \n <li>Stock options</li> \n <li>Flexible hybrid work model</li> \n <li>Large Study Fund contribution</li> \n <li>Salary Benchmarks and Checkpoints</li> \n <li>Monthly meal card of TenBis or CiBus (your choice) with generous balance</li> \n <li>Free parking for cars, scooters, and bikes</li> \n <li>Free gym membership</li> \n <li>Company-sponsored Mental Health Benefits</li> \n <li>PTO, Company Holidays, and Flexible Holidays</li> \n <li>Community-based volunteering opportunities</li>\n </ul>\n </div>\n </div>\n</div>\n<div></div>" }, ]
SCRAPEOPS_API_KEY = 'YOUR_API_KEY' def scrapeops_url(url): payload = {'api_key': SCRAPEOPS_API_KEY, 'url': url, 'country': 'us'} proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload) return proxy_url indeed_search_url = 'https%3A%2F%2Fwww.indeed.com%2Fjobs%3Fq%3Dsoftware%20engineer%26l%3DSan%20Francisco%26start%3D0%26filter%3D0' ## Send URL To ScrapeOps Instead of Indeed response = requests.get(scrapeops_url(indeed_search_url))
import reimport jsonimport requestsfrom urllib.parse import urlencode job_id_list = [ 'f6288f8af00406b1', '56ab4e4fe59ae782', '29bd7638828fab65', '697a7a3f18590465', '08e92505e27442d3', '105529f69e3fdae2'] full_job_data_list = [] for job_id in job_id_list: try: indeed_job_url = 'https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk=' + job_id response = requests.get(scrapeops_url(indeed_job_url)) if response.status_code == 200: script_tag = re.findall(r"_initialData=(\{.+?\});", response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) job = json_blob["jobInfoWrapperModel"]["jobInfoModel"] full_job_data_list.append({ 'company': job.get('jobInfoHeaderModel').get('companyName') if job.get('jobInfoHeaderModel') is not None else '', 'jobkey': job_id, 'jobTitle': job.get('jobInfoHeaderModel').get('jobTitle') if job.get('jobInfoHeaderModel') is not None else '', 'jobDescription': job.get('sanitizedJobDescription').get('content') if job.get('sanitizedJobDescription') is not None else '', }) except Exception as e: print('Error', e) full_job_data_list
Then check out ScrapeOps, the complete toolkit for web scraping.
Unencoded --> 'https://www.indeed.com/jobs?q=software engineer&l=San Francisco&start=0&filter=0'
Encoded --> 'https%3A%2F%2Fwww.indeed.com%2Fjobs%3Fq%3Dsoftware%20engineer%26l%3DSan%20Francisco%26start%3D0%26filter%3D0'
q
stands for the search query. In our case, q=software engineer
. Note: If you want to search for a keyword that contains spaces or special characters then remember you need to encode this value. (Encoded: q%3Dsoftware%20engineer
)l
stands for the location you want to search for jobs. In our case, we used l=California
.start
stands for the starting point for the pagination. We use the start
parameter to paginate through results.<script id="mosaic-data" type="text/javascript">
tag, under window.mosaic.providerData["mosaic-provider-jobcards"]
.<script id="mosaic-data" type="text/javascript"> ... window.mosaic.providerData["mosaic-provider-jobcards"]={"metaData":{"mosaicProviderJobCardsModel":{"adSignature":"3573","appName":"jasx","applyHolisticStyle":true,"bot":false,"brandedAds":[],"csrfToken":"VfkW0LdLxAXrjIkxOeasTnBCW8vSv9TE","encryptedQueryData":"RnZhMybXSk4M3QtTVGXWoe9dbTL46KyFjV9_vwSAcQxuziQ2QCDK8B6B0pUnV6xlgzK1HVOkc0tMGyUpMO9yEdnbun4jJaS6CbMzioz2PqM","experienceLevelFilterRefineBy":"","fccId":-1,"hasResume":false,"indeedApplyOnlyFilterUsed":false,"ipCountry":"IE","isDesktop":true,"isHighContrastIconShown":true,"isIpadApp":false,"isJobCardShelfApplied":true,"isTablet":false,"jobSeenLogParameters":{},"linkTargetAttribute":"_blank","loggedIn":false,"mobtk":"1ge736cml2gra002","mosaicNonJobContent":[],"mustShowSponsoredLabel":false,"myIndeedEnabled":true,"myIndeedRegisterLink":"https://www.indeed.com/account/register?dest=%2Fjobs%3Fjson%3D1%26q%3Dpython%26vjk%3D532734731891698b%26l%3DTexas","noJsUrlOnly":false,"overrideShelf":true,"pageNumber":1,"prforceGroups":"","proctorContext":{"accountId":-1,"app":false,"country":"US","ctkAge":72611863,"ctkDate":"20220929","hasRez":false,"lang":"en","loggedIn":false,"mtkAge":72611863,"platform":"","privileged":false,"smartphone":false,"stealthGroups":[],"tablet":false,"uaData":"{\"android\":false,\"androidApp\":false,\"androidEmployerApp\":false,\"androidJobSearchApp\":false,\"app\":false,\"bot\":false,\"browser\":\"CHROME\",\"browserFamily\":\"CHROME\",\"browserReleaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"browserVersion\":{\"majorVersion\":\"105\",\"minorVersion\":\"-1\",\"version\":\"105\"},\"chrome\":true,\"chromeForIOS\":false,\"currentJobseekerDeprecatedBrowser\":false,\"deviceType\":\"COMPUTER\",\"droidRezUploadDialog\":false,\"dumbPhone\":false,\"employerApp\":false,\"fileUploadCapable\":true,\"futureJobseekerDeprecatedBrowser\":false,\"geolocationCapable\":false,\"googleWebLight\":false,\"ios\":false,\"iosemployerApp\":false,\"iosjobSearchApp\":false,\"ipad\":false,\"ipadApp\":false,\"ipadJobSearchApp\":false,\"jobSearchApp\":false,\"mobileDevice\":false,\"operatingSystem\":\"WINDOWS\",\"os\":{\"family\":\"windows\",\"majorVersion\":-1,\"minorVersion\":-1,\"osFamily\":\"windows\",\"osVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"patchVersion\":-1,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"version\":\"\"},\"phone\":false,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"safari\":false,\"safariForIOS\":false,\"smartPhone\":false,\"tablet\":false,\"uaVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"userAgentDelegate\":{\"android\":false,\"bot\":false,\"browser\":\"CHROME\",\"browserName\":\"Chrome\",\"browserReleaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":29554872554618880},\"browserVersion\":{\"majorVersion\":\"105\",\"minorVersion\":\"-1\",\"version\":\"105\"},\"browserVersionString\":\"105\",\"chrome\":true,\"delegate\":{\"allFields\":{\"DeviceClass\":{\"confidence\":500,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"DeviceName\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"DeviceBrand\":{\"confidence\":0,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"OperatingSystemClass\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Desktop\"},\"OperatingSystemName\":{\"confidence\":400001,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Windows NT\"},\"OperatingSystemVersion\":{\"confidence\":400001,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"OperatingSystemVersionMajor\":{\"confidence\":400001,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"AgentClass\":{\"confidence\":2014,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Browser\"},\"AgentName\":{\"confidence\":2014,\"defaultValue\":\"Unknown\",\"isDefaultValue\":false,\"value\":\"Chrome\"},\"AgentVersion\":{\"confidence\":3000,\"defaultValue\":\"??\",\"isDefaultValue\":false,\"value\":\"105\"},\"AgentInformationEmail\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"AgentInformationUrl\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"WebviewAppName\":{\"confidence\":-1,\"defaultValue\":\"Unknown\",\"isDefaultValue\":true,\"value\":\"Unknown\"},\"WebviewAppVersion\":{\"confidence\":-1,\"defaultValue\":\"??\",\"isDefaultValue\":true,\"value\":\"??\"},\"__SyntaxError__\":{\"confidence\":-1,\"defaultValue\":\"false\",\"isDefaultValue\":true,\"value\":\"false\"}},\"ambiguityCount\":0,\"availableFieldNamesSorted\":[\"DeviceClass\",\"DeviceName\",\"DeviceBrand\",\"OperatingSystemClass\",\"OperatingSystemName\",\"OperatingSystemVersion\",\"OperatingSystemVersionMajor\",\"AgentClass\",\"AgentName\",\"AgentVersion\",\"AgentInformationEmail\",\"AgentInformationUrl\",\"WebviewAppName\",\"WebviewAppVersion\",\"__SyntaxError__\"],\"cleanedAvailableFieldNamesSorted\":[\"DeviceClass\",\"DeviceName\",\"DeviceBrand\",\"OperatingSystemClass\",\"OperatingSystemName\",\"OperatingSystemVersion\",\"OperatingSystemVersionMajor\",\"AgentClass\",\"AgentName\",\"AgentVersion\"],\"hasAmbiguity\":false,\"hasSyntaxError\":false,\"headers\":{\"User-Agent\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\"},\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"userAgentStringField\":{\"confidence\":0,\"defaultValue\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"isDefaultValue\":false,\"value\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\"}},\"deviceName\":\"Desktop\",\"deviceType\":\"COMPUTER\",\"deviceTypeString\":\"Desktop\",\"dumbPhone\":false,\"ios\":false,\"ipad\":false,\"mobileDevice\":false,\"operatingSystem\":\"WINDOWS\",\"operatingSystemFamily\":\"Windows NT\",\"operatingSystemVersion\":\"??\",\"os\":{\"family\":\"windows\",\"majorVersion\":-1,\"minorVersion\":-1,\"osFamily\":\"windows\",\"osVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"patchVersion\":-1,\"releaseVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"version\":\"\"},\"phone\":false,\"safari\":false,\"smartPhone\":false,\"tablet\":false,\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"webviewName\":\"Unknown\",\"webviewVersion\":{\"matchPrecision\":\"BUILD\",\"version\":0},\"windowsPhone\":false},\"userAgentString\":\"Mozilla\\u002F5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\\u002F537.36 (KHTML, like Gecko) Chrome\\u002F105.0.0.0 Safari\\u002F537.36\",\"version\":{\"major\":105,\"minor\":-1,\"version\":\"105\"},\"windowsPhone\":false}"},"proctorIdentifiers":{"ACCOUNT":"-1","USER":"1ge4tueuklhdh800"},"queryModifierResult":{"originalQuery":"python","queryModifiers":[{"clickUrl":"http://www.indeed.com/jobs?q=python%2Bintern&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"python intern"},{"clickUrl":"http://www.indeed.com/jobs?q=panda&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"panda"},{"clickUrl":"http://www.indeed.com/jobs?q=bobcat&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"bobcat"},{"clickUrl":"http://www.indeed.com/jobs?q=rhino&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"rhino"},{"clickUrl":"http://www.indeed.com/jobs?q=reptile&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"reptile"},{"clickUrl":"http://www.indeed.com/jobs?q=boba&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"boba"},{"clickUrl":"http://www.indeed.com/jobs?q=drupal&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"drupal"},{"clickUrl":"http://www.indeed.com/jobs?q=caterpillar&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"caterpillar"},{"clickUrl":"http://www.indeed.com/jobs?q=abacus&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"abacus"},{"clickUrl":"http://www.indeed.com/jobs?q=food%2Blion&l=Texas&from=querymodifiers&qm=1&oq=python","newQuery":"food lion"}]},"radius":25,"refineByTypes":[],"results":[{"appliedOrGreater":false,"company":"John Deere","companyBrandingAttributes":{"headerImageUrl":"https://d2q79iu7y748jz.cloudfront.net/s/_headerimage/1960x400/5e8d35d0dcbc8a32f12d61e4541c55ae","logoUrl":"https://d2q79iu7y748jz.cloudfront.net/s/_squarelogo/256x256/b46cb1797d2ea21811908aaa0ab2bdad"},"companyIdEncrypted":"38eb72d608d80c79","companyOverviewLink":"/cmp/John-Deere","companyOverviewLinkCampaignId":"serp-linkcompanyname","companyRating":4,"companyReviewCount":3767,"companyReviewLink":"/cmp/John-Deere/reviews","companyReviewLinkCampaignId":"cmplinktst2","d2iEnabled":false,"displayTitle":"Part-Time Student-MLOps Software Engineer-Remote","dradisJob":false,"employerAssistEnabled":false,"employerResponsive":false,"encryptedFccompanyId":"eade00c6021a5947","encryptedResultData":"VwIPTVJ1cTn5AN7Q-tSqGRXGNe2wB2UYx73qSczFnGU","expired":false,"extractTrackingUrls":"","extractedEntities":[],"fccompanyId":-1,"featuredCompanyAttributes":{},"featuredEmployer":false,"featuredEmployerCandidate":false,"feedId":2701,"formattedLocation":"Austin, TX 78704","formattedRelativeTime":"Today","hideMetaData":false,"hideSave":false,"highVolumeHiringModel":{"highVolumeHiring":false},"highlyRatedEmployer":false,"hiringEventJob":false,"indeedApplyEnabled":false,"indeedApplyable":false,"isJobSpotterJob":false,"isJobVisited":false,"isMobileThirdPartyApplyable":false,"isNoResumeJob":false,"isSubsidiaryJob":false,"jobCardRequirementsModel":{"additionalRequirementsCount":0,"requirementsHeaderShown":false},"jobLocationCity":"Austin","jobLocationExtras":"South Lamar-South Congress","jobLocationPostal":"78704","jobLocationState":"TX","jobTypes":["Full-time","Part-time"],"jobkey":"a22fa26470cfb9ad","jsiEnabled":false,"link":"/rc/clk?jk=a22fa26470cfb9ad&fccid=38eb72d608d80c79&vjs=3","locationCount":1,"loceJobTagModel":{},"mobtk":"1ge736cml2gra002","moreLinks":{"companyName":"John Deere","companyText":"John Deere jobs in Austin, TX","locationName":"Austin","qnaUrl":"/cmp/John-Deere/faq","qnaUrlParams":"?from=serp-more&campaignid=serp-more&fromjk=a22fa26470cfb9ad&jcid=38eb72d608d80c79","resultNumber":0,"salaryLocationName":"Austin, TX","salaryNoFollowLink":false,"salaryUrl":"/career/software-engineer/salaries/78704--TX","salaryUrlParams":"?campaignid=serp-more&fromjk=a22fa26470cfb9ad&from=serp-more","shortLocationName":"Austin, TX","showAcmeLink":true,"showAcmeQnaLink":true,"showViewAllCompanyAndLocationLinks":true,"showViewAllCompanyLink":true,"showViewAllLocationLink":true,"showViewAllNormalizedTitleLink":false,"viewAllCompanyLinkText":"John Deere jobs in Austin, TX","viewAllCompanyUrl":"/q-John-Deere-l-Austin,-TX-jobs.html","viewAllLocationUrl":"/l-Austin,-TX-jobs.html","visible":false},"moreLocUrl":"/jobs?q=python&l=Texas&jtid=b3a825820658bf92&jcid=38eb72d608d80c79&grp=tcl","mouseDownHandlerOption":{"adId":"","advn":"","extractTrackingUrls":[],"from":"vjs","jobKey":"a22fa26470cfb9ad","link":"/rc/clk?jk=a22fa26470cfb9ad&fccid=38eb72d608d80c79&vjs=3","tk":"1ge736cml2gra002"},"newJob":true,"normTitle":"Part Time Student Mlop Software Engineer Remote","openInterviewsInterviewsOnTheSpot":false,"openInterviewsJob":false,"openInterviewsOffersOnTheSpot":false,"openInterviewsPhoneJob":false,"overrideIndeedApplyText":true,"preciseLocationModel":{"obfuscateLocation":false,"overrideJCMPreciseLocationModel":true},"pubDate":1664427600000,"redirectToThirdPartySite":false,"remoteLocation":false,"remoteWorkModel":{"inlineText":true,"type":"REMOTE_ALWAYS"},"resumeMatch":false,"salarySnippet":{"salaryTextFormatted":false},"saved":false,"savedApplication":false,"screenerQuestionsURL":"","showCommutePromo":false,"showEarlyApply":false,"showJobType":false,"showRelativeDate":true,"showSponsoredLabel":false,"showStrongerAppliedLabel":false,"smartFillEnabled":false,"smbD2iEnabled":false,"snippet":"\u003Cul style=\"list-style-type:circle;margin-top: 0px;margin-bottom: 0px;padding-left:20px;\"\u003E \n \u003Cli style=\"margin-bottom:0px;\"\u003ETitle: Part-Time Student-MLOps Software Engineer-Remote - 91235.\u003C/li\u003E\n \u003Cli\u003EThe Part-Time Student Program is primarily designed to augment the Company’s regular full-time…\u003C/li\u003E\n\u003C/ul\u003E","sourceId":2775,"sponsored":false,"taxoAttributes":[],"taxoAttributesDisplayLimit":5,"taxoLogAttributes":[],"taxonomyAttributes":[{"attributes":[{"label":"Part-time","suid":"75GKK"}};" ...</script>
window.mosaic.providerData["mosaic-provider-jobcards"]
json on the page and parse it's contents.We can do this with a regex command like:'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});'
script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', html)
json_blob = json.loads(script_tag[0])jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results']
import reimport jsonimport requestsfrom urllib.parse import urlencode def get_indeed_search_url(keyword, location, offset=0): parameters = {"q": keyword, "l": location, "filter": 0, "start": offset} return "https://www.indeed.com/jobs?" + urlencode(parameters) headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} job_id_list = [] ## Job Search Parameterskeyword_list = ['software engineer']location_list = ['California'] ## Loop Through Indeed Pages Until No More Jobsfor keyword in keyword_list: for location in location_list: for offset in range(0, 1010, 10): try: indeed_jobs_url = get_indeed_search_url(keyword, location, offset) response = requests.get(indeed_jobs_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results'] for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: job_id_list.append(job.get('jobkey')) ## If response contains less than 10 jobs then stop pagination if len(jobs_list) < 10: break except Exception as e: print('Error', e) print(job_id_list)
job_id_list
will look like this:[ "f6288f8af00406b1", "56ab4e4fe59ae782", "29bd7638828fab65", "697a7a3f18590465", "08e92505e27442d3", "105529f69e3fdae2", "cb6051b3810c1a1e", "27b6100b1824c062", "5ed01f909421e1dc", "51e6c0cb491f1eb8", "9c00bbf58c08a82b", "1744f664384cf3f9", "d07ab6e42b00c36a", "4afb447f36e4ca83", "a4c8a5cac15f5a54", "2c05f3937cde360a", "37bd28548878ed8f" ]
jobKey
from the jobs_list
:for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: job_id_list.append(job.get('jobkey'))
for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: jobs_data_list.append({ 'keyword': keyword, 'location': location, 'page': round(offset / 10) + 1 if offset > 0 else 1, 'position': index, 'company': job.get('company'), 'companyRating': job.get('companyRating'), 'companyReviewCount': job.get('companyReviewCount'), 'companyRating': job.get('companyRating'), 'highlyRatedEmployer': job.get('highlyRatedEmployer'), 'jobkey': job.get('jobkey'), 'jobTitle': job.get('title'), 'jobLocationCity': job.get('jobLocationCity'), 'jobLocationPostal': job.get('jobLocationPostal'), 'jobLocationState': job.get('jobLocationState'), 'maxSalary': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 0, 'minSalary': job.get('estimatedSalary').get('min') if job.get('estimatedSalary') is not None else 0, 'salaryType': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 'none', 'pubDate': job.get('pubDate'), })
window._initialData={}
inside a <script>
tag in the HTML response it is pretty easy to extract the data.<script> ... window._initialData={"accountKey":null,"apiPaths":{},"appCommonData":null,"averageRatingsModel":null,"base64EncodedJson":"eyJjIjpmYWxzZSwiZSI6ZmFsc2UsImciOiJodHRwOi8vd3d3LmluZGVlZC5jb20vbS9iYXNlY2FtcC92aWV3am9iP3ZpZXd0eXBlPWVtYmVkZGVkJmprPWY2Mjg4ZjhhZjAwNDA2YjEifQ","baseInboxUrl":"https:\u002F\u002Finbox.indeed.com","baseUrl":"https:\u002F\u002Fwww.indeed.com","benefitsModel":{"benefits":[{"key":"FVKX2","label":"401(k)"},{"key":"SENX8","label":"401(k) matching"},{"key":"6XHWW","label":"Commuter assistance"},{"key":"YJ8XR","label":"Food provided"},{"key":"3K96F","label":"Free parking"},{"key":"TZV2T","label":"Gym membership"},{"key":"EY33Q","label":"Health insurance"},{"key":"Y2WS5","label":"Life insurance"},{"key":"HW4J4","label":"Paid time off"},{"key":"NPHPU","label":"Parental leave"},{"key":"6XT6J","label":"Stock options"}]},"callToInterviewButtonModel":null,"categorizedAttributesModel":null,"chatbotApplyButtonLinkModel":null,"clientsideProctorGroups":{"callToApplyStickySideBySide":true,"mobmapandcommutetimetst9":false,"showSalaryGuide":true,"desktopvj_stickyheader_tst":false,"callToApplyStickyBelowApplyNow":false,"mob_desktop_serp_tst":true,"callButtonPrimaryApplySecondary":false,"showInterviewCardBelowJobDesc":false},"cmiJobCategoryModel":null,"commuteInfoModel":null,"companyAvatarModel":null,"companyFollowFormModel":null,"companyTabModel":null,"contactPersonModel":null,"country":"US","cssResetProviders":{"mosaic-provider-reportcontent":false,"mosaic-provider-salary-feedback":true,"mosaic-provider-company-info-salary":true,"mosaic-provider-rich-media":false,"js-match-insights-provider":false,"MosaicProviderCallToApplyFeedback":true,"mosaic-provider-dislike-feedback":false},"ctk":"1ge4tueuklhdh800","dcmModel":{"category":"jobse0","source":"6927552","type":"organic"},"desktop":true,"desktopSponsoredJobSeenData":"tk=1ge7cg6jhkke6800","dgToken":"6987EEB4A2C6A193E5C44936188510EB","dislikeFrom2paneEnabled":false,"downloadAppButtonModel":null,"employerResponsiveCardModel":null,"from":null,"globalnavFooterHTML":"","globalnavHeaderHTML":"","highQualityMarketplace":null,"hiringInsightsModel":{"age":"30+ days ago","employerLastReviewed":null,"employerResponsiveCardModel":null,"numOfCandidates":null,"postedToday":false,"recurringHireText":null,"urgentlyHiringModel":null},"indeedApplyButtonContainer":{"brandingText":null,"buttonClickUrl":null,"disabled":false,"employerResponsiveCard":null,"enableStickyInquiry":false,"enableStickyInquiryTooltip":false,"hasMessage":false,"indeedApplyAttributes":{"content":"data-indeed-apply-apiToken='f09f8f8add995328354a7a9e7a7fefdedfe230dee1f12eeb30c6e4b184f2dd9e' data-indeed-apply-jobTitle='Senior Full Stack Engineer (python)' data-indeed-apply-jobId='4931506003' data-indeed-apply-jobLocation='Austin, TX' data-indeed-apply-jobCompanyName='Bluevine' data-indeed-apply-jobUrl='https:\u002F\u002Fwww.indeed.com\u002Fviewjob?jk=f6288f8af00406b1' data-indeed-apply-questions='https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed' data-indeed-apply-postUrl='https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed' data-indeed-apply-name='firstlastname' data-indeed-apply-coverletter='optional' data-indeed-apply-phone='required' data-indeed-apply-resume='required' data-indeed-apply-noButtonUI='true' data-indeed-apply-pingbackUrl='https:\u002F\u002Fgdc.indeed.com\u002Fconv\u002ForgIndApp?co=US&vjtk=1ge7cg6jhkke6800&jk=f6288f8af00406b1&mvj=0&astse=b78e1ac815f57228&assa=2193' data-indeed-apply-onappliedstatus='_updateIndeedApplyStatus' data-indeed-apply-onready='_onButtonReady' data-indeed-apply-jk='f6288f8af00406b1' data-indeed-apply-onclose=\"indeedApplyHandleModalClose\" data-indeed-apply-onapplied=\"indeedApplyHandleApply\" data-indeed-apply-oncontinueclick=\"indeedApplyHandleModalClose\" data-indeed-apply-onClick=\"indeedApplyHandleButtonClick\" data-indeed-apply-returnToJobSearchUrl='' data-acc-payload=\"1,2,22,1,144,1,552,1,3648,1,4392,1\" data-indeed-apply-recentsearchquery='{\"what\":\"software engineer\",\"where\":\"California\"}'","contentKind":"ATTRIBUTES"},"indeedApplyBaseUrl":"https:\u002F\u002Fapply.indeed.com","indeedApplyBootStrapAttributes":{"hl":"en","source":"idd","co":"US","vjtk":"1ge7cg6jhkke6800"},"indeedApplyButtonAttributes":{"postUrl":"https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed","jk":"f6288f8af00406b1","onClick":"indeedApplyHandleButtonClick","jobTitle":"Senior Full Stack Engineer (python)","questions":"https:\u002F\u002Fapi.greenhouse.io\u002Fv1\u002Fboards\u002Fbluevine\u002Fjobs\u002F4931506003\u002Findeed","onappliedstatus":"_updateIndeedApplyStatus","jobCompanyName":"Bluevine","recentsearchquery":"{\"what\":\"software engineer\",\"where\":\"California\"}","onclose":"indeedApplyHandleModalClose","jobUrl":"https:\u002F\u002Fwww.indeed.com\u002Fviewjob?jk=f6288f8af00406b1","onready":"_onButtonReady","onapplied":"indeedApplyHandleApply","coverletter":"optional","resume":"required","pingbackUrl":"https:\u002F\u002Fgdc.indeed.com\u002Fconv\u002ForgIndApp?co=US&vjtk=1ge7cg6jhkke6800&jk=f6288f8af00406b1&mvj=0&astse=b78e1ac815f57228&assa=2193","noButtonUI":"true","jobId":"4931506003","apiToken":"f09f8f8add995328354a7a9e7a7fefdedfe230dee1f12eeb30c6e4b184f2dd9e","jobLocation":"Austin, TX","phone":"required","name":"firstlastname","oncontinueclick":"indeedApplyHandleModalClose","returnToJobSearchUrl":""},"indeedApplyButtonModel":{"applyBtnNewStyle":true,"buttonSize":"block","buttonType":"branded","contentHtml":"Apply now","dataHref":null,"disclaimer":null,"href":"\u002F","icon":null,"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":null},"indeedApplyLoginModalModel":null,"indeedApplyScriptAttributes":{"data-indeed-apply-qs":"vjtk=1ge7cg6jhkke6800"},"indeedApplyScriptLocation":"https:\u002F\u002Fapply.indeed.com\u002Findeedapply\u002Fstatic\u002Fscripts\u002Fapp\u002Fbootstrap.js?hl=en&co=US&source=idd","shouldUseButtonPlaceholder":true,"stagingLevel":"prod","viewFormUrl":null,"viewFormUrlAttribute":{"content":"","contentKind":"ATTRIBUTES"}},"indeedLogoModel":null,"inlineJsErrEnabled":false,"isApp":false,"isApplyTextColorChanges":true,"isApplyTextSizeChanges":true,"isCriOS":false,"isDislikeFormV2Enabled":false,"isSafariForIOS":false,"isSalaryNewDesign":false,"isSyncJobs":false,"jasJobViewPingModel":null,"jasxInputWhatWhereActive":true,"jobAlertSignInModalModel":null,"jobAlertSignUp":null,"jobCardStyleModel":{"elementSpacingIncreased":false,"fontSizeEnlarged":false,"highContrastIconShown":false,"jobCardShelfApplied":false,"salaryBlack":false,"shouldMarkClickedJobAsVisited":false},"jobInfoWrapperModel":{"jobInfoModel":{"appliedStateBannerModel":null,"commuteInfoModel":null,"expiredJobMetadataModel":null,"hideCmpHeader":false,"isD2iEnabled":false,"isJsiEnabled":false,"jobAttributesTestValue":-1,"jobDebugInfoModel":null,"jobDescriptionSectionModel":null,"jobInfoHeaderModel":{"a11yNewtabIconActive":false,"averageRatingsModel":null,"companyImagesModel":{"ejiBannerAsBackground":false,"enhancedJobDescription":false,"featuredEmployer":false,"headerImageUrl":"https:\u002F\u002Fd2q79iu7y748jz.cloudfront.net\u002Fs\u002F_headerimage\u002F1960x400\u002Fae55ead6c2c0702692b9e43ac06f3277","logoAltText":"Bluevine logo","logoImageOverlayLower":false,"logoUrl":"https:\u002F\u002Fd2q79iu7y748jz.cloudfront.net\u002Fs\u002F_squarelogo\u002F256x256\u002Fc338b6786b5eadab2a1f404e10259004","showBannerTop":false,"showEnhancedJobImp":false,"showIconInTitle":false},"companyName":"Bluevine","companyOverviewLink":"https:\u002F\u002Fwww.indeed.com\u002Fcmp\u002FBluevine?campaignid=mobvjcmp&from=mobviewjob&tk=1ge7cg6jhkke6800&fromjk=f6288f8af00406b1","companyReviewLink":"https:\u002F\u002Fwww.indeed.com\u002Fcmp\u002FBluevine\u002Freviews?campaignid=mobvjcmp&cmpratingc=mobviewjob&from=mobviewjob&tk=1ge7cg6jhkke6800&fromjk=f6288f8af00406b1&jt=Senior+Full+Stack+Engineer+%28python%29","companyReviewModel":null,"disableAcmeLink":false,"employerActivity":null,"employerResponsiveCardModel":null,"encryptedFccCompanyId":null,"formattedLocation":"Austin, TX","hideRating":false,"isDesktopApplyButtonSticky":false,"isSimplifiedHeader":false,"jobNormTitle":null,"jobTitle":"Senior Full Stack Engineer (python)","jobTypes":null,"location":null,"openCompanyLinksInNewTab":false,"parentCompanyName":null,"preciseLocationModel":null,"ratingsModel":null,"recentSearch":null,"remoteWorkModel":{"inlineText":true,"text":"Hybrid remote","type":"REMOTE_HYBRID"},"salaryMax":null,"salaryMin":null,"salaryType":null,"subtitle":"Bluevine - Austin, TX","tagModels":null,"taxonomyAttributes":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"jobMetadataHeaderModel":{"jobType":null},"jobTagModel":null,"resumeEvaluationResult":null,"sanitizedJobDescription":{"content":"<div>\n <div>\n <p><b>About Bluevine<\u002Fb><\u002Fp> \n <p> Bluevine is on a mission to enable a better financial future for small business owners through innovative banking solutions designed just for them. By combining best-in-class technology with advanced security and a deep understanding of the small business community, we deliver end-to-end banking and lending products that empower always-on entrepreneurs to grow their businesses with confidence.<\u002Fp> \n <p> As a dynamic company with massive potential, we're backed by leading investors such as Lightspeed Venture Partners, Menlo Ventures, 83North, Citi Ventures, and nearly 9 years of proven success. Since launching in 2013, we have grown exponentially, amassing over 400,000 customers across all 50 states and a global team of more than 500 people. Our passion is driven by purpose: to give small businesses the tools they need to succeed and we're just getting started.<\u002Fp> \n <p> All of this begins with our team who are driven by collaboration, problem-solving, and learning and growing together. With a commitment to innovation and community impact, our mission is to help every small business—and every team member—thrive. Join us!<\u002Fp>\n <\u002Fdiv>\n <p><b><i> This is a hybrid role<\u002Fi><\u002Fb><i>. <\u002Fi>At Bluevine, we pride ourselves on our collaborative culture, which we believe is best maintained through in-person interactions and a vibrant office environment. All of our offices have reopened in accordance with local guidelines, and are following a hybrid model. In-office days will be determined by location and discipline.<\u002Fp> \n <p><b> ABOUT THE ROLE:<\u002Fb><\u002Fp> \n <p> We're looking for a Senior Full Stack Engineer flexible enough to develop features from the front (beautiful UX) to the back (scalable and robust components and integrations). If you're drawn to engineering challenges and have a strong desire to make a big impact as part of a small, agile team, in an exciting space, we'd love to talk to you.<\u002Fp> \n <p> The team serves a variety of stakeholders across all the business and the platform.<\u002Fp> \n <p><b> WHAT YOU'LL DO:<\u002Fb><\u002Fp> \n <ul> \n <li>Independently drive the engineering development of complex features<\u002Fli> \n <li>Design and build state-of-the-art responsive banking applications<\u002Fli> \n <li>Work closely with, and incorporate feedback from product managers and other stakeholders in the company<\u002Fli> \n <li>Be part of a fast-paced and highly-flexible team with the comfort of making decisions using your best judgement<\u002Fli> \n <li>Develop projects through their entire life cycle<\u002Fli> \n <\u002Ful> \n <p><b>WHAT WE LOOK FOR:<\u002Fb><\u002Fp> \n <ul> \n <li>5+ years of combined full stack experience experience building fast, reliable, web and\u002For mobile applications on applications with Python backends<\u002Fli> \n <li>Experience with Web frameworks (e.g., Angular, React, or Vue)<\u002Fli> \n <li>Experience with source control management systems, preferably Git<\u002Fli> \n <li>B.S. in Computer Science or a related field preferred<\u002Fli> \n <\u002Ful> \n <p><b>Nice to Haves<\u002Fb><\u002Fp> \n <ul> \n <li>Experience with AWS<\u002Fli> \n <li>Experience with mobile development (e.g., Native, Native Script, or React)<\u002Fli> \n <\u002Ful>\n <div>\n <div> \n <p><b>BENEFITS AND PERKS - for employees located in the US<\u002Fb><\u002Fp> \n <ul> \n <li>Excellent health coverage and life insurance benefits<\u002Fli> \n <li>401K with an immediate 3% company match<\u002Fli> \n <li>PTO, Company Holidays, and Flexible Holidays<\u002Fli> \n <li>Company-sponsored Mental Health Benefits, including 1:1 therapy<\u002Fli> \n <li>Over $1,000 annually for a wellness benefit of your choice<\u002Fli> \n <li>Monthly WFH stipend<\u002Fli> \n <li>Generous, paid parental leave covering up to 16 weeks<\u002Fli> \n <li>Access to financial coaches and education sessions<\u002Fli> \n <li>Free commuter benefits - Caltrain passes for San Francisco employees and a monthly parking allowance<\u002Fli> \n <li>Monthly DoorDash credit<\u002Fli> \n <li>Weekly catered lunches and fully stocked kitchen pantries<\u002Fli> \n <li>Dog-friendly Redwood City, CA office<\u002Fli> \n <li>Community-based volunteering opportunities<\u002Fli> \n <\u002Ful> \n <p><b>BENEFITS & PERKS - for employees located in Israel<\u002Fb><\u002Fp> \n <ul> \n <li>Excellent group health coverage and life insurance benefits<\u002Fli> \n <li>Stock options<\u002Fli> \n <li>Flexible hybrid work model<\u002Fli> \n <li>Large Study Fund contribution<\u002Fli> \n <li>Salary Benchmarks and Checkpoints<\u002Fli> \n <li>Monthly meal card of TenBis or CiBus (your choice) with generous balance<\u002Fli> \n <li>Free parking for cars, scooters, and bikes<\u002Fli> \n <li>Free gym membership<\u002Fli> \n <li>Company-sponsored Mental Health Benefits<\u002Fli> \n <li>PTO, Company Holidays, and Flexible Holidays<\u002Fli> \n <li>Community-based volunteering opportunities<\u002Fli>\n <\u002Ful>\n <\u002Fdiv>\n <\u002Fdiv>\n<\u002Fdiv>\n<div><\u002Fdiv>","contentKind":"HTML"},"screenerRequirementsModel":null,"showExpiredHeader":false,"tagModels":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"sectionedJobInfoModel":null},"jobKey":"f6288f8af00406b1","jobLocation":"Austin, TX","jobMetadataFooterModel":{"age":"30+ days ago","indeedApplyAdaNotice":"If you require alternative methods of application or screening, you must approach the employer directly to request this as Indeed is not responsible for the employer's application process.","originalJobLink":null,"phoneNumber":null,"saveJobLink":null,"showReportJobAsButton":true,"source":"BlueVine"},"jobSeenData":"tk=1ge7cg6jhkke6800&context=viewjobrecs","jobTitle":"Senior Full Stack Engineer (python)","language":"en","lastVisitTime":1664538063,"lazyProviders":{"mosaic-provider-reportcontent":"<div class=\"mosaic-reportcontent-wrapper button\"><style data-emotion=\"css 1686x4\">.css-1686x4{box-sizing:border-box;background:none;-webkit-appearance:none;-moz-appearance:none;-ms-appearance:none;appearance:none;text-align:left;-webkit-text-decoration:none;text-decoration:none;border:none;cursor:pointer;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-align-items:center;-webkit-box-align:center;-ms-flex-align:center;align-items:center;-webkit-box-pack:center;-ms-flex-pack:center;-webkit-justify-content:center;justify-content:center;position:relative;margin:0;padding-left:1rem;padding-right:1rem;line-height:1.5;font-family:\"Noto Sans\",\"Helvetica Neue\",\"Helvetica\",\"Arial\",\"Liberation Sans\",\"Roboto\",\"Noto\",sans-serif;font-size:1rem;font-weight:700;border-radius:0.5rem;border-width:1px;border-style:solid;-webkit-transition:border-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),background-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),opacity 200ms cubic-bezier(0.645, 0.045, 0.355, 1),box-shadow 200ms cubic-bezier(0.645, 0.045, 0.355, 1),color 200ms cubic-bezier(0.645, 0.045, 0.355, 1);transition:border-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),background-color 200ms cubic-bezier(0.645, 0.045, 0.355, 1),opacity 200ms cubic-bezier(0.645, 0.045, 0.355, 1),box-shadow 200ms cubic-bezier(0.645, 0.045, 0.355, 1),color 200ms cubic-bezier(0.645, 0.045, 0.355, 1);display:-webkit-inline-box;display:-webkit-inline-flex;display:-ms-inline-flexbox;display:inline-flex;width:auto;padding-top:0.5625rem;padding-bottom:0.5625rem;color:#2d2d2d;border-color:#e4e2e0;background-color:#e4e2e0;}.css-1686x4::-moz-focus-inner{border:0;}@media (prefers-reduced-motion: reduce){.css-1686x4{-webkit-transition:none;transition:none;}}.css-1686x4:disabled{opacity:0.4;pointer-events:none;}.css-1686x4:focus{outline:none;box-shadow:0 0 0 0.125rem #ffffff,0 0 0 0.1875rem #2557a7;}.css-1686x4:focus:not([data-focus-visible-added]){box-shadow:none;}.css-1686x4:visited{color:#2d2d2d;}.css-1686x4:hover{border-color:#d4d2d0;background-color:#d4d2d0;}.css-1686x4:active{box-shadow:inset 0 0.125rem 0.25rem rgba(45, 45, 45, 0.2),inset 0 0.0625rem 0.1875rem rgba(45, 45, 45, 0.12),inset 0 0 0.125rem rgba(45, 45, 45, 0.2);border-color:#b4b2b1;background-color:#b4b2b1;}<\u002Fstyle><button class=\"mosaic-reportcontent-button desktop css-1686x4 e8ju0x51\"><span class=\"mosaic-reportcontent-button-icon\"><\u002Fspan>Report job<\u002Fbutton><div class=\"mosaic-reportcontent-content\"><\u002Fdiv><\u002Fdiv>","mosaic-provider-salary-feedback":"","mosaic-provider-company-info-salary":"","mosaic-provider-rich-media":"","js-match-insights-provider":"","MosaicProviderCallToApplyFeedback":"","mosaic-provider-dislike-feedback":"<div class=\"animatedToast i-unmask\"><div class=\"\"><\u002Fdiv><\u002Fdiv>"},"locale":"en_US","localeData":{"":[null,"Project-Id-Version: \nReport-Msgid-Bugs-To: \nPOT-Creation-Date: 2022-09-27 23:51-0500\nPO-Revision-Date: 2021-08-06 19:00+0000\nLast-Translator: Auto Generated <noreply@indeed.com>\nLanguage-Team: English (United States) <https:\u002F\u002Fweblate.corp.indeed.com\u002Fprojects\u002Findeed\u002Findeedmobile-i18n-content\u002Fen_US\u002F>\nLanguage: en_US\nMIME-Version: 1.0\nContent-Type: text\u002Fplain; charset=UTF-8\nContent-Transfer-Encoding: 8bit\nPlural-Forms: nplurals=2; plural=n != 1;\nX-Generator: Weblate 3.9.1\n"],"\"Interview times available\" card content\u0004If your application meets the employer's criteria, you may be able to book a call or request an interview that suits your schedule.":[null,"If your application meets the employer's criteria, you can directly provide your availability for a video interview."]},"loggedIn":false,"mobResourceTimingEnabled":false,"mobileGlobalHeader":null,"mobtk":"1ge7cg6jhkke6800","mosaicData":null,"originalJobLinkModel":null,"pageId":"viewjob","parenttk":null,"phoneLinkType":null,"phoneNumberButtonLinkModel":null,"preciseLocationModel":null,"profileBaseUrl":"https:\u002F\u002Fprofile.indeed.com","queryString":null,"recentQueryString":"q1=software engineer&l1=california&r1=-1&q2=python&l2=texas&r2=-1","relatedLinks":null,"resumeFooterModel":{"buttonLink":{"applyBtnNewStyle":false,"buttonSize":"md","buttonType":"primary","contentHtml":"Upload Your Resume","dataHref":"\u002Fpromo\u002Fresume?from=bottomResumeCTAviewjob&trk.origin=viewjob","disclaimer":null,"href":"\u002Fpromo\u002Fresume","icon":null,"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":null},"isJanusActive":true,"letEmployersFindText":"Let Employers Find You"},"resumePromoCardModel":null,"rtl":false,"salaryGuideModel":{"acmeMicrocontentEndpoint":"https:\u002F\u002Fcocos-api.indeed.com","country":"US","estimatedSalaryModel":{"formattedRange":"$120K - $152K a year","max":151976.5,"min":120023.5,"type":"YEARLY"},"formattedLocation":"Austin, TX","jobKey":"f6288f8af00406b1","language":"en"},"salaryInfoModel":null,"saveJobButtonContainerModel":{"alreadySavedButtonModel":{"actions":["Saved","Applied","Interviewing","Offered","Hired"],"buttonSize":"block","buttonType":"secondary","contentHtml":"Saved","href":"\u002F","iconSize":null},"applyFromComputerLogUrl":"\u002Fm\u002Frpc\u002Flog\u002Femailmyself?jk=f6288f8af00406b1&mobvjtk=1ge7cg6jhkke6800&sbt=121f10e71cf3df2d415dae11933eb9ce&ctk=1ge4tueuklhdh800&acctKey=","currentJobState":"VISITED","didYouApplyPromptModel":{"calloutModel":{"actionsList":null,"actionsMap":{"NO":{"children":"Not interested","className":null,"href":null,"target":null},"LATER":{"children":"Maybe later","className":null,"href":null,"target":null},"YES":{"children":"Yes","className":null,"href":null,"target":null}},"caretPosition":null,"children":null,"dismissAriaLabel":"Close","dismissAttributes":null,"dismissHref":null,"heading":"Did you apply?"},"jobKey":"f6288f8af00406b1","possibleResponses":{"NO":"NO","LATER":"LATER","YES":"YES"},"userCanView":false},"didYouApplyResponseUrl":"\u002Fm\u002Frpc\u002Fdidyouapply?tk=1ge7cg6jhkke6800&jobKey=f6288f8af00406b1&originPage=viewjob&from=viewjob","hashedCSRFToken":"121f10e71cf3df2d415dae11933eb9ce","isAlreadySavedButtonVisible":false,"isDisableJobStatusChange":false,"isLoggedIn":false,"isSaveWithoutLoginEnabled":false,"isSticky":false,"isSyncJobs":false,"mobtk":"1ge7cg6jhkke6800","myIndeedLoginLink":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1&from=jsfe-desktopembedded-save-indeedmobile","myJobsAPIHref":"\u002Frpc\u002Flog\u002Fmyjobs\u002Ftransition_job_state?client=mobile&cause=statepicker&preserveTimestamp=false&tk=1ge7cg6jhkke6800&jobKey=f6288f8af00406b1&originPage=viewjob","myJobsURL":"https:\u002F\u002Fmyjobs.indeed.com?co=US&hl=en_US&from=viewjob","pageId":"viewjob","possibleJobActions":{"SAVED":"save","APPLIED":"apply","INTERVIEWING":"interview","OFFERED":"offer","HIRED":"hire","VISITED":"visit","ARCHIVED":"archive"},"possibleJobStates":{"SAVED":"Saved","APPLIED":"Applied","INTERVIEWING":"Interviewing","OFFERED":"Offered","HIRED":"Hired","VISITED":"Visited","ARCHIVED":"Archived"},"saveButtonModel":{"applyBtnNewStyle":false,"buttonSize":"block","buttonType":"secondary","contentHtml":"","dataHref":null,"disclaimer":null,"href":"\u002F","icon":{"iconTitle":"save-icon","iconType":"favorite-border"},"isBlock":false,"largeScreenSizeText":null,"openInNewTab":false,"referrerpolicy":null,"rel":null,"sanitizedHref":null,"sanitizedHtml":null,"sticky":false,"target":null,"title":null,"viewJobDisplay":"DESKTOP_EMBEDDED"},"showSaveJobInlineCallout":true,"uistates":{"INTERVIEWING":"INTERVIEWING","OFFERED":"OFFERED","SAVED":"SAVED","VISITED":"VISITED","HIRED":"HIRED","ARCHIVED":"ARCHIVED","APPLIED":"APPLIED"},"viewJobDisplay":"DESKTOP_EMBEDDED"},"saveJobCalloutModel":{"actionsList":null,"actionsMap":{"createaccount":{"children":"Create account (it's free)","className":null,"href":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Fregister?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1","target":"_PARENT"},"signin":{"children":"Sign in","className":null,"href":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1","target":"_PARENT"}},"caretPosition":null,"children":"You must sign in to save jobs:","dismissAriaLabel":"Close","dismissAttributes":null,"dismissHref":null,"heading":"Save jobs and view them from any computer."},"saveJobFailureModalModel":{"closeAriaLabel":"Close","closeButtonText":"Close","message":"Please retry","signInButtonText":null,"signInHref":null,"title":"Failed to Save Job"},"saveJobLimitExceededModalModel":{"closeAriaLabel":"Close","closeButtonText":null,"message":"You reached the limit. Please log in to save additional jobs.","signInButtonText":"Sign in","signInHref":"https:\u002F\u002Fwww.indeed.com\u002Faccount\u002Flogin?dest=%2Fm%2Fbasecamp%2Fviewjob%3Fviewtype%3Dembedded%26jk%3Df6288f8af00406b1&from=viewjob_savejoblimitmodal","title":"You've already saved 20 jobs"},"segmentId":"software_dev_seo","segmentPhoneNumberButtonLinkModel":null,"shareJobButtonContainerModel":{"buttonIconModel":{"color":"blue","position":null,"size":"md","title":"Share this job","type":"\u002Fm\u002Fimages\u002Fnativeshare.svg"},"buttonModel":{"buttonSize":null,"buttonType":"secondary","children":"Share this job","disabled":false,"href":null,"isActive":false,"isBlock":false,"isResponsive":false,"size":"md"},"fallbackButtonIconModel":{"color":"blue","position":null,"size":"md","title":"Copy link","type":"\u002Fm\u002Fimages\u002Ficon-copy.svg"},"shareText":"Check out this job on Indeed:\nBluevine\nSenior Full Stack Engineer (python)\nAustin, TX\nhttps:\u002F\u002Fwww.indeed.com\u002Fm\u002Fviewjob?jk=f6288f8af00406b1&from=native","shareType":"native","shareUrl":"https:\u002F\u002Fwww.indeed.com\u002Fm\u002Fviewjob?jk=f6288f8af00406b1&from=native","showUnderSaveButton":true},"shouldLogResolution":true,"showEmployerResponsiveCard":false,"showGlobalNavContent":false,"showReportInJobButtons":false,"sponsored":false,"sponsoredAdsContainerModel":null,"sponsoredJobs":null,"staticPrefix":"\u002F\u002Fd3fw5vlhllyvee.cloudfront.net\u002Fm\u002Fs\u002F","stickyType":"ALWAYS","successfullySignedInModel":null,"viewJobButtonLinkContainerModel":null,"viewJobDisplay":"DESKTOP_EMBEDDED","viewJobDisplayParam":"dtembd","viewjobDislikes":false,"whatWhereFormModel":null,"zoneProviders":{"aboveViewjobButtons":[],"viewjobModals":["MosaicProviderCallToApplyFeedback"],"aboveExtractedJobDescription":[],"aboveFullJobDescription":["mosaic-provider-company-info-salary"],"rightRail":[],"legacyProvidersViewJob":["mosaic-provider-reportcontent"],"betweenJobDescriptionAndButtons":[],"ssrVJModals":[],"belowJobDescription":[],"belowFullJobDescription":[],"belowViewjobButtons":["mosaic-provider-dislike-feedback","mosaic-provider-salary-feedback"],"belowViewjobNav":[]}};</script>
'window._initialData=(\{.+?\});'
script_tag = re.findall(r"_initialData=(\{.+?\});", html)
json_blob = json.loads(script_tag[0])job = json_blob["jobInfoWrapperModel"]["jobInfoModel"]
import reimport jsonimport requestsfrom urllib.parse import urlencode headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} job_id_list = [ 'f6288f8af00406b1', '56ab4e4fe59ae782', '29bd7638828fab65', '697a7a3f18590465', '08e92505e27442d3', '105529f69e3fdae2'] full_job_data_list = [] for job_id in job_id_list: try: indeed_job_url = 'https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk=' + job_id response = requests.get(indeed_job_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r"_initialData=(\{.+?\});", response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) job = json_blob["jobInfoWrapperModel"]["jobInfoModel"] full_job_data_list.append({ 'company': job.get('jobInfoHeaderModel').get('companyName') if job.get('jobInfoHeaderModel') is not None else '', 'jobkey': job_id, 'jobTitle': job.get('jobInfoHeaderModel').get('jobTitle') if job.get('jobInfoHeaderModel') is not None else '', 'jobDescription': job.get('sanitizedJobDescription').get('content') if job.get('sanitizedJobDescription') is not None else '', }) except Exception as e: print('Error', e) print(full_job_data_list)
[{ "company": "Bluevine", "jobkey": "f6288f8af00406b1", "jobTitle": "Senior Full Stack Engineer (python)", "jobDescription": "<div>\n <div>\n <p><b>About Bluevine</b></p> \n <p> Bluevine is on a mission to enable a better financial future for small business owners through innovative banking solutions designed just for them. By combining best-in-class technology with advanced security and a deep understanding of the small business community, we deliver end-to-end banking and lending products that empower always-on entrepreneurs to grow their businesses with confidence.</p> \n <p> As a dynamic company with massive potential, we're backed by leading investors such as Lightspeed Venture Partners, Menlo Ventures, 83North, Citi Ventures, and nearly 9 years of proven success. Since launching in 2013, we have grown exponentially, amassing over 400,000 customers across all 50 states and a global team of more than 500 people. Our passion is driven by purpose: to give small businesses the tools they need to succeed and we're just getting started.</p> \n <p> All of this begins with our team who are driven by collaboration, problem-solving, and learning and growing together. With a commitment to innovation and community impact, our mission is to help every small business—and every team member—thrive. Join us!</p>\n </div>\n <p><b><i> This is a hybrid role</i></b><i>. </i>At Bluevine, we pride ourselves on our collaborative culture, which we believe is best maintained through in-person interactions and a vibrant office environment. All of our offices have reopened in accordance with local guidelines, and are following a hybrid model. In-office days will be determined by location and discipline.</p> \n <p><b> ABOUT THE ROLE:</b></p> \n <p> We're looking for a Senior Full Stack Engineer flexible enough to develop features from the front (beautiful UX) to the back (scalable and robust components and integrations). If you're drawn to engineering challenges and have a strong desire to make a big impact as part of a small, agile team, in an exciting space, we'd love to talk to you.</p> \n <p> The team serves a variety of stakeholders across all the business and the platform.</p> \n <p><b> WHAT YOU'LL DO:</b></p> \n <ul> \n <li>Independently drive the engineering development of complex features</li> \n <li>Design and build state-of-the-art responsive banking applications</li> \n <li>Work closely with, and incorporate feedback from product managers and other stakeholders in the company</li> \n <li>Be part of a fast-paced and highly-flexible team with the comfort of making decisions using your best judgement</li> \n <li>Develop projects through their entire life cycle</li> \n </ul> \n <p><b>WHAT WE LOOK FOR:</b></p> \n <ul> \n <li>5+ years of combined full stack experience experience building fast, reliable, web and/or mobile applications on applications with Python backends</li> \n <li>Experience with Web frameworks (e.g., Angular, React, or Vue)</li> \n <li>Experience with source control management systems, preferably Git</li> \n <li>B.S. in Computer Science or a related field preferred</li> \n </ul> \n <p><b>Nice to Haves</b></p> \n <ul> \n <li>Experience with AWS</li> \n <li>Experience with mobile development (e.g., Native, Native Script, or React)</li> \n </ul>\n <div>\n <div> \n <p><b>BENEFITS AND PERKS - for employees located in the US</b></p> \n <ul> \n <li>Excellent health coverage and life insurance benefits</li> \n <li>401K with an immediate 3% company match</li> \n <li>PTO, Company Holidays, and Flexible Holidays</li> \n <li>Company-sponsored Mental Health Benefits, including 1:1 therapy</li> \n <li>Over $1,000 annually for a wellness benefit of your choice</li> \n <li>Monthly WFH stipend</li> \n <li>Generous, paid parental leave covering up to 16 weeks</li> \n <li>Access to financial coaches and education sessions</li> \n <li>Free commuter benefits - Caltrain passes for San Francisco employees and a monthly parking allowance</li> \n <li>Monthly DoorDash credit</li> \n <li>Weekly catered lunches and fully stocked kitchen pantries</li> \n <li>Dog-friendly Redwood City, CA office</li> \n <li>Community-based volunteering opportunities</li> \n </ul> \n <p><b>BENEFITS & PERKS - for employees located in Israel</b></p> \n <ul> \n <li>Excellent group health coverage and life insurance benefits</li> \n <li>Stock options</li> \n <li>Flexible hybrid work model</li> \n <li>Large Study Fund contribution</li> \n <li>Salary Benchmarks and Checkpoints</li> \n <li>Monthly meal card of TenBis or CiBus (your choice) with generous balance</li> \n <li>Free parking for cars, scooters, and bikes</li> \n <li>Free gym membership</li> \n <li>Company-sponsored Mental Health Benefits</li> \n <li>PTO, Company Holidays, and Flexible Holidays</li> \n <li>Community-based volunteering opportunities</li>\n </ul>\n </div>\n </div>\n</div>\n<div></div>" }, ]
import reimport jsonimport requestsfrom urllib.parse import urlencode def get_indeed_search_url(keyword, location, offset=0): parameters = {"q": keyword, "l": location, "filter": 0, "start": offset} return "https://www.indeed.com/jobs?" + urlencode(parameters) headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} jobs_data_list = [] ## Job Search Parameterskeyword_list = ['software engineer']location_list = ['California'] ## Loop Through Indeed Pages Until No More Jobsfor keyword in keyword_list: for location in location_list: for offset in range(0, 1010, 10): try: indeed_jobs_url = get_indeed_search_url(keyword, location, offset) response = requests.get(indeed_jobs_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results'] for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: jobs_data_list.append({ 'keyword': keyword, 'location': location, 'page': round(offset / 10) + 1 if offset > 0 else 1, 'position': index, 'company': job.get('company'), 'companyRating': job.get('companyRating'), 'companyReviewCount': job.get('companyReviewCount'), 'companyRating': job.get('companyRating'), 'highlyRatedEmployer': job.get('highlyRatedEmployer'), 'jobkey': job.get('jobkey'), 'jobTitle': job.get('title'), 'jobLocationCity': job.get('jobLocationCity'), 'jobLocationPostal': job.get('jobLocationPostal'), 'jobLocationState': job.get('jobLocationState'), 'maxSalary': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 0, 'minSalary': job.get('estimatedSalary').get('min') if job.get('estimatedSalary') is not None else 0, 'salaryType': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 'none', 'pubDate': job.get('pubDate'), }) ## If response contains less than 10 jobs then stop pagination if len(jobs_data_list) < 10: break except Exception as e: print('Error', e) print(jobs_data_list)
ScrapingClient
works and can be integrated. However, here is the final code:import timeimport requestsimport concurrent.futuresfrom urllib.parse import urlencodefrom scrapeops_python_requests.scrapeops_requests import ScrapeOpsRequests class ScrapingClient: def __init__(self, scrapeops_api_key=None, scrapeops_proxy_enabled=True, scrapeops_monitoring_enabled=True, scrapeops_proxy_settings={}, spider_name=None, job_name=None, num_concurrent_threads=1, num_retries=5, http_allow_list=[200, 404], ): self.scrapeops_api_key = scrapeops_api_key self.scrapeops_proxy_settings = scrapeops_proxy_settings self.scrapeops_proxy_enabled = scrapeops_proxy_enabled self.scrapeops_monitoring_enabled = scrapeops_monitoring_enabled self.num_concurrent_threads = num_concurrent_threads self.num_retries = num_retries self.http_allow_list = http_allow_list self.spider_name = spider_name self.job_name = job_name self.sops_request_wrapper = None self.start_scrapeops_monitor() def start_scrapeops_monitor(self): """ Starts the ScrapeOps monitor, which ships logs to dashboard. """ if self.scrapeops_monitoring_enabled and self.scrapeops_api_key is not None: try: self.scrapeops_logger = ScrapeOpsRequests( scrapeops_api_key=self.scrapeops_api_key, spider_name=self.spider_name, job_name=self.job_name, ) self.sops_request_wrapper = self.scrapeops_logger.RequestsWrapper() except Exception as e: print('monitioring error', e) else: self.sops_request_wrapper = requests def scrapeops_proxy_url(self, url, scrapeops_proxy_settings=None): """ Converts URL into ScrapeOps Proxy API Aggregator URL """ payload = {'api_key': self.scrapeops_api_key, 'url': url} ## Global Proxy Settings if self.scrapeops_proxy_settings is not None and type(self.scrapeops_proxy_settings) is dict: for key, value in self.scrapeops_proxy_settings.items(): payload[key] = value ## Per Request Proxy Settings if scrapeops_proxy_settings is not None and type(scrapeops_proxy_settings) is dict: for key, value in self.scrapeops_proxy_settings.items(): payload[key] = value proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload) return proxy_url def send_request(self, url, method='GET', scrapeops_proxy_settings=None, **kwargs): """ Sends HTTP request and retries failed responses. """ final_url = url try: if self.scrapeops_proxy_enabled and self.scrapeops_api_key is not None: final_url = self.scrapeops_proxy_url(url, scrapeops_proxy_settings) for _ in range(self.num_retries): try: #response = self.sops_request_wrapper.request(method, final_url, **kwargs) response = self.sops_request_wrapper.get(final_url, **kwargs) if response.status_code in self.http_allow_list: return response except Exception as e: print('Request error:', e) return None except Exception as e: print('Overall error', e) def concurrent_requests(self, function, input_list): """ Enables requests to be sent in parallel """ with concurrent.futures.ThreadPoolExecutor(max_workers=self.num_retries) as executor: executor.map(function, input_list)
ScrapingClient
into our Indeed Jobs Scraper.import reimport jsonimport requestsfrom urllib.parse import urlencode def get_indeed_search_url(keyword, location, offset=0): parameters = {"q": keyword, "l": location, "filter": 0, "start": offset} return "https://www.indeed.com/jobs?" + urlencode(parameters) headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"} jobs_data_list = [] ## Job Search Parameterskeyword_list = ['software engineer']location_list = ['California'] ## Loop Through Indeed Pages Until No More Jobsfor keyword in keyword_list: for location in location_list: for offset in range(0, 1010, 10): try: indeed_jobs_url = get_indeed_search_url(keyword, location, offset) response = requests.get(indeed_jobs_url, headers=headers) if response.status_code == 200: script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results'] for index, job in enumerate(jobs_list): if job.get('jobkey') is not None: jobs_data_list.append({ 'keyword': keyword, 'location': location, 'page': round(offset / 10) + 1 if offset > 0 else 1, 'position': index, 'company': job.get('company'), 'companyRating': job.get('companyRating'), 'companyReviewCount': job.get('companyReviewCount'), 'companyRating': job.get('companyRating'), 'highlyRatedEmployer': job.get('highlyRatedEmployer'), 'jobkey': job.get('jobkey'), 'jobTitle': job.get('title'), 'jobLocationCity': job.get('jobLocationCity'), 'jobLocationPostal': job.get('jobLocationPostal'), 'jobLocationState': job.get('jobLocationState'), 'maxSalary': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 0, 'minSalary': job.get('estimatedSalary').get('min') if job.get('estimatedSalary') is not None else 0, 'salaryType': job.get('estimatedSalary').get('max') if job.get('estimatedSalary') is not None else 'none', 'pubDate': job.get('pubDate'), }) ## If response contains less than 10 jobs then stop pagination if len(jobs_data_list) < 10: break except Exception as e: print('Error', e) print(jobs_data_list)
SCRAPEOPS_API_KEY = 'YOUR_API_KEY' def scrapeops_url(url): payload = {'api_key': SCRAPEOPS_API_KEY, 'url': url, 'country': 'us'} proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload) return proxy_url indeed_search_url = 'https%3A%2F%2Fwww.indeed.com%2Fjobs%3Fq%3Dsoftware%20engineer%26l%3DSan%20Francisco%26start%3D0%26filter%3D0' ## Send URL To ScrapeOps Instead of Indeed response = requests.get(scrapeops_url(indeed_search_url))
ScrapingClient
found above.Here is our updated Indeed Product Scraper using the ScrapeOps Proxy via the ScrapingClient
:import reimport jsonimport requestsfrom urllib.parse import urlencode job_id_list = [ 'f6288f8af00406b1', '56ab4e4fe59ae782', '29bd7638828fab65', '697a7a3f18590465', '08e92505e27442d3', '105529f69e3fdae2'] full_job_data_list = [] for job_id in job_id_list: try: indeed_job_url = 'https://www.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk=' + job_id response = requests.get(scrapeops_url(indeed_job_url)) if response.status_code == 200: script_tag = re.findall(r"_initialData=(\{.+?\});", response.text) if script_tag is not None: json_blob = json.loads(script_tag[0]) job = json_blob["jobInfoWrapperModel"]["jobInfoModel"] full_job_data_list.append({ 'company': job.get('jobInfoHeaderModel').get('companyName') if job.get('jobInfoHeaderModel') is not None else '', 'jobkey': job_id, 'jobTitle': job.get('jobInfoHeaderModel').get('jobTitle') if job.get('jobInfoHeaderModel') is not None else '', 'jobDescription': job.get('sanitizedJobDescription').get('content') if job.get('sanitizedJobDescription') is not None else '', }) except Exception as e: print('Error', e) full_job_data_list
Then check out ScrapeOps, the complete toolkit for web scraping.
config.json
file with your ScrapeOps API key.python name_of_your_script.py
.import osimport csvimport jsonimport loggingfrom urllib.parse import urlencodefrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import NoSuchElementExceptionimport concurrent.futuresimport time API_KEY = "Add-Your-Api-key-here" with open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url # Logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) @dataclassclass JobData: name: str salary: str description: str benefits: str class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if self.is_duplicate(scraped_data) == False: self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and self.csv_file_open == False: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: scrapeops_proxy_url = get_scrapeops_url(url, location=location) # Set up Selenium WebDriver chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) driver.get(scrapeops_proxy_url) logger.info(f"Received page from: {url}") div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") for div_card in div_cards: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None if not job_key: continue url = f"https://www.indeed.com/viewjob?jk={job_key}" company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_elements(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder[0].text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = SearchData( name=name, url=url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") logger.info(f"Retrying request for page: {url}, retries left {retries-tries}") tries += 1 finally: driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") def process_job(row, retries=3): url = row["url"] tries = 0 success = False chrome_options = Options() driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) while tries <= retries and not success: try: driver.get(url) logger.info(f"Status: {driver.title}") salary = "n/a" try: salary_holder = driver.find_elements(By.XPATH, "//div[@id='salaryInfoAndJobContainer']") or \ driver.find_elements(By.CSS_SELECTOR, "span[class*='salary']") or \ driver.find_elements(By.XPATH, "//div[contains(text(), 'Salary')]") if salary_holder: salary = salary_holder[0].text except NoSuchElementException: logger.warning("Salary information not found.") description = "n/a" try: description_holder = driver.find_element(By.ID, "jobDescriptionText") if description_holder: description = description_holder.text except NoSuchElementException: logger.warning("Description not found.") benefits = "n/a" try: benefits_holder = driver.find_elements(By.ID, "benefits") or \ driver.find_elements(By.CSS_SELECTOR, "div[data-testid='benefits-test']") or \ driver.find_elements(By.CSS_SELECTOR, "div.css-eynugf.eu4oa1w0") if benefits_holder: benefits = benefits_holder[0].text except NoSuchElementException: logger.warning("Benefits information not found.") job_data = JobData( name=row["name"], salary=salary, description=description, benefits=benefits ) # Save job data to a CSV file job_filename = f"{row['name'].replace(' ', '_')}.csv" keys = [field.name for field in fields(job_data)] file_exists = os.path.isfile(job_filename) and os.path.getsize(job_filename) > 0 with open(job_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() writer.writerow(asdict(job_data)) success = True except Exception as e: logger.error(f"Exception thrown: {e}") logger.warning(f"Failed to process page: {row['url']}") logger.warning(f"Retries left: {retries-tries}") tries += 1 driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") else: logger.info(f"Successfully parsed: {row['url']}") def process_results(csv_file, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) # Use threading for processing job details with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(process_job, row, retries): row for row in reader } for future in concurrent.futures.as_completed(futures): row = futures[future] try: future.result() except Except
MAX_RETRIES
: Sets how many times the scraper will retry if a request fails. This can happen due to network errors or non-200 HTTP responses.MAX_THREADS
: Limits the number of threads running the scraper at the same time.PAGES
: Controls how many search result pages to scrape for each keyword.LOCATION
: Sets the country or region code for the scraping.LOCALITY
: Specifies the city or area to narrow down the search.keyword_list
: A list of keywords the scraper will use to search for jobs on Indeed.https://www.indeed.com/jobs?q=writer&l=Westland%2C+MI&start=10&vjk=a88c42edb7b19c5d
https://www.indeed.com/jobs
is the endpoint we’re accessing on the server.?q=writer\&l=Westland%2C+MI\&start=10\&vjk=a88c42edb7b19c5d
is the query string.q=writer
tells Indeed that we want to search for writing jobs.If we want to look for writing jobs without any other criteria, the URL would be:https://www.indeed.com/jobs?q=writer
slider_item
.On each individual job page, we can locate the job description. Similar to the example above, our data is contained within a div
card.This div is identified by the id jobDescriptionText
and holds the complete job description.https://www.indeed.com/jobs?q=writer&l=Westland%2C+MI&start=10&vjk=a88c42edb7b19c5d
start=10
.On Indeed, each page number corresponds to a multiple of 10:start=0
.start=10
.start=20
.l=Westland%2C+MI
.l
parameter indicates the location we want to search. For example, to search in London, we would use l=London%2CUK
."country": "us"
, we’ll be routed through a server in the US."country": "uk"
.mkdir indeed-scraper cd indeed-scraper
python -m venv venv
source venv/bin/activate
pip install webdriver-manager pip install selenium
import osimport csvimport jsonimport loggingfrom urllib.parse import urlencodefrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom webdriver_manager.chrome import ChromeDriverManagerimport concurrent.futuresimport time # Set up logging logging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) # Load API key from configwith open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] # Function to scrape search resultsdef scrape_search_results(keyword, location, locality, retries=3): # Set up the Chrome driver options = webdriver.ChromeOptions() options.add_argument("--headless") # Run in headless mode for background execution driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}" tries = 0 success = False while tries < retries and not success: try: logger.info(f"Fetching URL: {url}") driver.get(url) time.sleep(3) # Wait for the page to load # Extract job cards div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") search_data_list = [] for div_card in div_cards: try: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None url = f"https://www.indeed.com/viewjob?jk={job_key}" if job_key else None company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder.text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = { "name": name, "url": url, "stars": rating, "company_name": company_name, "location": location, } search_data_list.append(search_data) print(search_data) except Exception as inner_e: logger.error(f"Error extracting data from div card: {inner_e}") logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") tries += 1 logger.info(f"Retrying request for page: {url}, retries left {retries - tries}") if not success: logger.error(f"Max retries exceeded for URL: {url}") driver.quit() # Close the browser after scraping if __name__ == "__main__": MAX_RETRIES = 3 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") # Input: List of keywords to scrape keyword_list = ["writer"] # Job Processes for keyword in keyword_list: scrape_search_results(keyword, LOCATION, LOCALITY, retries=MAX_RETRIES) logger.info(f"Crawl complete.")
scrape_search_results()
performs the following tasks:div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']")
div_card
, we:name = div_card.find_element(By.TAG_NAME, "h2").text
location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text
scrape_search_results()
for multiple pages.Our updated URL now looks like this:url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}"
start_scrape()
function. This function is very simple; it iterates through the pages and calls scrape_search_results()
for each one:def start_scrape(keyword, pages, location, locality, retries=3): for page in range(pages): scrape_search_results(keyword, location, locality, page, retries=retries)
import jsonimport loggingimport timefrom urllib.parse import urlencodefrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom webdriver_manager.chrome import ChromeDriverManager # Set up logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) # Load API key from configwith open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] # Function to scrape search resultsdef scrape_search_results(keyword, location, locality, page_number, retries=3): # Set up the Chrome driver options = webdriver.ChromeOptions() options.add_argument("--headless") # Run in headless mode for background execution driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options) formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries < retries and not success: try: logger.info(f"Fetching URL: {url}") driver.get(url) time.sleep(3) # Wait for the page to load # Extract job cards div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") search_data_list = [] for div_card in div_cards: try: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None url = f"https://www.indeed.com/viewjob?jk={job_key}" if job_key else None company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder.text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = { "name": name, "url": url, "stars": rating, "company_name": company_name, "location": location } search_data_list.append(search_data) print(search_data) except Exception as inner_e: logger.error(f"Error extracting data from div card: {inner_e}") logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") tries += 1 logger.info(f"Retrying request for page: {url}, retries left {retries - tries}") if not success: logger.error(f"Max retries exceeded for URL: {url}") driver.quit() # Close the browser after scraping def start_scrape(keyword, pages, location, locality, retries=3): for page in range(pages): scrape_search_results(keyword, location, locality, page, retries=retries) if __name__ == "__main__": MAX_RETRIES = 3 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") # Input: List of keywords to scrape keyword_list = ["writer"] PAGES = 2 # Specify the number of pages you want to scrape # Job Processes for keyword in keyword_list: start_scrape(keyword, PAGES, LOCATION, LOCALITY, retries=MAX_RETRIES) logger.info(f"Crawl complete.")
SearchData
.div_card
object from our parsing function.DataPipeline
.SearchData
class.from dataclasses import dataclass, fields @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip())
DataPipeline
.import osimport csvimport timefrom dataclasses import asdict, fields class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if not self.is_duplicate(scraped_data): self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and not self.csv_file_open: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv()
import osimport csvimport jsonimport loggingfrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.common.by import Byimport time # Load configurationwith open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] # Setup logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): logger.info("Saving data to CSV.") self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: logger.info("No data to save.") return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) logger.info(f"Saved: {item}") self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): logger.info(f"Adding data: {scraped_data.name}") if not self.is_duplicate(scraped_data): self.storage_queue.append(scraped_data) logger.info(f"Data added to storage queue. Queue size: {len(self.storage_queue)}") if len(self.storage_queue) >= self.storage_queue_limit and not self.csv_file_open: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: driver = webdriver.Chrome() # Ensure chromedriver is installed and in PATH driver.get(url) time.sleep(3) # Wait for page to load logger.info(f"Fetching URL: {url}") # Extract Data div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") for div_card in div_cards: name = div_card.find_element(By.TAG_NAME, "h2").text job_url = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None try: rating = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='holistic-rating']").text except Exception: pass # Rating may not be present location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = SearchData( name=name, url=job_url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") logger.info(f"Retrying request for page: {url}, retries left {retries - tries}") tries += 1 time.sleep(2) finally: driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") def start_scrape(keyword, pages, location, locality, data_pipeline=None, retries=3): for page in range(pages): scrape_search_results(keyword, location, locality, page, data_pipeline=data_pipeline, retries=retries) if __name__ == "__main__": MAX_RETRIES = 3 PAGES = 2 LOCATION = "us" LOCALITY = "mi" logger.info("Crawl starting...") keyword_list = ["writer"] for keyword in keyword_list: filename = keyword.replace(" ", "-") crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv") start_scrape(keyword, PAGES, LOCATION, LOCALITY, data_pipeline=crawl_pipeline, retries=MAX_RETRIES) crawl_pipeline.close_pipeline() logger.info("Crawl complete.")
start_scrape()
function by removing the for loop and replacing it with ThreadPoolExecutor
.Here is the updated function:import concurrent.futures def start_scrape(keyword, pages, location, locality, data_pipeline=None, max_threads=5, retries=3): with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: executor.map( scrape_search_results, [keyword] * pages, # Repeated keyword for all pages [location] * pages, # Repeated location for all pages [locality] * pages, # Repeated locality for all pages range(pages), # Page numbers to iterate [data_pipeline] * pages, # Same data pipeline instance [retries] * pages # Retry count for all tasks )
executor.map()
:scrape\_search\_results()
is the function we want to execute on each thread.scrape_search_results
.scrape_search_results()
.def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url
api_key
: Your ScrapeOps API key.url
: The URL you want to scrape.country
: The country through which we want to be routed.residential
: A boolean value. If set to True, ScrapeOps provides us with a residential IP address instead of a datacenter IP.import osimport csvimport requestsimport jsonimport loggingfrom urllib.parse import urlencode, urlparse, parse_qsfrom bs4 import BeautifulSoupimport concurrent.futuresfrom dataclasses import dataclass, field, fields, asdictimport time # Load API key from configuration fileAPI_KEY = "" with open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url # Setup logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if not self.is_duplicate(scraped_data): self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and not self.csv_file_open: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: scrapeops_proxy_url = get_scrapeops_url(url, location=location) response = requests.get(scrapeops_proxy_url) logger.info(f"Received [{response.status_code}] from: {url}") if response.status_code == 200: success = True else: raise Exception(f"Failed request, Status Code {response.status_code}") soup = BeautifulSoup(response.text, "html.parser") div_cards = soup.select("div[data-testid='slider_item']") for div_card in div_cards: name = div_card.select_one("h2").text parsed_url = urlparse(div_card.find("a").get("href")) query_params = parse_qs(parsed_url.query) has_job_key = "jk" in query_params.keys() if not has_job_key: continue job_key = query_params["jk"][0] url = f"https://www.indeed.com/viewjob?jk={job_key}" company_name = div_card.select_one("span[data-testid='company-name']").text rating = None rating_holder = div_card.select_one("span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder.text location = div_card.select_one("div[data-testid='text-location']").text search_data = SearchData( name=name, url=url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") tries += 1 logger.info(f"Retrying request for page: {url}, retries left {retries - tries}") if not success: raise Exception(f"Max Retries exceeded: {retries}") def start_scrape(keyword, pages, location, locality, data_pipeline=None, max_threads=5, retries=3): with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: executor.map( scrape_search_results, [keyword] * pages, [location] * pages, [locality] * pages, range(pages), [data_pipeline] * pages, [retries] * pages ) if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 5 PAGES = 1 LOCATION = "us" LOCALITY = "california" logger.info(f"Crawl starting...") keyword_list = ["writer"] aggregate_files = [] for keyword in keyword_list: filename = keyword.replace(" ", "-") crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv") start_scrape(keyword, PAGES, LOCATION, LOCALITY, data_pipeline=crawl_pipeline, max_threads=MAX_THREADS, retries=MAX_RETRIES) crawl_pipeline.close_pipeline() aggregate_files.append(f"{filename}.csv") logger.info(f"Crawl complete.")
PAGES
to 3, here is the updated main:if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 5 PAGES = 3 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") keyword_list = ["writer"] aggregate_files = [] for keyword in keyword_list: filename = keyword.replace(" ", "-") crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv") start_scrape(keyword, PAGES, LOCATION, LOCALITY, data_pipeline=crawl_pipeline, max_threads=MAX_THREADS, retries=MAX_RETRIES) crawl_pipeline.close_pipeline() aggregate_files.append(f"{filename}.csv") logger.info(f"Crawl complete.")
MAX_RETRIES
MAX_THREADS
PAGES
LOCATION
LOCALITY
process_job()
:def process_job(row, location, retries=3): url = row["url"] tries = 0 success = False chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) while tries <= retries and not success: driver.get(get_scrapeops_url(url, location)) try: if driver.title: logger.info(f"Status: 200") job_pipeline = DataPipeline(csv_filename=f"{row['name']}.csv") salary = "n/a" try: salary_holder = driver.find_elements(By.ID, "salaryInfoAndJobContainer") if salary_holder: salary = salary_holder[0].text except NoSuchElementException: logger.warning("Salary info not found.") description = "n/a" try: description_holder = driver.find_elements(By.ID, "jobDescriptionText") if description_holder: description = description_holder[0].text except NoSuchElementException: logger.warning("Job description not found.") benefits = "n/a" try: benefits_holder = driver.find_elements(By.ID, "benefits") if benefits_holder: benefits = benefits_holder[0].text except NoSuchElementException: logger.warning("Benefits info not found.") job_data = JobData( name=row["name"], salary=salary, description=description, benefits=benefits ) job_pipeline.add_data(job_data) job_pipeline.close_pipeline() success = True else: logger.warning("Failed Response: Not Found") raise Exception("Failed Request, page not loaded.") except Exception as e: logger.error(f"Exception thrown: {e}") logger.warning(f"Failed to process page: {row['url']}") logger.warning(f"Retries left: {retries - tries}") tries += 1 driver.quit() if not success: raise Exception("Max Retries exceeded: {retries}")
driver.find_elements(By.ID, "salaryInfoAndJobContainer")
. driver.find_elements(By.ID, "jobDescriptionText")
.driver.find_elements(By.ID, "benefits")
.process_results()
, which will be quite similar to start_scrape()
.Here’s how process_results()
works:First, we read the CSV file into an array. Then, we loop through that array and call process_job()
for each row.def process_results(csv_file, location, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) for row in reader: process_job(row, location, retries=retries)
import osimport csvimport jsonimport loggingfrom urllib.parse import urlencodefrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import NoSuchElementExceptionimport concurrent.futures API_KEY = "" with open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url # Logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if not self.is_duplicate(scraped_data): self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and not self.csv_file_open: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: scrapeops_proxy_url = get_scrapeops_url(url, location=location) # Set up Selenium WebDriver chrome_options = Options() chrome_options.add_argument("--headless") # Optional: Run in headless mode driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) driver.get(scrapeops_proxy_url) logger.info(f"Received page from: {url}") div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") for div_card in div_cards: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None if not job_key: continue url = f"https://www.indeed.com/viewjob?jk={job_key}" company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_elements(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder[0].text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = SearchData( name=name, url=url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") logger.info(f"Retrying request for page: {url}, retries left {retries-tries}") tries += 1 finally: driver.quit() # Ensure the driver closes after processing if not success: raise Exception(f"Max Retries exceeded: {retries}") def process_job(row, location, retries=3): url = row["url"] tries = 0 success = False chrome_options = Options() chrome_options.add_argument("--headless") # Optional: Run in headless mode driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) while tries <= retries and not success: try: driver.get(url) logger.info(f"Status: {driver.title}") salary = "n/a" salary_holder = driver.find_elements(By.ID, "salaryInfoAndJobContainer") if salary_holder: salary = salary_holder[0].text description = "n/a" description_holder = driver.find_elements(By.ID, "jobDescriptionText") if description_holder: description = description_holder[0].text benefits = "n/a" benefits_holder = driver.find_elements(By.ID, "benefits") if benefits_holder: benefits = benefits_holder[0].text job_data = { "name": row["name"], "salary": salary, "description": description, "benefits": benefits } print(job_data) success = True except Exception as e: logger.error(f"Exception thrown: {e}") logger.warning(f"Failed to process page: {row['url']}") logger.warning(f"Retries left: {retries-tries}") tries += 1 driver.quit() # Close the browser after processing if not success: raise Exception(f"Max Retries exceeded: {retries}") else: logger.info(f"Successfully parsed: {row['url']}") def process_results(csv_file, location, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) for row in reader: process_job(row, location, retries=retries) if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 5 PAGES = 1 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") # INPUT ---> List of keywords to scrape keyword_list = ["writer"] aggregate_files = [] # Job Processes for keyword in keyword_list: filename = keyword.replace(" ", "-") crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv") with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: executor.map( scrape_search_results, [keyword] * PAGES, [LOCATION] * PAGES, [LOCALITY] * PAGES, range(PAGES), [crawl_pipeline] * PAGES, [MAX_RETRIES] * PAGES ) crawl_pipeline.close_pipeline() aggregate_files.append(f"{filename}.csv") logger.info(f"Crawl complete.") for file in aggregate_files: process_results(file, LOCATION, retries=MAX_RETRIES)
JobData
class.@dataclassclass JobData: name: str = "" salary: str = "" description: str = "" benefits: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip())
SearchData
. Moreover, it contains a very few fields. Now we need to add a DataPipeline
into our parsing function. It will help us save this information to a file.Here is the complete code: import osimport csvimport jsonimport loggingfrom urllib.parse import urlencodefrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import NoSuchElementExceptionimport concurrent.futuresimport time API_KEY = "" with open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url # Logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) @dataclassclass JobData: name: str salary: str description: str benefits: str class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if self.is_duplicate(scraped_data) == False: self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and self.csv_file_open == False: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: scrapeops_proxy_url = get_scrapeops_url(url, location=location) chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) driver.get(scrapeops_proxy_url) logger.info(f"Received page from: {url}") div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") for div_card in div_cards: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None if not job_key: continue url = f"https://www.indeed.com/viewjob?jk={job_key}" company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_elements(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder[0].text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = SearchData( name=name, url=url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") logger.info(f"Retrying request for page: {url}, retries left {retries-tries}") tries += 1 finally: driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") def process_job(row, data_pipeline, retries=3): url = row["url"] tries = 0 success = False chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) while tries <= retries and not success: try: driver.get(url) logger.info(f"Status: {driver.title}") salary = "n/a" salary_holder = driver.find_elements(By.ID, "salaryInfoAndJobContainer") if salary_holder: salary = salary_holder[0].text description = "n/a" description_holder = driver.find_elements(By.ID, "jobDescriptionText") if description_holder: description = description_holder[0].text benefits = "n/a" benefits_holder = driver.find_elements(By.ID, "benefits") if benefits_holder: benefits = benefits_holder[0].text job_data = JobData( name=row["name"], salary=salary, description=description, benefits=benefits ) job_filename = f"{row['name'].replace(' ', '_')}.csv" keys = [field.name for field in fields(job_data)] file_exists = os.path.isfile(job_filename) and os.path.getsize(job_filename) > 0 with open(job_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() writer.writerow(asdict(job_data)) success = True except Exception as e: logger.error(f"Exception thrown: {e}") logger.warning(f"Failed to process page: {row['url']}") logger.warning(f"Retries left: {retries-tries}") tries += 1 driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") else: logger.info(f"Successfully parsed: {row['url']}") def process_results(csv_file, data_pipeline, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) for row in reader: process_job(row, data_pipeline, retries=retries) if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 5 PAGES = 1 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") keyword_list = ["writer"] aggregate_files = [] for keyword in keyword_list: filename = keyword.replace(" ", "-") crawl_pipeline = DataPipeline(csv_filename=f"{filename}.csv") job_pipeline = DataPipeline(csv_filename=f"{filename}.csv") with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: executor.map( scrape_search_results, [keyword] * PAGES, [LOCATION] * PAGES, [LOCALITY] * PAGES, range(PAGES), [crawl_pipeline] * PAGES, [MAX_RETRIES] * PAGES ) crawl_pipeline.close_pipeline() aggregate_files.append(f"{filename}.csv") process_results(f"{filename}.csv", job_pipeline, retries=MAX_RETRIES) job_pipeline.close_pipeline() aggregate_files.append(f"{filename}_jobs.csv") logger.info(f"Done. Saved {len(aggregate_files)} files.")
ThreadPoolExecutor
. We'll do the same for process_results()
.Here is the final version of the process_results()
function.def process_results(csv_file, location, max_threads=5, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) # Use ThreadPoolExecutor for concurrent processing of job data with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: executor.map( process_job, reader, [location] * len(reader), [retries] * len(reader) ) logger.info(f"Finished processing {csv_file}")
executor.map()
:process_job
can help us call on every available thread.get_scrapeops_url()
function; we just need to incorporate it into a single line to harness the power of the proxy.scrapeops_proxy_url = get_scrapeops_url(url, location=location)
import osimport csvimport jsonimport loggingfrom urllib.parse import urlencodefrom dataclasses import dataclass, field, fields, asdictfrom selenium import webdriverfrom selenium.webdriver.chrome.service import Service as ChromeServicefrom selenium.webdriver.chrome.options import Optionsfrom webdriver_manager.chrome import ChromeDriverManagerfrom selenium.webdriver.common.by import Byfrom selenium.common.exceptions import NoSuchElementExceptionimport concurrent.futuresimport time API_KEY = "" with open("config.json", "r") as config_file: config = json.load(config_file) API_KEY = config["api_key"] def get_scrapeops_url(url, location="us"): payload = { "api_key": API_KEY, "url": url, "country": location, "residential": True } proxy_url = "https://proxy.scrapeops.io/v1/?" + urlencode(payload) return proxy_url # Logginglogging.basicConfig(level=logging.INFO)logger = logging.getLogger(__name__) @dataclassclass SearchData: name: str = "" url: str = "" stars: float = None company_name: str = "" location: str = "" def __post_init__(self): self.check_string_fields() def check_string_fields(self): for field in fields(self): if isinstance(getattr(self, field.name), str): if getattr(self, field.name) == "": setattr(self, field.name, f"No {field.name}") continue value = getattr(self, field.name) setattr(self, field.name, value.strip()) @dataclassclass JobData: name: str salary: str description: str benefits: str class DataPipeline: def __init__(self, csv_filename="", storage_queue_limit=50): self.names_seen = [] self.storage_queue = [] self.storage_queue_limit = storage_queue_limit self.csv_filename = csv_filename self.csv_file_open = False def save_to_csv(self): self.csv_file_open = True data_to_save = [] data_to_save.extend(self.storage_queue) self.storage_queue.clear() if not data_to_save: return keys = [field.name for field in fields(data_to_save[0])] file_exists = os.path.isfile(self.csv_filename) and os.path.getsize(self.csv_filename) > 0 with open(self.csv_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() for item in data_to_save: writer.writerow(asdict(item)) self.csv_file_open = False def is_duplicate(self, input_data): if input_data.name in self.names_seen: logger.warning(f"Duplicate item found: {input_data.name}. Item dropped.") return True self.names_seen.append(input_data.name) return False def add_data(self, scraped_data): if self.is_duplicate(scraped_data) == False: self.storage_queue.append(scraped_data) if len(self.storage_queue) >= self.storage_queue_limit and self.csv_file_open == False: self.save_to_csv() def close_pipeline(self): if self.csv_file_open: time.sleep(3) if len(self.storage_queue) > 0: self.save_to_csv() def scrape_search_results(keyword, location, locality, page_number, data_pipeline=None, retries=3): formatted_keyword = keyword.replace(" ", "+") formatted_location = locality.replace(" ", "+") url = f"https://www.indeed.com/jobs?q={formatted_keyword}&l={formatted_location}&start={page_number * 10}" tries = 0 success = False while tries <= retries and not success: try: scrapeops_proxy_url = get_scrapeops_url(url, location=location) # Set up Selenium WebDriver chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) driver.get(scrapeops_proxy_url) logger.info(f"Received page from: {url}") div_cards = driver.find_elements(By.CSS_SELECTOR, "div[data-testid='slider_item']") for div_card in div_cards: name = div_card.find_element(By.TAG_NAME, "h2").text job_link = div_card.find_element(By.TAG_NAME, "a").get_attribute("href") job_key = job_link.split("jk=")[-1] if "jk=" in job_link else None if not job_key: continue url = f"https://www.indeed.com/viewjob?jk={job_key}" company_name = div_card.find_element(By.CSS_SELECTOR, "span[data-testid='company-name']").text rating = None rating_holder = div_card.find_elements(By.CSS_SELECTOR, "span[data-testid='holistic-rating']") if rating_holder: rating = rating_holder[0].text location = div_card.find_element(By.CSS_SELECTOR, "div[data-testid='text-location']").text search_data = SearchData( name=name, url=url, stars=rating, company_name=company_name, location=location ) data_pipeline.add_data(search_data) logger.info(f"Successfully parsed data from: {url}") success = True except Exception as e: logger.error(f"An error occurred while processing page {url}: {e}") logger.info(f"Retrying request for page: {url}, retries left {retries-tries}") tries += 1 finally: driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") def process_job(row, retries=3): url = row["url"] tries = 0 success = False chrome_options = Options() driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options) while tries <= retries and not success: try: driver.get(url) logger.info(f"Status: {driver.title}") salary = "n/a" try: salary_holder = driver.find_elements(By.XPATH, "//div[@id='salaryInfoAndJobContainer']") or \ driver.find_elements(By.CSS_SELECTOR, "span[class*='salary']") or \ driver.find_elements(By.XPATH, "//div[contains(text(), 'Salary')]") if salary_holder: salary = salary_holder[0].text except NoSuchElementException: logger.warning("Salary information not found.") description = "n/a" try: description_holder = driver.find_element(By.ID, "jobDescriptionText") if description_holder: description = description_holder.text except NoSuchElementException: logger.warning("Description not found.") benefits = "n/a" try: benefits_holder = driver.find_elements(By.ID, "benefits") or \ driver.find_elements(By.CSS_SELECTOR, "div[data-testid='benefits-test']") or \ driver.find_elements(By.CSS_SELECTOR, "div.css-eynugf.eu4oa1w0") if benefits_holder: benefits = benefits_holder[0].text except NoSuchElementException: logger.warning("Benefits information not found.") job_data = JobData( name=row["name"], salary=salary, description=description, benefits=benefits ) # Save job data to a CSV file job_filename = f"{row['name'].replace(' ', '_')}.csv" keys = [field.name for field in fields(job_data)] file_exists = os.path.isfile(job_filename) and os.path.getsize(job_filename) > 0 with open(job_filename, mode="a", newline="", encoding="utf-8") as output_file: writer = csv.DictWriter(output_file, fieldnames=keys) if not file_exists: writer.writeheader() writer.writerow(asdict(job_data)) success = True except Exception as e: logger.error(f"Exception thrown: {e}") logger.warning(f"Failed to process page: {row['url']}") logger.warning(f"Retries left: {retries-tries}") tries += 1 driver.quit() if not success: raise Exception(f"Max Retries exceeded: {retries}") else: logger.info(f"Successfully parsed: {row['url']}") def process_results(csv_file, retries=3): logger.info(f"Processing {csv_file}") with open(csv_file, newline="") as file: reader = list(csv.DictReader(file)) # Use threading for processing job details with concurrent.futures.ThreadPoolExecutor() as executor: futures = { executor.submit(process_job, row, retries): row for row in reader } for future in concurrent.futures.as_completed(futures): row = futures[future] try: future.result() except Exception as e: logger.error(f"Error processing job {row['name']}: {e}") if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 5 PAGES = 1 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") # INPUT ---> List of keywords to scrape keyword_list = ["writer"] aggregate_files = [] # Job Processes for keyword in keyword_list: filename = keyword.replace(" ", "_") + ".csv" pipeline = DataPipeline(csv_filename=filename) with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: future_threads = { executor.submit(scrape_search_results, keyword, LOCATION, LOCALITY, i, pipeline): i for i in range(PAGES) } logger.info(f"Pipeline Closed...") pipeline.close_pipeline() aggregate_files.append(filename) for csv_file in aggregate_files: process_results(csv_file)
if __name__ == "__main__": MAX_RETRIES = 3 MAX_THREADS = 2 PAGES = 1 LOCATION = "us" LOCALITY = "Westland MI" logger.info(f"Crawl starting...") # INPUT ---> List of keywords to scrape keyword_list = ["writer"] aggregate_files = [] # Job Processes for keyword in keyword_list: filename = keyword.replace(" ", "_") + ".csv" data_pipeline = DataPipeline(csv_filename=filename) for page in range(PAGES): scrape_search_results(keyword, LOCATION, LOCALITY, page, data_pipeline=data_pipeline, retries=MAX_RETRIES) data_pipeline.close_pipeline() aggregate_files.append(filename) # Process each CSV file for job details in parallel for csv_file in aggregate_files: process_results(csv_file, retries=MAX_RETRIES)
robots.txt
.It's important to note that most sites can suspend or even permanently ban you for violating their terms.On another note, when scraping the web, public data is generally fair game. If you don't have to login to a site to view the data, this is public data.If your data is gated behind a login, this is generally considered private data. When working with private data, you often need to get permission from the site you're scraping and you can be sued for accessing or disseminating private data.If you're unsure whether your scraper is legal, consult an attorney.