Scraping landing pages for competitive analytics

Did you ever have to analyze your competitor to improve your competitive edge? Where do you even start? One place to start is web scraping! Below is a hypothetical code example using our [web scraping API](https://proxyscrape.com/products/web-scraping-api/) to analyze landing pages of a competitor. ```python import os import time import json import aiohttp import asyncio from typing import List, Union import base64 import requests from bs4 import BeautifulSoup # Replace with your ProxyScrape API key api_key = "Your API key" # Function to handle relative URLs by adding the base URL def handle_relative_url(base_url, url): if url.startswith("http"): return url else: return base_url + url async def send_request_ASYNC(data) -> dict or None: webscraping_api_endpoint = "https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request" headers = { "Content-Type": "application/json", "X-Api-Key": api_key } retries = 3 for i in range(retries): try: async with aiohttp.ClientSession() as session: async with session.post(webscraping_api_endpoint, headers=headers, json=data[0]) as response: if response.status == 200: print( 'URL:', data[0]['url'], 'STATUS:', response.status, ) response_output = { 'json': await response.json(), 'status_code': response.status, 'index': data[1], 'url': data[0] } return response_output else: print(f"Failed with status code {response.status}, retrying...") except aiohttp.ClientError as e: print(f"Request failed: {e}, attempt {i + 1}/{retries}") if i < retries - 1: await asyncio.sleep(2 ** i) # exponential back-off else: raise return None def send_request(url, data, headers): retries = 3 for i in range(retries): try: response = requests.post(url, headers=headers, json=data) if response.status_code == 200: return response else: print(f"Failed with status code {response.status_code}, retrying...") except requests.exceptions.RequestException as e: print(f"Request failed: {e}, attempt {i + 1}/{retries}") if i < retries - 1: time.sleep(2 ** i) # exponential back-off else: raise return None # Function to scrape a single page using the ProxyScrape API def scrape_page(url): print("Scraping and collecting urls from:", url) data = { "url": url, "httpResponseBody": True # Use browserHtml for JavaScript rendering } headers = { "Content-Type": "application/json", "X-Api-Key": api_key } response = send_request( "https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request", data, headers ) if response.status_code == 200: json_response = response.json() #base64 decode json_response["data"]["httpResponseBody"] html = base64.b64decode(json_response["data"]["httpResponseBody"]).decode("utf-8") # Parse HTML content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") # Extract URLs from the page (handling relative URLs) base_url = "https://iproyal.com" # Base URL for iproyal urls_elements = soup.select( "a[class=\"group flex items-center gap-4 mt-auto astro-zgb246lv outlined-button\"]" ) for url_element in urls_elements: yield handle_relative_url(base_url, url_element["href"]) # Check for "Next" page button (handling relative URL) try: next_page_button = soup.find( "a", class_="outlined-button w-full text-size-[14px] px-8 sm:px-16 pagination-link", string="Next", ) if next_page_button: yield handle_relative_url(base_url, next_page_button["href"]) except AttributeError: pass else: print("Error:", response.status_code) # Function to extract data from a specific landing page and save to JSON def extract_and_save_data(response: dict or None): if response is None: print("Error:", response['status_code']) if response['status_code'] == 200: html = response['json']["data"]["browserHtml"] # Parse HTML content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") meta_title = soup.find("meta", property="og:title") if meta_title: meta_title = meta_title["content"] else: meta_title = "" print(f"Meta title: {meta_title}") meta_description = soup.find("meta", property="og:description") if meta_description: meta_description = meta_description["content"] else: meta_description = "" print(f"Meta description: {meta_description}") brand = soup.find("a", class_="!text-brand-400") if brand: brand = brand.text else: brand = "" print(f"Brand: {brand}") h1 = soup.find("h1", class_="tp-headline-m lg:tp-headline-l lg:max-w-[600px]") if h1: h1 = h1.text else: h1 = "" print(f"H1: {h1}") jumbotron_text = soup.find("div", class_="lg:tp-subheadline") if jumbotron_text: jumbotron_text = jumbotron_text.text else: jumbotron_text = "" print(f"Jumbotron text: {jumbotron_text}") jumbotron_image = soup.find("img", class_="w-full h-full hidden lg:block") if not jumbotron_image: jumbotron_image = soup.find("img", class_="w-full h-full object-contain") if jumbotron_image: jumbotron_image = jumbotron_image['src'] else: jumbotron_image = "" print(f"Jumbotron image: {jumbotron_image}") most_imports_question_element = soup.find("fieldset", class_="flex flex-col gap-16 w-full lg:max-w-[522px] flex-shrink-0 astro-jipml36k") if most_imports_question_element: most_imports_questions = most_imports_question_element.find_all("label") most_import_ques = [question.text for question in most_imports_questions] else: most_import_ques = [] print(most_import_ques) most_imports_answer_element = soup.find("div", class_="contents astro-jipml36k") if most_imports_answer_element: most_imports_answers = most_imports_answer_element.find_all("section") most_import_ans = [answer.text.strip() for answer in most_imports_answers] else: most_import_ans = [] print(most_import_ans) faq_question_elements = soup.find_all("summary", class_="group-[&:not(:first-child)]:pt-8 pb-22 tp-headline-s flex justify-between appearance-none cursor-pointer astro-vqbstbga") faq_ques = [question.text for question in faq_question_elements] print(faq_ques) faq_answer_elements = soup.find_all("div", class_="pb-16 astro-vqbstbga") faq_ans = [answer.text.strip() for answer in faq_answer_elements] print(faq_ans) json_data = {} json_data['meta_title'] = meta_title json_data['meta_description'] = meta_description json_data['url'] = response['url'] json_data['h1'] = h1 json_data['jumbotron_text'] = jumbotron_text json_data['jumbotron_image'] = jumbotron_image most_import_json = [] for i in range(len(most_import_ans)): most_import_json.append({"title": most_import_ques[i], "description": most_import_ans[i]}) faq_json = [] for i in range(len(faq_ans)): faq_json.append({"question": faq_ques[i], "answer": faq_ans[i]}) json_data['most_import'] = most_import_json json_data['faq'] = faq_json data = {} data[f'page{response["index"]}'] = json_data prettified_json = json.dumps(data, indent=4) # Create a directory for JSON files if it doesn't exist os.makedirs("data", exist_ok=True) # Save data to a JSON file with open(f"data/page_{response['index']}.json", "w") as f: json.dump(data, f, indent=4) else: print("Error:", response['status_code']) async def get_page_contents(properties) -> List[dict or None]: tasks = [send_request_ASYNC(p) for p in properties] responses = await asyncio.gather(*tasks) return responses if __name__ == '__main__': # Starting URL start_url = "https://iproyal.com/other-proxies/" scraped_urls = [start_url] pagination_url = start_url index = 1 while pagination_url != '': urls_to_scrape_inside_page = [] for extracted_url in scrape_page(pagination_url): if extracted_url not in scraped_urls: scraped_urls.append(extracted_url) # WE USE THIS LIST TO MAKE SURE WE DON'T SCRAPE DUPLICATES # Check if it's a landing page and extract data if "/other-proxies/" in extracted_url: if 'iproyal.com/other-proxies/?page=' in extracted_url: pagination_url = extracted_url else: pagination_url = '' PROXYSCRAPE_webscraping_api_config: dict = { "url": extracted_url, "browserHtml": True } urls_to_scrape_inside_page.append([PROXYSCRAPE_webscraping_api_config, index]) index += 1 print('Extracting content of each link') content = asyncio.run(get_page_contents(urls_to_scrape_inside_page)) print("Writing output files") for c in content: extract_and_save_data(c) print("Scraping and data extraction completed!") ```