Did you ever have to analyze your competitor to improve your competitive edge? Where do you even start? One place to start is web scraping! Below is a hypothetical code example using our [web scraping API](https://proxyscrape.com/products/web-scraping-api/) to analyze landing pages of a competitor. ```python import os import time import json import aiohttp import asyncio from typing import List, Union import base64 import requests from bs4 import BeautifulSoup # Replace with your ProxyScrape API key api_key = "Your API key" # Function to handle relative URLs by adding the base URL def handle_relative_url(base_url, url): if url.startswith("http"): return url else: return base_url + url async def send_request_ASYNC(data) -> dict or None: webscraping_api_endpoint = "https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request" headers = { "Content-Type": "application/json", "X-Api-Key": api_key } retries = 3 for i in range(retries): try: async with aiohttp.ClientSession() as session: async with session.post(webscraping_api_endpoint, headers=headers, json=data[0]) as response: if response.status == 200: print( 'URL:', data[0]['url'], 'STATUS:', response.status, ) response_output = { 'json': await response.json(), 'status_code': response.status, 'index': data[1], 'url': data[0] } return response_output else: print(f"Failed with status code {response.status}, retrying...") except aiohttp.ClientError as e: print(f"Request failed: {e}, attempt {i + 1}/{retries}") if i < retries - 1: await asyncio.sleep(2 ** i) # exponential back-off else: raise return None def send_request(url, data, headers): retries = 3 for i in range(retries): try: response = requests.post(url, headers=headers, json=data) if response.status_code == 200: return response else: print(f"Failed with status code {response.status_code}, retrying...") except requests.exceptions.RequestException as e: print(f"Request failed: {e}, attempt {i + 1}/{retries}") if i < retries - 1: time.sleep(2 ** i) # exponential back-off else: raise return None # Function to scrape a single page using the ProxyScrape API def scrape_page(url): print("Scraping and collecting urls from:", url) data = { "url": url, "httpResponseBody": True # Use browserHtml for JavaScript rendering } headers = { "Content-Type": "application/json", "X-Api-Key": api_key } response = send_request( "https://api.proxyscrape.com/v3/accounts/freebies/scraperapi/request", data, headers ) if response.status_code == 200: json_response = response.json() #base64 decode json_response["data"]["httpResponseBody"] html = base64.b64decode(json_response["data"]["httpResponseBody"]).decode("utf-8") # Parse HTML content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") # Extract URLs from the page (handling relative URLs) base_url = "https://iproyal.com" # Base URL for iproyal urls_elements = soup.select( "a[class=\"group flex items-center gap-4 mt-auto astro-zgb246lv outlined-button\"]" ) for url_element in urls_elements: yield handle_relative_url(base_url, url_element["href"]) # Check for "Next" page button (handling relative URL) try: next_page_button = soup.find( "a", class_="outlined-button w-full text-size-[14px] px-8 sm:px-16 pagination-link", string="Next", ) if next_page_button: yield handle_relative_url(base_url, next_page_button["href"]) except AttributeError: pass else: print("Error:", response.status_code) # Function to extract data from a specific landing page and save to JSON def extract_and_save_data(response: dict or None): if response is None: print("Error:", response['status_code']) if response['status_code'] == 200: html = response['json']["data"]["browserHtml"] # Parse HTML content using BeautifulSoup soup = BeautifulSoup(html, "html.parser") meta_title = soup.find("meta", property="og:title") if meta_title: meta_title = meta_title["content"] else: meta_title = "" print(f"Meta title: {meta_title}") meta_description = soup.find("meta", property="og:description") if meta_description: meta_description = meta_description["content"] else: meta_description = "" print(f"Meta description: {meta_description}") brand = soup.find("a", class_="!text-brand-400") if brand: brand = brand.text else: brand = "" print(f"Brand: {brand}") h1 = soup.find("h1", class_="tp-headline-m lg:tp-headline-l lg:max-w-[600px]") if h1: h1 = h1.text else: h1 = "" print(f"H1: {h1}") jumbotron_text = soup.find("div", class_="lg:tp-subheadline") if jumbotron_text: jumbotron_text = jumbotron_text.text else: jumbotron_text = "" print(f"Jumbotron text: {jumbotron_text}") jumbotron_image = soup.find("img", class_="w-full h-full hidden lg:block") if not jumbotron_image: jumbotron_image = soup.find("img", class_="w-full h-full object-contain") if jumbotron_image: jumbotron_image = jumbotron_image['src'] else: jumbotron_image = "" print(f"Jumbotron image: {jumbotron_image}") most_imports_question_element = soup.find("fieldset", class_="flex flex-col gap-16 w-full lg:max-w-[522px] flex-shrink-0 astro-jipml36k") if most_imports_question_element: most_imports_questions = most_imports_question_element.find_all("label") most_import_ques = [question.text for question in most_imports_questions] else: most_import_ques = [] print(most_import_ques) most_imports_answer_element = soup.find("div", class_="contents astro-jipml36k") if most_imports_answer_element: most_imports_answers = most_imports_answer_element.find_all("section") most_import_ans = [answer.text.strip() for answer in most_imports_answers] else: most_import_ans = [] print(most_import_ans) faq_question_elements = soup.find_all("summary", class_="group-[&:not(:first-child)]:pt-8 pb-22 tp-headline-s flex justify-between appearance-none cursor-pointer astro-vqbstbga") faq_ques = [question.text for question in faq_question_elements] print(faq_ques) faq_answer_elements = soup.find_all("div", class_="pb-16 astro-vqbstbga") faq_ans = [answer.text.strip() for answer in faq_answer_elements] print(faq_ans) json_data = {} json_data['meta_title'] = meta_title json_data['meta_description'] = meta_description json_data['url'] = response['url'] json_data['h1'] = h1 json_data['jumbotron_text'] = jumbotron_text json_data['jumbotron_image'] = jumbotron_image most_import_json = [] for i in range(len(most_import_ans)): most_import_json.append({"title": most_import_ques[i], "description": most_import_ans[i]}) faq_json = [] for i in range(len(faq_ans)): faq_json.append({"question": faq_ques[i], "answer": faq_ans[i]}) json_data['most_import'] = most_import_json json_data['faq'] = faq_json data = {} data[f'page{response["index"]}'] = json_data prettified_json = json.dumps(data, indent=4) # Create a directory for JSON files if it doesn't exist os.makedirs("data", exist_ok=True) # Save data to a JSON file with open(f"data/page_{response['index']}.json", "w") as f: json.dump(data, f, indent=4) else: print("Error:", response['status_code']) async def get_page_contents(properties) -> List[dict or None]: tasks = [send_request_ASYNC(p) for p in properties] responses = await asyncio.gather(*tasks) return responses if __name__ == '__main__': # Starting URL start_url = "https://iproyal.com/other-proxies/" scraped_urls = [start_url] pagination_url = start_url index = 1 while pagination_url != '': urls_to_scrape_inside_page = [] for extracted_url in scrape_page(pagination_url): if extracted_url not in scraped_urls: scraped_urls.append(extracted_url) # WE USE THIS LIST TO MAKE SURE WE DON'T SCRAPE DUPLICATES # Check if it's a landing page and extract data if "/other-proxies/" in extracted_url: if 'iproyal.com/other-proxies/?page=' in extracted_url: pagination_url = extracted_url else: pagination_url = '' PROXYSCRAPE_webscraping_api_config: dict = { "url": extracted_url, "browserHtml": True } urls_to_scrape_inside_page.append([PROXYSCRAPE_webscraping_api_config, index]) index += 1 print('Extracting content of each link') content = asyncio.run(get_page_contents(urls_to_scrape_inside_page)) print("Writing output files") for c in content: extract_and_save_data(c) print("Scraping and data extraction completed!") ```