3 horizontal lines, burger
3 horizontal lines, burger

3 horizontal lines, burger
Remove all
LOADING ...

Python script to scrape google search result page(or just links)

Clock
15.02.2025
/
Clock
12.02.2026
/
Icon of app type
Scraper
An eye
516
Hearts
0
Connected dots
0
Connected dots
0
Connected dots
0


Goal:
This parser is designed to allow you to legally parse Google search results without any problems through the official API.
Characteristics:
  • Threading: Singlethreaded
  • Type: Through API
  • Used languages: Python
  • Proxy: Without proxy
  • Rotation: Without user-agent rotation

Description

This parser is written in python to parse Google search result page and links using its official API.

Configuration file, config.json

{ "key": "AIzaSyDmt2BUl9gwkrw4iOaObCdosnQcjH4M9B4", "cx": "43ed4817eb4d8481a", "save_to": "exel", "title": true, "description": false, "url": true, "depth": 1 }

Script file, main.py

import json import argparse import requests import pandas from urllib.parse import quote, unquote def save_to_json(path, list): with open(path, 'w', encoding='utf-8') as file: json.dump(list, file, indent=2, ensure_ascii=False) file.close() def save_to_exel(path, data): frame = pandas.DataFrame({ 'title': [], 'link': [], 'description': [] }) for indx, entry in enumerate(data): frame.at[indx, 'title'] = entry['title'] frame.at[indx, 'link'] = entry['url'] frame.at[indx, 'description'] = entry['description'] frame.to_excel(path, index=False ) def serp_page_scrape(query: str, options: dict) -> list: data = [] for i in range(0, options['depth']): try: with open(f'./data/temp/{query}_{i*10 + 1}-{i*10 + 10}.json', 'r+', encoding='utf-8') as file: data_temp = json.loads(file.read()) for item in data_temp['items']: title = None if options['title']: title = item['title'] description = None if options['description']: description = item['snippet'] url = None if options['url']: url = item['link'] data.append({ 'title': title, 'description': description, 'url': url, }) except: pass if options['save_to'] == 'json': save_to_json(f'./data/serp/{query}.json', data) else: save_to_exel(f'./data/serp/{query}.xlsx', data) return data def serp_scrape_init(query: str, options: dict = {}) -> list: print(f'Query: {unquote(query)},\nOptions: title={options['title']} | description={options['description']} | urls={options['url']} | depth={options['depth']} | save to={options['save_to']}') for i in range(0, options['depth']): response = requests.get(f'https://www.googleapis.com/customsearch/v1?key={options['key']}&cx={options['cx']}&q={query}&num=10&start={i * 10 + 1}') save_to_json(f'./data/temp/{query}_{i*10 + 1}-{i*10 + 10}.json',response.json()) def run(): # This is going to be only in standalone script # Get the options and query from CLI parser = argparse.ArgumentParser(add_help=True) parser.add_argument('-q', type=str, help='Query to parse', metavar='QUERY', required=True, nargs='*') parser.add_argument('-C', type=str, help='Path to config, in json format', metavar='CONFIG_FILE', required=True, nargs=1) args = parser.parse_args() # query raw_query = ''.join(word + ' ' for word in args.q) if raw_query is None: return query = quote(raw_query) # check if config exist options = { 'key': '', 'cx': '', 'save_to': '', 'title': '', 'description': '', 'url': '', 'depth':'' } with open(args.C[0], 'r') as file: data = json.loads(file.read()) for key in data: if options.get(key) is not None: options[key] = data[key] else: print(f'ERROR: Something went wrong in your config file, {key}') return False # check depth if options['depth'] > 10: print('WARNING: Google Search API allowed only 100 search results to be available') options['depth'] = 10 else: options['depth'] = data['depth'] serp_scrape_init(query, options) serp_page_scrape(query, options) if __name__ == "__main__": run()

Do not forget to share, like and leave a comment :)

Reviews

(0)

captcha
Send
LOADING ...
It's empty now. Be the first (o゚v゚)ノ