Skip to content

Download images from google query #4853

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 16, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions web_programming/download_images_from_google_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
import os
import re
import sys
import urllib.request

import requests
from bs4 import BeautifulSoup

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}


def download_images_from_google_query(query: str = "dhaka", max_images: int = 5) -> int:
"""Searches google using the provided query term and downloads the images in a folder.

Args:
query : The image search term to be provided by the user. Defaults to
"dhaka".
image_numbers : [description]. Defaults to 5.

Returns:
The number of images successfully downloaded.

>>> download_images_from_google_query()
5
>>> download_images_from_google_query("potato")
5
"""
max_images = min(max_images, 50) # Prevent abuse!
params = {
"q": query,
"tbm": "isch",
"hl": "en",
"ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, "html.parser")
matched_images_data = "".join(
re.findall(r"AF_initDataCallback\(([^<]+)\);", str(soup.select("script")))
)

matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)

matched_google_image_data = re.findall(
r"\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",",
matched_images_data_json,
)
if not matched_google_image_data:
return 0

removed_matched_google_images_thumbnails = re.sub(
r"\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]",
"",
str(matched_google_image_data),
)

matched_google_full_resolution_images = re.findall(
r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails,
)
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
if index >= max_images:
return index
original_size_img_not_fixed = bytes(fixed_full_res_image, "ascii").decode(
"unicode-escape"
)
original_size_img = bytes(original_size_img_not_fixed, "ascii").decode(
"unicode-escape"
)
opener = urllib.request.build_opener()
opener.addheaders = [
(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582",
)
]
urllib.request.install_opener(opener)
path_name = f"query_{query.replace(' ', '_')}"
if not os.path.exists(path_name):
os.makedirs(path_name)
urllib.request.urlretrieve(
original_size_img, f"{path_name}/original_size_img_{index}.jpg"
)
return index


if __name__ == "__main__":
try:
image_count = download_images_from_google_query(sys.argv[1])
print(f"{image_count} images were downloaded to disk.")
except IndexError:
print("Please provide a search term.")
raise