Skip to content

Commit 1b83024

Browse files
committed
Images From Webpage Script Added
1 parent fba0b5c commit 1b83024

File tree

3 files changed

+174
-0
lines changed

3 files changed

+174
-0
lines changed

Images From Webpage/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Download Images From Webpage
2+
[![forthebadge](https://forthebadge.com/images/badges/built-with-grammas-recipe.svg)](https://forthebadge.com)
3+
[![forthebadge](https://forthebadge.com/images/badges/built-with-love.svg)](https://forthebadge.com)
4+
[![forthebadge](https://forthebadge.com/images/badges/made-with-python.svg)](https://forthebadge.com)
5+
[![forthebadge](https://forthebadge.com/images/badges/powered-by-water.svg)](https://forthebadge.com)
6+
7+
The python script gets all the images from the webpage
8+
9+
What the program does?
10+
- Takes single URL as input at the same time
11+
- Gets all the images
12+
- Downloads the file(s) in the directory the script runs
13+
14+
### Requirements
15+
>Python3.6+
16+
>
17+
> pip install -r requirements.txt
18+
19+
20+
### Usage
21+
```
22+
get_links.py [-h] -u URL
23+
24+
Download Images From Webpage
25+
26+
optional arguments:
27+
-h, --help show this help message and exit
28+
-u URL URL to extract links from
29+
```
30+
31+
### Contribution
32+
Any kind of contributions are welcome
33+
1. Fork the project
34+
2. Commit your changes
35+
3. Open a pull request
36+
37+

Images From Webpage/get_img.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import argparse
2+
from bs4 import BeautifulSoup
3+
import requests
4+
from tqdm import tqdm
5+
from urllib.parse import urlparse
6+
7+
8+
class ImageDownloader:
9+
"""
10+
This class is used to download all the images on the webpage
11+
Takes only one param
12+
13+
:param -- Valid URL for the website
14+
"""
15+
16+
def __init__(self, scrap_url):
17+
self.url = scrap_url
18+
self.image_urls = list()
19+
self.get_images()
20+
self.download()
21+
22+
def download(self):
23+
24+
if self.image_urls:
25+
26+
for url in self.image_urls:
27+
28+
# Download the body of response by chunk
29+
response = requests.get(url, stream=True)
30+
31+
# Get the total file size
32+
file_size = int(response.headers.get("Content-Length", 0))
33+
34+
# Get the file name
35+
filename = url.split("/")[-1]
36+
37+
# Progress bar
38+
progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B",
39+
unit_scale=True,
40+
unit_divisor=1024)
41+
42+
# Write to file
43+
with open(filename, "wb") as f:
44+
45+
for data in progress:
46+
# Write data to the file
47+
f.write(data)
48+
49+
# Update the progress bar
50+
progress.update(len(data))
51+
52+
def get_images(self):
53+
54+
# try, except block
55+
try:
56+
57+
# Get the page, with the timeout set to 5
58+
resp = requests.get(self.url, timeout=5)
59+
60+
# Check the response code, 200 for valid
61+
if resp.status_code == 200:
62+
63+
# Create the soup
64+
soup = BeautifulSoup(resp.content, "html.parser")
65+
66+
# Find all the img tags from the page
67+
links = soup.find_all('img')
68+
69+
# Extract the src of the images
70+
data = [y.attrs['src'] for y in links if y.name == 'img']
71+
72+
# Get the original URL domain
73+
url_parsing = urlparse(self.url)
74+
domain = url_parsing.netloc
75+
76+
for img_url in data:
77+
# Parse the image url
78+
img_url_parsing = urlparse(img_url)
79+
img_path = img_url_parsing.path
80+
81+
# Check if the image url contains the domain and use it
82+
if img_url_parsing.netloc != '':
83+
84+
# Create the complete URL
85+
comp_url = f"https://{img_url_parsing.netloc}{img_path}"
86+
87+
# Append it into the list
88+
self.image_urls.append(comp_url)
89+
90+
# Else use the default domain if the site
91+
else:
92+
93+
# Create the complete URL
94+
comp_url = f"https://{domain}{img_path}"
95+
96+
# Append it into the list
97+
self.image_urls.append(comp_url)
98+
99+
# Not a valid response code is given by the site
100+
else:
101+
print(f"Status Code: {resp.status_code}, For URL: {self.url}")
102+
103+
# Tries to connect to the site
104+
except requests.exceptions.ConnectionError:
105+
print(f"No Response From URL: {self.url}")
106+
107+
# Generic exception
108+
except Exception as err:
109+
print(f"Exception Raised: {err}")
110+
111+
112+
def main():
113+
parser = argparse.ArgumentParser(description='Download Images From Webpage')
114+
115+
# Argument is to input the url to download images from
116+
parser.add_argument('-u', dest='url', type=str, help='URL to extract links from', required=True)
117+
118+
args = parser.parse_args()
119+
120+
if args.url:
121+
122+
url = args.url
123+
124+
# Check if the url starts with wither http or https
125+
if not url.startswith("http") and not url.startswith("https"):
126+
# Add the https in front of the url
127+
url = "https://" + url
128+
129+
# Call to the function
130+
ImageDownloader(url)
131+
132+
133+
if __name__ == '__main__':
134+
main()

Images From Webpage/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests==2.25.1
2+
tqdm==4.55.1
3+
beautifulsoup4==4.9.3

0 commit comments

Comments
 (0)