-
Notifications
You must be signed in to change notification settings - Fork 0
/
FirstImage.java
121 lines (109 loc) · 3.75 KB
/
FirstImage.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
package test;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.FileSystemException;
import java.nio.file.Files;
import java.nio.file.Paths;
/**
* Scrapes an Apple website with a small number of possible pages for images of
* Apple products.
*
* @author nilsstreedain
*
*/
public class SmallAppleProductScraper {
/**
* Main method to loop over possible webpages that might exist, then, if they do
* exist, saves the first image on the page.
*
* @param args
*/
public static void main(String[] args) {
// Loops over range of possible webpage numbers.
for (int i = 500; i < 1000; i++) {
try {
// Find the URL of the first image appearing on the webpage.
URL imgURL = getImageURL(new URL("https://support.apple.com/kb/SP" + i));
// If an image exists, save the image.
if (imgURL != null)
saveImageFromURL(imgURL, filenameFromURL(imgURL, i));
else
System.out.println(i + " exists but there is no image.");
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
}
/**
* Gets the URL of the first image on a webpage from a specified URL. If no
* image is found, null is returned.
*
* @param url - URL of webpage to scrape
*/
public static URL getImageURL(URL url) {
try {
// Creates a buffered reader to go through each line of HTML on the webpage.
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
// Loops over each line of HTML until an image is found.
String currentLine;
while ((currentLine = reader.readLine()) != null)
if (currentLine.contains("<img")) {
// Parses the line of HTML containing the image source
currentLine = currentLine.substring(currentLine.indexOf("src=")).split("\"")[1];
// If the source begins in https:// or http:// the source is returned directly.
if (currentLine.startsWith("https://") || currentLine.startsWith("http://"))
return new URL(currentLine);
// Otherwise it is added to the end of the website domain.
else
return new URL("https://support.apple.com" + currentLine);
}
} catch (FileNotFoundException e) {
System.out.println(url + " was not found.");
} catch (IOException e) {
e.printStackTrace();
}
// If no image is found, null is returned.
return null;
}
/**
* Saves image found at given URL and names it filename.
*
* @param url - URL to find image at
* @param filename - Name to save image under
*/
public static void saveImageFromURL(URL url, String filename) {
try {
// Creates an input stream from a URL to save an image.
InputStream stream = url.openStream();
// Copies image from the stream to a filepath on the computer.
Files.copy(stream, Paths.get("/Users/nilsstreedain/products2/" + filename));
stream.close();
System.out.println(filename + " successfully saved.");
} catch (FileAlreadyExistsException e) {
System.out.println(filename + " already exists.");
} catch (FileSystemException e) {
System.out.println(filename.substring(0, 100) + " is an invalid image.");
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Parses the filename of an image given the images URL.
*
* @param url - URL to parse for a filename
* @return filename - The parsed filename
*/
public static String filenameFromURL(URL url, int productNumber) {
// Turns the URL into a string
String filename = url.toString();
// Parses the filename out of the URL string.
filename = filename.substring(filename.lastIndexOf("/") + 1);
return productNumber + " " + filename;
}
}