Professional Profile
Picture
Joel P. Schroeder

Course Description:

CS  145 - Introduction to Object-Oriented Programming Using Java

A general introduction to computer data representation, programming, and the design of computer software. Object-oriented design and implementation techniques and concepts are introduced.

Program 3:  Webcrawler

A web crawler (a.k.a. robot or spider) is a program that automatically traverses the Web's hypertext structure by retrieving a document, recursively retrieving all documents that are referenced. Web crawlers can be used for a number of purposes, including indexing (i.e. google, yahoo, etc), HTML validation, link validation, “what’s new” monitoring, and mirroring. For this assignment, you will be building a basic web crawler that performs link validation (i.e. looking for broken links).

Program Structure

All your code must go in a class called WebCrawler. This class must contain the following functions. Note that you may not change these functions in any way. 

public static void main(String[ ] args)
public static URL getStartingURLFromUser( )
public static String htmlReader(URL webURL)
public static ArrayList<String> linkParser(String htmlContents)
public static boolean isBrokenLink(URL baseURL, String theHREF)
public static void displayBrokenLinkReport(ArrayList<ArrayList<String>>brokenLinks)


//This web crawler was written by Joel Schroeder.  11/18/2010

Illustrated below are a few screen shots of this program.



package edu.uwec.cs.schroejp.webcrawler;
//http://www.cs.uwec.edu/~stevende/cs145testpages/default.htm
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import javax.swing.JOptionPane;

public class WebCrawler {
            ArrayList<String> pagesVisited;
            ArrayList<String> pagesToVisit;

            public static URL getStartingURLFromUser() {
                        // This method gets the URL from user.
                        // Initializes the URL string.
                        URL startingURL = null;
//                     String startingString = "http://www.cs.uwec.edu/~stevende/cs145testpages/default.htm";
                         String startingString = JOptionPane.showInputDialog(null, "Enter a valid URL:");
                        startingString.toLowerCase();
                        // catches errors in URL.
                        try {
                                    startingURL = new URL(startingString);
                        } catch (MalformedURLException e) {
                                    System.out.println("Bad URL");
                                    System.exit(0);
                        }
                        return startingURL;
            }

            public static String htmlReader(URL webURL) {
                        // Initializes the strings.
                        String htmlContent = null;
                        // Reading the file.
                        // This try/catch block reads in a file with a reader and creates a
                        // continuous string.
                        try {
                                    URLConnection con = webURL.openConnection();
                                    InputStream is = con.getInputStream();
                                    BufferedReader br = new BufferedReader(new InputStreamReader(is));
                                    String line = br.readLine();
                                    while (line != null) {
                                                htmlContent = htmlContent + line + " ";
                                                line = br.readLine();
                                    }
                                    br.close();
                        } catch (MalformedURLException e) {
                                    System.out.println("Bad URL");
                                    System.exit(0);
                        } catch (FileNotFoundException e) {
                                    e.printStackTrace();
                                    System.out.println("File not found.");
                        } catch (IOException e) {
                                    e.printStackTrace();
                                    System.out.println("after try/catch");
                        }
                        return htmlContent;
            }

            public static ArrayList<String> linkParser(String htmlContents) {
                        // Initializes the strings, boolean, variables and arraylist.
                        ArrayList<String> listOfLinks = new ArrayList<String>();
                        int i = 0;
                        int k = 0;
                        boolean search = true;
                        String URL = "";
                        // This loop creates an array list of URL's.
                        while (search == true) {
                                    i = htmlContents.indexOf("HREF=", k) + 6;
                                    k = htmlContents.indexOf("\">", i);
                                    URL = htmlContents.substring(i, k);
                                    listOfLinks.add(URL);
                                    if (i == htmlContents.lastIndexOf("HREF=") + 6) {
                                                search = false;
                                    }
                        }
                        return listOfLinks;
                        // System.out.println(htmlContents);
            }
            public static boolean isBrokenLink(URL currentURL, String theHREF) {
                        // Initializes the strings, boolean, variables and arraylist.
                        Boolean isbroken = false;
                        try {
                                    URL baseURL = new URL(currentURL, theHREF);
                                    URLConnection con = currentURL.openConnection();
                                    HttpURLConnection httpProtocol = (HttpURLConnection) con;
                                    httpProtocol.getResponseCode();
                                    int httpPro = httpProtocol.getResponseCode();
                                    if (httpPro != 200) {
                                                isbroken = true;
                                    }
                        } catch (MalformedURLException e) {
                                    isbroken = true;
                                    System.out.println("Bad URL1");
                        } catch (FileNotFoundException e) {
                                    isbroken = true;
                                    e.printStackTrace();
                                    System.out.println("File not found.");
                        } catch (IOException e) {
                                    isbroken = true;
                                    e.printStackTrace();
                                    System.out.println("after try/catch");
                        }
                        return isbroken;
            }

            public static void displayBrokenLinkReport(
                                    ArrayList<ArrayList<String>> brokenLinks) {
                        System.out.println("Broken Link Report: \n");
                        for (int j = 0; j < brokenLinks.size(); j++) {
                                    for (int i = 0; i < brokenLinks.get(j).size(); i++) {
                                                if (j == 0) {
                                                            System.out.println("Page " + brokenLinks.get(j).get(i));
                                                } else {
                                                            System.out.println("Broken link "
                                                                                    + brokenLinks.get(j).get(i));
                                                }
                                    }
                        }
            }

            public static void main(String[] args) {
                        // Initializes the arraylists and the array<arraylist
                        ArrayList<URL> pagesVisited = new ArrayList<URL>();
                        ArrayList<URL> pagesToVisit = new ArrayList<URL>();
                        ArrayList<String> listOfLinks = null;
                        ArrayList<String> badLinks = null;
                        ArrayList<ArrayList<String>> brokenLinks = new ArrayList<ArrayList<String>>();
                        int i = 0;
                        // calls the getStartingURLFromUser method which returns the string.
                        pagesToVisit.add(getStartingURLFromUser());
                        while (!pagesToVisit.isEmpty()) {
                                    URL baseURL = pagesToVisit.get(0);
                                    pagesVisited.add(baseURL);
                                    pagesToVisit.remove(0);
                                    // calls the htmlReader method which returns the string.
                                    String htmlContent = htmlReader(baseURL);
                                    // calls the linkParser method which returns the array list.
                                    listOfLinks = linkParser(htmlContent);
                                    // calls the isBrokenLink method which returns the string.
                                    for (i = 0; i <= (listOfLinks.size()); i++) {
                                                boolean isBL = isBrokenLink(baseURL, listOfLinks.get(i));
                                                System.out.println(listOfLinks.get(i) + "a");
                                                if (isBL) {
                                                            System.out.println(listOfLinks.get(i));
                                                            badLinks.add(listOfLinks.get(i));
                                                } else {
                                                            // catches errors in URL.
                                                            URL nextURL = null;
                                                            try {
                                                                        nextURL = new URL(baseURL, istOfLinks.get(i));
                                                                        if (!pagesVisited.contains(nextURL)) {
                                                                                    pagesToVisit.add(nextURL);
                                                                        }
                                                            } catch (MalformedURLException e) {
                                                                        e.printStackTrace();
                                                                        System.out.println("Bad URL2");
                                                            }
                                                }
                                    }
                                    brokenLinks.add(badLinks);
                        }
                        displayBrokenLinkReport(brokenLinks);
            }
}


This input box allows a user to enter the website to be parsed.
Picture

This screen shot illustrates the web addresses that have been parsed.
Picture