import java.net.* ; import java.io.* ; public class CheckLink { // there is a URL data type, but since this is intentionally low-level, // use a string for the page name String pagename = "/" ; // vars used while reading the page and to store the entire page String readline = "" ; String firstline = "" ; String readpage = "" ; // Java Socket class is predefined as client using TCP // and also automatically "connects" Socket sock ; // InetAddress class includes getByName method (equivalent // to C gethostbyname) InetAddress addr ; // vars used to write / read socket PrintWriter out = null ; BufferedReader in = null ; // reasonable defaults for host and port, save default port number for later use String hostname = "localhost" ; int defaultport = 80 ; // working port number is the default (80) until a different port number is read // even if a different port number is read from the command line, we still need // to remember the default for checking external links // In other words, portnum is the working port number, defaultport is the fixed // default HTTP port number, and readport is whatever is specified on the command // line, which will be used for checking the top page and it's internal links int portnum = defaultport ; int readport = -1 ; // main method instantiates class, reads command line arguments, and // calls methods from this class (CheckLink) and class PageParser public static void main (String [] args) { CheckLink connection = new CheckLink(); PageParser parseUrl = new PageParser() ; String topPage = "" ; String topPath = "/" ; // read one argument (hostname) if available if (args.length > 0) { connection.hostname = args[0] ; } // read second argument (page / URL) if available if (args.length > 1) { connection.pagename = args[1] ; // if page name does not start with slash, add one String checkslash = connection.pagename.substring(0,1) ; if (! checkslash.equals("/")) { connection.pagename = "/" + connection.pagename ; } // save path part of initial page to use with any relative links // Because the host is a separate parameter on the command line, // use __NOHOST__ keyword topPage = parseUrl.getPagePart(connection.pagename,"__NOHOST__") ; topPath = parseUrl.getPathPart(connection.pagename, "__NOHOST__", topPage) ; // for debugging only //System.out.println("DEBUG top page = " + topPage); //System.out.println("DEBUG top path = " + topPath); } // read third argument (port number) if available if (args.length > 2) { try { connection.readport = Integer.parseInt( args[2] ); connection.portnum = connection.readport ; } catch (NumberFormatException e) { System.err.println("Invalid port number: " + args[2]) ; System.exit(1); } } System.out.println("Using host name: " + connection.hostname); System.out.println("Using page name: " + connection.pagename); System.out.println("Using port number: " + connection.portnum ); // initialize socket using CheckLink connect method connection.sock = connection.connect(connection.hostname, connection.portnum) ; if (connection.sock == null) { System.out.println("Initial socket connection failed, cannot continue"); System.exit(1) ; } System.out.println("\nConnected, starting page status check..."); // check page status using CheckLink goodPage method boolean goodStatus = connection.goodPage(connection.hostname, connection.pagename); if (!goodStatus) { System.out.println("\nError reading initial page, cannot continue.\n"); System.exit(1) ; } // read entire page, then close initial socket connection System.out.println("Reading page content..."); connection.readpage = connection.wholePage(); connection.closeSocket(); System.out.println("Done reading top-level page\n"); // for debugging only, dump page to a file connection.printPage(connection.readpage) ; // initialize parser object PageParser parser = new PageParser(); System.out.println("Starting page parsing...\n"); // parseFile method returns a two-dimensional array of strings consisting of: // 0 : hostname // 1 : page // 2 : path // 3 : port // 4 : entire href (for display only) // for each href found on the page // String [][] listToCheck = parser.parseFile(connection.readpage) ; System.out.println("Starting href checks...\n"); int i = 0 ; while ( (i < listToCheck.length) && (listToCheck[i][0].equals("__END__") == false) ) { // first read host name String checkHost = listToCheck[i][0] ; // check if this is an internal link, and also set the // port number appropriately. The port number is always // reset in case it was changed during the last loop if (checkHost.equals("__SAMEHOST__")) { checkHost = connection.hostname; if (connection.readport != -1 ) { connection.portnum = connection.readport ; } else { connection.portnum = connection.defaultport ; } } else { if (listToCheck[i][3].equals("-1")) { connection.portnum = connection.defaultport ; } else { try { connection.portnum = Integer.parseInt(listToCheck[i][3]); } catch (NumberFormatException e) { System.err.println("Invalid port number detected: " + listToCheck[i][3]) ; System.err.println("Using default port 80"); connection.portnum = connection.defaultport ; } } } // next items in array are pagename and path String checkPage ; String pagePart = listToCheck[i][1] ; String pathPart = listToCheck[i][2] ; // for debugging only //System.out.println(">>> path = " + pathPart) ; //System.out.println(">>> page = " + pagePart) ; if (pagePart.equals("__DIRONLY__") ) { pagePart = "" ; } if (pathPart.equals("__NOPATH__") ) { pathPart = "" ; } checkPage = pathPart + pagePart ; // for debugging only //System.out.println("DEBUG page before appending toppath: " + checkPage) ; // make sure page starts with slash if (checkPage.equals("")) { checkPage = "/"; } else { boolean startslash = checkPage.startsWith("/"); if (startslash == false ) { // append top-level path, putting in delimiter if needed if (topPath.endsWith("/")) { checkPage = topPath + checkPage ; } else { checkPage = topPath + "/" + checkPage ; } } } System.out.println("HREF: " + listToCheck[i][4] ) ; i++ ; System.out.println("Checking host: " + checkHost + ", page: " + checkPage + " on port " + connection.portnum) ; // check for valid port number if (connection.portnum < 0) { System.out.println("Invalid port number, cannot check"); System.out.println("========================================================") ; System.out.println("===> Automatic link check failed; check link manually ") ; System.out.println("========================================================\n") ; continue ; } // make a connection, check the page status, close the socket connection.sock = connection.connect(checkHost,connection.portnum) ; if (connection.sock == null) { System.out.println("Socket connection failed, cannot check " + checkHost + checkPage ); System.out.println("========================================================") ; System.out.println("===> Automatic link check failed; check link manually ") ; System.out.println("========================================================\n") ; continue ; } goodStatus = connection.goodPage(checkHost, checkPage) ; connection.closeSocket(); } } // end main public Socket connect(String hostname, int portnum) { // InetAddress is a weird class with no constructor, // instead use getByName method to initialize / get IP address // to be used with Socket constructor try { // get IP address from hostname addr = InetAddress.getByName(hostname); // Socket constructor creates and connects sock = new Socket(addr, portnum) ; // establish read/write to socket out = new PrintWriter( sock.getOutputStream(), true ) ; in = new BufferedReader( new InputStreamReader( sock.getInputStream() ) ) ; } catch (UnknownHostException e) { System.out.println("Entered host name (" + hostname + ") is not a valid hostname"); // System.exit(1) ; return null ; } catch (IOException e) { System.out.println("Socket I/O error: " + e.toString() ) ; // System.exit(1) ; return null ; } return sock ; } // end connect public void closeSocket() { // be tidy! try { out.close() ; in.close() ; sock.close() ; } catch (IOException e) { System.out.println("Socket I/O error while closing" + e.toString() ) ; } } public boolean goodPage(String hostname, String pagename) { // construct http request message // // most sites work with just newline chars \n // but a few need both return and newline \r\n // String httpMsg = "GET " + pagename + " HTTP/1.1" + "\r\n" + "Host: " + hostname + "\r\n\r\n" ; out.println(httpMsg) ; // for debugging only //System.out.println("DEBUG http message: \n" + httpMsg); try { // first line of response includes status firstline = in.readLine() ; } catch (IOException e) { System.out.println("Socket I/O error while checking page status: " + e.toString() ) ; return false; } // for debugging only // System.out.println("First line = " + firstline ) ; if (firstline.startsWith("HTTP") != true) { System.out.println("========================================================") ; System.out.println("Invalid HTTP message returned") ; System.out.println("===> Automatic link check failed; check link manually ") ; System.out.println("========================================================\n") ; return false ; } String errorcode = firstline.substring(9,12) ; System.out.println("Status code: -" + errorcode + "-") ; if (errorcode.equals("200") ) { // System.out.println("Page " + hostname + pagename + " checked successfully\n") ; System.out.println("Page checked successfully\n") ; return true; } else { if (errorcode.equals("302") || errorcode.equals("301") ) { System.out.println("========================================================") ; System.out.println("===> This link redirects; check link manually ") ; System.out.println("========================================================\n") ; return false ; } else { System.out.println("========================================================") ; System.out.println("===> Automatic link check failed; check link manually ") ; System.out.println("========================================================\n") ; return false; } } } // end goodPage public String wholePage() { String readline = "", readpage = ""; int linecount = 0 ; try { while ( (readline = in.readLine()) != null ) { readpage = readpage + readline + "\n" ; linecount++ ; if (linecount % 100 == 0) { System.out.println("Read line" + linecount) ; } // for debugging only, limit number of lines processed //if (linecount > 100 ) //{ // break ; //} // in some cases, end of file is not recognized // this is a work-around; generally, nothing after the // html end tag will be of interest // if ( (readline.indexOf("")>=0) || (readline.indexOf("")>=0) ) { break ; } } } catch (IOException e) { System.out.println("Socket I/O error" + e.toString() ) ; System.exit(1) ; } // for debugging only, dump page to a file // printPage(readpage) ; return readpage ; } public void printPage(String readpage) { // this method is for debugging only. it just takes whatever string // you pass it and dumps it into a temporary file PrintWriter tfile = null ; try { tfile = new PrintWriter( new FileWriter("tempfile.txt"), true ) ; tfile.println(readpage) ; tfile.close(); } catch (IOException e) { System.out.println("File I/O error" + e.toString() ) ; System.exit(1) ; } } } // end class CheckLink