import java.io.* ; public class PageParser { private BufferedReader inFile ; public String readInFile() { // readInFile method used for debugging only (read text from file instead of passing from main prog) String fileName = "tempfile.txt" ; String page = "" ; try { inFile = new BufferedReader( new FileReader(fileName) ) ; String line = "" ; while( (line = inFile.readLine() ) != null) { page = page + line + "\n" ; } inFile.close(); } catch (IOException ex) { System.err.println("Error in file " + fileName + ": " + ex.toString() ) ; System.exit(1) ; } return page ; } public String[][] parseFile(String page) { // initial size of array is arbitrary, but most web sites will have less // than 100 links (the array is resized if necessary) int maxsize=100 ; // 5 parameters per link get passed back: // 0 : host // 1 : page // 2 : path // 3 : port // 4 : complete href string (for display) int numparms=5 ; String[][] links = new String[maxsize][numparms] ; String url = "" ; int startindex = 0, endindex = 0, i = 0 ; // most href tags will be delimited with quotes, but not all! String delimChar; while ( startindex != -1 && i < maxsize ) { // href may be mixed case, but for now just deal with lower case // and upper case, the two likeliest cases startindex = page.indexOf("href=") ; if (startindex == -1) { startindex = page.indexOf("HREF=") ; } // note that indexOf method returns -1 when the substring is not present if ( (startindex > -1) ) { startindex = startindex + 5; boolean quoted = page.substring(startindex,startindex+1).equals("\"") ; // for debugging only //System.out.println("DEBUG next href is quoted: " + quoted) ; // if the href starts with a quote character, skip over it and // look for the next quote if (quoted) { delimChar = "\"" ; startindex=startindex + 1 ; } // otherwise, look for the end of the tag else { delimChar = ">" ; } // for debugging only //System.out.println("DEBUG delim char is: " + delimChar) ; endindex = page.indexOf(delimChar, startindex) ; // for debugging only // System.out.println("start index = " + startindex) ; // System.out.println("end index = " + endindex) ; if (endindex > startindex) { // url gets the url (from startindex up to endindex) // and page gets the rest of the page (from endindex on) url = page.substring(startindex,endindex) ; page = page.substring(endindex) ; // checkHrefType method screens out mailto refs, etc. boolean firstcheck = checkHrefType(url) ; if (firstcheck == true) { // store the whole href System.out.println("href = " + url) ; links[i][4] = url; // get the host part, but don't store it right away // because later, the port number (if any) gets // stripped out String host = getHostPart(url) ; // get the page part and store it String pageonly = getPagePart(url, host) ; links[i][1] = pageonly ; // use the page and host parts to find the path // part in the middle! links[i][2] = getPathPart(url, host, pageonly) ; System.out.println("path = " + links[i][2] + "" ); System.out.println("page = " + links[i][1] + "" ); // find the port, if any, and remove the port // from the host name. int port = getPort(host); links[i][3] = Integer.toString(port); host = removePort(host) ; links[i][0] = host ; System.out.println("host = " + links[i][0] +"\n" ) ; i++ ; } } } if (endindex <= startindex) { startindex = -1 ; } // check size of array and resize if necessary if (i > maxsize -1) { // for debugging only //System.out.println("DEBUG resizing array in parser"); maxsize = maxsize + 100 ; String [][] temp = new String[maxsize][numparms] ; // this would be more elegant if done with nested for's // instead of hardcoding array elements 0 through 4 for (int j=0; j startindex) { host = url.substring(startindex, endindex) ; } else { host = url.substring(startindex) ; } } return host ; } public int getPort(String host) { // check for port number in host name int portnum = -1 ; int hostcolon = host.indexOf(":"); if (hostcolon != -1) { try { portnum = Integer.parseInt( host.substring(hostcolon + 1) ) ; } catch (NumberFormatException e) { System.err.println("Invalid port number detected: " + host.substring(hostcolon) ) ; return -2 ; } } return portnum ; } // end getPort public String removePort(String host) { // remove port number from host name int hostcolon = host.indexOf(":"); if (hostcolon != -1) { return host.substring(0,hostcolon); } return host ; } public String getPathPart(String url, String host, String page) { int startindex = 0 ; // for debugging only //System.out.println("DEBUG pathcheck url = " + url); //System.out.println("DEBUG pathcheck host = " + host) ; //System.out.println("DEBUG pathcheck page = " + page) ; // check if the url contains the previously determined host name // if so, skip over it int hostlen = host.length() ; int hostindex = url.indexOf(host) ; if (hostindex != -1) { startindex = hostindex + hostlen ; } // special handling for no page (e.g., page = slash) // just return everything after the host if (page.equals("__DIRONLY__")) { return url.substring(startindex) ; } // if all that's left after the host is the page, // return marker __NOPATH__ int endindex = url.indexOf(page) ; if (endindex == startindex) { return "__NOPATH__" ; } // otherwise, the path is everything between the host // and the page! url = url.substring(startindex, endindex) ; return url ; } public String getPagePart(String url, String host) { int startindex = 0 ; // check if the url contains the previously determined host name // and if so, skip over it int hostlen = host.length() ; int hostindex = url.indexOf(host) ; if (hostindex != -1) { startindex = hostindex + hostlen ; } // special handling for href which has nothing or // nothing but a slash after the host // return marker indicating no page part url = url.substring(startindex) ; if (url.equals("/") || url.equals("") ) { return "__DIRONLY__" ; } // check for a path (but saving the path will be handled separately) // this routine basically just finds the last slash and takes what // comes after it boolean nextslash = true; int slashloc = 1 ; while (nextslash) { slashloc = url.indexOf("/") ; // if the url contains a slash if (slashloc != -1) { // for debugging only //System.out.println("DEBUG url length = " + url.length() ); //System.out.println("DEBUG slash location = " + slashloc ); // and if the slash is not the last character if (slashloc != url.length()-1 ) { // take whatever is after the slash url = url.substring(slashloc+1) ; // for debugging only //System.out.println("reduced url: " + url) ; } // stop looping if the slash is the last character else { nextslash = false ; } } // stop looping if there are no more slashes in the url else { nextslash = false ; } } /////////////////////////////////////////////// // for future reference -- check for newline in middle of href? // this does happen, but not very often // the following code is commented out /*if (url.indexOf("\n") != -1) { System.out.println("URL has embedded newline!!!") ; } */ /////////////////////////////////////////////// // check that the string found (after the last slash) looks // like a page rather than a directory, i.e., contains a dot // obviously this is not foolproof! // but it only matters in a few cases (i.e., relative links under // a specified path) // if (url.indexOf(".") != -1) { return url ; } else { return "__DIRONLY__" ; } } public boolean checkHrefType(String url) { // very quick check to screen out href tags that are not actually // links to HTML or similar pages // skip mailto links, anything ending with .css extention if (url.startsWith("mailto") == true) { return false ; } if (url.endsWith(".css") == true) { return false ; } // internal anchors (starting with #) were being screened out, // but at least sometimes they seem to work OK, so comment out this one // if (url.startsWith("#") == true) { return false ; } return true ; } } // end class PageParser