| 
68
 | 
     1 /*
 | 
| 
 | 
     2 Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com>
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
| 
 | 
     5 of this software and associated documentation files (the "Software"), to deal
 | 
| 
 | 
     6 in the Software without restriction, including without limitation the rights
 | 
| 
 | 
     7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
| 
 | 
     8 copies of the Software, and to permit persons to whom the Software is
 | 
| 
 | 
     9 furnished to do so, subject to the following conditions:
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 The above copyright notice and this permission notice shall be included in
 | 
| 
 | 
    12 all copies or substantial portions of the Software.
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
| 
 | 
    15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
| 
 | 
    16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
| 
 | 
    17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
| 
 | 
    18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
| 
 | 
    19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 | 
| 
 | 
    20 THE SOFTWARE.
 | 
| 
 | 
    21 */
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 package fschmidt.tools;
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 import java.io.IOException;
 | 
| 
 | 
    26 import java.net.URL;
 | 
| 
 | 
    27 import java.util.Set;
 | 
| 
 | 
    28 import java.util.HashSet;
 | 
| 
 | 
    29 import java.util.Map;
 | 
| 
 | 
    30 import java.util.HashMap;
 | 
| 
 | 
    31 import java.util.Arrays;
 | 
| 
 | 
    32 import java.util.Iterator;
 | 
| 
 | 
    33 import fschmidt.util.java.IoUtils;
 | 
| 
 | 
    34 import fschmidt.html.Html;
 | 
| 
 | 
    35 import fschmidt.html.HtmlTag;
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 public class CrawlSite {
 | 
| 
 | 
    39 	private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{
 | 
| 
 | 
    40 		"html", "htm", "ssp", "jsp", "jtp",
 | 
| 
 | 
    41 	} ) );
 | 
| 
 | 
    42 	private static final Map<String,String> tagMap = new HashMap<String,String>();
 | 
| 
 | 
    43 	static {
 | 
| 
 | 
    44 		tagMap.put("a","href");
 | 
| 
 | 
    45 		tagMap.put("area","href");
 | 
| 
 | 
    46 		tagMap.put("base","href");
 | 
| 
 | 
    47 		tagMap.put("form","action");
 | 
| 
 | 
    48 		tagMap.put("frame","src");
 | 
| 
 | 
    49 		tagMap.put("img","src");
 | 
| 
 | 
    50 		tagMap.put("link","href");
 | 
| 
 | 
    51 		tagMap.put("input","src");
 | 
| 
 | 
    52 	}
 | 
| 
 | 
    53 
 | 
| 
 | 
    54 	private String baseUrl;
 | 
| 
 | 
    55 	private Set<String> done = new HashSet<String>();
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 	public CrawlSite(String baseUrl) {
 | 
| 
 | 
    58 		this.baseUrl = baseUrl;
 | 
| 
 | 
    59 	}
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 	protected boolean isHtml(String url) {
 | 
| 
 | 
    62 		url = url.substring( url.lastIndexOf('/') + 1 );
 | 
| 
 | 
    63 		int i = url.indexOf('.');
 | 
| 
 | 
    64 		if( i == -1 )
 | 
| 
 | 
    65 			return true;
 | 
| 
 | 
    66 		String ending = url.substring(i+1);
 | 
| 
 | 
    67 		return endings.contains(ending);
 | 
| 
 | 
    68 	}
 | 
| 
 | 
    69 
 | 
| 
 | 
    70 	protected void process(String url) {
 | 
| 
 | 
    71 		System.out.println(url);
 | 
| 
 | 
    72 	}
 | 
| 
 | 
    73 
 | 
| 
 | 
    74 	public void crawl()
 | 
| 
 | 
    75 		throws IOException
 | 
| 
 | 
    76 	{
 | 
| 
 | 
    77 		crawl(baseUrl);
 | 
| 
 | 
    78 	}
 | 
| 
 | 
    79 
 | 
| 
 | 
    80 	private void crawl(String url)
 | 
| 
 | 
    81 		throws IOException
 | 
| 
 | 
    82 	{
 | 
| 
 | 
    83 		if( !done.add(url) )
 | 
| 
 | 
    84 			return;
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 		process(url);
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 		if( !isHtml(url) )
 | 
| 
 | 
    89 			return;
 | 
| 
 | 
    90 
 | 
| 
 | 
    91 		String page = IoUtils.readPage(url);
 | 
| 
 | 
    92 		Html html = new Html(page);
 | 
| 
 | 
    93 		Iterator iter = html.iterator();
 | 
| 
 | 
    94 		while( iter.hasNext() ) {
 | 
| 
 | 
    95 			Object o = iter.next();
 | 
| 
 | 
    96 			if( !(o instanceof HtmlTag) )
 | 
| 
 | 
    97 				continue;
 | 
| 
 | 
    98 			HtmlTag tag = (HtmlTag)o;
 | 
| 
 | 
    99 			String tagName = tag.getName().toLowerCase();
 | 
| 
 | 
   100 			String attrName = (String)tagMap.get(tagName);
 | 
| 
 | 
   101 			if( attrName==null )
 | 
| 
 | 
   102 				continue;
 | 
| 
 | 
   103 			String val = tag.getAttributeValue(attrName);
 | 
| 
 | 
   104 			if( val==null )
 | 
| 
 | 
   105 				continue;
 | 
| 
 | 
   106 			String url2 = HtmlTag.unquote(val);
 | 
| 
 | 
   107 			url2 = new URL(new URL(url),url2).toString();
 | 
| 
 | 
   108 			if( !url2.startsWith(baseUrl) )
 | 
| 
 | 
   109 				continue;
 | 
| 
 | 
   110 			int i = url2.indexOf('#');
 | 
| 
 | 
   111 			if( i != -1 )
 | 
| 
 | 
   112 				url2 = url2.substring(0,i);
 | 
| 
 | 
   113 			String file = url2.substring( url2.lastIndexOf('/') + 1 );
 | 
| 
 | 
   114 			if( file.length() > 0 && file.indexOf('.') == -1 )
 | 
| 
 | 
   115 				url2 += '/';
 | 
| 
 | 
   116 			try {
 | 
| 
 | 
   117 				crawl(url2);
 | 
| 
 | 
   118 			} catch(IOException e) {
 | 
| 
 | 
   119 //				System.err.println(e);
 | 
| 
 | 
   120 				System.err.println(e+"  referred to from "+url);
 | 
| 
 | 
   121 //				e.printStackTrace();
 | 
| 
 | 
   122 			}
 | 
| 
 | 
   123 		}
 | 
| 
 | 
   124 	}
 | 
| 
 | 
   125 
 | 
| 
 | 
   126 	public static void main(String[] args) throws Exception {
 | 
| 
 | 
   127 		new CrawlSite(args[0]).crawl();
 | 
| 
 | 
   128 	}
 | 
| 
 | 
   129 }
 |