| 68 | 1 /* | 
|  | 2 Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com> | 
|  | 3 | 
|  | 4 Permission is hereby granted, free of charge, to any person obtaining a copy | 
|  | 5 of this software and associated documentation files (the "Software"), to deal | 
|  | 6 in the Software without restriction, including without limitation the rights | 
|  | 7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
|  | 8 copies of the Software, and to permit persons to whom the Software is | 
|  | 9 furnished to do so, subject to the following conditions: | 
|  | 10 | 
|  | 11 The above copyright notice and this permission notice shall be included in | 
|  | 12 all copies or substantial portions of the Software. | 
|  | 13 | 
|  | 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|  | 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|  | 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | 
|  | 17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
|  | 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
|  | 19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | 
|  | 20 THE SOFTWARE. | 
|  | 21 */ | 
|  | 22 | 
|  | 23 package fschmidt.tools; | 
|  | 24 | 
|  | 25 import java.io.IOException; | 
|  | 26 import java.net.URL; | 
|  | 27 import java.util.Set; | 
|  | 28 import java.util.HashSet; | 
|  | 29 import java.util.Map; | 
|  | 30 import java.util.HashMap; | 
|  | 31 import java.util.Arrays; | 
|  | 32 import java.util.Iterator; | 
|  | 33 import fschmidt.util.java.IoUtils; | 
|  | 34 import fschmidt.html.Html; | 
|  | 35 import fschmidt.html.HtmlTag; | 
|  | 36 | 
|  | 37 | 
|  | 38 public class CrawlSite { | 
|  | 39 	private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{ | 
|  | 40 		"html", "htm", "ssp", "jsp", "jtp", | 
|  | 41 	} ) ); | 
|  | 42 	private static final Map<String,String> tagMap = new HashMap<String,String>(); | 
|  | 43 	static { | 
|  | 44 		tagMap.put("a","href"); | 
|  | 45 		tagMap.put("area","href"); | 
|  | 46 		tagMap.put("base","href"); | 
|  | 47 		tagMap.put("form","action"); | 
|  | 48 		tagMap.put("frame","src"); | 
|  | 49 		tagMap.put("img","src"); | 
|  | 50 		tagMap.put("link","href"); | 
|  | 51 		tagMap.put("input","src"); | 
|  | 52 	} | 
|  | 53 | 
|  | 54 	private String baseUrl; | 
|  | 55 	private Set<String> done = new HashSet<String>(); | 
|  | 56 | 
|  | 57 	public CrawlSite(String baseUrl) { | 
|  | 58 		this.baseUrl = baseUrl; | 
|  | 59 	} | 
|  | 60 | 
|  | 61 	protected boolean isHtml(String url) { | 
|  | 62 		url = url.substring( url.lastIndexOf('/') + 1 ); | 
|  | 63 		int i = url.indexOf('.'); | 
|  | 64 		if( i == -1 ) | 
|  | 65 			return true; | 
|  | 66 		String ending = url.substring(i+1); | 
|  | 67 		return endings.contains(ending); | 
|  | 68 	} | 
|  | 69 | 
|  | 70 	protected void process(String url) { | 
|  | 71 		System.out.println(url); | 
|  | 72 	} | 
|  | 73 | 
|  | 74 	public void crawl() | 
|  | 75 		throws IOException | 
|  | 76 	{ | 
|  | 77 		crawl(baseUrl); | 
|  | 78 	} | 
|  | 79 | 
|  | 80 	private void crawl(String url) | 
|  | 81 		throws IOException | 
|  | 82 	{ | 
|  | 83 		if( !done.add(url) ) | 
|  | 84 			return; | 
|  | 85 | 
|  | 86 		process(url); | 
|  | 87 | 
|  | 88 		if( !isHtml(url) ) | 
|  | 89 			return; | 
|  | 90 | 
|  | 91 		String page = IoUtils.readPage(url); | 
|  | 92 		Html html = new Html(page); | 
|  | 93 		Iterator iter = html.iterator(); | 
|  | 94 		while( iter.hasNext() ) { | 
|  | 95 			Object o = iter.next(); | 
|  | 96 			if( !(o instanceof HtmlTag) ) | 
|  | 97 				continue; | 
|  | 98 			HtmlTag tag = (HtmlTag)o; | 
|  | 99 			String tagName = tag.getName().toLowerCase(); | 
|  | 100 			String attrName = (String)tagMap.get(tagName); | 
|  | 101 			if( attrName==null ) | 
|  | 102 				continue; | 
|  | 103 			String val = tag.getAttributeValue(attrName); | 
|  | 104 			if( val==null ) | 
|  | 105 				continue; | 
|  | 106 			String url2 = HtmlTag.unquote(val); | 
|  | 107 			url2 = new URL(new URL(url),url2).toString(); | 
|  | 108 			if( !url2.startsWith(baseUrl) ) | 
|  | 109 				continue; | 
|  | 110 			int i = url2.indexOf('#'); | 
|  | 111 			if( i != -1 ) | 
|  | 112 				url2 = url2.substring(0,i); | 
|  | 113 			String file = url2.substring( url2.lastIndexOf('/') + 1 ); | 
|  | 114 			if( file.length() > 0 && file.indexOf('.') == -1 ) | 
|  | 115 				url2 += '/'; | 
|  | 116 			try { | 
|  | 117 				crawl(url2); | 
|  | 118 			} catch(IOException e) { | 
|  | 119 //				System.err.println(e); | 
|  | 120 				System.err.println(e+"  referred to from "+url); | 
|  | 121 //				e.printStackTrace(); | 
|  | 122 			} | 
|  | 123 		} | 
|  | 124 	} | 
|  | 125 | 
|  | 126 	public static void main(String[] args) throws Exception { | 
|  | 127 		new CrawlSite(args[0]).crawl(); | 
|  | 128 	} | 
|  | 129 } |