68
|
1 /*
|
|
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com>
|
|
3
|
|
4 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5 of this software and associated documentation files (the "Software"), to deal
|
|
6 in the Software without restriction, including without limitation the rights
|
|
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8 copies of the Software, and to permit persons to whom the Software is
|
|
9 furnished to do so, subject to the following conditions:
|
|
10
|
|
11 The above copyright notice and this permission notice shall be included in
|
|
12 all copies or substantial portions of the Software.
|
|
13
|
|
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20 THE SOFTWARE.
|
|
21 */
|
|
22
|
|
23 package fschmidt.tools;
|
|
24
|
|
25 import java.io.IOException;
|
|
26 import java.net.URL;
|
|
27 import java.util.Set;
|
|
28 import java.util.HashSet;
|
|
29 import java.util.Map;
|
|
30 import java.util.HashMap;
|
|
31 import java.util.Arrays;
|
|
32 import java.util.Iterator;
|
|
33 import fschmidt.util.java.IoUtils;
|
|
34 import fschmidt.html.Html;
|
|
35 import fschmidt.html.HtmlTag;
|
|
36
|
|
37
|
|
38 public class CrawlSite {
|
|
39 private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{
|
|
40 "html", "htm", "ssp", "jsp", "jtp",
|
|
41 } ) );
|
|
42 private static final Map<String,String> tagMap = new HashMap<String,String>();
|
|
43 static {
|
|
44 tagMap.put("a","href");
|
|
45 tagMap.put("area","href");
|
|
46 tagMap.put("base","href");
|
|
47 tagMap.put("form","action");
|
|
48 tagMap.put("frame","src");
|
|
49 tagMap.put("img","src");
|
|
50 tagMap.put("link","href");
|
|
51 tagMap.put("input","src");
|
|
52 }
|
|
53
|
|
54 private String baseUrl;
|
|
55 private Set<String> done = new HashSet<String>();
|
|
56
|
|
57 public CrawlSite(String baseUrl) {
|
|
58 this.baseUrl = baseUrl;
|
|
59 }
|
|
60
|
|
61 protected boolean isHtml(String url) {
|
|
62 url = url.substring( url.lastIndexOf('/') + 1 );
|
|
63 int i = url.indexOf('.');
|
|
64 if( i == -1 )
|
|
65 return true;
|
|
66 String ending = url.substring(i+1);
|
|
67 return endings.contains(ending);
|
|
68 }
|
|
69
|
|
70 protected void process(String url) {
|
|
71 System.out.println(url);
|
|
72 }
|
|
73
|
|
74 public void crawl()
|
|
75 throws IOException
|
|
76 {
|
|
77 crawl(baseUrl);
|
|
78 }
|
|
79
|
|
80 private void crawl(String url)
|
|
81 throws IOException
|
|
82 {
|
|
83 if( !done.add(url) )
|
|
84 return;
|
|
85
|
|
86 process(url);
|
|
87
|
|
88 if( !isHtml(url) )
|
|
89 return;
|
|
90
|
|
91 String page = IoUtils.readPage(url);
|
|
92 Html html = new Html(page);
|
|
93 Iterator iter = html.iterator();
|
|
94 while( iter.hasNext() ) {
|
|
95 Object o = iter.next();
|
|
96 if( !(o instanceof HtmlTag) )
|
|
97 continue;
|
|
98 HtmlTag tag = (HtmlTag)o;
|
|
99 String tagName = tag.getName().toLowerCase();
|
|
100 String attrName = (String)tagMap.get(tagName);
|
|
101 if( attrName==null )
|
|
102 continue;
|
|
103 String val = tag.getAttributeValue(attrName);
|
|
104 if( val==null )
|
|
105 continue;
|
|
106 String url2 = HtmlTag.unquote(val);
|
|
107 url2 = new URL(new URL(url),url2).toString();
|
|
108 if( !url2.startsWith(baseUrl) )
|
|
109 continue;
|
|
110 int i = url2.indexOf('#');
|
|
111 if( i != -1 )
|
|
112 url2 = url2.substring(0,i);
|
|
113 String file = url2.substring( url2.lastIndexOf('/') + 1 );
|
|
114 if( file.length() > 0 && file.indexOf('.') == -1 )
|
|
115 url2 += '/';
|
|
116 try {
|
|
117 crawl(url2);
|
|
118 } catch(IOException e) {
|
|
119 // System.err.println(e);
|
|
120 System.err.println(e+" referred to from "+url);
|
|
121 // e.printStackTrace();
|
|
122 }
|
|
123 }
|
|
124 }
|
|
125
|
|
126 public static void main(String[] args) throws Exception {
|
|
127 new CrawlSite(args[0]).crawl();
|
|
128 }
|
|
129 }
|