annotate src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02

add fschmidt source
author Franklin Schmidt <fschmidt@gmail.com>
date Sun, 05 Oct 2025 17:24:15 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
68
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
1 /*
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com>
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
3
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
4 Permission is hereby granted, free of charge, to any person obtaining a copy
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
5 of this software and associated documentation files (the "Software"), to deal
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
6 in the Software without restriction, including without limitation the rights
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
8 copies of the Software, and to permit persons to whom the Software is
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
9 furnished to do so, subject to the following conditions:
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
10
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
11 The above copyright notice and this permission notice shall be included in
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
12 all copies or substantial portions of the Software.
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
13
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
20 THE SOFTWARE.
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
21 */
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
22
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
23 package fschmidt.tools;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
24
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
25 import java.io.IOException;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
26 import java.net.URL;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
27 import java.util.Set;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
28 import java.util.HashSet;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
29 import java.util.Map;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
30 import java.util.HashMap;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
31 import java.util.Arrays;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
32 import java.util.Iterator;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
33 import fschmidt.util.java.IoUtils;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
34 import fschmidt.html.Html;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
35 import fschmidt.html.HtmlTag;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
36
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
37
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
38 public class CrawlSite {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
39 private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
40 "html", "htm", "ssp", "jsp", "jtp",
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
41 } ) );
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
42 private static final Map<String,String> tagMap = new HashMap<String,String>();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
43 static {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
44 tagMap.put("a","href");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
45 tagMap.put("area","href");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
46 tagMap.put("base","href");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
47 tagMap.put("form","action");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
48 tagMap.put("frame","src");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
49 tagMap.put("img","src");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
50 tagMap.put("link","href");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
51 tagMap.put("input","src");
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
52 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
53
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
54 private String baseUrl;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
55 private Set<String> done = new HashSet<String>();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
56
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
57 public CrawlSite(String baseUrl) {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
58 this.baseUrl = baseUrl;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
59 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
60
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
61 protected boolean isHtml(String url) {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
62 url = url.substring( url.lastIndexOf('/') + 1 );
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
63 int i = url.indexOf('.');
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
64 if( i == -1 )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
65 return true;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
66 String ending = url.substring(i+1);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
67 return endings.contains(ending);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
68 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
69
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
70 protected void process(String url) {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
71 System.out.println(url);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
72 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
73
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
74 public void crawl()
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
75 throws IOException
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
76 {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
77 crawl(baseUrl);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
78 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
79
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
80 private void crawl(String url)
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
81 throws IOException
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
82 {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
83 if( !done.add(url) )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
84 return;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
85
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
86 process(url);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
87
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
88 if( !isHtml(url) )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
89 return;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
90
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
91 String page = IoUtils.readPage(url);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
92 Html html = new Html(page);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
93 Iterator iter = html.iterator();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
94 while( iter.hasNext() ) {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
95 Object o = iter.next();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
96 if( !(o instanceof HtmlTag) )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
97 continue;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
98 HtmlTag tag = (HtmlTag)o;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
99 String tagName = tag.getName().toLowerCase();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
100 String attrName = (String)tagMap.get(tagName);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
101 if( attrName==null )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
102 continue;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
103 String val = tag.getAttributeValue(attrName);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
104 if( val==null )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
105 continue;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
106 String url2 = HtmlTag.unquote(val);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
107 url2 = new URL(new URL(url),url2).toString();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
108 if( !url2.startsWith(baseUrl) )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
109 continue;
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
110 int i = url2.indexOf('#');
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
111 if( i != -1 )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
112 url2 = url2.substring(0,i);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
113 String file = url2.substring( url2.lastIndexOf('/') + 1 );
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
114 if( file.length() > 0 && file.indexOf('.') == -1 )
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
115 url2 += '/';
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
116 try {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
117 crawl(url2);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
118 } catch(IOException e) {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
119 // System.err.println(e);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
120 System.err.println(e+" referred to from "+url);
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
121 // e.printStackTrace();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
122 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
123 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
124 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
125
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
126 public static void main(String[] args) throws Exception {
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
127 new CrawlSite(args[0]).crawl();
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
128 }
00520880ad02 add fschmidt source
Franklin Schmidt <fschmidt@gmail.com>
parents:
diff changeset
129 }