Mercurial Hosting > nabble
comparison src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02
add fschmidt source
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sun, 05 Oct 2025 17:24:15 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:9d0fefce6985 | 68:00520880ad02 |
---|---|
1 /* | |
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 */ | |
22 | |
23 package fschmidt.tools; | |
24 | |
25 import java.io.IOException; | |
26 import java.net.URL; | |
27 import java.util.Set; | |
28 import java.util.HashSet; | |
29 import java.util.Map; | |
30 import java.util.HashMap; | |
31 import java.util.Arrays; | |
32 import java.util.Iterator; | |
33 import fschmidt.util.java.IoUtils; | |
34 import fschmidt.html.Html; | |
35 import fschmidt.html.HtmlTag; | |
36 | |
37 | |
38 public class CrawlSite { | |
39 private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{ | |
40 "html", "htm", "ssp", "jsp", "jtp", | |
41 } ) ); | |
42 private static final Map<String,String> tagMap = new HashMap<String,String>(); | |
43 static { | |
44 tagMap.put("a","href"); | |
45 tagMap.put("area","href"); | |
46 tagMap.put("base","href"); | |
47 tagMap.put("form","action"); | |
48 tagMap.put("frame","src"); | |
49 tagMap.put("img","src"); | |
50 tagMap.put("link","href"); | |
51 tagMap.put("input","src"); | |
52 } | |
53 | |
54 private String baseUrl; | |
55 private Set<String> done = new HashSet<String>(); | |
56 | |
57 public CrawlSite(String baseUrl) { | |
58 this.baseUrl = baseUrl; | |
59 } | |
60 | |
61 protected boolean isHtml(String url) { | |
62 url = url.substring( url.lastIndexOf('/') + 1 ); | |
63 int i = url.indexOf('.'); | |
64 if( i == -1 ) | |
65 return true; | |
66 String ending = url.substring(i+1); | |
67 return endings.contains(ending); | |
68 } | |
69 | |
70 protected void process(String url) { | |
71 System.out.println(url); | |
72 } | |
73 | |
74 public void crawl() | |
75 throws IOException | |
76 { | |
77 crawl(baseUrl); | |
78 } | |
79 | |
80 private void crawl(String url) | |
81 throws IOException | |
82 { | |
83 if( !done.add(url) ) | |
84 return; | |
85 | |
86 process(url); | |
87 | |
88 if( !isHtml(url) ) | |
89 return; | |
90 | |
91 String page = IoUtils.readPage(url); | |
92 Html html = new Html(page); | |
93 Iterator iter = html.iterator(); | |
94 while( iter.hasNext() ) { | |
95 Object o = iter.next(); | |
96 if( !(o instanceof HtmlTag) ) | |
97 continue; | |
98 HtmlTag tag = (HtmlTag)o; | |
99 String tagName = tag.getName().toLowerCase(); | |
100 String attrName = (String)tagMap.get(tagName); | |
101 if( attrName==null ) | |
102 continue; | |
103 String val = tag.getAttributeValue(attrName); | |
104 if( val==null ) | |
105 continue; | |
106 String url2 = HtmlTag.unquote(val); | |
107 url2 = new URL(new URL(url),url2).toString(); | |
108 if( !url2.startsWith(baseUrl) ) | |
109 continue; | |
110 int i = url2.indexOf('#'); | |
111 if( i != -1 ) | |
112 url2 = url2.substring(0,i); | |
113 String file = url2.substring( url2.lastIndexOf('/') + 1 ); | |
114 if( file.length() > 0 && file.indexOf('.') == -1 ) | |
115 url2 += '/'; | |
116 try { | |
117 crawl(url2); | |
118 } catch(IOException e) { | |
119 // System.err.println(e); | |
120 System.err.println(e+" referred to from "+url); | |
121 // e.printStackTrace(); | |
122 } | |
123 } | |
124 } | |
125 | |
126 public static void main(String[] args) throws Exception { | |
127 new CrawlSite(args[0]).crawl(); | |
128 } | |
129 } |