Mercurial Hosting > nabble
comparison src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02
add fschmidt source
| author | Franklin Schmidt <fschmidt@gmail.com> |
|---|---|
| date | Sun, 05 Oct 2025 17:24:15 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 67:9d0fefce6985 | 68:00520880ad02 |
|---|---|
| 1 /* | |
| 2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> | |
| 3 | |
| 4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 5 of this software and associated documentation files (the "Software"), to deal | |
| 6 in the Software without restriction, including without limitation the rights | |
| 7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 8 copies of the Software, and to permit persons to whom the Software is | |
| 9 furnished to do so, subject to the following conditions: | |
| 10 | |
| 11 The above copyright notice and this permission notice shall be included in | |
| 12 all copies or substantial portions of the Software. | |
| 13 | |
| 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 20 THE SOFTWARE. | |
| 21 */ | |
| 22 | |
| 23 package fschmidt.tools; | |
| 24 | |
| 25 import java.io.IOException; | |
| 26 import java.net.URL; | |
| 27 import java.util.Set; | |
| 28 import java.util.HashSet; | |
| 29 import java.util.Map; | |
| 30 import java.util.HashMap; | |
| 31 import java.util.Arrays; | |
| 32 import java.util.Iterator; | |
| 33 import fschmidt.util.java.IoUtils; | |
| 34 import fschmidt.html.Html; | |
| 35 import fschmidt.html.HtmlTag; | |
| 36 | |
| 37 | |
| 38 public class CrawlSite { | |
| 39 private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{ | |
| 40 "html", "htm", "ssp", "jsp", "jtp", | |
| 41 } ) ); | |
| 42 private static final Map<String,String> tagMap = new HashMap<String,String>(); | |
| 43 static { | |
| 44 tagMap.put("a","href"); | |
| 45 tagMap.put("area","href"); | |
| 46 tagMap.put("base","href"); | |
| 47 tagMap.put("form","action"); | |
| 48 tagMap.put("frame","src"); | |
| 49 tagMap.put("img","src"); | |
| 50 tagMap.put("link","href"); | |
| 51 tagMap.put("input","src"); | |
| 52 } | |
| 53 | |
| 54 private String baseUrl; | |
| 55 private Set<String> done = new HashSet<String>(); | |
| 56 | |
| 57 public CrawlSite(String baseUrl) { | |
| 58 this.baseUrl = baseUrl; | |
| 59 } | |
| 60 | |
| 61 protected boolean isHtml(String url) { | |
| 62 url = url.substring( url.lastIndexOf('/') + 1 ); | |
| 63 int i = url.indexOf('.'); | |
| 64 if( i == -1 ) | |
| 65 return true; | |
| 66 String ending = url.substring(i+1); | |
| 67 return endings.contains(ending); | |
| 68 } | |
| 69 | |
| 70 protected void process(String url) { | |
| 71 System.out.println(url); | |
| 72 } | |
| 73 | |
| 74 public void crawl() | |
| 75 throws IOException | |
| 76 { | |
| 77 crawl(baseUrl); | |
| 78 } | |
| 79 | |
| 80 private void crawl(String url) | |
| 81 throws IOException | |
| 82 { | |
| 83 if( !done.add(url) ) | |
| 84 return; | |
| 85 | |
| 86 process(url); | |
| 87 | |
| 88 if( !isHtml(url) ) | |
| 89 return; | |
| 90 | |
| 91 String page = IoUtils.readPage(url); | |
| 92 Html html = new Html(page); | |
| 93 Iterator iter = html.iterator(); | |
| 94 while( iter.hasNext() ) { | |
| 95 Object o = iter.next(); | |
| 96 if( !(o instanceof HtmlTag) ) | |
| 97 continue; | |
| 98 HtmlTag tag = (HtmlTag)o; | |
| 99 String tagName = tag.getName().toLowerCase(); | |
| 100 String attrName = (String)tagMap.get(tagName); | |
| 101 if( attrName==null ) | |
| 102 continue; | |
| 103 String val = tag.getAttributeValue(attrName); | |
| 104 if( val==null ) | |
| 105 continue; | |
| 106 String url2 = HtmlTag.unquote(val); | |
| 107 url2 = new URL(new URL(url),url2).toString(); | |
| 108 if( !url2.startsWith(baseUrl) ) | |
| 109 continue; | |
| 110 int i = url2.indexOf('#'); | |
| 111 if( i != -1 ) | |
| 112 url2 = url2.substring(0,i); | |
| 113 String file = url2.substring( url2.lastIndexOf('/') + 1 ); | |
| 114 if( file.length() > 0 && file.indexOf('.') == -1 ) | |
| 115 url2 += '/'; | |
| 116 try { | |
| 117 crawl(url2); | |
| 118 } catch(IOException e) { | |
| 119 // System.err.println(e); | |
| 120 System.err.println(e+" referred to from "+url); | |
| 121 // e.printStackTrace(); | |
| 122 } | |
| 123 } | |
| 124 } | |
| 125 | |
| 126 public static void main(String[] args) throws Exception { | |
| 127 new CrawlSite(args[0]).crawl(); | |
| 128 } | |
| 129 } |
