Mercurial Hosting > nabble
diff src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02
add fschmidt source
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sun, 05 Oct 2025 17:24:15 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/fschmidt/tools/CrawlSite.java Sun Oct 05 17:24:15 2025 -0600 @@ -0,0 +1,129 @@ +/* +Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +package fschmidt.tools; + +import java.io.IOException; +import java.net.URL; +import java.util.Set; +import java.util.HashSet; +import java.util.Map; +import java.util.HashMap; +import java.util.Arrays; +import java.util.Iterator; +import fschmidt.util.java.IoUtils; +import fschmidt.html.Html; +import fschmidt.html.HtmlTag; + + +public class CrawlSite { + private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{ + "html", "htm", "ssp", "jsp", "jtp", + } ) ); + private static final Map<String,String> tagMap = new HashMap<String,String>(); + static { + tagMap.put("a","href"); + tagMap.put("area","href"); + tagMap.put("base","href"); + tagMap.put("form","action"); + tagMap.put("frame","src"); + tagMap.put("img","src"); + tagMap.put("link","href"); + tagMap.put("input","src"); + } + + private String baseUrl; + private Set<String> done = new HashSet<String>(); + + public CrawlSite(String baseUrl) { + this.baseUrl = baseUrl; + } + + protected boolean isHtml(String url) { + url = url.substring( url.lastIndexOf('/') + 1 ); + int i = url.indexOf('.'); + if( i == -1 ) + return true; + String ending = url.substring(i+1); + return endings.contains(ending); + } + + protected void process(String url) { + System.out.println(url); + } + + public void crawl() + throws IOException + { + crawl(baseUrl); + } + + private void crawl(String url) + throws IOException + { + if( !done.add(url) ) + return; + + process(url); + + if( !isHtml(url) ) + return; + + String page = IoUtils.readPage(url); + Html html = new Html(page); + Iterator iter = html.iterator(); + while( iter.hasNext() ) { + Object o = iter.next(); + if( !(o instanceof HtmlTag) ) + continue; + HtmlTag tag = (HtmlTag)o; + String tagName = tag.getName().toLowerCase(); + String attrName = (String)tagMap.get(tagName); + if( attrName==null ) + continue; + String val = tag.getAttributeValue(attrName); + if( val==null ) + continue; + String url2 = HtmlTag.unquote(val); + url2 = new URL(new URL(url),url2).toString(); + if( !url2.startsWith(baseUrl) ) + continue; + int i = url2.indexOf('#'); + if( i != -1 ) + url2 = url2.substring(0,i); + String file = url2.substring( url2.lastIndexOf('/') + 1 ); + if( file.length() > 0 && file.indexOf('.') == -1 ) + url2 += '/'; + try { + crawl(url2); + } catch(IOException e) { +// System.err.println(e); + System.err.println(e+" referred to from "+url); +// e.printStackTrace(); + } + } + } + + public static void main(String[] args) throws Exception { + new CrawlSite(args[0]).crawl(); + } +}