repo/nabble: src/fschmidt/tools/CrawlSite.java comparison

comparison src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02

add fschmidt source

author	Franklin Schmidt <fschmidt@gmail.com>
date	Sun, 05 Oct 2025 17:24:15 -0600
parents
children

comparison

equal deleted inserted replaced

-:9d0fefce6985
+:00520880ad02
+/*
+Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com>
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+package fschmidt.tools;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Arrays;
+import java.util.Iterator;
+import fschmidt.util.java.IoUtils;
+import fschmidt.html.Html;
+import fschmidt.html.HtmlTag;
+public class CrawlSite {
+	private static final Set<String> endings = new HashSet<String>( Arrays.asList( new String[]{
+		"html", "htm", "ssp", "jsp", "jtp",
+	} ) );
+	private static final Map<String,String> tagMap = new HashMap<String,String>();
+	static {
+		tagMap.put("a","href");
+		tagMap.put("area","href");
+		tagMap.put("base","href");
+		tagMap.put("form","action");
+		tagMap.put("frame","src");
+		tagMap.put("img","src");
+		tagMap.put("link","href");
+		tagMap.put("input","src");
+	}
+	private String baseUrl;
+	private Set<String> done = new HashSet<String>();
+	public CrawlSite(String baseUrl) {
+		this.baseUrl = baseUrl;
+	}
+	protected boolean isHtml(String url) {
+		url = url.substring( url.lastIndexOf('/') + 1 );
+		int i = url.indexOf('.');
+		if( i == -1 )
+			return true;
+		String ending = url.substring(i+1);
+		return endings.contains(ending);
+	}
+	protected void process(String url) {
+		System.out.println(url);
+	}
+	public void crawl()
+		throws IOException
+	{
+		crawl(baseUrl);
+	}
+	private void crawl(String url)
+		throws IOException
+	{
+		if( !done.add(url) )
+			return;
+		process(url);
+		if( !isHtml(url) )
+			return;
+		String page = IoUtils.readPage(url);
+		Html html = new Html(page);
+		Iterator iter = html.iterator();
+		while( iter.hasNext() ) {
+			Object o = iter.next();
+			if( !(o instanceof HtmlTag) )
+				continue;
+			HtmlTag tag = (HtmlTag)o;
+			String tagName = tag.getName().toLowerCase();
+			String attrName = (String)tagMap.get(tagName);
+			if( attrName==null )
+				continue;
+			String val = tag.getAttributeValue(attrName);
+			if( val==null )
+				continue;
+			String url2 = HtmlTag.unquote(val);
+			url2 = new URL(new URL(url),url2).toString();
+			if( !url2.startsWith(baseUrl) )
+				continue;
+			int i = url2.indexOf('#');
+			if( i != -1 )
+				url2 = url2.substring(0,i);
+			String file = url2.substring( url2.lastIndexOf('/') + 1 );
+			if( file.length() > 0 && file.indexOf('.') == -1 )
+				url2 += '/';
+			try {
+				crawl(url2);
+			} catch(IOException e) {
+//				System.err.println(e);
+				System.err.println(e+"  referred to from "+url);
+//				e.printStackTrace();
+			}
+		}
+	}
+	public static void main(String[] args) throws Exception {
+		new CrawlSite(args[0]).crawl();
+	}
+}

Mercurial Hosting > nabble

comparison src/fschmidt/tools/CrawlSite.java @ 68:00520880ad02