User:Hendrik Brummermann/XHTMLDumper.java

// // This work is licensed under CC-BY // (Creative Commons License - Attribution 2.0). // see: http://creativecommons.org/licenses/by/2.0/

// You need the program "tidy" in your system's search path.

/* * $Log: XHTMLDumper.java,v $ * Revision 1.8 2005/01/08 12:01:30  nhb * Fixing invokation of wget * * Revision 1.7 2004/12/11 18:08:28  nhb * Store output of tidy into a file instead of reading it directly from stdout. * Do not depand on node.toString dumping the whole xml tree. * * Revision 1.6 2004/09/28 19:50:46  nhb * Bugfix: Doppeltes head-Element beseitigt und im inline Stylesheet die linke Spalte auf 0 gesetzt * * Revision 1.5 2004/09/28 19:09:55  nhb * - Skriptbasiertes Herunterladen von allen Links auf einer Seite. * - Der Head-Bereich wird ausgetauscht. * - Bild-URLs werden entsprechend umgeschrieben. * - Ausgabe eines wget-Skripts fuer Bilder * - keine Umkodierung von UTF-8 nach ISO-8859-1 mehr. * * Revision 1.4 2004/08/29 18:12:50  nhb * Neue Klasse: Book * * Revision 1.3 2004/08/28 21:27:26  nhb * *** empty log message *** * * Revision 1.2 2004/08/28 08:16:09  nhb * Refectoring * * Revision 1.1 2004/08/23 22:16:56  nhb * inital checkin * */ package nhb.wikipedia;

import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.StringWriter; import java.net.URL; import java.util.HashSet; import java.util.Iterator; import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult;

import org.apache.xpath.XPathAPI; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.traversal.NodeIterator; import org.xml.sax.SAXException;

/** * fetches a collection of articles to disk. * It can rewrite links and remove the MediaWiki navigation. * Several articles can be combined to one file. * * @author Hendrik Brummermann  * @link http://creativecommons.org/licenses/by/2.0/ */ public class XHTMLDumper {

// --> --> --> --> --> --> --> --> --> --> --> --> --> --> -->   private static final String URL_PREFIX = "http://localhost:10080"; //"http://wiki"; private static final String WIKI_PATH = "/mediawiki/index.php/"; private static final String UPLOAD_PATH = "/mediawiki/images"; private static final String TARGET = "/tmp/wiki"; private static final String IMAGE_FOLDER = "wiki_files"; private static final String ID_SEP = "_____"; // <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <--

public XHTMLDumper { }

public class Article { // Variablen // private final Namespace NS_HTML = Namespace.getNamespace("http://www.w3.org/1999/xhtml"); private String title = null; private String url = null; private Element root = null; private Element content = null; private boolean unifyIDs = false; private boolean convertShortTags = true; private boolean fetchPageRequisites = true; private boolean rewriteLocalURLs = true; private Set pageRequisites = new HashSet; private String head = " \r\n" + "\r\n" + "/*<![CDATA[*/ @import \"wiki_files/main.css\"; /*]]>*/#content {margin: 0} \r\n" + "\r\n" + " \r\n" + " HISLSF - Dokumentation \r\n" + " ";       // DB-Interface-Admin - His

public Article(String title) { this.title = title; url = URL_PREFIX + WIKI_PATH + title; fetchAsXHTML; }

public void process { extractContent; unifyIDsAndConvertShortTags(root); }

/**        * Stores the file to disk. *        * @throws IOException bei einem E/A-Fehler * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers * @throws SAXException bei einem XML-Fehler */       public void saveToDisk throws SAXException, IOException, ParserConfigurationException { process; replaceHead; //           fetchPageRequisites; String filename = TARGET + "/" + title.replace(' ', '_').replace('/', '-') + ".html"; OutputStream of = new FileOutputStream(filename); of.write(XMLUtils.dumpXML(root).getBytes("UTF-8")); of.close; }

/**        * replaces the head-element *        * @throws IOException bei einem E/A-Fehler * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers * @throws SAXException bei einem XML-Fehler */       private void replaceHead throws SAXException, IOException, ParserConfigurationException { Document doc = DocumentBuilderFactory.newInstance.newDocumentBuilder.parse(new ByteArrayInputStream(head.getBytes)); Node oldHead = null; for (int i = 0; i < root.getChildNodes.getLength; i++) { oldHead = root.getChildNodes.item(i); if ((oldHead instanceof Element) && ((Element) oldHead).getNodeName.equalsIgnoreCase("head")) { break; }           }            Node newHead = doc.getDocumentElement; newHead = root.getOwnerDocument.importNode(newHead, true); root.insertBefore(newHead, oldHead); root.removeChild(oldHead); }

/**        * Downloads a HTML-document, converts it into xhtml using tidy * and parses it into an xml object tree. */       private void fetchAsXHTML { try { // fetch String file = NetUtil.fetchDocumentAsFile(url);

// run tidy //"tidy -asxhtml -utf8 $1 >$1.html 2> /dev/null" //             Process process = Runtime.getRuntime.exec("tidy -q -asxhtml -utf8 " + file); String outFile = File.createTempFile("xhtml", ".html").getAbsolutePath; Process process = Runtime.getRuntime.exec("tidy -q -asxhtml -utf8 -o " + outFile + " "+ file); /*               System.out.println("sleeping"); Thread.sleep(5000); System.out.println("sleeped");*/ process.waitFor; //Document doc = DocumentBuilderFactory.newInstance.newDocumentBuilder.parse(process.getInputStream); Document doc = DocumentBuilderFactory.newInstance.newDocumentBuilder.parse(new FileInputStream(outFile)); root = doc.getDocumentElement; } catch (Exception e) { e.printStackTrace; }       }

/**        * Extracts the content (i. e. strips the navigation). */       private void extractContent { try { content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

// Einige Elemente loeschen //               XMLUtils.removeChildren(root, "//self::node[@id='contentSub' or @id='siteSub' or @id='toc' or @class='printfooter' or @id='catlinks' or @class='editsection']"); XMLUtils.removeChildren(root, "//self::node[@id='column-one' or @id='footer' or @id='contentSub' or @id='siteSub' or @class='printfooter' or @id='catlinks' or @class='editsection']");

} catch (TransformerException e) { e.printStackTrace; }       }

/* name: top id: contentTop id: bodyContent id: contentSub

private void unifyIDsAndConvertShortTags(Element element) { NodeList nodes = element.getChildNodes; for (int i = 0; i < nodes.getLength; i++) { Node node = nodes.item(i); if (node instanceof Element) { Element e = (Element) node; String nodeName = node.getNodeName;

// unify IDs if (unifyIDs && "a".equals(nodeName)) { String val = e.getAttribute("name"); if (!val.equals("")) { e.setAttribute("name", title + ID_SEP + val); e.setAttribute("id", title + ID_SEP + val); }                       val = e.getAttribute("href"); if ((val.length > 1) && val.charAt(0) == '#') { e.setAttribute("href", title + ID_SEP + val.substring(1)); System.out.println("#" + title + ID_SEP + val.substring(1)); }                   }

// convert short tags if (convertShortTags && ("a".equals(nodeName) || "div".equals(nodeName)) && (e.getFirstChild == null)) { e.appendChild(e.getOwnerDocument.createTextNode("")); }

// convert links to other pages if (rewriteLocalURLs && "a".equals(nodeName)) { String val = e.getAttribute("href"); if (val.startsWith(WIKI_PATH)) { e.setAttribute("href", val.substring(WIKI_PATH.length)); }                   }

// collection image urls and rewrite img-src links. if (fetchPageRequisites && "img".equals(nodeName)) { String url = e.getAttribute("src"); pageRequisites.add(url); if (url.startsWith(UPLOAD_PATH)) { url = IMAGE_FOLDER + url.substring(UPLOAD_PATH.length + 5); e.setAttribute("src", url); }                   }

// go to the next level unifyIDsAndConvertShortTags((Element) node); }           }        }

/**        * Returns a set of page requisites (like images) *        * @return Set */       public Set getPageRequisites { return pageRequisites; }

/**        * Return the xml object. *        * @return Element */       public Element getXML { if (content != null) { return content; } else { return root; }       }    }

public class Book { private Set pages = new HashSet; private Set pageRequisites = new HashSet;

/**        * creates a new book *        * @param name page containing a list of links * @throws IOException */       public Book(String name) throws IOException { // fetch wiki text BufferedReader br = NetUtil.fetchDocumentAsBufferedReader(URL_PREFIX + WIKI_PATH + name + "?action=raw"); fetchLinkList(br); br.close; }

/**        * Fetches all pages of this book * @throws IOException * @throws ParserConfigurationException * @throws SAXException */       public void fetchBook throws SAXException, IOException, ParserConfigurationException { Iterator itr = pages.iterator; while (itr.hasNext) { String page = (String) itr.next; System.out.println("fetching " + page + "..."); Article article = new Article(page); article.saveToDisk; pageRequisites.addAll(article.getPageRequisites); }           fetchPageRequisites; }

private void fetchPageRequisites { System.out.println("cd " + TARGET + "/" + IMAGE_FOLDER); Iterator itr = pageRequisites.iterator; while (itr.hasNext) { System.out.println("wget -N " + URL_PREFIX + itr.next); }       }

private void fetchLinkList(BufferedReader br) throws IOException { String line = br.readLine; while (line != null) { int pos = line.indexOf("");               while (pos > -1) {                    line = line.substring(pos + 2);                    int posEnd = line.indexOf(""); if (posEnd == -1) { // is the link closed? break; }                   String link = line.substring(0, posEnd); pos = link.indexOf("|"); if (pos > -1) { link = link.substring(0, pos); }                   link = link.trim; String page = link; if (page.length == 0) { continue; }                   page = page.replace(' ', '_'); pages.add(page);

// find next link line = line.substring(posEnd + 2); pos = line.indexOf("[[");               }                line = br.readLine;            }        }    }

public class Cover { private Element root = null;

public Cover(String name) { Article cover = new Article(name); root = cover.getXML; XMLUtils.removeChildren(root, "//div[@id='content']/*"); XMLUtils.removeChildren(root, "//div[@id='column-one' or @id='footer']"); try { Element content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

} catch (TransformerException e) { e.printStackTrace; }       }

public Element getXML { return root; }   }

public static class XMLUtils { /** hide constructor */ private XMLUtils { }

/**        * Dumps an XML-tree into a String *        * @param node xml-node * @return String */       public static String dumpXML(Node node) { try { // Message-ID:  From: "Billy Ng" DOMSource source = new DOMSource(node); TransformerFactory tfFactory = TransformerFactory.newInstance; Transformer transformer = tfFactory.newTransformer; StringWriter sw = new StringWriter; StreamResult result = new StreamResult(sw); transformer.transform(source, result); return sw.toString; } catch (TransformerConfigurationException e) { e.printStackTrace; } catch (TransformerException e) { e.printStackTrace; }           return ""; }

public static void removeChildren(Element parent, String xpath) { try { NodeIterator itr = XPathAPI.selectNodeIterator(parent, xpath); Node node = itr.nextNode; Set set = new HashSet; while (node != null) { set.add(node); node = itr.nextNode; }               Iterator itr2 = set.iterator; while (itr2.hasNext) { node = (Node) itr2.next; node.getParentNode.removeChild(node); }           } catch (TransformerException e) { e.printStackTrace; }       }    }

/**    * Utility class for network access. */   public static final class NetUtil { private static final int BUFFER_SIZE = 10240;

/** Hide constructor */ private NetUtil { }

/**        * Gibt einen BufferedReader mit dem Ziel der URL zurueck. *        * @param urlString URL * @return BufferedReader * @throws IOException bei einem E/A-Fehler */       public static BufferedReader fetchDocumentAsBufferedReader(String urlString) throws IOException { URL url = new URL(urlString); InputStream is = url.openStream; return new BufferedReader(new InputStreamReader(is)); }

/**        * Laedt ein Dokument aus dem Netz herunter und * speichert es in einer lokalen Datei. *        * @param urlString URL * @return Dateiname * @throws IOException bei einem Fehler */       public static String fetchDocumentAsFile(String urlString) throws IOException { byte[] temp = new byte[BUFFER_SIZE + 1]; URL url = new URL(urlString); BufferedInputStream is = new BufferedInputStream(url.openStream); File file = File.createTempFile("dump", ".html"); file.deleteOnExit; String tempFile = file.getAbsolutePath; BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(tempFile)); while (true) { int aval = is.available; if (aval == 0) { try { Thread.sleep(100); } catch (InterruptedException e) { e.printStackTrace; }               }                int count = is.read(temp, 0, BUFFER_SIZE); if (count == -1) { break; }               os.write(temp, 0, count); }           is.close; os.close; return tempFile; }

}

/**    * main entry point *    * @param args command line arguments * @throws Exception if something unexpected happend */   public static void main(String[] args) throws Exception { if (args.length == 0) { System.err.println("Aufruf: nhb.wikipedia.XHTMLDumper title-of-link-list"); System.err.println("   title-of-link-list is the title of the page containing a list of links."); System.exit(1); }       XHTMLDumper xd = new XHTMLDumper; Book book = xd.new Book(args[0]); book.fetchBook;

/*article.process; System.out.println(article.getXML);*/

System.out.println("Fertig."); } }

//