private void parseHTML(ProcessedDocument htmlDoc, InputSource inputSource)
throws HTMLDocumentParserException {
// NekoHTML parser
DOMParser parser = new DOMParser();
// Create filter to remove elements that we don't care about.
ElementRemover remover = new ElementRemover();
// keep only a subset of elements (text and links)
remover.acceptElement("html", null);
remover.acceptElement("meta", new String[] { "name", "content" });
remover.acceptElement("title", null);
remover.acceptElement("body", null);
remover.acceptElement("base", new String[] { "href" });
remover.acceptElement("b", null);
remover.acceptElement("i", null);
remover.acceptElement("u", null);
remover.acceptElement("p", null);
remover.acceptElement("br", null);
remover.acceptElement("a", new String[] { "href", "rel" });
// completely remove these elements
remover.removeElement("script");
remover.removeElement("style");
StringWriter sw = new StringWriter();
XMLDocumentFilter writer = new HTMLWriter(sw, "UTF-8");
XMLDocumentFilter[] filters = { remover, writer };
try {
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
} catch (SAXException e) {
throw new HTMLDocumentParserException("Property is not supported", e);
}
try {
parser.parse(inputSource);
} catch (SAXException e) {
throw new HTMLDocumentParserException("Parsing error: ", e);
} catch (IOException e) {
throw new HTMLDocumentParserException("Parsing error: ", e);
}
}
Etiketler: Java