org.egothor.html
Class HTMLPrinter

java.lang.Object
  extended by org.cyberneko.html.filters.DefaultFilter
      extended by org.egothor.html.HTMLPrinter
All Implemented Interfaces:
org.apache.xerces.xni.parser.XMLComponent, org.apache.xerces.xni.parser.XMLDocumentFilter, org.apache.xerces.xni.parser.XMLDocumentSource, org.apache.xerces.xni.XMLDocumentHandler, org.cyberneko.html.HTMLComponent

public class HTMLPrinter
extends org.cyberneko.html.filters.DefaultFilter

This class reformats HTML documents in our DB to a format that can be used for the science. It was used for a syllable XBW paper.


Field Summary
protected static java.lang.String AUGMENTATIONS
           
protected  boolean charNorm
           
protected  int depth
           
protected  java.lang.String enc
           
protected static java.lang.String FILTERS
           
protected  boolean httpEqSeen
           
static java.lang.String NOTIFY_CHAR_REFS
           
static java.lang.String NOTIFY_HTML_BUILTIN_REFS
           
protected  boolean printChars
           
protected  java.io.PrintWriter pw
           
protected  boolean rootSeen
           
 
Fields inherited from class org.cyberneko.html.filters.DefaultFilter
fDocumentHandler, fDocumentSource
 
Constructor Summary
HTMLPrinter(java.io.OutputStream outputStream, java.lang.String encoding)
           
HTMLPrinter(java.io.Writer writer, java.lang.String encoding, boolean strictScript)
           
 
Method Summary
 void characters(org.apache.xerces.xni.XMLString text, org.apache.xerces.xni.Augmentations augs)
           
 void comment(org.apache.xerces.xni.XMLString text, org.apache.xerces.xni.Augmentations augs)
           
 void emptyElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs)
           
 void endElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.Augmentations augs)
           
 void endGeneralEntity(java.lang.String name, org.apache.xerces.xni.Augmentations augs)
           
static void filter(java.lang.String systemId, byte[] file, int file_len, java.io.PrintStream ps, java.lang.String inpEnc, java.lang.String outEnc)
           
protected  void printAttributeValue(java.lang.String text)
           
protected  void printCharacters(org.apache.xerces.xni.XMLString text, boolean normalize)
           
protected  void printEndElement(org.apache.xerces.xni.QName element)
           
protected  void printEntity(java.lang.String name)
           
protected  void printStartElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes)
           
protected  void printStartElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, boolean empty)
           
 void startDocument(org.apache.xerces.xni.XMLLocator locator, java.lang.String encoding, org.apache.xerces.xni.Augmentations augs)
           
 void startDocument(org.apache.xerces.xni.XMLLocator locator, java.lang.String encoding, org.apache.xerces.xni.NamespaceContext nscontext, org.apache.xerces.xni.Augmentations augs)
           
 void startElement(org.apache.xerces.xni.QName element, org.apache.xerces.xni.XMLAttributes attributes, org.apache.xerces.xni.Augmentations augs)
           
 void startGeneralEntity(java.lang.String name, org.apache.xerces.xni.XMLResourceIdentifier id, java.lang.String encoding, org.apache.xerces.xni.Augmentations augs)
           
 
Methods inherited from class org.cyberneko.html.filters.DefaultFilter
doctypeDecl, endCDATA, endDocument, endPrefixMapping, getDocumentHandler, getDocumentSource, getFeatureDefault, getPropertyDefault, getRecognizedFeatures, getRecognizedProperties, ignorableWhitespace, merge, processingInstruction, reset, setDocumentHandler, setDocumentSource, setFeature, setProperty, startCDATA, startPrefixMapping, textDecl, xmlDecl
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

NOTIFY_CHAR_REFS

public static final java.lang.String NOTIFY_CHAR_REFS
See Also:
Constant Field Values

NOTIFY_HTML_BUILTIN_REFS

public static final java.lang.String NOTIFY_HTML_BUILTIN_REFS
See Also:
Constant Field Values

AUGMENTATIONS

protected static final java.lang.String AUGMENTATIONS
See Also:
Constant Field Values

FILTERS

protected static final java.lang.String FILTERS
See Also:
Constant Field Values

enc

protected java.lang.String enc

pw

protected java.io.PrintWriter pw

rootSeen

protected boolean rootSeen

httpEqSeen

protected boolean httpEqSeen

depth

protected int depth

charNorm

protected boolean charNorm

printChars

protected boolean printChars
Constructor Detail

HTMLPrinter

public HTMLPrinter(java.io.OutputStream outputStream,
                   java.lang.String encoding)
            throws java.io.UnsupportedEncodingException
Throws:
java.io.UnsupportedEncodingException

HTMLPrinter

public HTMLPrinter(java.io.Writer writer,
                   java.lang.String encoding,
                   boolean strictScript)
Method Detail

startDocument

public void startDocument(org.apache.xerces.xni.XMLLocator locator,
                          java.lang.String encoding,
                          org.apache.xerces.xni.NamespaceContext nscontext,
                          org.apache.xerces.xni.Augmentations augs)
                   throws org.apache.xerces.xni.XNIException
Specified by:
startDocument in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
startDocument in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

startDocument

public void startDocument(org.apache.xerces.xni.XMLLocator locator,
                          java.lang.String encoding,
                          org.apache.xerces.xni.Augmentations augs)
                   throws org.apache.xerces.xni.XNIException
Overrides:
startDocument in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

comment

public void comment(org.apache.xerces.xni.XMLString text,
                    org.apache.xerces.xni.Augmentations augs)
             throws org.apache.xerces.xni.XNIException
Specified by:
comment in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
comment in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

startElement

public void startElement(org.apache.xerces.xni.QName element,
                         org.apache.xerces.xni.XMLAttributes attributes,
                         org.apache.xerces.xni.Augmentations augs)
                  throws org.apache.xerces.xni.XNIException
Specified by:
startElement in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
startElement in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

emptyElement

public void emptyElement(org.apache.xerces.xni.QName element,
                         org.apache.xerces.xni.XMLAttributes attributes,
                         org.apache.xerces.xni.Augmentations augs)
                  throws org.apache.xerces.xni.XNIException
Specified by:
emptyElement in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
emptyElement in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

characters

public void characters(org.apache.xerces.xni.XMLString text,
                       org.apache.xerces.xni.Augmentations augs)
                throws org.apache.xerces.xni.XNIException
Specified by:
characters in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
characters in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

endElement

public void endElement(org.apache.xerces.xni.QName element,
                       org.apache.xerces.xni.Augmentations augs)
                throws org.apache.xerces.xni.XNIException
Specified by:
endElement in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
endElement in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

startGeneralEntity

public void startGeneralEntity(java.lang.String name,
                               org.apache.xerces.xni.XMLResourceIdentifier id,
                               java.lang.String encoding,
                               org.apache.xerces.xni.Augmentations augs)
                        throws org.apache.xerces.xni.XNIException
Specified by:
startGeneralEntity in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
startGeneralEntity in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

endGeneralEntity

public void endGeneralEntity(java.lang.String name,
                             org.apache.xerces.xni.Augmentations augs)
                      throws org.apache.xerces.xni.XNIException
Specified by:
endGeneralEntity in interface org.apache.xerces.xni.XMLDocumentHandler
Overrides:
endGeneralEntity in class org.cyberneko.html.filters.DefaultFilter
Throws:
org.apache.xerces.xni.XNIException

printAttributeValue

protected void printAttributeValue(java.lang.String text)

printCharacters

protected void printCharacters(org.apache.xerces.xni.XMLString text,
                               boolean normalize)

printStartElement

protected void printStartElement(org.apache.xerces.xni.QName element,
                                 org.apache.xerces.xni.XMLAttributes attributes)

printStartElement

protected void printStartElement(org.apache.xerces.xni.QName element,
                                 org.apache.xerces.xni.XMLAttributes attributes,
                                 boolean empty)

printEndElement

protected void printEndElement(org.apache.xerces.xni.QName element)

printEntity

protected void printEntity(java.lang.String name)

filter

public static void filter(java.lang.String systemId,
                          byte[] file,
                          int file_len,
                          java.io.PrintStream ps,
                          java.lang.String inpEnc,
                          java.lang.String outEnc)
                   throws java.lang.Exception
Throws:
java.lang.Exception