|
What this is
Other links
The source code// $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/Parser.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $ /* * ==================================================================== * Copyright 2002-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ // The developers of JMeter and Apache are greatful to the developers // of HTMLParser for giving Apache Software Foundation a non-exclusive // license. The performance benefits of HTMLParser are clear and the // users of JMeter will benefit from the hard work the HTMLParser // team. For detailed information about HTMLParser, the project is // hosted on sourceforge at http://htmlparser.sourceforge.net/. // // HTMLParser was originally created by Somik Raha in 2000. Since then // a healthy community of users has formed and helped refine the // design so that it is able to tackle the difficult task of parsing // dirty HTML. Derrick Oswald is the current lead developer and was kind // enough to assist JMeter. package org.htmlparser; ////////////////// // Java Imports // ////////////////// import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLConnection; import java.util.HashMap; import java.util.Hashtable; import java.util.Map; import org.htmlparser.parserHelper.ParserHelper; import org.htmlparser.parserHelper.TagParser; import org.htmlparser.scanners.AppletScanner; import org.htmlparser.scanners.BodyScanner; import org.htmlparser.scanners.BulletListScanner; import org.htmlparser.scanners.DivScanner; import org.htmlparser.scanners.DoctypeScanner; import org.htmlparser.scanners.FormScanner; import org.htmlparser.scanners.FrameSetScanner; import org.htmlparser.scanners.HeadScanner; import org.htmlparser.scanners.HtmlScanner; import org.htmlparser.scanners.JspScanner; import org.htmlparser.scanners.LinkScanner; import org.htmlparser.scanners.MetaTagScanner; import org.htmlparser.scanners.ScriptScanner; import org.htmlparser.scanners.StyleScanner; import org.htmlparser.scanners.TableScanner; import org.htmlparser.scanners.TagScanner; import org.htmlparser.scanners.TitleScanner; import org.htmlparser.tags.EndTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.tags.Tag; import org.htmlparser.util.DefaultParserFeedback; import org.htmlparser.util.IteratorImpl; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.ParserFeedback; import org.htmlparser.visitors.NodeVisitor; /** * This is the class that the user will use, either to get an iterator into * the html page or to directly parse the page and print the results *
* Important: If you are using this constructor, and you would like to use the parser
* to parse multiple times (multiple calls to parser.elements()), you must ensure the following:
resourceLocn , url_conn , character_set
* and reader . It does not adjust the scanners list
* or feedback object. The four fields are set atomicly by
* this method, either they are all set or none of them is set. Trying to
* set the connection to null is a noop.
* @param connection A fully conditioned connection. The connect()
* method will be called so it need not be connected yet.
* @exception ParserException if the character set specified in the
* HTTP header is not supported, or an i/o exception occurs creating the
* reader.
*/
public void setConnection(URLConnection connection) throws ParserException
{
String res;
NodeReader rd;
String chs;
URLConnection con;
if (null != connection)
{
res = getURL();
rd = getReader();
chs = getEncoding();
con = getConnection();
try
{
resourceLocn = connection.getURL().toExternalForm();
url_conn = connection;
url_conn.connect();
character_set = getCharacterSet(url_conn);
createReader();
}
catch (IOException ioe)
{
String msg =
"setConnection() : Error in opening a connection to "
+ connection.getURL().toExternalForm();
ParserException ex = new ParserException(msg, ioe);
feedback.error(msg, ex);
resourceLocn = res;
url_conn = con;
character_set = chs;
reader = rd;
throw ex;
}
}
}
/**
* Return the current connection.
* @return The connection either created by the parser or passed into this
* parser via setConnection .
* @see #setConnection(URLConnection)
*/
public URLConnection getConnection()
{
return (url_conn);
}
/**
* Set the URL for this parser.
* This method sets four of the fields in the parser object;
* resourceLocn , url_conn , character_set
* and reader . It does not adjust the scanners list
* or feedback object.Trying to set the url to null or an
* empty string is a noop.
* @see #setConnection(URLConnection)
*/
public void setURL(String url) throws ParserException
{
if ((null != url) && !"".equals(url))
setConnection(ParserHelper.openConnection(url, getFeedback()));
}
/**
* Return the current URL being parsed.
* @return The url passed into the constructor or the file name
* passed to the constructor modified to be a URL.
*/
public String getURL()
{
return (resourceLocn);
}
/**
* Set the encoding for this parser.
* If there is no connection (getConnection() returns null) it simply sets
* the character set name stored in the parser (Note: the reader object
* which must have been set in the constructor or by setReader() ,
* may or may not be using this character set).
* Otherwise (getConnection() doesn't return null) it does this by reopening the
* input stream of the connection and creating a reader that uses this
* character set. In this case, this method sets two of the fields in the
* parser object; character_set and reader .
* It does not adjust resourceLocn , url_conn ,
* scanners or feedback . The two fields are set
* atomicly by this method, either they are both set or none of them is set.
* Trying to set the encoding to null or an empty string is a noop.
* @exception ParserException If the opening of the reader
*/
public void setEncoding(String encoding) throws ParserException
{
String chs;
NodeReader rd;
BufferedInputStream in;
if ((null != encoding) && !"".equals(encoding))
if (null == getConnection())
character_set = encoding;
else
{
rd = getReader();
chs = getEncoding();
in = input;
try
{
character_set = encoding;
recreateReader();
}
catch (IOException ioe)
{
String msg =
"setEncoding() : Error in opening a connection to "
+ getConnection().getURL().toExternalForm();
ParserException ex = new ParserException(msg, ioe);
feedback.error(msg, ex);
character_set = chs;
reader = rd;
input = in;
throw ex;
}
}
}
/**
* The current encoding.
* This item is et from the HTTP header but may be overridden by meta
* tags in the head, so this may change after the head has been parsed.
*/
public String getEncoding()
{
return (character_set);
}
/**
* Set the reader for this parser.
* This method sets four of the fields in the parser object;
* resourceLocn , url_conn , character_set
* and reader . It does not adjust the scanners list
* or feedback object. The url_conn is set to
* null since this cannot be determined from the reader. The
* character_set is set to the default character set since
* this cannot be determined from the reader.
* Trying to set the reader to null is a noop.
* @param rd The reader object to use. This reader will be bound to this
* parser after this call.
*/
public void setReader(NodeReader rd)
{
if (null != rd)
{
resourceLocn = rd.getURL();
reader = rd;
character_set = DEFAULT_CHARSET;
url_conn = null;
reader.setParser(this);
}
}
/**
* Returns the reader associated with the parser
* @return NodeReader
*/
public NodeReader getReader()
{
return reader;
}
/**
* Get the number of scanners registered currently in the scanner.
* @return int number of scanners registered
*/
public int getNumScanners()
{
return scanners.size();
}
/**
* This method is to be used to change the set of scanners in the current parser.
* @param newScanners Vector holding scanner objects to be used during the parsing process.
*/
public void setScanners(Map newScanners)
{
scanners = (null == newScanners) ? new HashMap() : newScanners;
}
/**
* Get an enumeration of scanners registered currently in the parser
* @return Enumeration of scanners currently registered in the parser
*/
public Map getScanners()
{
return scanners;
}
/**
* Sets the feedback object used in scanning.
* @param fb The new feedback object to use.
*/
public void setFeedback(ParserFeedback fb)
{
feedback = (null == fb) ? noFeedback : fb;
}
/**
* Returns the feedback.
* @return HTMLParserFeedback
*/
public ParserFeedback getFeedback()
{
return feedback;
}
//
// Internal methods
//
/**
* Open a stream reader on the InputStream .
* Revise the character set to it's default value if an
* UnsupportedEncodingException is thrown.
* @exception UnsupportedEncodingException in the unlikely event that
* the default character set is not supported on this platform.
*/
protected InputStreamReader createInputStreamReader()
throws UnsupportedEncodingException
{
InputStreamReader ret;
try
{
ret = new InputStreamReader(input, character_set);
}
catch (UnsupportedEncodingException uee)
{
StringBuffer msg;
String message;
msg = new StringBuffer(1024);
msg.append(url_conn.getURL().toExternalForm());
msg.append(" has an encoding (");
msg.append(character_set);
msg.append(") which is not supported, using ");
msg.append(DEFAULT_CHARSET);
message = msg.toString();
feedback.warning(message);
character_set = DEFAULT_CHARSET;
ret = new InputStreamReader(input, character_set);
}
return (ret);
}
/**
* Create a new reader for the URLConnection object.
* The current character set is used to transform the input stream
* into a character reader.
* @exception IOException if there is a problem constructing the reader.
* @see #createInputStreamReader()
* @see #getEncoding()
*/
protected void createReader() throws IOException
{
InputStream stream;
InputStreamReader in;
stream = url_conn.getInputStream();
input = new BufferedInputStream(stream);
input.mark(Integer.MAX_VALUE);
in = createInputStreamReader();
reader = new NodeReader(in, resourceLocn);
reader.setParser(this);
}
/**
* Create a new reader for the URLConnection object but reuse the input stream.
* The current character set is used to transform the input stream
* into a character reader. Defaults to createReader() if
* there is no existing input stream.
* @exception IOException if there is a problem constructing the reader.
* @see #createReader()
* @see #createInputStreamReader()
* @see #getEncoding()
*/
protected void recreateReader() throws IOException
{
InputStreamReader in;
if (null == input)
createReader();
else
{
input.reset();
input.mark(Integer.MAX_VALUE);
in = createInputStreamReader();
reader = new NodeReader(in, resourceLocn);
reader.setParser(this);
}
}
/**
* Try and extract the character set from the HTTP header.
* @param connection The connection with the charset info.
* @return The character set name to use for this HTML page.
*/
protected String getCharacterSet(URLConnection connection)
{
final String field = "Content-Type";
String string;
String ret;
ret = DEFAULT_CHARSET;
string = connection.getHeaderField(field);
if (null != string)
ret = getCharset(string);
return (ret);
}
/**
* Get a CharacterSet name corresponding to a charset parameter.
* @param content A text line of the form:
* * text/html; charset=Shift_JIS ** which is applicable both to the HTTP header field Content-Type and * the meta tag http-equiv="Content-Type". * Note this method also handles non-compliant quoted charset directives such as: * * text/html; charset="UTF-8" ** and * * text/html; charset='UTF-8' ** @return The character set name to use when reading the input stream. * For JDKs that have the Charset class this is qualified by passing * the name to findCharset() to render it into canonical form. * If the charset parameter is not found in the given string, the default * character set is returned. * @see ParserHelper#findCharset * @see #DEFAULT_CHARSET */ protected String getCharset(String content) { int index; String ret; ret = DEFAULT_CHARSET; if (null != content) { index = content.indexOf(CHARSET_STRING); if (index != -1) { content = content.substring(index + CHARSET_STRING.length()).trim(); if (content.startsWith("=")) { content = content.substring(1).trim(); index = content.indexOf(";"); if (index != -1) content = content.substring(0, index); //remove any double quotes from around charset string if (content.startsWith("\"") && content.endsWith("\"") && (1 < content.length())) content = content.substring(1, content.length() - 1); //remove any single quote from around charset string if (content.startsWith("'") && content.endsWith("'") && (1 < content.length())) content = content.substring(1, content.length() - 1); ret = ParserHelper.findCharset(content, ret); // Charset names are not case-sensitive; // that is, case is always ignored when comparing charset names. if (!ret.equalsIgnoreCase(content)) { feedback.info( "detected charset \"" + content + "\", using \"" + ret + "\""); } } } } return (ret); } // // Public methods // /** * Add a new Tag Scanner. * In typical situations where you require a no-frills parser, use the registerScanners() method to add the most * common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method. * It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method * would also be of use when you have developed custom scanners, and need to register them into the parser. * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners */ public void addScanner(TagScanner scanner) { String ids[] = scanner.getID(); for (int i = 0; i < ids.length; i++) { scanners.put(ids[i], scanner); } scanner.setFeedback(feedback); } /** * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/ * string/link/image * This is perhaps the most important method of this class. In typical situations, you will need to use * the parser like this : * * Parser parser = new Parser("http://www.yahoo.com"); * parser.registerScanners(); * for (NodeIterator i = parser.elements();i.hasMoreElements();) { * Node node = i.nextHTMLNode(); * if (node instanceof StringNode) { * // Downcasting to StringNode * StringNode stringNode = (StringNode)node; * // Do whatever processing you want with the string node * System.out.println(stringNode.getText()); * } * // Check for the node or tag that you want * if (node instanceof ...) { * // Downcast, and process * } * } **/ public NodeIterator elements() throws ParserException { boolean remove_scanner; Node node; MetaTag meta; String httpEquiv; String charset; boolean restart; EndTag end; IteratorImpl ret; remove_scanner = false; restart = false; ret = new IteratorImpl(reader, resourceLocn, feedback); ret = createIteratorImpl(remove_scanner, ret); return (ret); } public IteratorImpl createIteratorImpl( boolean remove_scanner, IteratorImpl ret) throws ParserException { Node node; MetaTag meta; String httpEquiv; String charset; EndTag end; if (null != url_conn) try { if (null == scanners.get("-m")) { addScanner(new MetaTagScanner("-m")); remove_scanner = true; } /* pre-read up to looking for charset directive */ while (null != (node = ret.peek())) { if (node instanceof MetaTag) { // check for charset on Content-Type meta = (MetaTag) node; httpEquiv = meta.getAttribute("HTTP-EQUIV"); if ("Content-Type".equalsIgnoreCase(httpEquiv)) { charset = getCharset(meta.getAttribute("CONTENT")); if (!charset.equalsIgnoreCase(character_set)) { // oops, different character set, restart character_set = charset; recreateReader(); ret = new IteratorImpl( reader, resourceLocn, feedback); } // once we see the Content-Type meta tag we're finished the pre-read break; } } else if (node instanceof EndTag) { end = (EndTag) node; if (end.getTagName().equalsIgnoreCase("HEAD")) // or, once we see the tag we're finished the pre-read break; } } } catch (UnsupportedEncodingException uee) { String msg = "elements() : The content of " + url_conn.getURL().toExternalForm() + " has an encoding which is not supported"; ParserException ex = new ParserException(msg, uee); feedback.error(msg, ex); throw ex; } catch (IOException ioe) { String msg = "elements() : Error in opening a connection to " + url_conn.getURL().toExternalForm(); ParserException ex = new ParserException(msg, ioe); feedback.error(msg, ex); throw ex; } finally { if (remove_scanner) scanners.remove("-m"); } return ret; } /** * Flush the current scanners registered. The registered scanners list becomes empty with this call. */ public void flushScanners() { scanners = new Hashtable(); } /** * Return the scanner registered in the parser having the * given id * @param id The id of the requested scanner * @return TagScanner The Tag Scanner */ public TagScanner getScanner(String id) { return (TagScanner) scanners.get(id); } /** * Parse the given resource, using the filter provided */ public void parse(String filter) throws Exception { Node node; for (NodeIterator e = elements(); e.hasMoreNodes();) { node = e.nextNode(); if (node != null) { if (filter == null) System.out.println(node.toString()); else { // There is a filter. Find if the associated filter of this node // matches the specified filter if (!(node instanceof Tag)) continue; Tag tag = (Tag) node; TagScanner scanner = tag.getThisScanner(); if (scanner == null) continue; String tagFilter = scanner.getFilter(); if (tagFilter == null) continue; if (tagFilter.equals(filter)) System.out.println(node.toString()); } } else System.out.println("Node is null"); } } /** * This method should be invoked in order to register some common scanners. The scanners that get added are : * LinkScanner (filter key "-l") * HTMLImageScanner (filter key "-i") * HTMLScriptScanner (filter key "-s") * HTMLStyleScanner (filter key "-t") * HTMLJspScanner (filter key "-j") * HTMLAppletScanner (filter key "-a") * HTMLMetaTagScanner (filter key "-m") * HTMLTitleScanner (filter key "-t") * HTMLDoctypeScanner (filter key "-d") * HTMLFormScanner (filter key "-f") * HTMLFrameSetScanner(filter key "-r") * HTMLBaseHREFScanner(filter key "-b") * * Call this method after creating the Parser object. e.g. * * Parser parser = new Parser("http://www.yahoo.com"); * parser.registerScanners(); **/ public void registerScanners() { if (scanners.size() > 0) { System.err.println( "registerScanners() should be called first, when no other scanner has been registered."); System.err.println( "Other scanners already exist, hence this method call wont have any effect"); return; } LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER); // Note - The BaseHREF and Image scanners share the same // link processor - internally linked up with the factory // method in the link scanner class addScanner(linkScanner); addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER)); addScanner(new ScriptScanner("-s")); addScanner(new StyleScanner("-t")); addScanner(new JspScanner("-j")); addScanner(new AppletScanner("-a")); addScanner(new MetaTagScanner("-m")); addScanner(new TitleScanner("-T")); addScanner(new DoctypeScanner("-d")); addScanner(new FormScanner("-f", this)); addScanner(new FrameSetScanner("-r")); addScanner(linkScanner.createBaseHREFScanner("-b")); addScanner(new BulletListScanner("-bulletList", this)); // addScanner(new SpanScanner("-p")); addScanner(new DivScanner("-div")); addScanner(new TableScanner(this)); } /** * Make a call to registerDomScanners(), instead of registerScanners(), * when you are interested in retrieving a Dom representation of the html * page. Upon parsing, you will receive an Html object - which will contain * children, one of which would be the body. This is still evolving, and in * future releases, you might see consolidation of Html - to provide you * with methods to access the body and the head. */ public void registerDomScanners() { registerScanners(); addScanner(new HtmlScanner()); addScanner(new BodyScanner()); addScanner(new HeadScanner()); } /** * Removes a specified scanner object. You can create * an anonymous object as a parameter. This method * will use the scanner's key and remove it from the * registry of scanners. * e.g. * * removeScanner(new FormScanner("")); ** @param scanner TagScanner object to be removed from the list of registered scanners */ public void removeScanner(TagScanner scanner) { scanners.remove(scanner.getID()[0]); } /** * The main program, which can be executed from the command line */ public static void main(String[] args) { System.out.println("HTMLParser v" + VERSION_STRING); if (args.length < 1 || args[0].equals("-help")) { System.out.println(); System.out.println( "Syntax : java -jar htmlparser.jar |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.