lexer html解析一个js过滤的改进

edwardpro

浏览: 300562 次
性别:

最近访客更多访客>>

fxstiandi

snai_user

1040979038

lost-java

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

技术私语

问题描述，使用htmlparser的lexer解析器进行页面解析时发现类似如下的页面会有问题：

<script>
for(i=0;i<a;i++){

}
</script>

解析后代码变成了：
<script>
for(i=0;i<a;i++){

}
></script>

通过lexer代码发现，实际上只要js代码改成：

<script>
<!--
for(i=0;i<a;i++){

}
-->
</script>

就不会有问题了，从代码中主要发现它的解析其实没有问题，主要是我们平时的页面规范做的不好，它在解析时会看到字符解析时发现<后面如果有字母就认为它是一个tag：

	protected Node parseString(int start, boolean quotesmart)
			throws ParserException {
		boolean done;
		char ch;
		char quote;

		done = false;
		quote = 0;
		while (!done) {
			ch = mPage.getCharacter(mCursor);
			if (Page.EOF == ch)
				done = true;
			else if (0x1b == ch) // escape
			{
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				else if ('$' == ch) {
					ch = mPage.getCharacter(mCursor);
					if (Page.EOF == ch)
						done = true;
					// JIS X 0208-1978 and JIS X 0208-1983
					else if ('@' == ch || 'B' == ch)
						scanJIS(mCursor);
					/*
					 * // JIS X 0212-1990 else if ('(' == ch) { ch =
					 * mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
					 * true; else if ('D' == ch) scanJIS (mCursor); else {
					 * mPage.ungetCharacter (mCursor); mPage.ungetCharacter
					 * (mCursor); mPage.ungetCharacter (mCursor); } }
					 */
					else {
						mPage.ungetCharacter(mCursor);
						mPage.ungetCharacter(mCursor);
					}
				} else
					mPage.ungetCharacter(mCursor);
			} else if (quotesmart && (0 == quote)
					&& (('\'' == ch) || ('"' == ch)))
				quote = ch; // enter quoted state
			// patch from Gernot Fricke to handle escaped closing quote
			else if (quotesmart && (0 != quote) && ('\\' == ch)) {
				ch = mPage.getCharacter(mCursor); // try to consume escape
				if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash
						&& (ch != quote)) // escaped quote character
					// ( reflects ["] or ['] whichever opened the quotation)
					mPage.ungetCharacter(mCursor); // unconsume char if char not
													// an escape
			} else if (quotesmart && (ch == quote))
				quote = 0; // exit quoted state
			else if (quotesmart && (0 == quote) && (ch == '/')) {
				// handle multiline and double slash comments (with a quote)
				// in script like:
				// I can't handle single quotations.
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				else if ('/' == ch) {
					do
						ch = mPage.getCharacter(mCursor);
					while ((Page.EOF != ch) && ('\n' != ch));
				} else if ('*' == ch) {
					do {
						do
							ch = mPage.getCharacter(mCursor);
						while ((Page.EOF != ch) && ('*' != ch));
						ch = mPage.getCharacter(mCursor);
						if (ch == '*')
							mPage.ungetCharacter(mCursor);
					} while ((Page.EOF != ch) && ('/' != ch));
				} else
					mPage.ungetCharacter(mCursor);
			} else if ((0 == quote) && ('<' == ch)) {
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				// the order of these tests might be optimized for speed:
				else if ('/' == ch
						|| Character.isLetter(ch)
						|| '!' == ch || '%' == ch || '?' == ch) {
					done = true;
					mPage.ungetCharacter(mCursor);
					mPage.ungetCharacter(mCursor);
				} else {
					// it's not a tag, so keep going, but check for quotes
					mPage.ungetCharacter(mCursor);
				}
			}
		}

		return (makeString(start, mCursor.getPosition()));
	}

因此为了解决这个问题，现在要在上面做一个手脚：

首先在类中间增加了一个标记，script

这个标记是修改了nexNode方法，在返回前判断下是否前一个标签是<script> 或者</script>

然后在parseString中修改其解析方法就可以了，下面是完整的代码：

import java.net.URLConnection;

import org.htmlparser.Node;
import org.htmlparser.lexer.Lexer;
import org.htmlparser.lexer.Page;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.ParserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author edwardpro
 * 
 */
public class LexerFixed extends Lexer {
	private static final Logger logger = LoggerFactory
			.getLogger(LexerFixed.class);

	/**
	 * 
	 */
	private static final long serialVersionUID = 8425806017089419815L;

	//script标签标记，如果发现当前在script里就掠过所有的< >
	private int script=0;

	/**
	 * 
	 */
	public LexerFixed() {
		super();
	}

	/**
	 * @param page
	 */
	public LexerFixed(Page page) {
		super(page);
	}

	/**
	 * @param text
	 */
	public LexerFixed(String text) {
		super(text);
	}

	/**
	 * @param connection
	 * @throws ParserException
	 */
	public LexerFixed(URLConnection connection) throws ParserException {
		super(connection);
	}

	@Override
	public Node nextNode(boolean quotesmart) throws ParserException {
		Node ret = super.nextNode(quotesmart);
		checkTag(ret);
		return (ret);
	}

	/**
	 * checkTag用于修改tagNode的方法当有入参数时都会进行一次参数修正另外對內容進行一下escape操作並且會進行判斷是否存在已經escape的蹟象
	 * 
	 * @param node
	 */
	private void checkTag(Node node) {
		if (node != null && node instanceof TagNode
				&& !((TagNode) node).isEmptyXmlTag()) {
			String tagName = ((TagNode) node).getTagName();
			if("SCRIPT".equalsIgnoreCase(tagName)){
				if (!((TagNode) node).isEndTag() ) {
					this.script=1;
				} else{
					this.script=0;
				}
			}
		}
	}

	@Override
	protected Node parseString(int start, boolean quotesmart)
			throws ParserException {
		boolean done;
		char ch;
		char quote;

		done = false;
		quote = 0;
		while (!done) {
			ch = mPage.getCharacter(mCursor);
			if (Page.EOF == ch)
				done = true;
			else if (0x1b == ch) // escape
			{
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				else if ('$' == ch) {
					ch = mPage.getCharacter(mCursor);
					if (Page.EOF == ch)
						done = true;
					// JIS X 0208-1978 and JIS X 0208-1983
					else if ('@' == ch || 'B' == ch)
						scanJIS(mCursor);
					/*
					 * // JIS X 0212-1990 else if ('(' == ch) { ch =
					 * mPage.getCharacter (mCursor); if (Page.EOF == ch) done =
					 * true; else if ('D' == ch) scanJIS (mCursor); else {
					 * mPage.ungetCharacter (mCursor); mPage.ungetCharacter
					 * (mCursor); mPage.ungetCharacter (mCursor); } }
					 */
					else {
						mPage.ungetCharacter(mCursor);
						mPage.ungetCharacter(mCursor);
					}
				} else
					mPage.ungetCharacter(mCursor);
			} else if (quotesmart && (0 == quote)
					&& (('\'' == ch) || ('"' == ch)))
				quote = ch; // enter quoted state
			// patch from Gernot Fricke to handle escaped closing quote
			else if (quotesmart && (0 != quote) && ('\\' == ch)) {
				ch = mPage.getCharacter(mCursor); // try to consume escape
				if ((Page.EOF != ch) && ('\\' != ch) // escaped backslash
						&& (ch != quote)) // escaped quote character
					// ( reflects ["] or ['] whichever opened the quotation)
					mPage.ungetCharacter(mCursor); // unconsume char if char not
													// an escape
			} else if (quotesmart && (ch == quote))
				quote = 0; // exit quoted state
			else if (quotesmart && (0 == quote) && (ch == '/')) {
				// handle multiline and double slash comments (with a quote)
				// in script like:
				// I can't handle single quotations.
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				else if ('/' == ch) {
					do
						ch = mPage.getCharacter(mCursor);
					while ((Page.EOF != ch) && ('\n' != ch));
				} else if ('*' == ch) {
					do {
						do
							ch = mPage.getCharacter(mCursor);
						while ((Page.EOF != ch) && ('*' != ch));
						ch = mPage.getCharacter(mCursor);
						if (ch == '*')
							mPage.ungetCharacter(mCursor);
					} while ((Page.EOF != ch) && ('/' != ch));
				} else
					mPage.ungetCharacter(mCursor);
			} else if ((0 == quote) && ('<' == ch)) {
				ch = mPage.getCharacter(mCursor);
				if (Page.EOF == ch)
					done = true;
				// the order of these tests might be optimized for speed:
				else if ('/' == ch
						|| (Character.isLetter(ch) && this.script==0)
						|| '!' == ch || '%' == ch || '?' == ch) {
					done = true;
					mPage.ungetCharacter(mCursor);
					mPage.ungetCharacter(mCursor);
				} else {
					// it's not a tag, so keep going, but check for quotes
					mPage.ungetCharacter(mCursor);
				}
			}
		}

		return (makeString(start, mCursor.getPosition()));
	}
}

分享到：

thrift-1-2-3 | 在linux下删除大文件的好方法

2011-09-22 11:07
浏览 1250
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lexer html解析一个js过滤的改进

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

lexer html解析一个js过滤的改进

评论

发表评论

相关推荐

Thread In Ruby

thrift-1-2-3

在linux下删除大文件的好方法

thrift 在ubuntu 11.04下的编译过程一两点心得

StringBuilder和String中的subString方法的细微差别

用枚举来作为配置

装箱操作需注意

在JTIDY中对于inline标签的这行问题

JAVA7的不兼容表现

用SPRING AOP实现主动缓存

gson使用感受

数据对象化的思考

用json作为配置存储介质的讨论

JAVA学习笔记之泛型接口

lucene中的filter器群组及其缓存大盘点

lucene的前端集群思路

lunece 用的高亮类

自己写的lucene的高亮类

使用form dom要注意的小问题

ubuntu 7.04 下配置 ruby环境

最近访客更多访客>>