是想用JavaScript来取得还是用Java来取得?
Java的HTML2Text代码,你可以改改就可以用了
/*
* HTML to Text Converter This utility convert HTML code to text code.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
/**
* Convert text/html into text/plain Auther: Omindra Kumar Rana Email:
* rana_omindra@yahoo.co.in
*
* @version 1.0 $Date: May 10, 2005 $
*/
public class HTML2Text
{
boolean body_found = false;
boolean in_body = false;
boolean center = false;
boolean pre = false;
String href = "";
public String convert(String source) throws Exception
{
StringBuffer result = new StringBuffer();
StringBuffer result2 = new StringBuffer();
StringReader input = new StringReader(source);
try
{
String text = null;
int c = input.read();
while(c != -1) // Convert until EOF
{
text = "";
if(c == '<') // It's a tag!!
{
String CurrentTag = getTag(input); // Get the rest of the
// tag
text = convertTag(CurrentTag);
}
else
if(c == '&')
{
String specialchar = getSpecial(input);
if(specialchar.equals("lt;") || specialchar.equals("#60"))
text = "<";
else
if(specialchar.equals("gt;") || specialchar.equals("#62"))
text = ">";
else
if(specialchar.equals("amp;") || specialchar.equals("#38"))
text = "&";
else
if(specialchar.equals("nbsp;"))
text = " ";
else
if(specialchar.equals("quot;") || specialchar.equals("#34"))
text = "\"";
else
if(specialchar.equals("copy;") || specialchar.equals("#169"))
text = "[Copyright]";
else
if(specialchar.equals("reg;") || specialchar.equals("#174"))
text = "[Registered]";
else
if(specialchar.equals("trade;") || specialchar.equals("#153"))
text = "[Trademark]";
else
text = "&" + specialchar;
}
else
if(!pre && Character.isWhitespace((char)c))
{
StringBuffer s = in_body ? result : result2;
if(s.length() > 0 && Character.isWhitespace(s.charAt(s.length() - 1)))
text = "";
else
text = " ";
}
else
{
text = "" + (char)c;
}
StringBuffer s = in_body ? result : result2;
s.append(text);
c = input.read();
}
}
catch(Exception e)
{
input.close();
throw e;
}
StringBuffer s = body_found ? result : result2;
return s.toString().trim();
}
String getTag(Reader r) throws IOException
{
StringBuffer result = new StringBuffer();
int level = 1;
result.append('<');
while(level > 0)
{
int c = r.read();
if(c == -1)
break; // EOF
result.append((char)c);
if(c == '<')
level++;
else
if(c == '>')
level--;
}
return result.toString();
}
String getSpecial(Reader r) throws IOException
{
StringBuffer result = new StringBuffer();
r.mark(1);// Mark the present position in the stream
int c = r.read();
while(Character.isLetter((char)c))
{
result.append((char)c);
r.mark(1);
c = r.read();
}
if(c == ';')
result.append(';');
else
r.reset();
return result.toString();
}
boolean isTag(String s1,String s2)
{
s1 = s1.toLowerCase();
String t1 = "<" + s2.toLowerCase() + ">";
String t2 = "<" + s2.toLowerCase() + " ";
return s1.startsWith(t1) || s1.startsWith(t2);
}
String convertTag(String t) throws IOException
{
String result = "";
if(isTag(t,"body"))
{
in_body = true;
body_found = true;
}
else
if(isTag(t,"/body"))
{
in_body = false;
result = "";
}
else
if(isTag(t,"center"))
{
result = "";
center = true;
}
else
if(isTag(t,"/center"))
{
result = "";
center = false;
}
else
if(isTag(t,"pre"))
{
result = "";
pre = true;
}
else
if(isTag(t,"/pre"))
{
result = "";
pre = false;
}
else
if(isTag(t,"p"))
result = "";
else
if(isTag(t,"br"))
result = "";
else
if(isTag(t,"h1") || isTag(t,"h2") || isTag(t,"h3") || isTag(t,"h4") || isTag(t,"h5") || isTag(t,"h6") || isTag(t,"h7"))
result = "";
else
if(isTag(t,"/h1") || isTag(t,"/h2") || isTag(t,"/h3") || isTag(t,"/h4") || isTag(t,"/h5") || isTag(t,"/h6") || isTag(t,"/h7"))
result = "";
else
if(isTag(t,"/dl"))
result = "";
else
if(isTag(t,"dd"))
result = " * ";
else
if(isTag(t,"dt"))
result = " ";
else
if(isTag(t,"li"))
result = " * ";
else
if(isTag(t,"/ul"))
result = "";
else
if(isTag(t,"/ol"))
result = "";
else
if(isTag(t,"hr"))
result = "_________________________________________";
else
if(isTag(t,"table"))
result = "";
else
if(isTag(t,"/table"))
result = "";
else
if(isTag(t,"form"))
result = "";
else
if(isTag(t,"/form"))
result = "";
else
if(isTag(t,"b"))
result = "*";
else
if(isTag(t,"/b"))
result = "*";
else
if(isTag(t,"i"))
result = "\"";
else
if(isTag(t,"/i"))
result = "\"";
else
if(isTag(t,"img"))
{
int idx = t.indexOf("alt=\"");
if(idx != -1)
{
idx += 5;
int idx2 = t.indexOf("\"",idx);
result = t.substring(idx,idx2);
}
}
else
if(isTag(t,"a"))
{
int idx = t.indexOf("href=\"");
if(idx != -1)
{
idx += 6;
int idx2 = t.indexOf("\"",idx);
href = t.substring(idx,idx2);
}
else
{
href = "";
}
}
else
if(isTag(t,"/a"))
{
if(href.length() > 0)
{
result = " [ " + href + " ]";
href = "";
}
}
return result;
}
public static void main(String argv[]) throws Exception
{
FileInputStream fis = null;
String s = null;
try
{
File file;
// if(argv[0] != null)
// file = new File(argv[0]);
// else
file = new File("Test.html");//Test.html
fis = new FileInputStream(file);
byte buf[] = new byte[fis.available()];
// bytes that can be read from this file input stream without
// blocking
fis.read(buf);
fis.close();
fis = null;
s = new String(buf);
HTML2Text h = new HTML2Text();
System.out.println(h.convert(s));
}
catch(Exception e)
{
if(fis != null)
fis.close();
throw e;
}
}
}
JavaScript的代码片断如下:
var vTable = document.all.tableID;
//alert(vTable);
var elementsTable = vTable.childNodes;
//alert(elementsTable.length);
for(i = 0;i < elementsTable.length;i++)
{
var elementsTR = elementsTable.item(i);
var htmlValue = elementsTR.innerHTML;
if(htmlValue != null && htmlValue != "")
{
//alert(elementsTR.innerHTML);
var elementsTR = elementsTR.childNodes;
//alert(elementsTR.length);
for(j = 0;j < elementsTR.length;j++)
{
var elementsTD = elementsTR.item(j);
//alert(elementsTD.innerHTML);
var ele = elementsTD.childNodes;
//alert(elementsTR.length);
for(k = 0;k < ele.length;k++)
{
var e1 = ele.item(k);
.......