htmalTOtext

Embed Size (px)

Citation preview

  • 8/3/2019 htmalTOtext

    1/5

    Code :import java.io.File;import java.io.FileInputStream;

    import java.io.Reader;import java.io.StringReader;

    import java.io.IOException;

    /*** Convert text/html into text/plain** Auther: Omindra Kumar Rana* Email: [email protected]** @version 1.0 $Date: May 10, 2005 $*/

    public class HTML2Text

    { boolean body_found = false;boolean in_body = false;boolean center = false;boolean pre = false;String href = "";

    public String convert(String source) throws Exception{

    StringBuffer result = new StringBuffer();StringBuffer result2 = new StringBuffer();StringReader input = new StringReader(source);

    try{String text = null;int c = input.read();

    while (c != -1) // Convert until EOF{text = "";if (c == '

  • 8/3/2019 htmalTOtext

    2/5

    else if (specialchar.equals("copy;") specialchar.equals("#169"))

    text = "[Copyright]";else if (specialchar.equals("reg;") specialchar.equals("#174"

    ))text = "[Registered]";else if (specialchar.equals("trade;") specialchar.equals("#15

    3"))text = "[Trademark]";elsetext = "&" + specialchar;}else if (!pre && Character.isWhitespace((char)c)){StringBuffer s = in_body ? result : result2;if (s.length() > 0 && Character.isWhitespace(s.charAt(s.length()

    -1)))text = "";

    else text = " ";

    }else{text = "" + (char)c;}

    StringBuffer s = in_body ? result : result2;s.append(text);

    c = input.read();}}catch (Exception e)

    {input.close();throw e;}

    StringBuffer s = body_found ? result : result2;return s.toString().trim();

    }

    String getTag(Reader r) throws IOException{

    StringBuffer result = new StringBuffer();int level = 1;

    result.append('

  • 8/3/2019 htmalTOtext

    3/5

    StringBuffer result = new StringBuffer();r.mark(1);//Mark the present position in the streamint c = r.read();

    while (Character.isLetter((char)c)){result.append((char)c);

    r.mark(1);c = r.read();}

    if (c == ';') result.append(';');else r.reset();

    return result.toString();}

    boolean isTag(String s1, String s2){

    s1 = s1.toLowerCase();String t1 = "";String t2 = "

  • 8/3/2019 htmalTOtext

    4/5

    isTag(t,"/h3") isTag(t,"/h4") isTag(t,"/h5") isTag(t,"/h6") isTag(t,"/h7"))

    result = "";

    else if (isTag(t,"/dl"))result = "

    ";

    else if (isTag(t,"dd"))result = "

    * ";else if (isTag(t,"dt"))result = " ";else if (isTag(t,"li"))result = "

    * ";else if (isTag(t,"/ul"))result = "

    ";else if (isTag(t,"/ol"))

    result = "";else if (isTag(t,"hr"))result = "_________________________________________

    ";else if (isTag(t,"table"))result = "

    ";else if (isTag(t,"/table"))result = "

    ";else if (isTag(t,"form"))

    result = "";

    else if (isTag(t,"/form"))result = "

    ";else if (isTag(t,"b"))result = "*";else if (isTag(t,"/b"))result = "*";else if (isTag(t,"i"))result = """;else if (isTag(t,"/i"))result = """;else if (isTag(t,"img")){int idx = t.indexOf("alt="");if (idx != -1){idx += 5;int idx2 = t.indexOf(""",idx);result = t.substring(idx,idx2);}}else if (isTag(t,"a")){

    int idx = t.indexOf("href="");if (idx != -1){

  • 8/3/2019 htmalTOtext

    5/5

    idx += 6;int idx2 = t.indexOf(""",idx);href = t.substring(idx,idx2);}else{href = "";

    }}else if (isTag(t,"/a")){if (href.length() > 0){result = " [ " + href + " ]";href = "";}}

    return result;

    }

    public static void main(String argv[]) throws Exception{

    FileInputStream fis = null;String s = null;

    try{File file;if (argv[0] != null) file = new File(argv[0]);else file = new File("html_test_file.html");fis = new FileInputStream(file);

    byte buf[] = new byte[fis.available()];//bytes that can be read from this file input stream without blo

    cking

    fis.read(buf);fis.close();fis = null;s = new String(buf);HTML2Text h = new HTML2Text();System.out.println(h.convert(s));}catch (Exception e){if (fis != null) fis.close();throw e;}

    }}