WEB开发网
开发学院软件开发Java 使用Java不借助框架直接抽取rtf文本 阅读

使用Java不借助框架直接抽取rtf文本

 2012-06-26 15:01:13 来源:WEB开发网   
核心提示: 如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,使用Java不借助框架直接抽取rtf文本,大多都需要依赖于第三方的包或者用swing里面的api抽取,public class RTFExtractor{ private static Map<String, String>

 如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,大多都需要依赖于第三方的包或者用swing里面的api抽取,

public class RTFExtractor
{
   
    private static Map<String, String> include = new HashMap<String, String>();
    static
    {
        include.put("par",             " ");
        include.put("bullet",     "?");
        include.put("emdash",     "—");
        include.put("emspace",    "?");
        include.put("endash",     "–");
        include.put("enspace",    "?");
        include.put("ldblquote",  "“");
        include.put("lquote",     "‘");
        include.put("ltrmark",    "?");
        include.put("rdblquote",  "”");
        include.put("rquote",     "’");
        include.put("rtlmark",    "?");
        include.put("tab",        " ");
        include.put("zwj",        "?");
        include.put("zwnj",       "?");
    }
   
    public static boolean isRTFFormat(byte[] ogiBytes)
    {
        if(ogiBytes == null || ogiBytes.length < 8)
        {
            return false;
        }
        
        if(
            (ogiBytes[0] & 0xFF) == 0x7B ||
            (ogiBytes[1] & 0xFF) == 0x5C ||
            (ogiBytes[2] & 0xFF) == 0x72 ||
            (ogiBytes[3] & 0xFF) == 0x74 ||
            (ogiBytes[4] & 0xFF) == 0x66 ||
            (ogiBytes[5] & 0xFF) == 0x31 ||
            (ogiBytes[6] & 0xFF) == 0x5C ||
            (ogiBytes[7] & 0xFF) == 0x61
        ){
            return true;
        }
        
        short lines = 0;
        for(int i = 0; i < ogiBytes.length; i++)
        {
            if(ogiBytes[i] != 0x0A && ogiBytes[i] != 0x0D)
            {
                continue;
            }
            
            if(lines > 2)
            {
                break;
            }
            
            if(i+1+8 >= ogiBytes.length)
            {
                break;
            }
            
            if(
                (ogiBytes[i+1] & 0xFF) == 0x7B ||
                (ogiBytes[i+2] & 0xFF) == 0x5C ||
                (ogiBytes[i+3] & 0xFF) == 0x72 ||
                (ogiBytes[i+4] & 0xFF) == 0x74 ||
                (ogiBytes[i+5] & 0xFF) == 0x66 ||
                (ogiBytes[i+6] & 0xFF) == 0x31 ||
                (ogiBytes[i+7] & 0xFF) == 0x5C ||
                (ogiBytes[i+8] & 0xFF) == 0x61
            ){
                return true;
            }
            
            lines++;
        }
        
        return false;
    }
   
    public static String extract(String file, int[] err, int read)
    {
        String content = file;
        if(read == 0)
        {
            try {
                content = FileUtil.readAscFile(file);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        System.out.println("Total length: "+ content.length());
        
        if(!isRTFFormat(content.getBytes()))
        {
            System.out.println("Not the rtf file!");
            err[0] = ExtractorUtil.NOT_RTF_FORMAT;
            return "";
        }
        
        int line1 = content.indexOf("}}");
        if(line1 > -1)
        {
            int line2 = content.indexOf("}}", line1+2);
            if(line2 > -1)
            {
                content = content.substring(line2+2);
            }
            else
            {
                content = content.substring(line1+2);
            }
        }

        content = content.replaceAll("\\{\\\\\\*[^\\}]*?\\}", "");
        content = content.replaceAll("\n|\r", "");
        
        StringBuilder builder = new StringBuilder();
        String[] buffers = content.split("\\\\");
        for(int i=0;i<buffers.length;i++)
        {
            String buffer = buffers[i].trim();
            String value = include.get(buffer);
            if(value != null)
            {
                builder.append(value);
                continue;
            }
            
            if(buffer.startsWith("'"))
            {
                if(i == buffers.length - 1)
                {
                    break;
                }
                
                String a = buffer.replaceAll("\\}\\{", "");
                String b = buffers[i+1].replaceAll("\\}\\{", "");
                if(!b.startsWith("'"))
                {
                    continue;
                }
                
                if(a.length() < 3 || b.length() < 3)
                {
                    continue;
                }
                
                if(a.length() > 3)
                {
                    builder.append( a.substring(3) );
                    a = a.substring(0, 3);
                }
                
                if(b.length() > 3)
                {
                    builder.append( b.substring(3) );
                    b = b.substring(0, 3);
                }
                
                a = a.replace("'", "");
                b = b.replace("'", "");
                if(a.length() != 2 || a.replaceAll("[A-Fa-f0-9]", "").length() != 0)
                {
                    continue;
                }
                
                if(b.length() != 2 || b.replaceAll("[A-Fa-f0-9]", "").length() != 0)
                {
                    continue;
                }
                
                int ch = Integer.valueOf(b+a, 16);
                byte[] temp = new byte[2];
                temp[0] = (byte) ch;
                temp[1] = (byte) (ch >> 8);
                builder.append( new String(temp) );

                i++;
                continue;
            }
            
            int spaceOffset = buffer.indexOf(" ");
            if(spaceOffset > -1)
            {
                String rest = buffer.substring(spaceOffset+1);
                if(buffer.startsWith("f0 "))
                {
                    builder.append( rest.replaceAll("\\}\\{", "") );
                    continue;
                }
                
                if("{".equals(rest) || "}".equals(rest) || rest.indexOf("}{") > -1 || rest.indexOf("}}") > -1)
                {
                    continue;
                }
                
                if(rest.startsWith("{"))
                {
                    builder.append( rest.substring(1) );
                    continue;
                }
                
                if(rest.endsWith("}"))
                {
                    builder.append( rest.substring(0, rest.length()-1) );
                    continue;
                }
                
                builder.append( rest );
                continue;
            }
            
        }
        
        err[0] = ExtractorUtil.EXTRACTING_DONE;
        return builder.toString();
    }
   
    public static void main(String[] args) throws Exception
    {
        String text = RTFExtractor.extract("E:\en.rtf", new int[1], 0);
        FileUtil.writeAscFile("E:\output.txt", text, false);
       
        System.out.println("Done!");
    }
}

Tags:使用 Java 借助

编辑录入:爽爽 [复制链接] [打 印]
赞助商链接