使用Java不借助框架直接抽取rtf文本
2012-06-26 15:01:13 来源:WEB开发网核心提示: 如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,使用Java不借助框架直接抽取rtf文本,大多都需要依赖于第三方的包或者用swing里面的api抽取,public class RTFExtractor{ private static Map<String, String>
如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,大多都需要依赖于第三方的包或者用swing里面的api抽取,
public class RTFExtractor { private static Map<String, String> include = new HashMap<String, String>(); static { include.put("par", " "); include.put("bullet", "?"); include.put("emdash", "—"); include.put("emspace", "?"); include.put("endash", "–"); include.put("enspace", "?"); include.put("ldblquote", "“"); include.put("lquote", "‘"); include.put("ltrmark", "?"); include.put("rdblquote", "”"); include.put("rquote", "’"); include.put("rtlmark", "?"); include.put("tab", " "); include.put("zwj", "?"); include.put("zwnj", "?"); } public static boolean isRTFFormat(byte[] ogiBytes) { if(ogiBytes == null || ogiBytes.length < 8) { return false; } if( (ogiBytes[0] & 0xFF) == 0x7B || (ogiBytes[1] & 0xFF) == 0x5C || (ogiBytes[2] & 0xFF) == 0x72 || (ogiBytes[3] & 0xFF) == 0x74 || (ogiBytes[4] & 0xFF) == 0x66 || (ogiBytes[5] & 0xFF) == 0x31 || (ogiBytes[6] & 0xFF) == 0x5C || (ogiBytes[7] & 0xFF) == 0x61 ){ return true; } short lines = 0; for(int i = 0; i < ogiBytes.length; i++) { if(ogiBytes[i] != 0x0A && ogiBytes[i] != 0x0D) { continue; } if(lines > 2) { break; } if(i+1+8 >= ogiBytes.length) { break; } if( (ogiBytes[i+1] & 0xFF) == 0x7B || (ogiBytes[i+2] & 0xFF) == 0x5C || (ogiBytes[i+3] & 0xFF) == 0x72 || (ogiBytes[i+4] & 0xFF) == 0x74 || (ogiBytes[i+5] & 0xFF) == 0x66 || (ogiBytes[i+6] & 0xFF) == 0x31 || (ogiBytes[i+7] & 0xFF) == 0x5C || (ogiBytes[i+8] & 0xFF) == 0x61 ){ return true; } lines++; } return false; } public static String extract(String file, int[] err, int read) { String content = file; if(read == 0) { try { content = FileUtil.readAscFile(file); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } System.out.println("Total length: "+ content.length()); if(!isRTFFormat(content.getBytes())) { System.out.println("Not the rtf file!"); err[0] = ExtractorUtil.NOT_RTF_FORMAT; return ""; } int line1 = content.indexOf("}}"); if(line1 > -1) { int line2 = content.indexOf("}}", line1+2); if(line2 > -1) { content = content.substring(line2+2); } else { content = content.substring(line1+2); } } content = content.replaceAll("\\{\\\\\\*[^\\}]*?\\}", ""); content = content.replaceAll("\n|\r", ""); StringBuilder builder = new StringBuilder(); String[] buffers = content.split("\\\\"); for(int i=0;i<buffers.length;i++) { String buffer = buffers[i].trim(); String value = include.get(buffer); if(value != null) { builder.append(value); continue; } if(buffer.startsWith("'")) { if(i == buffers.length - 1) { break; } String a = buffer.replaceAll("\\}\\{", ""); String b = buffers[i+1].replaceAll("\\}\\{", ""); if(!b.startsWith("'")) { continue; } if(a.length() < 3 || b.length() < 3) { continue; } if(a.length() > 3) { builder.append( a.substring(3) ); a = a.substring(0, 3); } if(b.length() > 3) { builder.append( b.substring(3) ); b = b.substring(0, 3); } a = a.replace("'", ""); b = b.replace("'", ""); if(a.length() != 2 || a.replaceAll("[A-Fa-f0-9]", "").length() != 0) { continue; } if(b.length() != 2 || b.replaceAll("[A-Fa-f0-9]", "").length() != 0) { continue; } int ch = Integer.valueOf(b+a, 16); byte[] temp = new byte[2]; temp[0] = (byte) ch; temp[1] = (byte) (ch >> 8); builder.append( new String(temp) ); i++; continue; } int spaceOffset = buffer.indexOf(" "); if(spaceOffset > -1) { String rest = buffer.substring(spaceOffset+1); if(buffer.startsWith("f0 ")) { builder.append( rest.replaceAll("\\}\\{", "") ); continue; } if("{".equals(rest) || "}".equals(rest) || rest.indexOf("}{") > -1 || rest.indexOf("}}") > -1) { continue; } if(rest.startsWith("{")) { builder.append( rest.substring(1) ); continue; } if(rest.endsWith("}")) { builder.append( rest.substring(0, rest.length()-1) ); continue; } builder.append( rest ); continue; } } err[0] = ExtractorUtil.EXTRACTING_DONE; return builder.toString(); } public static void main(String[] args) throws Exception { String text = RTFExtractor.extract("E:\en.rtf", new int[1], 0); FileUtil.writeAscFile("E:\output.txt", text, false); System.out.println("Done!"); } }
赞助商链接