使用Java不借助框架直接抽取rtf文本
2012-06-26 15:01:13 来源:WEB开发网核心提示: 如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,使用Java不借助框架直接抽取rtf文本,大多都需要依赖于第三方的包或者用swing里面的api抽取,public class RTFExtractor{ private static Map<String, String>
如下代码是直接基于字符对rtf文件进行文本抽取,支网上找了一些例子,大多都需要依赖于第三方的包或者用swing里面的api抽取,
public class RTFExtractor
{
private static Map<String, String> include = new HashMap<String, String>();
static
{
include.put("par", " ");
include.put("bullet", "?");
include.put("emdash", "—");
include.put("emspace", "?");
include.put("endash", "–");
include.put("enspace", "?");
include.put("ldblquote", "“");
include.put("lquote", "‘");
include.put("ltrmark", "?");
include.put("rdblquote", "”");
include.put("rquote", "’");
include.put("rtlmark", "?");
include.put("tab", " ");
include.put("zwj", "?");
include.put("zwnj", "?");
}
public static boolean isRTFFormat(byte[] ogiBytes)
{
if(ogiBytes == null || ogiBytes.length < 8)
{
return false;
}
if(
(ogiBytes[0] & 0xFF) == 0x7B ||
(ogiBytes[1] & 0xFF) == 0x5C ||
(ogiBytes[2] & 0xFF) == 0x72 ||
(ogiBytes[3] & 0xFF) == 0x74 ||
(ogiBytes[4] & 0xFF) == 0x66 ||
(ogiBytes[5] & 0xFF) == 0x31 ||
(ogiBytes[6] & 0xFF) == 0x5C ||
(ogiBytes[7] & 0xFF) == 0x61
){
return true;
}
short lines = 0;
for(int i = 0; i < ogiBytes.length; i++)
{
if(ogiBytes[i] != 0x0A && ogiBytes[i] != 0x0D)
{
continue;
}
if(lines > 2)
{
break;
}
if(i+1+8 >= ogiBytes.length)
{
break;
}
if(
(ogiBytes[i+1] & 0xFF) == 0x7B ||
(ogiBytes[i+2] & 0xFF) == 0x5C ||
(ogiBytes[i+3] & 0xFF) == 0x72 ||
(ogiBytes[i+4] & 0xFF) == 0x74 ||
(ogiBytes[i+5] & 0xFF) == 0x66 ||
(ogiBytes[i+6] & 0xFF) == 0x31 ||
(ogiBytes[i+7] & 0xFF) == 0x5C ||
(ogiBytes[i+8] & 0xFF) == 0x61
){
return true;
}
lines++;
}
return false;
}
public static String extract(String file, int[] err, int read)
{
String content = file;
if(read == 0)
{
try {
content = FileUtil.readAscFile(file);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
System.out.println("Total length: "+ content.length());
if(!isRTFFormat(content.getBytes()))
{
System.out.println("Not the rtf file!");
err[0] = ExtractorUtil.NOT_RTF_FORMAT;
return "";
}
int line1 = content.indexOf("}}");
if(line1 > -1)
{
int line2 = content.indexOf("}}", line1+2);
if(line2 > -1)
{
content = content.substring(line2+2);
}
else
{
content = content.substring(line1+2);
}
}
content = content.replaceAll("\\{\\\\\\*[^\\}]*?\\}", "");
content = content.replaceAll("\n|\r", "");
StringBuilder builder = new StringBuilder();
String[] buffers = content.split("\\\\");
for(int i=0;i<buffers.length;i++)
{
String buffer = buffers[i].trim();
String value = include.get(buffer);
if(value != null)
{
builder.append(value);
continue;
}
if(buffer.startsWith("'"))
{
if(i == buffers.length - 1)
{
break;
}
String a = buffer.replaceAll("\\}\\{", "");
String b = buffers[i+1].replaceAll("\\}\\{", "");
if(!b.startsWith("'"))
{
continue;
}
if(a.length() < 3 || b.length() < 3)
{
continue;
}
if(a.length() > 3)
{
builder.append( a.substring(3) );
a = a.substring(0, 3);
}
if(b.length() > 3)
{
builder.append( b.substring(3) );
b = b.substring(0, 3);
}
a = a.replace("'", "");
b = b.replace("'", "");
if(a.length() != 2 || a.replaceAll("[A-Fa-f0-9]", "").length() != 0)
{
continue;
}
if(b.length() != 2 || b.replaceAll("[A-Fa-f0-9]", "").length() != 0)
{
continue;
}
int ch = Integer.valueOf(b+a, 16);
byte[] temp = new byte[2];
temp[0] = (byte) ch;
temp[1] = (byte) (ch >> 8);
builder.append( new String(temp) );
i++;
continue;
}
int spaceOffset = buffer.indexOf(" ");
if(spaceOffset > -1)
{
String rest = buffer.substring(spaceOffset+1);
if(buffer.startsWith("f0 "))
{
builder.append( rest.replaceAll("\\}\\{", "") );
continue;
}
if("{".equals(rest) || "}".equals(rest) || rest.indexOf("}{") > -1 || rest.indexOf("}}") > -1)
{
continue;
}
if(rest.startsWith("{"))
{
builder.append( rest.substring(1) );
continue;
}
if(rest.endsWith("}"))
{
builder.append( rest.substring(0, rest.length()-1) );
continue;
}
builder.append( rest );
continue;
}
}
err[0] = ExtractorUtil.EXTRACTING_DONE;
return builder.toString();
}
public static void main(String[] args) throws Exception
{
String text = RTFExtractor.extract("E:\en.rtf", new int[1], 0);
FileUtil.writeAscFile("E:\output.txt", text, false);
System.out.println("Done!");
}
}
赞助商链接
