I’m using Apache PDFBox to parse a PDF document and extract text into a new PDF while preserving original fonts and coordinates. the pdfbox version is '3.0.2', The following code generally works, but causes significant coordinate offsets for specific characters (e.g., the Chinese characters "一年级" in the PDF 0.pdf ). The characters "一年级" appear displaced by ~20-30 units compared to their original positions. Other text elements retain correct coordinates. How can I resolve this misalignment?
public static void main(String[] args) {
String input = "0.pdf";
String out = "0_out.pdf";
PDDocument document = null;
try {
document = Loader.loadPDF(new File(input));
PDDocument outDoc = new PDDocument();
copyPdfText(document, outDoc);
outDoc.save(new File(out));
outDoc.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void copyPdfText(PDDocument source, PDDocument target) throws IOException {
Set<PDFont> backupFonts = new HashSet();
PDFont font_simhei = PDType0Font.load(source, new File("C:\\Users\\admin\\Desktop\\simhei.ttf"));
backupFonts.add(font_simhei );
for (int i = 0; i < source.getNumberOfPages(); i++) {
PDPage sourcePage = source.getPage(i);
PDPage targetPage = null;
if (i < target.getNumberOfPages()) {
targetPage = target.getPage(i);
} else {
float pageWidth = sourcePage.getCropBox().getWidth();
float pageHeight = sourcePage.getCropBox().getHeight();
targetPage = new PDPage(new PDRectangle(pageWidth, pageHeight));
target.addPage(targetPage);
}
copyText2(backupFonts, source, i, target, targetPage);
}
}
public static void copyText2(Set<PDFont> backupFonts, PDDocument source, int sourcePageNumber, PDDocument target, PDPage targetPage) throws IOException {
List<TextPosition> allTextPositions = new ArrayList<>();
PDFTextStripper pdfTextStripper = new PDFTextStripper() {
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
allTextPositions.addAll(textPositions);
super.writeString(text, textPositions);
}
};
pdfTextStripper.setStartPage(sourcePageNumber + 1);
pdfTextStripper.setEndPage(sourcePageNumber + 1);
pdfTextStripper.setSortByPosition(true);
pdfTextStripper.getText(source);
Iterable<COSName> fontNames = source.getPage(sourcePageNumber).getResources().getFontNames();
for (COSName cs : fontNames) {
PDFont ft = source.getPage(sourcePageNumber).getResources().getFont(cs);
backupFonts.add(ft);
}
PDPageContentStream contentStream = new PDPageContentStream(target, targetPage, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.beginText();
for (TextPosition position : allTextPositions) {
try {
// set font and fontsize
contentStream.setFont(position.getFont(), position.getFontSizeInPt());
Matrix m = position.getTextMatrix().clone();
m.concatenate(position.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
// 获取文本的坐标和旋转信息
float tx = position.getXDirAdj();
float ty = position.getTextMatrix().getTranslateY();
float fontSize = position.getFontSize();
double theta = (angle * Math.PI) / 180;
// 构建旋转矩阵
float cos = (float) Math.cos(theta);
float sin = (float) Math.sin(theta);
// 构建完整的变换矩阵,考虑字体大小和旋转
float a = cos * fontSize;
float b = sin * fontSize;
float c = -sin * fontSize;
float d = cos * fontSize;
float e = tx;
float f = ty;
Matrix newMat = new Matrix(a, b, c, d, e, f);
contentStream.setTextMatrix(newMat);
contentStream.showText(position.getUnicode());
} catch (Exception e) {
System.out.println("--------------------------------exception");
PDFont fft = getAvaliableFont(position.getUnicode(), backupFonts);
if (fft != null) {
//适配的字体?
contentStream.setFont(fft, position.getFontSizeInPt());
// contentStream.newLineAtOffset(position.getX() - x, - (position.getY() - y));
contentStream.showText(position.getUnicode());
}
}
}
contentStream.endText();
contentStream.close();
}
public static PDFont getAvaliableFont(String text, Set<PDFont> backups) {
for (PDFont ft : backups) {
try {
if (ft.encode(text) != null) {
return ft;
}
} catch (Exception e) {
System.out.println(ft.getName() + " ");
}
}
return null;
}
I tried to chang the font to local 'simhei' when process the special chars "一年级"; like this:
` contentStream.setFont( font_simhei , position.getFontSizeInPt());`
it seems work,the coordinate is correct , but I'm not sure when the problem happend again in other pdf files and how to solve it
this is the original pdf looks like in adobe :
o_oriinal.jpg
and this is the output file looks like in Adobe :
0_dest.jpg
I print the text position ,here is the result:
TextMatrix:[29.0,0.0,0.0,29.0,42.2633,684.99915]--XDirAdj:42.2633--YDirAdj:52.008423--text:一
TextMatrix:[29.0,0.0,0.0,29.0,42.2633,651.93915]--XDirAdj:42.2633--YDirAdj:85.06842--text:年
TextMatrix:[29.0,0.0,0.0,29.0,42.2633,618.90814]--XDirAdj:42.2633--YDirAdj:118.09943--text:级
TextMatrix:[75.6,0.0,0.0,72.0,141.54689,601.7391]--XDirAdj:141.54689--YDirAdj:135.2685--text:校
TextMatrix:[75.6,0.0,0.0,72.0,220.92688,601.7391]--XDirAdj:220.92688--YDirAdj:135.2685--text:本