Document document;

Fisher810228 發表在 痞客邦 留言(0) 人氣()

PDFBox 1.8.x

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFOperator;

import java.util.ArrayList;
import java.util.List;

/**
 * This is an example on how to remove all text from PDF document.
 *
 * Usage: java org.apache.pdfbox.examples.util.RemoveAllText <input-pdf> <output-pdf>
 *
 * @author Ben Litchfield
 * @version $Revision: 1.2 $
 */
public class RemoveAllText
{
    /**
     * Default constructor.
     */
    private RemoveAllText()
    {
        //example class should not be instantiated
    }

    /**
     * This will remove all text from a PDF document.
     *
     * @param args The command line arguments.
     *
     * @throws Exception If there is an error parsing the document.
     */
    public static void main( String[] args ) throws Exception
    {
        if( args.length != 2 )
        {
            usage();
        }
        else
        {
            PDDocument document = null;
            try
            {
                document = PDDocument.load( args[0] );
                if( document.isEncrypted() )
                {
                    System.err.println( "Error: Encrypted documents are not supported for this example." );
                    System.exit( 1 );
                }
                List allPages = document.getDocumentCatalog().getAllPages();
                for( int i=0; i " );
    }

}

Fisher810228 發表在 痞客邦 留言(0) 人氣()

使用pdfBox 1.8.x版本

import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;

import java.io.IOException;
import java.util.List;

public class PrintTextLocations extends PDFTextStripper {

    public PrintTextLocations() throws IOException {
        super.setSortByPosition(true);
    }

    public static void main(String[] args) throws Exception {

        PDDocument document = null;
        try {
            File input = new File("C:\\path\\to\\PDF.pdf");
            document = PDDocument.load(input);
            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                }
            }
            PrintTextLocations printer = new PrintTextLocations();
            List allPages = document.getDocumentCatalog().getAllPages();
            //讀取每一頁
            for (int i = 0; i < allPages.size(); i++) {
                PDPage page = (PDPage) allPages.get(i);
                System.out.println("Processing page: " + i);
                PDStream contents = page.getContents();
                if (contents != null) {
                    //顯示字元資料
                    printer.processStream(page, page.findResources(), page.getContents().getStream());
                }
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }

    /**
     * @param text The text to be processed
     */
    @Override /* this is questionable, not sure if needed... */
    protected void processTextPosition(TextPosition text) {
        System.out.println("String[" + text.getXDirAdj() + ","
                + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale="
                + text.getXScale() + " height=" + text.getHeightDir() + " space="
                + text.getWidthOfSpace() + " width="
                + text.getWidthDirAdj() + "]" + text.getCharacter());
    }
}

Fisher810228 發表在 痞客邦 留言(0) 人氣()

※怕自己忘記特別做個簡單的操作紀錄※

目前是使用Array做暫存資料的動作,之後透過呼叫write的function去做寫入新的Excel的動作

Fisher810228 發表在 痞客邦 留言(0) 人氣()

※怕自己忘記特別做個簡單的操作紀錄※

由於分析數據會用到很多excel的檔案,故使用POI這個開源碼來進行excel的讀取及資料的抓取。

Fisher810228 發表在 痞客邦 留言(0) 人氣()