admin管理员组

文章数量:1567919

2023年12月23日发(作者:)

最近由于工作原因,需要从pdf中提取里面的图片和文字,网上这方面的资料很少,最后费了九牛二虎之力终于搞定了,用的编程语言是C#,用到的工具包是itextSharp,主要代码如下,希望有相同需求的朋友可以少走些弯路。

方法一:从pdf中提取图片

private void ExtractImage(string pdfFile)

{

PdfReader pdfReader = new PdfReader(pdfFile);

for (int pageNumber = 1; pageNumber <=

OfPages; pageNumber++)

{

PdfReader pdf = new PdfReader(pdfFile);

PdfDictionary pg = eN(pageNumber);

PdfDictionary res =

(PdfDictionary)Object((CES));

PdfDictionary xobj =

(PdfDictionary)Object((T));

try

{

foreach (PdfName name in )

{

PdfObject bj = (name);

if (rect())

{

PdfDictionary tg =

(PdfDictionary)Object(obj);

string width = ().ToString();

string height = ().ToString();

//ImageRenderInfo imgRI =

ForXObject((GraphicsState)new

Matrix((width), (height)), (PRIndirectReference)obj, tg);

ImageRenderInfo imgRI =

ForXObject(new GraphicsState(),

(PRIndirectReference)obj, tg);

RenderImage(imgRI);

}

}

}catch

{

continue;

}

}

}

方法二:将图片保存到文件

private void RenderImage(ImageRenderInfo renderInfo)

{

count++;

PdfImageObject image = ge();

using (Dotnet dotnetImg = wingImage())

{

if (dotnetImg != null)

{

using (MemoryStream ms = new MemoryStream())

{

(ms, );

Bitmap d = new Bitmap(dotnetImg);

(@"");

}

}

}

}

方法三:从pdf中提取文本

public void ExtractTextFromPDFPage(string pdfFile)

{

PdfReader reader = new PdfReader(pdfFile);

int n = OfPages;

for (int i = 1; i <= n; i++)

{

string text = tFromPage(reader, i);

}

try { (); }

catch { }

}

本文标签: 图片提取文字用到需求