Skip to content

Commit

Permalink
Add new OCR sample.
Browse files Browse the repository at this point in the history
Also, update markdown.
  • Loading branch information
datalogics-josepha committed Jan 22, 2025
1 parent 7201a36 commit 21a197c
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 0 deletions.
6 changes: 6 additions & 0 deletions OpticalCharacterRecognition/OCRDocument/App.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.7.2" />
</startup>
</configuration>
75 changes: 75 additions & 0 deletions OpticalCharacterRecognition/OCRDocument/OCRDocument.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
using System;
using System.Collections.Generic;
using Datalogics.PDFL;

/*
* Runs OCR on the document recognizing text found on its rasterized pages.
*
* Copyright (c) 2007-2025, Datalogics, Inc. All rights reserved.
*
*/

namespace OCRDocument
{
class OCRDocument
{
static void Main(string[] args)
{
Console.WriteLine("OCRDocument Sample:");


using (Library lib = new Library())
{
Console.WriteLine("Initialized the library.");

String sInput = Library.ResourceDirectory + "Sample_Input/scanned_images.pdf";
String sOutput = "OCRDocument-out.pdf";

if (args.Length > 0)
sInput = args[0];
if (args.Length > 1)
sOutput = args[1];

Console.WriteLine("Input file: " + sInput);
Console.WriteLine("Writing output to: " + sOutput);

OCRParams ocrParams = new OCRParams();
//The OCRParams.Languages parameter controls which languages the OCR engine attempts
//to detect. By default the OCR engine searches for English.
List<LanguageSetting> langList = new List<LanguageSetting>();
LanguageSetting languageOne = new LanguageSetting(Language.English, false);
langList.Add(languageOne);

//You could add additional languages for the OCR engine to detect by adding
//more entries to the LanguageSetting list.

//LanguageSetting languageTwo = new LanguageSetting(Language.Japanese, false);
//langList.Add(languageTwo);
ocrParams.Languages = langList;

// If the resolution for the images in your document are not
// 300 dpi, specify a default resolution here. Specifying a
// correct resolution gives better results for OCR, especially
// with automatic image preprocessing.
// ocrParams.Resolution = 600;

using (OCREngine ocrEngine = new OCREngine(ocrParams))
{
//Create a document object using the input file
using (Document doc = new Document(sInput))
{
for (int numPage = 0; numPage < doc.NumPages; numPage++)
{
using (Page page = doc.GetPage(numPage))
{
page.OCRPageContents(doc, ocrEngine);
}
}

doc.Save(SaveFlags.Full, sOutput);
}
}
}
}
}
}
58 changes: 58 additions & 0 deletions OpticalCharacterRecognition/OCRDocument/OCRDocument.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">x64</Platform>
<ProjectGuid>{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}</ProjectGuid>
<OutputType>Exe</OutputType>
<RootNamespace>OCRDocument</RootNamespace>
<AssemblyName>OCRDocument</AssemblyName>
<TargetFrameworkVersion>v4.7.2</TargetFrameworkVersion>
<FileAlignment>512</FileAlignment>
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<Deterministic>true</Deterministic>
<NuGetPackageImportStamp>
</NuGetPackageImportStamp>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|x64' ">
<PlatformTarget>x64</PlatformTarget>
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<OutputPath>..\..\..\dle\build\win-x86-64\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|x64' ">
<PlatformTarget>x64</PlatformTarget>
<DebugType>pdbonly</DebugType>
<Optimize>false</Optimize>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Adobe.PDF.Library.LM.NETFramework">
<Version>18.*</Version>
</PackageReference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Net.Http" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="OCRDocument.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>
<ItemGroup>
<None Include="App.config" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
</Project>
25 changes: 25 additions & 0 deletions OpticalCharacterRecognition/OCRDocument/OCRDocument.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.33328.57
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OCRDocument", "OCRDocument.csproj", "{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.ActiveCfg = Debug|x64
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Debug|x64.Build.0 = Debug|x64
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.ActiveCfg = Release|x64
{C9DD37F3-545F-4346-8EF2-FAE2DD20FDCF}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {CEA60573-4A7F-49A3-8EC5-6DCC54E2E30B}
EndGlobalSection
EndGlobal
33 changes: 33 additions & 0 deletions OpticalCharacterRecognition/OCRDocument/Properties/AssemblyInfo.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

// General Information about an assembly is controlled through the following
// set of attributes. Change these attribute values to modify the information
// associated with an assembly.
[assembly: AssemblyTitle("OCRDocument")]
[assembly: AssemblyDescription("")]
[assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("Datalogics, Inc.")]
[assembly: AssemblyProduct("OCRDocument")]
[assembly: AssemblyCopyright("Copyright © Datalogics 2019-2025")]
[assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")]

// Setting ComVisible to false makes the types in this assembly not visible
// to COM components. If you need to access a type in this assembly from
// COM, set the ComVisible attribute to true on that type.
[assembly: ComVisible(false)]

// The following GUID is for the ID of the typelib if this project is exposed to COM
[assembly: Guid("a1a2f184-6250-4843-8d6b-3a72776dd27d")]

// Version information for an assembly consists of the following four values:
//
// Major Version
// Minor Version
// Build Number
// Revision
//
[assembly: AssemblyVersion("1.0.0.0")]
[assembly: AssemblyFileVersion("1.0.0.0")]
3 changes: 3 additions & 0 deletions OpticalCharacterRecognition/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ Places recognized text behind the OCR images found on a PDF page.

## ***AddTextToImage***
Adds an image file to a PDF page, runs OCR on the image, and place the recognized text behind it.

## ***OCRDocument***
Runs OCR on the document recognizing text found on its rasterized pages.

0 comments on commit 21a197c

Please sign in to comment.