module Pdftext: sig
.. end
Parsing Fonts and Extracting Text
Data Types
type
type3_glpyhs = {
}
type
simple_fonttype =
type
fontmetrics = int array
type
fontdescriptor
type
differences
type
encoding =
| |
ImplicitInFontFile |
| |
StandardEncoding |
| |
MacRomanEncoding |
| |
WinAnsiEncoding |
| |
MacExpertEncoding |
| |
CustomEncoding of encoding * differences |
| |
FillUndefinedWithStandard of encoding |
type
simple_font = {
}
type
standard_font =
| |
TimesRoman |
| |
TimesBold |
| |
TimesItalic |
| |
TimesBoldItalic |
| |
Helvetica |
| |
HelveticaBold |
| |
HelveticaOblique |
| |
HelveticaBoldOblique |
| |
Courier |
| |
CourierBold |
| |
CourierOblique |
| |
CourierBoldOblique |
| |
Symbol |
| |
ZapfDingbats |
type
cid_system_info = {
|
registry : string ; |
|
ordering : string ; |
|
supplement : int ; |
}
type
composite_CIDfont = {
|
cid_system_info : cid_system_info ; |
|
cid_basefont : string ; |
|
cid_fontdescriptor : fontdescriptor ; |
|
cid_widths : (int * int) list ; |
|
cid_default_width : int ; |
}
type
cmap_encoding =
| |
Predefined of string |
| |
CMap of int |
type
font =
Reading a Font
val read_font : Pdf.pdfdoc -> Pdf.pdfobject -> font
Read a font from a given document and object
Text Extraction
type
text_extractor
The type of text extractors.
val text_extractor_of_font : Pdf.pdfdoc -> Pdf.pdfobject -> text_extractor
Build a text extractor from a document and font object
val codepoints_of_text : text_extractor -> string -> int list
Return a list of unicode points from a given extractor and string (for
example from a Pdfpages.Op_Tj
or Op_TJ
operator).
val utf16be_of_text : text_extractor -> string -> string
Same, but return UTF16BE
val latin1_string_of_text : text_extractor -> string -> string
Same, but return Latin1 (Lossy)
val decode_char : encoding -> char -> char
Decode a single character code in a standard font
val decode_type3_char : encoding -> char -> string
Decode a single character code in a type3 font to a glyph name