533 lines
18 KiB
Python
533 lines
18 KiB
Python
from binascii import unhexlify
|
||
from math import ceil
|
||
from typing import Any, Dict, List, Tuple, Union, cast
|
||
|
||
from ._codecs import adobe_glyphs, charset_encoding
|
||
from ._utils import logger_error, logger_warning
|
||
from .generic import (
|
||
DecodedStreamObject,
|
||
DictionaryObject,
|
||
StreamObject,
|
||
is_null_or_none,
|
||
)
|
||
|
||
|
||
# code freely inspired from @twiggy ; see #711
|
||
def build_char_map(
|
||
font_name: str, space_width: float, obj: DictionaryObject
|
||
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any], DictionaryObject]:
|
||
"""
|
||
Determine information about a font.
|
||
|
||
Args:
|
||
font_name: font name as a string
|
||
space_width: default space width if no data is found.
|
||
obj: XObject or Page where you can find a /Resource dictionary
|
||
|
||
Returns:
|
||
Font sub-type, space_width criteria (50% of width), encoding, map character-map, font-dictionary.
|
||
The font-dictionary itself is suitable for the curious.
|
||
|
||
"""
|
||
ft: DictionaryObject = obj["/Resources"]["/Font"][font_name] # type: ignore
|
||
font_subtype, font_halfspace, font_encoding, font_map = build_char_map_from_dict(
|
||
space_width, ft
|
||
)
|
||
return font_subtype, font_halfspace, font_encoding, font_map, ft
|
||
|
||
|
||
def build_char_map_from_dict(
|
||
space_width: float, ft: DictionaryObject
|
||
) -> Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]]:
|
||
"""
|
||
Determine information about a font.
|
||
|
||
Args:
|
||
space_width: default space with if no data found
|
||
(normally half the width of a character).
|
||
ft: Font Dictionary
|
||
|
||
Returns:
|
||
Font sub-type, space_width criteria(50% of width), encoding, map character-map.
|
||
The font-dictionary itself is suitable for the curious.
|
||
|
||
"""
|
||
font_type = cast(str, ft["/Subtype"].get_object())
|
||
encoding, map_dict = get_encoding(ft)
|
||
|
||
space_key_char = get_actual_str_key(" ", encoding, map_dict)
|
||
font_width_map = build_font_width_map(ft, space_width * 2.0)
|
||
half_space_width = compute_space_width(font_width_map, space_key_char) / 2.0
|
||
|
||
return (
|
||
font_type,
|
||
half_space_width,
|
||
encoding,
|
||
# https://github.com/python/mypy/issues/4374
|
||
map_dict
|
||
)
|
||
|
||
|
||
# used when missing data, e.g. font def missing
|
||
unknown_char_map: Tuple[str, float, Union[str, Dict[int, str]], Dict[Any, Any]] = (
|
||
"Unknown",
|
||
9999,
|
||
dict(zip(range(256), ["<EFBFBD>"] * 256)),
|
||
{},
|
||
)
|
||
|
||
|
||
_predefined_cmap: Dict[str, str] = {
|
||
"/Identity-H": "utf-16-be",
|
||
"/Identity-V": "utf-16-be",
|
||
"/GB-EUC-H": "gbk",
|
||
"/GB-EUC-V": "gbk",
|
||
"/GBpc-EUC-H": "gb2312",
|
||
"/GBpc-EUC-V": "gb2312",
|
||
"/GBK-EUC-H": "gbk",
|
||
"/GBK-EUC-V": "gbk",
|
||
"/GBK2K-H": "gb18030",
|
||
"/GBK2K-V": "gb18030",
|
||
"/ETen-B5-H": "cp950",
|
||
"/ETen-B5-V": "cp950",
|
||
"/ETenms-B5-H": "cp950",
|
||
"/ETenms-B5-V": "cp950",
|
||
"/UniCNS-UTF16-H": "utf-16-be",
|
||
"/UniCNS-UTF16-V": "utf-16-be",
|
||
"/UniGB-UTF16-H": "gb18030",
|
||
"/UniGB-UTF16-V": "gb18030",
|
||
# UCS2 in code
|
||
}
|
||
|
||
# manually extracted from http://mirrors.ctan.org/fonts/adobe/afm/Adobe-Core35_AFMs-229.tar.gz
|
||
_default_fonts_space_width: Dict[str, int] = {
|
||
"/Courier": 600,
|
||
"/Courier-Bold": 600,
|
||
"/Courier-BoldOblique": 600,
|
||
"/Courier-Oblique": 600,
|
||
"/Helvetica": 278,
|
||
"/Helvetica-Bold": 278,
|
||
"/Helvetica-BoldOblique": 278,
|
||
"/Helvetica-Oblique": 278,
|
||
"/Helvetica-Narrow": 228,
|
||
"/Helvetica-NarrowBold": 228,
|
||
"/Helvetica-NarrowBoldOblique": 228,
|
||
"/Helvetica-NarrowOblique": 228,
|
||
"/Times-Roman": 250,
|
||
"/Times-Bold": 250,
|
||
"/Times-BoldItalic": 250,
|
||
"/Times-Italic": 250,
|
||
"/Symbol": 250,
|
||
"/ZapfDingbats": 278,
|
||
}
|
||
|
||
|
||
def get_encoding(
|
||
ft: DictionaryObject
|
||
) -> Tuple[Union[str, Dict[int, str]], Dict[Any, Any]]:
|
||
encoding = _parse_encoding(ft)
|
||
map_dict, int_entry = _parse_to_unicode(ft)
|
||
|
||
# Apply rule from PDF ref 1.7 §5.9.1, 1st bullet:
|
||
# if cmap not empty encoding should be discarded
|
||
# (here transformed into identity for those characters)
|
||
# If encoding is a string it is expected to be an identity translation.
|
||
if isinstance(encoding, dict):
|
||
for x in int_entry:
|
||
if x <= 255:
|
||
encoding[x] = chr(x)
|
||
|
||
return encoding, map_dict
|
||
|
||
|
||
def _parse_encoding(
|
||
ft: DictionaryObject
|
||
) -> Union[str, Dict[int, str]]:
|
||
encoding: Union[str, List[str], Dict[int, str]] = []
|
||
if "/Encoding" not in ft:
|
||
if "/BaseFont" in ft and cast(str, ft["/BaseFont"]) in charset_encoding:
|
||
encoding = dict(
|
||
zip(range(256), charset_encoding[cast(str, ft["/BaseFont"])])
|
||
)
|
||
else:
|
||
encoding = "charmap"
|
||
return encoding
|
||
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
|
||
if isinstance(enc, str):
|
||
try:
|
||
# already done : enc = NameObject.unnumber(enc.encode()).decode()
|
||
# for #xx decoding
|
||
if enc in charset_encoding:
|
||
encoding = charset_encoding[enc].copy()
|
||
elif enc in _predefined_cmap:
|
||
encoding = _predefined_cmap[enc]
|
||
elif "-UCS2-" in enc:
|
||
encoding = "utf-16-be"
|
||
else:
|
||
raise Exception("not found")
|
||
except Exception:
|
||
logger_error(f"Advanced encoding {enc} not implemented yet", __name__)
|
||
encoding = enc
|
||
elif isinstance(enc, DictionaryObject) and "/BaseEncoding" in enc:
|
||
try:
|
||
encoding = charset_encoding[cast(str, enc["/BaseEncoding"])].copy()
|
||
except Exception:
|
||
logger_error(
|
||
f"Advanced encoding {encoding} not implemented yet",
|
||
__name__,
|
||
)
|
||
encoding = charset_encoding["/StandardCoding"].copy()
|
||
else:
|
||
encoding = charset_encoding["/StandardCoding"].copy()
|
||
if "/Differences" in enc:
|
||
x: int = 0
|
||
o: Union[int, str]
|
||
for o in cast(DictionaryObject, cast(DictionaryObject, enc)["/Differences"]):
|
||
if isinstance(o, int):
|
||
x = o
|
||
else: # isinstance(o,str):
|
||
try:
|
||
if x < len(encoding):
|
||
encoding[x] = adobe_glyphs[o] # type: ignore
|
||
except Exception:
|
||
encoding[x] = o # type: ignore
|
||
x += 1
|
||
if isinstance(encoding, list):
|
||
encoding = dict(zip(range(256), encoding))
|
||
return encoding
|
||
|
||
|
||
def _parse_to_unicode(
|
||
ft: DictionaryObject
|
||
) -> Tuple[Dict[Any, Any], List[int]]:
|
||
# will store all translation code
|
||
# and map_dict[-1] we will have the number of bytes to convert
|
||
map_dict: Dict[Any, Any] = {}
|
||
|
||
# will provide the list of cmap keys as int to correct encoding
|
||
int_entry: List[int] = []
|
||
|
||
if "/ToUnicode" not in ft:
|
||
if ft.get("/Subtype", "") == "/Type1":
|
||
return _type1_alternative(ft, map_dict, int_entry)
|
||
else:
|
||
return {}, []
|
||
process_rg: bool = False
|
||
process_char: bool = False
|
||
multiline_rg: Union[
|
||
None, Tuple[int, int]
|
||
] = None # tuple = (current_char, remaining size) ; cf #1285 for example of file
|
||
cm = prepare_cm(ft)
|
||
for line in cm.split(b"\n"):
|
||
process_rg, process_char, multiline_rg = process_cm_line(
|
||
line.strip(b" \t"),
|
||
process_rg,
|
||
process_char,
|
||
multiline_rg,
|
||
map_dict,
|
||
int_entry,
|
||
)
|
||
|
||
return map_dict, int_entry
|
||
|
||
|
||
def get_actual_str_key(
|
||
value_char: str, encoding: Union[str, Dict[int, str]], map_dict: Dict[Any, Any]
|
||
) -> str:
|
||
key_dict = {}
|
||
if isinstance(encoding, dict):
|
||
key_dict = {value: chr(key) for key, value in encoding.items() if value == value_char}
|
||
else:
|
||
key_dict = {value: key for key, value in map_dict.items() if value == value_char}
|
||
key_char = key_dict.get(value_char, value_char)
|
||
return key_char
|
||
|
||
|
||
def prepare_cm(ft: DictionaryObject) -> bytes:
|
||
tu = ft["/ToUnicode"]
|
||
cm: bytes
|
||
if isinstance(tu, StreamObject):
|
||
cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
|
||
else: # if (tu is None) or cast(str, tu).startswith("/Identity"):
|
||
# the full range 0000-FFFF will be processed
|
||
cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
|
||
if isinstance(cm, str):
|
||
cm = cm.encode()
|
||
# we need to prepare cm before due to missing return line in pdf printed
|
||
# to pdf from word
|
||
cm = (
|
||
cm.strip()
|
||
.replace(b"beginbfchar", b"\nbeginbfchar\n")
|
||
.replace(b"endbfchar", b"\nendbfchar\n")
|
||
.replace(b"beginbfrange", b"\nbeginbfrange\n")
|
||
.replace(b"endbfrange", b"\nendbfrange\n")
|
||
.replace(b"<<", b"\n{\n") # text between << and >> not used but
|
||
.replace(b">>", b"\n}\n") # some solution to find it back
|
||
)
|
||
ll = cm.split(b"<")
|
||
for i in range(len(ll)):
|
||
j = ll[i].find(b">")
|
||
if j >= 0:
|
||
if j == 0:
|
||
# string is empty: stash a placeholder here (see below)
|
||
# see https://github.com/py-pdf/pypdf/issues/1111
|
||
content = b"."
|
||
else:
|
||
content = ll[i][:j].replace(b" ", b"")
|
||
ll[i] = content + b" " + ll[i][j + 1 :]
|
||
cm = (
|
||
(b" ".join(ll))
|
||
.replace(b"[", b" [ ")
|
||
.replace(b"]", b" ]\n ")
|
||
.replace(b"\r", b"\n")
|
||
)
|
||
return cm
|
||
|
||
|
||
def process_cm_line(
|
||
line: bytes,
|
||
process_rg: bool,
|
||
process_char: bool,
|
||
multiline_rg: Union[None, Tuple[int, int]],
|
||
map_dict: Dict[Any, Any],
|
||
int_entry: List[int],
|
||
) -> Tuple[bool, bool, Union[None, Tuple[int, int]]]:
|
||
if line == b"" or line[0] == 37: # 37 = %
|
||
return process_rg, process_char, multiline_rg
|
||
line = line.replace(b"\t", b" ")
|
||
if b"beginbfrange" in line:
|
||
process_rg = True
|
||
elif b"endbfrange" in line:
|
||
process_rg = False
|
||
elif b"beginbfchar" in line:
|
||
process_char = True
|
||
elif b"endbfchar" in line:
|
||
process_char = False
|
||
elif process_rg:
|
||
multiline_rg = parse_bfrange(line, map_dict, int_entry, multiline_rg)
|
||
elif process_char:
|
||
parse_bfchar(line, map_dict, int_entry)
|
||
return process_rg, process_char, multiline_rg
|
||
|
||
|
||
def parse_bfrange(
|
||
line: bytes,
|
||
map_dict: Dict[Any, Any],
|
||
int_entry: List[int],
|
||
multiline_rg: Union[None, Tuple[int, int]],
|
||
) -> Union[None, Tuple[int, int]]:
|
||
lst = [x for x in line.split(b" ") if x]
|
||
closure_found = False
|
||
if multiline_rg is not None:
|
||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||
a = multiline_rg[0] # a, b not in the current line
|
||
b = multiline_rg[1]
|
||
for sq in lst[0:]:
|
||
if sq == b"]":
|
||
closure_found = True
|
||
break
|
||
map_dict[
|
||
unhexlify(fmt % a).decode(
|
||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||
"surrogatepass",
|
||
)
|
||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||
int_entry.append(a)
|
||
a += 1
|
||
else:
|
||
a = int(lst[0], 16)
|
||
b = int(lst[1], 16)
|
||
nbi = max(len(lst[0]), len(lst[1]))
|
||
map_dict[-1] = ceil(nbi / 2)
|
||
fmt = b"%%0%dX" % (map_dict[-1] * 2)
|
||
if lst[2] == b"[":
|
||
for sq in lst[3:]:
|
||
if sq == b"]":
|
||
closure_found = True
|
||
break
|
||
map_dict[
|
||
unhexlify(fmt % a).decode(
|
||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||
"surrogatepass",
|
||
)
|
||
] = unhexlify(sq).decode("utf-16-be", "surrogatepass")
|
||
int_entry.append(a)
|
||
a += 1
|
||
else: # case without list
|
||
c = int(lst[2], 16)
|
||
fmt2 = b"%%0%dX" % max(4, len(lst[2]))
|
||
closure_found = True
|
||
while a <= b:
|
||
map_dict[
|
||
unhexlify(fmt % a).decode(
|
||
"charmap" if map_dict[-1] == 1 else "utf-16-be",
|
||
"surrogatepass",
|
||
)
|
||
] = unhexlify(fmt2 % c).decode("utf-16-be", "surrogatepass")
|
||
int_entry.append(a)
|
||
a += 1
|
||
c += 1
|
||
return None if closure_found else (a, b)
|
||
|
||
|
||
def parse_bfchar(line: bytes, map_dict: Dict[Any, Any], int_entry: List[int]) -> None:
|
||
lst = [x for x in line.split(b" ") if x]
|
||
map_dict[-1] = len(lst[0]) // 2
|
||
while len(lst) > 1:
|
||
map_to = ""
|
||
# placeholder (see above) means empty string
|
||
if lst[1] != b".":
|
||
map_to = unhexlify(lst[1]).decode(
|
||
"charmap" if len(lst[1]) < 4 else "utf-16-be", "surrogatepass"
|
||
) # join is here as some cases where the code was split
|
||
map_dict[
|
||
unhexlify(lst[0]).decode(
|
||
"charmap" if map_dict[-1] == 1 else "utf-16-be", "surrogatepass"
|
||
)
|
||
] = map_to
|
||
int_entry.append(int(lst[0], 16))
|
||
lst = lst[2:]
|
||
|
||
|
||
def build_font_width_map(
|
||
ft: DictionaryObject, default_font_width: float
|
||
) -> Dict[Any, float]:
|
||
font_width_map: Dict[Any, float] = {}
|
||
st: int = 0
|
||
en: int = 0
|
||
try:
|
||
default_font_width = _default_fonts_space_width[cast(str, ft["/BaseFont"].get_object)] * 2.0
|
||
except KeyError:
|
||
pass
|
||
if "/DescendantFonts" in ft: # ft["/Subtype"].startswith("/CIDFontType"):
|
||
# §9.7.4.3 of the 1.7 reference ("Glyph Metrics in CIDFonts")
|
||
# Widths for a CIDFont are defined using the DW and W entries.
|
||
# DW2 and W2 are for vertical use. Vertical type is not implemented.
|
||
ft1 = ft["/DescendantFonts"][0].get_object() # type: ignore
|
||
try:
|
||
font_width_map["default"] = cast(float, ft1["/DW"])
|
||
except Exception:
|
||
font_width_map["default"] = default_font_width
|
||
if "/W" in ft1:
|
||
w = ft1["/W"].get_object()
|
||
else:
|
||
w = []
|
||
while len(w) > 0:
|
||
st = w[0] if isinstance(w[0], int) else w[0].get_object()
|
||
second = w[1].get_object()
|
||
if isinstance(second, int):
|
||
# C_first C_last same_W
|
||
en = second
|
||
for c_code in range(st, en + 1):
|
||
font_width_map[chr(c_code)] = w[2]
|
||
w = w[3:]
|
||
elif isinstance(second, list):
|
||
# Starting_C [W1 W2 ... Wn]
|
||
c_code = st
|
||
for width in second:
|
||
font_width_map[chr(c_code)] = width
|
||
c_code += 1
|
||
w = w[2:]
|
||
else:
|
||
logger_warning(
|
||
"unknown widths : \n" + (ft1["/W"]).__repr__(),
|
||
__name__,
|
||
)
|
||
break
|
||
elif "/Widths" in ft:
|
||
w = ft["/Widths"].get_object()
|
||
if "/FontDescriptor" in ft and "/MissingWidth" in cast(
|
||
DictionaryObject, ft["/FontDescriptor"]
|
||
):
|
||
font_width_map["default"] = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
|
||
else:
|
||
# will consider width of char as avg(width)
|
||
m = 0
|
||
cpt = 0
|
||
for xx in w:
|
||
xx = xx.get_object()
|
||
if xx > 0:
|
||
m += xx
|
||
cpt += 1
|
||
font_width_map["default"] = m / max(1, cpt)
|
||
st = cast(int, ft["/FirstChar"])
|
||
en = cast(int, ft["/LastChar"])
|
||
for c_code in range(st, en + 1):
|
||
try:
|
||
width = w[c_code - st].get_object()
|
||
font_width_map[chr(c_code)] = width
|
||
except (IndexError, KeyError):
|
||
# The PDF structure is invalid. The array is too small
|
||
# for the specified font width.
|
||
pass
|
||
if is_null_or_none(font_width_map.get("default")):
|
||
font_width_map["default"] = default_font_width if default_font_width else 0.0
|
||
return font_width_map
|
||
|
||
|
||
def compute_space_width(
|
||
font_width_map: Dict[Any, float], space_char: str
|
||
) -> float:
|
||
try:
|
||
sp_width = font_width_map[space_char]
|
||
if sp_width == 0:
|
||
raise ValueError("Zero width")
|
||
except (KeyError, ValueError):
|
||
sp_width = (
|
||
font_width_map["default"] / 2.0
|
||
) # if using default we consider space will be only half size
|
||
|
||
return sp_width
|
||
|
||
|
||
def compute_font_width(
|
||
font_width_map: Dict[Any, float],
|
||
char: str
|
||
) -> float:
|
||
char_width: float = 0.0
|
||
try:
|
||
char_width = font_width_map[char]
|
||
except KeyError:
|
||
char_width = (
|
||
font_width_map["default"]
|
||
)
|
||
|
||
return char_width
|
||
|
||
|
||
def _type1_alternative(
|
||
ft: DictionaryObject,
|
||
map_dict: Dict[Any, Any],
|
||
int_entry: List[int],
|
||
) -> Tuple[Dict[Any, Any], List[int]]:
|
||
if "/FontDescriptor" not in ft:
|
||
return map_dict, int_entry
|
||
ft_desc = cast(DictionaryObject, ft["/FontDescriptor"]).get("/FontFile")
|
||
if is_null_or_none(ft_desc):
|
||
return map_dict, int_entry
|
||
assert ft_desc is not None, "mypy"
|
||
txt = ft_desc.get_object().get_data()
|
||
txt = txt.split(b"eexec\n")[0] # only clear part
|
||
txt = txt.split(b"/Encoding")[1] # to get the encoding part
|
||
lines = txt.replace(b"\r", b"\n").split(b"\n")
|
||
for li in lines:
|
||
if li.startswith(b"dup"):
|
||
words = [_w for _w in li.split(b" ") if _w != b""]
|
||
if len(words) > 3 and words[3] != b"put":
|
||
continue
|
||
try:
|
||
i = int(words[1])
|
||
except ValueError: # pragma: no cover
|
||
continue
|
||
try:
|
||
v = adobe_glyphs[words[2].decode()]
|
||
except KeyError:
|
||
if words[2].startswith(b"/uni"):
|
||
try:
|
||
v = chr(int(words[2][4:], 16))
|
||
except ValueError: # pragma: no cover
|
||
continue
|
||
map_dict[chr(i)] = v
|
||
int_entry.append(i)
|
||
return map_dict, int_entry
|