Module fusus.lib
Expand source code Browse git
import os
import io
from itertools import chain, groupby
from tempfile import NamedTemporaryFile
import pprint as pp
import numpy as np
import PIL.Image
from IPython.display import HTML, Image, display
import cv2
from tf.core.helpers import rangesFromList, specFromRanges, setFromSpec
PP = pp.PrettyPrinter(indent=2)
def pprint(x):
PP.pprint(x)
def dh(html):
display(HTML(html))
EXTENSIONS = set(
"""
jpeg
jpg
png
tif
tiff
""".strip().split()
)
"""Supported image file extensions.
"""
DEFAULT_EXTENSION = "png"
FONT = cv2.FONT_HERSHEY_SIMPLEX
NB_VIEWER = "https://nbviewer.jupyter.org/github"
def parseNums(numSpec):
"""Parses a value as one or more numbers.
Parameters
----------
numSpec: None | int | string | iterable
If `None` results in `None`.
If an `int`, it stands for that int.
If a `string`, it is allowed to be a comma separated list of
numbers or ranges, where a range is a lower bound and an upper bound
separated by a `-`.
If none of these, it should be an iterable of `int` values.
Examples:
50
"50"
"50,70"
"50-70,91,92,300-350"
(50, 70, 91, 92, 300)
[50, 70, 90]
range(300, 350)
Returns
-------
None | iterable of int
Depending on the value.
"""
return (
None
if not numSpec
else [numSpec]
if type(numSpec) is int
else setFromSpec(numSpec)
if type(numSpec) is str
else list(numSpec)
)
def getNbLink(path, text):
if path.startswith("~/github"):
components = path.rstrip("/").split("/")[2:]
if not components:
return None
linkA = '<a target="_blank" href="'
linkB = f'">{text}</a>'
nbPathA = f"{NB_VIEWER}/" + "/".join(components[0:2])
if len(components) <= 2:
return f"{linkA}{nbPathA}{linkB}"
else:
nbPathB = "/".join(components[2:])
return f"{linkA}{nbPathA}/blob/master/{nbPathB}{linkB}"
return path
def getNbPath(path):
if path.startswith("~/github"):
components = path.rstrip("/").split("/")[2:]
if not components:
return (False, path)
nbPathA = f"{NB_VIEWER}/" + "/".join(components[0:2])
if len(components) <= 2:
return (True, nbPathA)
else:
nbPathB = "/".join(components[2:])
return (True, f"{nbPathA}/blob/master/{nbPathB}")
return (False, path)
def tempFile():
"""Get a temporary file.
"""
return NamedTemporaryFile(mode="w", dir=".")
def imgElem(data):
"""Produce an image with its data packaged into a HTML <img> element.
"""
return f"""<img src="data:image/jpeg;base64,{data}">"""
def PILFromArray(a):
return PIL.Image.fromarray(a)
def arrayFromPIL(img):
return np.asarray(img)
def showImage(a, fmt="jpeg", **kwargs):
"""Show one or more images.
"""
if type(a) in {list, tuple}:
ads = []
for ae in a:
ai = np.uint8(np.clip(ae, 0, 255))
f = io.BytesIO()
PIL.Image.fromarray(ae).save(f, fmt)
ad = Image(data=f.getvalue(), **kwargs)._repr_jpeg_()
ads.append(ad)
display(HTML(f"<div>{''.join(imgElem(ad) for ad in ads)}</div>"))
else:
ai = np.uint8(np.clip(a, 0, 255))
f = io.BytesIO()
PIL.Image.fromarray(ai).save(f, fmt)
display(Image(data=f.getvalue(), **kwargs))
def writeImage(a, path, **kwargs):
"""Write an image to disk
"""
ai = np.uint8(np.clip(a, 0, 255))
with open(path, "wb") as f:
PIL.Image.fromarray(ai).save(f)
def overlay(img, left, top, right, bottom, srcColor, dstColor):
"""Colors a region of an image with care.
A selected region of an image can be given a uniform color,
where only pixels are changed that have an exact given color.
In this way you can replace all the white with gray, for example,
without wiping out existing non-white pixels.
Parameters
----------
img: np array
The image to be overlain with a new color
(left, top, right, bottom): (int, int, int, int)
The region in the image to be colored
srcColor: RGB color
The color of the pixels that may be replaced.
dstColor:
The new color of the replaced pixels.
"""
if right > left and bottom > top:
roi = img[top:bottom, left:right]
roi[np.where((roi == list(srcColor)).all(axis=2))] = dstColor
def splitext(f, withDot=True):
"""Splits a file name into its main part and its extension.
Parameters
----------
f: string
The file name
withDot: boolean, optional `False`
If True, the `.` in the extension is considered part of the extension,
else the dot is stripped from it.
Returns
-------
tuple
The main part and the extension
"""
(bare, ext) = os.path.splitext(f)
if ext and not withDot:
ext = ext[1:]
return (bare, ext)
def imageFileList(imDir):
"""Gets a sorted list of image files from a directory.
Only files having an image extension (defined in `EXTENSIONS`)
are listed.
Parameters
----------
imDir: string
Path to the image directory
Returns
-------
list
Alphabetically sorted list of file names (without directory, with extension)
"""
if not os.path.exists(imDir):
return []
imageFiles = []
with os.scandir(imDir) as it:
for entry in it:
name = entry.name
(bare, ext) = splitext(name, withDot=False)
if not name.startswith(".") and entry.is_file() and ext in EXTENSIONS:
imageFiles.append(name)
return sorted(imageFiles)
def imageFileListSub(imDir):
"""Gets sorted lisst of image files from the subdirectories of a directory.
Only files having an image extension (defined in `EXTENSIONS`)
are listed.
Parameters
----------
imDir: string
Path to the image directory
Returns
-------
dict
Keyed by subdirectory names, valued by
alphabetically sorted list of file names (without directory, with extension)
"""
if not os.path.exists(imDir):
return {}
imageFiles = {}
with os.scandir(imDir) as it:
for entry in it:
name = entry.name
if not name.startswith(".") and entry.is_dir():
imageFiles[name] = imageFileList(f"{imDir}/{name}")
return imageFiles
def pagesRep(source, asList=False):
"""Represents a set of pages as a string in a compact way or as a list.
Parameters
----------
source: list
A list of file names, without directory, with extension
asList: boolean, optional `False`
Whether to return the result as a list of integers or as a compact string.
Returns
-------
list or string
Depending on `asList` a list of page numbers (integers) or a string
mentioning the page numbers, using intervals where possible.
"""
pages = [int(splitext(f)[0].lstrip("0")) for f in source]
return pages if asList else specFromRanges(rangesFromList(pages))
def select(source, selection):
"""Choose items from a bunch of integers.
Parameters
----------
source: iterable of int
The items to choose from
selection: iterable of int or string or `None`
If None, selects all items, otherwise specifies what numbers to select.
If a number is in the selection, but not in the source, it will not be selected.
The selection can be an integer or a compact string that specifies integers,
using ranges and commas.
Returns
-------
list
Sorted list of selected items
"""
if selection is None:
return sorted(source)
index = {int(splitext(f)[0].lstrip("0")): f for f in source}
universe = set(index)
if type(selection) is int:
return sorted(index[n] for n in {selection} & universe)
minu = min(universe, default=0)
maxu = max(universe, default=0)
selected = set()
for rng in selection.split(","):
parts = rng.split("-")
if len(parts) == 2:
(lower, upper) = parts
lower = minu if lower == "" else int(lower)
upper = maxu if upper == "" else int(upper)
else:
lower = int(parts[0])
upper = lower
selected |= set(range(lower, upper + 1)) & universe
return sorted(index[n] for n in selected)
def cropBorders(img, tolerance=10):
"""Get the bounding box of the image without black borders, if any.
The image is white writing on black background.
The outer frame is white, if any.
We find the region within a white outer frame
by identifying all black pixels and computing a bounding box around it.
Thanks to
[stackoverflow](https://codereview.stackexchange.com/a/132933).
Parameters
----------
img: numpy array
The image. We assume it is grayscale, and inverted.
For best results, it should be blurred before thresholding.
tolerance: integer
This parameter is the upper limit of what counts as black.
Returns
-------
int, int, int, int
The (x0, x1, y0, y1) of the crop region.
This will be used in `removeBorders` to whiten the margins outside it.
"""
# check whether image is completely non-black
# then we do not crop
if np.amin(img) >= tolerance:
(imH, imW) = img.shape[0:2]
print("*", 0, imW, 0, imH)
return (0, imW, 0, imH)
# Mask of black pixels
mask = img < tolerance
# Coordinates of black pixels.
coords = np.argwhere(mask)
# Bounding box of black pixels.
(y0, x0) = coords.min(axis=0)
(y1, x1) = coords.max(axis=0)
# Get the contents of the bounding box.
return (x0, x1, y0, y1)
def removeBorders(img, crop, white):
"""Remove black borders around an image.
When an image has been unskewed, sharp triangle-shape strokes in the corners
may have been introduced.
Or it might be the result of scanning a page.
This function removes them by coloring all image borders with white.
The exact borders to be whitened are calculated by `cropBorders`.
Parameters
----------
img: image as np array
the image to operate on
crop: (int, int, int, int)
the x1, x2, y1, y2 values which indicate the region
outside which the white may be applied
white: color
the exact white color with which we color the borders.
Returns
-------
None
The source image receives a modification.
"""
(imH, imW) = img.shape[0:2]
(x0, x1, y0, y1) = crop
for rect in (
((0, 0), (x0, imH)),
((0, 0), (imW, y0)),
((x1, 0), (imW, imH)),
((0, y1), (imW, imH)),
):
cv2.rectangle(img, *rect, white, -1)
def parseStages(stage, allStages, sortedStages, error):
"""Parses a string that specifies stages.
Stages are steps in the image processing.
Each stage has an intermediate processing result.
Parameters
----------
stage: string or None or iterable
If None: it means all stages.
If a string: the name of a stage.
If an iterable: the items must be names of stages.
allStages: tuple
Names of all stages.
sortedStages:
Sorted list of all stages.
error: function
Method to write error messages.
Returns
-------
tuple
The stages as parsed.
"""
doStages = (
allStages
if stage is None
else set()
if not stage
else set(stage.split(","))
if type(stage) is str
else set(stage)
)
illegalStages = doStages - allStages
if illegalStages:
error(f"Will skip illegal stages: {', '.join(sorted(illegalStages))}")
doStages = doStages - illegalStages
return tuple(s for s in sortedStages if s in doStages)
def parseBands(band, allBands, error):
"""Parses a string that specifies bands.
Bands are horizontal rectangles defined with respect to lines.
They correspond with regions of interest where we try to find specific
marks, such as commas and accents.
Parameters
----------
band: string or None or iterable
If None: it means all bands.
If a string: the name of a band.
If an iterable: the items must be names of bands.
allBands: tuple
Names of all bands.
error: function
Method to write error messages.
Returns
-------
tuple
The bands as parsed.
"""
sortedBands = sorted(allBands)
doBands = (
allBands
if band is None
else set(band.split(","))
if type(band) is str
else set(band)
)
illegalBands = doBands - allBands
if illegalBands:
error(f"Will skip illegal bands: {', '.join(sorted(illegalBands))}")
doBands -= illegalBands
return tuple(b for b in sortedBands if b in doBands)
def parseMarks(mark, allMarks, bands, error):
"""Parses a string that specifies Marks.
Marks are strokes that we need to find on the page in order to remove them.
They are organized in bands: the regions of interest with respect to the lines
where we expect them to occur.
Parameters
----------
mark: string or None or iterable
If None: it means all marks.
If a string: the name of a mark.
If an iterable: the items must be names of marks.
allMarks: tuple
Names of all marks.
error: function
Method to write error messages.
Returns
-------
tuple
The marks as parsed.
"""
markIndex = {}
for (band, bandMarks) in allMarks.items():
for m in bandMarks:
markIndex.setdefault(m, set()).add(band)
doMarks = (
set()
if mark is None
else set(chain.from_iterable(allMarks.get(band, ()) for band in bands))
if mark == ""
else set(mark.split(","))
if type(mark) is str
else set(mark)
)
illegalMarks = doMarks - set(markIndex)
if illegalMarks:
error(f"Will skip illegal marks: {', '.join(sorted(illegalMarks))}")
doMarks -= illegalMarks
return doMarks
def findRuns(x):
"""Find runs of consecutive items in an array.
Credits:
[Alistair Miles](https://gist.github.com/alimanfoo/c5977e87111abe8127453b21204c1065)
"""
# ensure array
x = np.asanyarray(x)
if x.ndim != 1:
raise ValueError("only 1D array supported")
n = x.shape[0]
# handle empty array
if n == 0:
return np.array([]), np.array([]), np.array([])
else:
# find run starts
loc_run_start = np.empty(n, dtype=bool)
loc_run_start[0] = True
np.not_equal(x[:-1], x[1:], out=loc_run_start[1:])
run_starts = np.nonzero(loc_run_start)[0]
# find run values
run_values = x[loc_run_start]
# find run lengths
run_lengths = np.diff(np.append(run_starts, n))
return run_values, run_starts, run_lengths
def applyBandOffset(C, height, bandName, lines, inter=False):
"""Produce bands from a list of lines.
Bands are defined relative to lines by means of offsets of the top
and bottom heights of the lines.
Bands may also be interlinear: defined between the bottom of one line and the top
of the next line.
Parameters
----------
C: object
Configuration settings
height:
The height of the page or block
bandName: string
The name of the bands
lines: tuple
The lines relative to which the bands have to be determined.
Lines are given as a tuple of tuples of top and bottom heights.
inter: boolean, optional `False`
Whether the bands are relative the lines, or relative the interlinear spaces.
Returns
-------
tuple
For each line the band named bandName specified by top and bottom heights.
"""
offsetBand = C.offsetBand
(top, bottom) = offsetBand[bandName]
def offset(x, off):
x += off
return 0 if x < 0 else height if x > height else x
return tuple(
(offset(up, top), offset(lo, bottom))
for (up, lo) in (
zip((x[1] for x in lines), (x[0] for x in lines[1:])) if inter else lines
)
)
def getMargins(hist, width, threshold):
"""Get margins from a histogram.
The margins of a histogram are the coordinates where the histogram reaches a
threshold for the first time and for the last time.
We deliver the pairs (0, xFirst) and (xLast, maxWidth) if there are points
above the threshold, and (0, maxW) otherwise.
Parameters
----------
hist: [int]
Source array of pixel values
width: int
Maximum index of the source array
threshold: int
Value below which pixels count as zero
"""
chunks = [
[i for (i, value) in it]
for (key, it) in groupby(enumerate(hist), key=lambda x: x[1] >= threshold)
if key >= threshold
]
w = len(hist)
return ((0, chunks[0][0]), (chunks[-1][-1], w)) if chunks else ((0, w),)
def pureAverage(data, supplied):
"""Get the average of a list of values after removing the outliers.
It is used for calcaluting lineheights from a sequence of distances between
histogram peaks.
In practice, some peaks are missing due to short line lengths, and that
causes some abnormal peak distances which we want to remove.
Parameters
----------
data: np array
The list of values whose average we compute.
supplied: integer
Value to return if there is no data.
"""
if data.size == 0:
return supplied
elif data.size == 1:
return int(round(data[0]))
# remove outliers
m = 2.0
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d / mdev if mdev else 0.0
pure = data[s < m]
if len(pure) == 0:
return supplied
elif pure.size == 1:
return int(round(pure[0]))
return int(round(np.average(pure)))
Global variables
var EXTENSIONS
-
Supported image file extensions.
Functions
def PILFromArray(a)
-
Expand source code Browse git
def PILFromArray(a): return PIL.Image.fromarray(a)
def applyBandOffset(C, height, bandName, lines, inter=False)
-
Produce bands from a list of lines.
Bands are defined relative to lines by means of offsets of the top and bottom heights of the lines.
Bands may also be interlinear: defined between the bottom of one line and the top of the next line.
Parameters
C
:object
- Configuration settings
- height:
- The height of the page or block
bandName
:string
- The name of the bands
lines
:tuple
- The lines relative to which the bands have to be determined. Lines are given as a tuple of tuples of top and bottom heights.
inter
:boolean
, optionalFalse
- Whether the bands are relative the lines, or relative the interlinear spaces.
Returns
tuple
- For each line the band named bandName specified by top and bottom heights.
Expand source code Browse git
def applyBandOffset(C, height, bandName, lines, inter=False): """Produce bands from a list of lines. Bands are defined relative to lines by means of offsets of the top and bottom heights of the lines. Bands may also be interlinear: defined between the bottom of one line and the top of the next line. Parameters ---------- C: object Configuration settings height: The height of the page or block bandName: string The name of the bands lines: tuple The lines relative to which the bands have to be determined. Lines are given as a tuple of tuples of top and bottom heights. inter: boolean, optional `False` Whether the bands are relative the lines, or relative the interlinear spaces. Returns ------- tuple For each line the band named bandName specified by top and bottom heights. """ offsetBand = C.offsetBand (top, bottom) = offsetBand[bandName] def offset(x, off): x += off return 0 if x < 0 else height if x > height else x return tuple( (offset(up, top), offset(lo, bottom)) for (up, lo) in ( zip((x[1] for x in lines), (x[0] for x in lines[1:])) if inter else lines ) )
def arrayFromPIL(img)
-
Expand source code Browse git
def arrayFromPIL(img): return np.asarray(img)
def cropBorders(img, tolerance=10)
-
Get the bounding box of the image without black borders, if any.
The image is white writing on black background. The outer frame is white, if any. We find the region within a white outer frame by identifying all black pixels and computing a bounding box around it.
Thanks to stackoverflow.
Parameters
img
:numpy array
- The image. We assume it is grayscale, and inverted. For best results, it should be blurred before thresholding.
tolerance
:integer
- This parameter is the upper limit of what counts as black.
Returns
int, int, int, int
- The (x0, x1, y0, y1) of the crop region.
This will be used in
removeBorders()
to whiten the margins outside it.
Expand source code Browse git
def cropBorders(img, tolerance=10): """Get the bounding box of the image without black borders, if any. The image is white writing on black background. The outer frame is white, if any. We find the region within a white outer frame by identifying all black pixels and computing a bounding box around it. Thanks to [stackoverflow](https://codereview.stackexchange.com/a/132933). Parameters ---------- img: numpy array The image. We assume it is grayscale, and inverted. For best results, it should be blurred before thresholding. tolerance: integer This parameter is the upper limit of what counts as black. Returns ------- int, int, int, int The (x0, x1, y0, y1) of the crop region. This will be used in `removeBorders` to whiten the margins outside it. """ # check whether image is completely non-black # then we do not crop if np.amin(img) >= tolerance: (imH, imW) = img.shape[0:2] print("*", 0, imW, 0, imH) return (0, imW, 0, imH) # Mask of black pixels mask = img < tolerance # Coordinates of black pixels. coords = np.argwhere(mask) # Bounding box of black pixels. (y0, x0) = coords.min(axis=0) (y1, x1) = coords.max(axis=0) # Get the contents of the bounding box. return (x0, x1, y0, y1)
def dh(html)
-
Expand source code Browse git
def dh(html): display(HTML(html))
def findRuns(x)
-
Find runs of consecutive items in an array.
Credits: Alistair Miles
Expand source code Browse git
def findRuns(x): """Find runs of consecutive items in an array. Credits: [Alistair Miles](https://gist.github.com/alimanfoo/c5977e87111abe8127453b21204c1065) """ # ensure array x = np.asanyarray(x) if x.ndim != 1: raise ValueError("only 1D array supported") n = x.shape[0] # handle empty array if n == 0: return np.array([]), np.array([]), np.array([]) else: # find run starts loc_run_start = np.empty(n, dtype=bool) loc_run_start[0] = True np.not_equal(x[:-1], x[1:], out=loc_run_start[1:]) run_starts = np.nonzero(loc_run_start)[0] # find run values run_values = x[loc_run_start] # find run lengths run_lengths = np.diff(np.append(run_starts, n)) return run_values, run_starts, run_lengths
def getMargins(hist, width, threshold)
-
Get margins from a histogram.
The margins of a histogram are the coordinates where the histogram reaches a threshold for the first time and for the last time.
We deliver the pairs (0, xFirst) and (xLast, maxWidth) if there are points above the threshold, and (0, maxW) otherwise.
Parameters
hist
:[int]
- Source array of pixel values
width
:int
- Maximum index of the source array
threshold
:int
- Value below which pixels count as zero
Expand source code Browse git
def getMargins(hist, width, threshold): """Get margins from a histogram. The margins of a histogram are the coordinates where the histogram reaches a threshold for the first time and for the last time. We deliver the pairs (0, xFirst) and (xLast, maxWidth) if there are points above the threshold, and (0, maxW) otherwise. Parameters ---------- hist: [int] Source array of pixel values width: int Maximum index of the source array threshold: int Value below which pixels count as zero """ chunks = [ [i for (i, value) in it] for (key, it) in groupby(enumerate(hist), key=lambda x: x[1] >= threshold) if key >= threshold ] w = len(hist) return ((0, chunks[0][0]), (chunks[-1][-1], w)) if chunks else ((0, w),)
def getNbLink(path, text)
-
Expand source code Browse git
def getNbLink(path, text): if path.startswith("~/github"): components = path.rstrip("/").split("/")[2:] if not components: return None linkA = '<a target="_blank" href="' linkB = f'">{text}</a>' nbPathA = f"{NB_VIEWER}/" + "/".join(components[0:2]) if len(components) <= 2: return f"{linkA}{nbPathA}{linkB}" else: nbPathB = "/".join(components[2:]) return f"{linkA}{nbPathA}/blob/master/{nbPathB}{linkB}" return path
def getNbPath(path)
-
Expand source code Browse git
def getNbPath(path): if path.startswith("~/github"): components = path.rstrip("/").split("/")[2:] if not components: return (False, path) nbPathA = f"{NB_VIEWER}/" + "/".join(components[0:2]) if len(components) <= 2: return (True, nbPathA) else: nbPathB = "/".join(components[2:]) return (True, f"{nbPathA}/blob/master/{nbPathB}") return (False, path)
def imageFileList(imDir)
-
Gets a sorted list of image files from a directory.
Only files having an image extension (defined in
EXTENSIONS
) are listed.Parameters
imDir
:string
- Path to the image directory
Returns
list
- Alphabetically sorted list of file names (without directory, with extension)
Expand source code Browse git
def imageFileList(imDir): """Gets a sorted list of image files from a directory. Only files having an image extension (defined in `EXTENSIONS`) are listed. Parameters ---------- imDir: string Path to the image directory Returns ------- list Alphabetically sorted list of file names (without directory, with extension) """ if not os.path.exists(imDir): return [] imageFiles = [] with os.scandir(imDir) as it: for entry in it: name = entry.name (bare, ext) = splitext(name, withDot=False) if not name.startswith(".") and entry.is_file() and ext in EXTENSIONS: imageFiles.append(name) return sorted(imageFiles)
def imageFileListSub(imDir)
-
Gets sorted lisst of image files from the subdirectories of a directory.
Only files having an image extension (defined in
EXTENSIONS
) are listed.Parameters
imDir
:string
- Path to the image directory
Returns
dict
- Keyed by subdirectory names, valued by alphabetically sorted list of file names (without directory, with extension)
Expand source code Browse git
def imageFileListSub(imDir): """Gets sorted lisst of image files from the subdirectories of a directory. Only files having an image extension (defined in `EXTENSIONS`) are listed. Parameters ---------- imDir: string Path to the image directory Returns ------- dict Keyed by subdirectory names, valued by alphabetically sorted list of file names (without directory, with extension) """ if not os.path.exists(imDir): return {} imageFiles = {} with os.scandir(imDir) as it: for entry in it: name = entry.name if not name.startswith(".") and entry.is_dir(): imageFiles[name] = imageFileList(f"{imDir}/{name}") return imageFiles
def imgElem(data)
-
Produce an image with its data packaged into a HTML element.
Expand source code Browse git
def imgElem(data): """Produce an image with its data packaged into a HTML <img> element. """ return f"""<img src="data:image/jpeg;base64,{data}">"""
def overlay(img, left, top, right, bottom, srcColor, dstColor)
-
Colors a region of an image with care.
A selected region of an image can be given a uniform color, where only pixels are changed that have an exact given color.
In this way you can replace all the white with gray, for example, without wiping out existing non-white pixels.
Parameters
img
:np array
- The image to be overlain with a new color
- (left, top, right, bottom): (int, int, int, int)
- The region in the image to be colored
srcColor
:RGB color
- The color of the pixels that may be replaced.
dstColor: The new color of the replaced pixels.
Expand source code Browse git
def overlay(img, left, top, right, bottom, srcColor, dstColor): """Colors a region of an image with care. A selected region of an image can be given a uniform color, where only pixels are changed that have an exact given color. In this way you can replace all the white with gray, for example, without wiping out existing non-white pixels. Parameters ---------- img: np array The image to be overlain with a new color (left, top, right, bottom): (int, int, int, int) The region in the image to be colored srcColor: RGB color The color of the pixels that may be replaced. dstColor: The new color of the replaced pixels. """ if right > left and bottom > top: roi = img[top:bottom, left:right] roi[np.where((roi == list(srcColor)).all(axis=2))] = dstColor
def pagesRep(source, asList=False)
-
Represents a set of pages as a string in a compact way or as a list.
Parameters
source
:list
- A list of file names, without directory, with extension
asList
:boolean
, optionalFalse
- Whether to return the result as a list of integers or as a compact string.
Returns
list
orstring
- Depending on
asList
a list of page numbers (integers) or a string mentioning the page numbers, using intervals where possible.
Expand source code Browse git
def pagesRep(source, asList=False): """Represents a set of pages as a string in a compact way or as a list. Parameters ---------- source: list A list of file names, without directory, with extension asList: boolean, optional `False` Whether to return the result as a list of integers or as a compact string. Returns ------- list or string Depending on `asList` a list of page numbers (integers) or a string mentioning the page numbers, using intervals where possible. """ pages = [int(splitext(f)[0].lstrip("0")) for f in source] return pages if asList else specFromRanges(rangesFromList(pages))
def parseBands(band, allBands, error)
-
Parses a string that specifies bands.
Bands are horizontal rectangles defined with respect to lines. They correspond with regions of interest where we try to find specific marks, such as commas and accents.
Parameters
band
:string
orNone
oriterable
- If None: it means all bands. If a string: the name of a band. If an iterable: the items must be names of bands.
allBands
:tuple
- Names of all bands.
error
:function
- Method to write error messages.
Returns
tuple
- The bands as parsed.
Expand source code Browse git
def parseBands(band, allBands, error): """Parses a string that specifies bands. Bands are horizontal rectangles defined with respect to lines. They correspond with regions of interest where we try to find specific marks, such as commas and accents. Parameters ---------- band: string or None or iterable If None: it means all bands. If a string: the name of a band. If an iterable: the items must be names of bands. allBands: tuple Names of all bands. error: function Method to write error messages. Returns ------- tuple The bands as parsed. """ sortedBands = sorted(allBands) doBands = ( allBands if band is None else set(band.split(",")) if type(band) is str else set(band) ) illegalBands = doBands - allBands if illegalBands: error(f"Will skip illegal bands: {', '.join(sorted(illegalBands))}") doBands -= illegalBands return tuple(b for b in sortedBands if b in doBands)
def parseMarks(mark, allMarks, bands, error)
-
Parses a string that specifies Marks.
Marks are strokes that we need to find on the page in order to remove them. They are organized in bands: the regions of interest with respect to the lines where we expect them to occur.
Parameters
mark
:string
orNone
oriterable
- If None: it means all marks. If a string: the name of a mark. If an iterable: the items must be names of marks.
allMarks
:tuple
- Names of all marks.
error
:function
- Method to write error messages.
Returns
tuple
- The marks as parsed.
Expand source code Browse git
def parseMarks(mark, allMarks, bands, error): """Parses a string that specifies Marks. Marks are strokes that we need to find on the page in order to remove them. They are organized in bands: the regions of interest with respect to the lines where we expect them to occur. Parameters ---------- mark: string or None or iterable If None: it means all marks. If a string: the name of a mark. If an iterable: the items must be names of marks. allMarks: tuple Names of all marks. error: function Method to write error messages. Returns ------- tuple The marks as parsed. """ markIndex = {} for (band, bandMarks) in allMarks.items(): for m in bandMarks: markIndex.setdefault(m, set()).add(band) doMarks = ( set() if mark is None else set(chain.from_iterable(allMarks.get(band, ()) for band in bands)) if mark == "" else set(mark.split(",")) if type(mark) is str else set(mark) ) illegalMarks = doMarks - set(markIndex) if illegalMarks: error(f"Will skip illegal marks: {', '.join(sorted(illegalMarks))}") doMarks -= illegalMarks return doMarks
def parseNums(numSpec)
-
Parses a value as one or more numbers.
Parameters
numSpec
:None | int | string | iterable
-
If
None
results inNone
. If anint
, it stands for that int. If astring
, it is allowed to be a comma separated list of numbers or ranges, where a range is a lower bound and an upper bound separated by a-
. If none of these, it should be an iterable ofint
values.Examples:
50 "50" "50,70" "50-70,91,92,300-350" (50, 70, 91, 92, 300) [50, 70, 90] range(300, 350)
Returns
None | iterable
ofint
- Depending on the value.
Expand source code Browse git
def parseNums(numSpec): """Parses a value as one or more numbers. Parameters ---------- numSpec: None | int | string | iterable If `None` results in `None`. If an `int`, it stands for that int. If a `string`, it is allowed to be a comma separated list of numbers or ranges, where a range is a lower bound and an upper bound separated by a `-`. If none of these, it should be an iterable of `int` values. Examples: 50 "50" "50,70" "50-70,91,92,300-350" (50, 70, 91, 92, 300) [50, 70, 90] range(300, 350) Returns ------- None | iterable of int Depending on the value. """ return ( None if not numSpec else [numSpec] if type(numSpec) is int else setFromSpec(numSpec) if type(numSpec) is str else list(numSpec) )
def parseStages(stage, allStages, sortedStages, error)
-
Parses a string that specifies stages.
Stages are steps in the image processing. Each stage has an intermediate processing result.
Parameters
stage
:string
orNone
oriterable
- If None: it means all stages. If a string: the name of a stage. If an iterable: the items must be names of stages.
allStages
:tuple
- Names of all stages.
- sortedStages:
- Sorted list of all stages.
error
:function
- Method to write error messages.
Returns
tuple
- The stages as parsed.
Expand source code Browse git
def parseStages(stage, allStages, sortedStages, error): """Parses a string that specifies stages. Stages are steps in the image processing. Each stage has an intermediate processing result. Parameters ---------- stage: string or None or iterable If None: it means all stages. If a string: the name of a stage. If an iterable: the items must be names of stages. allStages: tuple Names of all stages. sortedStages: Sorted list of all stages. error: function Method to write error messages. Returns ------- tuple The stages as parsed. """ doStages = ( allStages if stage is None else set() if not stage else set(stage.split(",")) if type(stage) is str else set(stage) ) illegalStages = doStages - allStages if illegalStages: error(f"Will skip illegal stages: {', '.join(sorted(illegalStages))}") doStages = doStages - illegalStages return tuple(s for s in sortedStages if s in doStages)
def pprint(x)
-
Expand source code Browse git
def pprint(x): PP.pprint(x)
def pureAverage(data, supplied)
-
Get the average of a list of values after removing the outliers.
It is used for calcaluting lineheights from a sequence of distances between histogram peaks. In practice, some peaks are missing due to short line lengths, and that causes some abnormal peak distances which we want to remove.
Parameters
data
:np array
- The list of values whose average we compute.
supplied
:integer
- Value to return if there is no data.
Expand source code Browse git
def pureAverage(data, supplied): """Get the average of a list of values after removing the outliers. It is used for calcaluting lineheights from a sequence of distances between histogram peaks. In practice, some peaks are missing due to short line lengths, and that causes some abnormal peak distances which we want to remove. Parameters ---------- data: np array The list of values whose average we compute. supplied: integer Value to return if there is no data. """ if data.size == 0: return supplied elif data.size == 1: return int(round(data[0])) # remove outliers m = 2.0 d = np.abs(data - np.median(data)) mdev = np.median(d) s = d / mdev if mdev else 0.0 pure = data[s < m] if len(pure) == 0: return supplied elif pure.size == 1: return int(round(pure[0])) return int(round(np.average(pure)))
def removeBorders(img, crop, white)
-
Remove black borders around an image.
When an image has been unskewed, sharp triangle-shape strokes in the corners may have been introduced. Or it might be the result of scanning a page.
This function removes them by coloring all image borders with white.
The exact borders to be whitened are calculated by
cropBorders()
.Parameters
img
:image as np array
- the image to operate on
crop
:(int, int, int, int)
- the x1, x2, y1, y2 values which indicate the region outside which the white may be applied
white
:color
- the exact white color with which we color the borders.
Returns
None
- The source image receives a modification.
Expand source code Browse git
def removeBorders(img, crop, white): """Remove black borders around an image. When an image has been unskewed, sharp triangle-shape strokes in the corners may have been introduced. Or it might be the result of scanning a page. This function removes them by coloring all image borders with white. The exact borders to be whitened are calculated by `cropBorders`. Parameters ---------- img: image as np array the image to operate on crop: (int, int, int, int) the x1, x2, y1, y2 values which indicate the region outside which the white may be applied white: color the exact white color with which we color the borders. Returns ------- None The source image receives a modification. """ (imH, imW) = img.shape[0:2] (x0, x1, y0, y1) = crop for rect in ( ((0, 0), (x0, imH)), ((0, 0), (imW, y0)), ((x1, 0), (imW, imH)), ((0, y1), (imW, imH)), ): cv2.rectangle(img, *rect, white, -1)
def select(source, selection)
-
Choose items from a bunch of integers.
Parameters
source
:iterable
ofint
- The items to choose from
selection
:iterable
ofint
orstring
orNone
- If None, selects all items, otherwise specifies what numbers to select. If a number is in the selection, but not in the source, it will not be selected. The selection can be an integer or a compact string that specifies integers, using ranges and commas.
Returns
list
- Sorted list of selected items
Expand source code Browse git
def select(source, selection): """Choose items from a bunch of integers. Parameters ---------- source: iterable of int The items to choose from selection: iterable of int or string or `None` If None, selects all items, otherwise specifies what numbers to select. If a number is in the selection, but not in the source, it will not be selected. The selection can be an integer or a compact string that specifies integers, using ranges and commas. Returns ------- list Sorted list of selected items """ if selection is None: return sorted(source) index = {int(splitext(f)[0].lstrip("0")): f for f in source} universe = set(index) if type(selection) is int: return sorted(index[n] for n in {selection} & universe) minu = min(universe, default=0) maxu = max(universe, default=0) selected = set() for rng in selection.split(","): parts = rng.split("-") if len(parts) == 2: (lower, upper) = parts lower = minu if lower == "" else int(lower) upper = maxu if upper == "" else int(upper) else: lower = int(parts[0]) upper = lower selected |= set(range(lower, upper + 1)) & universe return sorted(index[n] for n in selected)
def showImage(a, fmt='jpeg', **kwargs)
-
Show one or more images.
Expand source code Browse git
def showImage(a, fmt="jpeg", **kwargs): """Show one or more images. """ if type(a) in {list, tuple}: ads = [] for ae in a: ai = np.uint8(np.clip(ae, 0, 255)) f = io.BytesIO() PIL.Image.fromarray(ae).save(f, fmt) ad = Image(data=f.getvalue(), **kwargs)._repr_jpeg_() ads.append(ad) display(HTML(f"<div>{''.join(imgElem(ad) for ad in ads)}</div>")) else: ai = np.uint8(np.clip(a, 0, 255)) f = io.BytesIO() PIL.Image.fromarray(ai).save(f, fmt) display(Image(data=f.getvalue(), **kwargs))
def splitext(f, withDot=True)
-
Splits a file name into its main part and its extension.
Parameters
f
:string
- The file name
withDot
:boolean
, optionalFalse
- If True, the
.
in the extension is considered part of the extension, else the dot is stripped from it.
Returns
tuple
- The main part and the extension
Expand source code Browse git
def splitext(f, withDot=True): """Splits a file name into its main part and its extension. Parameters ---------- f: string The file name withDot: boolean, optional `False` If True, the `.` in the extension is considered part of the extension, else the dot is stripped from it. Returns ------- tuple The main part and the extension """ (bare, ext) = os.path.splitext(f) if ext and not withDot: ext = ext[1:] return (bare, ext)
def tempFile()
-
Get a temporary file.
Expand source code Browse git
def tempFile(): """Get a temporary file. """ return NamedTemporaryFile(mode="w", dir=".")
def writeImage(a, path, **kwargs)
-
Write an image to disk
Expand source code Browse git
def writeImage(a, path, **kwargs): """Write an image to disk """ ai = np.uint8(np.clip(a, 0, 255)) with open(path, "wb") as f: PIL.Image.fromarray(ai).save(f)