Module `fusus.tfFromTsv`

Convert TSV data to Text-Fabric.

The TSV data consists of one-word-per-line files for each page, and for each word the line specifies its text, its bounding boxes in the original, and its containing spaces on the page (line, block, etc).

The TSV data from OCRed pages is slightly different from that of the textual extraction of the Lakhnawi PDF, but they share most fields.

The code here can deal with both kinds of input.

See also

Expand source code Browse git

"""Convert TSV data to Text-Fabric.

The TSV data consists of one-word-per-line files for each page,
and for each word the line specifies its text, its bounding boxes in the original,
and its containing spaces on the page (line, block, etc).

The TSV data from OCRed pages is slightly different from that of the
textual extraction of the Lakhnawi PDF, but they share most fields.

The code here can deal with both kinds of input.

See also

* `fusus.convert`
* [Text-Fabric](https://annotation.github.io/text-fabric/tf/index.html)
"""

import collections
import re

from tf.fabric import Fabric
from tf.convert.walker import CV
from tf.writing.transcription import Transcription as Tr

from .char import UChar
from .works import BASE, WORKS, getFile, getTfDest
from .lib import parseNums


EXT = ".tsv"
VERSION_TF = None

# TF CONFIGURATION

slotType = "word"

GENERIC = dict(
    language="ara",
    institute="Univ Utrecht NL/DANS",
    project="Fusus",
    researcher="Cornelis van Lit",
    converters="Cornelis van Lit, Dirk Roorda (Text-Fabric)",
    sourceFormat="CSV (tab-separated)",
)


def generic(source):
    return {**GENERIC, **WORKS[source]["meta"]}


otext = {}
otext[None] = {}
otext[False] = {
    "fmt:text-orig-full": "{letters}{punc} ",
    "fmt:text-orig-plain": "{lettersp}{punca} ",
    "fmt:text-orig-nice": "{lettersn}{punca} ",
    "fmt:text-orig-trans": "{letterst}{punca} ",
    "sectionFeatures": "n,n,ln",
    "sectionTypes": "piece,page,line",
}
otext[True] = {
    "fmt:text-orig-full": "{letters}{punc}",
    "fmt:text-orig-plain": "{lettersp}{punca}",
    "fmt:text-orig-nice": "{lettersn}{punca}",
    "fmt:text-orig-trans": "{letterst}{punca}",
    "sectionFeatures": "n,b,ln",
    "sectionTypes": "page,block,line",
}
otext["extra"] = {}
otext["merged"] = {
    "fmt:text-afifi-full": "{letters_af}{punc_af}",
    "fmt:text-afifi-plain": "{lettersp_af}{punca_af}",
    "fmt:text-afifi-nice": "{lettersn_af}{punca_af}",
    "fmt:text-afifi-trans": "{letterst_af}{punca_af}",
}

intFeatures = {}
intFeatures[None] = set(
    """
        n
        ln
    """.strip().split()
)
intFeatures[False] = {"np"}
intFeatures[True] = set()
intFeatures["extra"] = {"poetryverse", "fass"}
intFeatures["merged"] = {
    "slot_lk",
    "slot_af",
    "combine_lk",
    "combine_af",
    "editdistance",
    "ratio",
    "page_af",
    "line_af",
}

featureMeta = {}

featureMeta[None] = {
    "boxl": {
        "description": "left x-coordinate of word",
        "format": "number",
    },
    "boxt": {
        "description": "top y-coordinate of word",
        "format": "number",
    },
    "boxr": {
        "description": "right x-coordinate of word",
        "format": "number",
    },
    "boxb": {
        "description": "bottom y-coordinate of word",
        "format": "number",
    },
    "letters": {
        "description": "text string of a word without punctuation",
        "format": "string",
    },
    "lettersn": {
        "description": "text string of a word in latin transcription (beta code)",
        "format": "string, latin with diacritics",
    },
    "lettersp": {
        "description": "text string of a word in ascii transcription (beta code)",
        "format": "string, ascii",
    },
    "letterst": {
        "description": (
            "text string of a word in romanized transcription" " (Library of Congress)"
        ),
        "format": "string, latin with diacritics",
    },
    "punc": {
        "description": "punctuation and/or space immediately after a word",
        "format": "string",
    },
    "punca": {
        "description": "punctuation and/or space immediately after a word",
        "format": "string, ascii",
    },
}

featureMeta["merged"] = {
    "letters_af": {
        "description": "text string of a word without punctuation (Afifi edition)",
        "format": "string",
    },
    "lettersn_af": {
        "description": (
            "text string of a word in latin transcription (beta code)"
            " (Afifi edition)"
        ),
        "format": "string, latin with diacritics",
    },
    "lettersp_af": {
        "description": (
            "text string of a word in ascii transcription (beta code)"
            " (Afifi edition)"
        ),
        "format": "string, ascii",
    },
    "letterst_af": {
        "description": (
            "text string of a word in romanized transcription"
            " (Library of Congress)"
            " (Afifi edition)"
        ),
        "format": "string, latin with diacritics",
    },
    "punc_af": {
        "description": (
            "punctuation and/or space immediately after a word" " (Afifi edition)"
        ),
        "format": "string",
    },
    "punca_af": {
        "description": (
            "punctuation and/or space immediately after a word" " (Afifi edition)"
        ),
        "format": "string, ascii",
    },
    "slot_lk": {
        "description": (
            "slot number in the raw fususl dataset, "
            "which is obtained from reverse-engineering the Lakhnawi pdf"
        ),
        "format": "integer",
    },
    "slot_af": {
        "description": (
            "slot number in the raw fususa dataset, "
            "which is obtained from ocr-ing the Afifi page images"
        ),
        "format": "integer",
    },
    "combine_lk": {
        "description": (
            "number of consecutive words in the Lakhnawi text"
            "that form an alignment entry with 1 or more words in the Afifi text"
        ),
        "format": "integer",
    },
    "combine_af": {
        "description": (
            "number of consecutive words in the Afifi text"
            "that form an alignment entry with 1 or more words in the Lakhnawi text"
        ),
        "format": "integer",
    },
    "editdistance": {
        "description": (
            "edit distance between the Lakhnawi part in an alignment entry"
            "and its Afifi counterpart"
        ),
        "format": "integer, number of edits between the counterparts",
    },
    "ratio": {
        "description": (
            "ratio (=similarity) between the Lakhnawi part in an alignment entry"
            "and its Afifi counterpart"
        ),
        "format": "integer, scale 1 to 10, higher is more similar",
    },
    "page_af": {
        "description": (
            "page number in the raw fususa dataset, "
            "which is obtained from ocr-ing the Afifi page images"
        ),
        "format": "integer",
    },
    "line_af": {
        "description": (
            "line number in the raw fususa dataset, "
            "which is obtained from ocr-ing the Afifi page images"
        ),
        "format": "integer",
    },
}

featureMeta[False] = {
    "dir": {
        "description": "writing direction of a span",
        "format": "string, either r or l",
    },
    "ln": {
        "description": "sequence number of a line within a page",
        "format": "number",
    },
    "n": {
        "description": (
            "sequence number of a piece, page, column within a line, or span"
        ),
        "format": "number",
    },
    "np": {
        "description": "sequence number of a proper content piece",
        "format": "number",
    },
    "title": {
        "description": "title of a piece",
        "format": "string",
    },
}

featureMeta[True] = {
    "b": {
        "description": "name of a block inside a stripe",
        "format": "string, either r or l",
    },
    "confidence": {
        "description": "confidence of OCR recognition of the word",
        "format": "number between 0 and 100 (including)",
    },
    "n": {
        "description": "sequence number of a piece, page, or stripe",
        "format": "number",
    },
    "ln": {
        "description": "sequence number of a line within a block",
        "format": "number",
    },
}

featureMeta["extra"] = {
    "raw": {
        "description": "letters of the word straight from the pdf",
        "format": "string",
    },
    "puncb": {
        "description": "",
        "format": "string",
    },
    "puncba": {
        "description": "",
        "format": "string, ascii",
    },
    "qunawims": {
        "description": (
            "on which folio of the oldest manuscript, "
            "penned by Qunawi himself, is this word attested?"
        ),
        "format": "string",
    },
    "poetrymeter": {
        "description": "meter in which this verse is written",
        "format": "string",
    },
    "poetryverse": {
        "description": (
            "word is start of a verse of poetry, " "value is the number of the verse"
        ),
        "format": "number",
    },
    "fass": {
        "description": "number of the piece (bezel) that the word belongs to",
        "format": "number",
    },
    "lwcvl": {
        "description": "personal notes by Cornelis van Lit",
        "format": "string",
    },
    "quran": {
        "description": "word is part of a quran citation (sura:aya)",
        "format": "string",
    },
}

# DISTILL TABLE of CONTENTS

TOC_PAGES = (4, 5)

TOC_LINE_RE = re.compile(
    r"""
    ^
    ([٠-٩]+)
    ‐
    \s*
    ([^…]+)
    …+
    ([٠-٩]+)
    $
""",
    re.X,
)

PIECE_RE = re.compile(
    r"""
    ^
    \[
    ([٠-٩]+)
    \]
    (.*)
    $
""",
    re.X,
)


def getToc(data):
    (start, end) = TOC_PAGES

    lines = []
    curLine = []

    prevLine = None

    for fields in data:
        page = fields[0]
        if page < start:
            continue
        if page > end:
            break

        line = fields[1]
        if prevLine is None or prevLine != line:
            if curLine:
                lines.append("".join(curLine))
                curLine = []

        curLine.append(f"{fields[-2]}{fields[-1]}")
        prevLine = line

    if curLine:
        lines.append("".join(curLine))

    toc = {}

    for line in lines:
        match = TOC_LINE_RE.match(line)
        if not match:
            continue
        (seq, title, pg) = match.group(1, 2, 3)
        seq = int(seq[::-1])
        pg = int(pg[::-1])
        pSeq = None
        matchP = PIECE_RE.match(title)
        if matchP:
            (pSeq, title) = matchP.group(1, 2)
            pSeq = int(pSeq[::-1])
        toc[pg] = (seq, pSeq, title)
    return toc


# SET UP CONVERSION

pageNums = None


TYPE_MAPS = {
    False: ["page", "line", "column", "span"],
    True: ["page", "stripe", "block", "line"],
    "extra": [
        "short",
        "haspunct",
        "punctafter",
        "punctbefore",
        "qunawims",
        "poetrymeter",
        "poetryverse",
        "fass",
        "lwcvl",
        "quran",
    ],
}


def convert(source, ocred, pages, versionTf):
    global pageNums
    global SRC_FILE
    global TYPE_MAP
    global HAS_TOC
    global TOC_SOURCE
    global OCRED
    global U
    global VERSION_TF
    global SEP
    global SKIPCOL
    global MERGED
    global EXTRA

    U = UChar()

    pageNums = parseNums(pages)

    workInfo = WORKS[source]
    dest = getTfDest(source, versionTf)
    (SRC_FILE, OCRED) = getFile(source, ocred)
    HAS_TOC = workInfo.get("toc", False)
    TOC_SOURCE = workInfo.get("sourceToc", None)
    TYPE_MAP = TYPE_MAPS[OCRED]
    VERSION_TF = versionTf
    SEP = workInfo["sep"]
    SKIPCOL = workInfo.get("skipcol", None)
    MERGED = workInfo.get("merged", False)
    EXTRA = workInfo.get("extra", False)

    cv = CV(Fabric(locations=dest))

    thisFeatureMeta = (
        featureMeta[None]
        | featureMeta[OCRED]
        | (featureMeta["extra"] if EXTRA else {})
        | (featureMeta["merged"] if MERGED else {})
    )
    if MERGED:
        for feat in ("boxl", "boxt", "boxr", "boxb"):
            del thisFeatureMeta[feat]

    return cv.walk(
        director,
        slotType,
        otext=otext[None]
        | otext[OCRED]
        | (otext["extra"] if EXTRA else {})
        | (otext["merged"] if MERGED else {}),
        generic=generic(source),
        intFeatures=intFeatures[None]
        | intFeatures[OCRED]
        | (intFeatures["extra"] if EXTRA else {})
        | (intFeatures["merged"] if MERGED else {}),
        featureMeta=thisFeatureMeta,
        generateTf=True,
    )


# DIRECTOR


def director(cv):
    """Read tsv data fields.

    This is a function that does the work as indicated in the
    [walker converion engine of Text-Fabric](https://annotation.github.io/text-fabric/tf/convert/walker.html)
    See `fusus.convert` for a description of the fields in the TSV files.
    """

    stops = U.stops

    errors = collections.defaultdict(set)

    cur = [None, None, None, None]
    prev = [None, None, None, None]
    nSec = len(prev)

    def getData(dataFile, sep, extra, merged):
        data = []

        with open(dataFile) as fh:
            next(fh)
            for line in fh:
                row = line.rstrip("\n").split(sep)
                if SKIPCOL is not None:
                    del row[SKIPCOL : SKIPCOL + 1]
                page = int(row[0])
                if pageNums is not None and page not in pageNums:
                    continue

                if OCRED:
                    row = (
                        page,
                        int(row[1]),
                        row[2],
                        int(row[3]),
                        *(None if c in {"", "?"} else int(c) for c in row[4:8]),
                        int(row[8]),
                        *row[9:11],
                    )
                elif merged:
                    row = (
                        page,
                        int(row[1]),
                        int(row[2]),
                        int(row[3]),
                        row[4],
                        *row[6:9],
                        row[5],
                        *row[9:11],
                        int(row[11]) if row[11] else None,
                        int(row[12]) if row[12] else None,
                        *row[13:15],
                        int(row[15]) if row[15] else None,
                        int(row[16]) if row[16] else None,
                        int(row[17]) if row[17] else None,
                        int(round(float(row[18]) * 10)) if row[18] else None,
                        int(row[19]) if row[19] else None,
                        int(row[20]) if row[20] else None,
                        int(row[21]),
                        int(row[22]),
                        *row[23:],
                    )
                else:
                    tail = (*row[10:13], row[9], *row[13:]) if extra else row[9:11]
                    row = (
                        page,
                        *(int(c) for c in row[1:4]),
                        row[4],
                        *(None if c in {"", "?"} else int(c) for c in row[5:9]),
                        *tail,
                    )

                data.append(row)
        return data

    data = getData(SRC_FILE, SEP, EXTRA, MERGED)

    boxL = nSec if OCRED else nSec + 1

    if HAS_TOC:
        tocData = data
        if TOC_SOURCE:
            tocFile = f"{TOC_SOURCE['dir']}/{TOC_SOURCE['file']}"
            sep = TOC_SOURCE["sep"]
            tocData = getData(f"{BASE}/{tocFile}", sep, False, False)
        toc = getToc(tocData)
        curPiece = cv.node("piece")
        cv.feature(curPiece, n=1, title="front")

    curSentence = cv.node("sentence")
    nSentence = 1
    cv.feature(curSentence, n=nSentence)

    for (r, fields) in enumerate(data):
        if HAS_TOC:
            page = fields[0]
            if page in toc and page != prev[0]:
                for i in reversed(range(nSec)):
                    cv.terminate(cur[i])

                cv.terminate(curSentence)
                cv.terminate(curPiece)
                nSentence = 1
                curSentence = cv.node("sentence")
                cv.feature(curSentence, n=nSentence)

                (n, np, title) = toc[page]
                curPiece = cv.node("piece")
                cv.feature(curPiece, n=n, title=title)
                if np is not None:
                    cv.feature(curPiece, np=np)

        for i in range(nSec):
            if fields[i] != prev[i]:
                for j in reversed(range(i, nSec)):
                    cv.terminate(cur[j])
                for j in range(i, nSec):
                    cn = cv.node(TYPE_MAP[j])
                    cur[j] = cn
                    if OCRED and j == 2:
                        cv.feature(cn, b=fields[j])
                    elif OCRED and j == 3 or not OCRED and j == 1:
                        cv.feature(cn, ln=fields[j])
                    else:
                        cv.feature(cn, n=fields[j])
                    if not OCRED and j == nSec - 1:
                        cv.feature(cn, dir=fields[nSec])
                break
        for i in range(nSec):
            prev[i] = fields[i]

        lettersIndex = 5 if MERGED else 9 if EXTRA else -2
        puncIndex = 6 if MERGED else 10 if EXTRA else -1
        letters = fields[lettersIndex]
        punc = fields[puncIndex]

        puncBefore = None
        raw = None
        if EXTRA:
            puncBefore = fields[puncIndex + 1]
            raw = fields[puncIndex + 2]

        lettersp = Tr.asciiFromArabic(letters) if letters else ""
        lettersn = Tr.latinFromArabic(letters) if letters else ""
        letterst = Tr.standardFromArabic(letters) if letters else ""
        punca = Tr.asciiFromArabic(punc) if punc else ""

        s = cv.slot()
        if not MERGED:
            cv.feature(
                s,
                boxl=fields[boxL],
                boxt=fields[boxL + 1],
                boxr=fields[boxL + 2],
                boxb=fields[boxL + 3],
            )
        cv.feature(
            s,
            letters=letters,
            lettersp=lettersp,
            lettersn=lettersn,
            letterst=letterst,
        )
        cv.feature(s, punc=punc, punca=punca)
        if puncBefore is not None:
            puncba = Tr.asciiFromArabic(puncBefore) if puncBefore else ""
            cv.feature(s, puncb=puncBefore, puncba=puncba)
        if raw is not None:
            cv.feature(s, raw=raw)

        if EXTRA:
            extraData = {}
            if fields[13]:
                extraData["qunawims"] = fields[13]
            if fields[14]:
                extraData["poetrymeter"] = fields[14]
            if fields[15]:
                extraData["poetryverse"] = int(fields[15])
            if fields[16]:
                extraData["fass"] = int(fields[16])
            if fields[17]:
                extraData["lwcvl"] = fields[17]
            if fields[18]:
                extraData["quran"] = fields[18]
            cv.feature(s, **extraData)

        if MERGED:
            letters_af = fields[-2]
            punc_af = fields[-1]

            lettersp_af = Tr.asciiFromArabic(letters_af) if letters_af else ""
            lettersn_af = Tr.latinFromArabic(letters_af) if letters_af else ""
            letterst_af = Tr.standardFromArabic(letters_af) if letters_af else ""
            punca_af = Tr.asciiFromArabic(punc_af) if punc_af else ""
            cv.feature(
                s,
                letters_af=letters_af,
                lettersp_af=lettersp_af,
                lettersn_af=lettersn_af,
                letterst_af=letterst_af,
                punc_af=punc_af,
                punca_af=punca_af,
                slot_lk=fields[15],
                combine_lk=fields[16],
                editdistance=fields[17],
                ratio=fields[18],
                combine_af=fields[19],
                slot_af=fields[20],
                page_af=fields[21],
                line_af=fields[22],
            )

        if any(c in stops for c in punc):
            cv.terminate(curSentence)
            curSentence = cv.node("sentence")
            nSentence += 1
            cv.feature(curSentence, n=nSentence)
        if OCRED:
            cv.feature(s, confidence=fields[-3])

    cv.terminate(curSentence)

    for i in reversed(range(nSec)):
        if cur[i]:
            cv.terminate(cur[i])

    if HAS_TOC:
        cv.terminate(curPiece)

    for feat in featureMeta:
        if not cv.occurs(feat):
            cv.meta(feat)

    if errors:
        for kind in sorted(errors):
            instances = sorted(errors[kind])
            nInstances = len(instances)
            showInstances = instances[0:20]
            print(f"ERROR {kind}: {nInstances} x")
            print(", ".join(showInstances))


# TF LOADING (to test the generated TF)


def loadTf(outDir):
    TF = Fabric(locations=[outDir])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f"max node = {api.F.otype.maxNode}")
        print("Frequencies of words")
        for (word, n) in api.F.letters.freqList()[0:20]:
            print(f"{n:>6} x {word}")

Functions

def convert(source, ocred, pages, versionTf)

Expand source code Browse git

def convert(source, ocred, pages, versionTf):
    global pageNums
    global SRC_FILE
    global TYPE_MAP
    global HAS_TOC
    global TOC_SOURCE
    global OCRED
    global U
    global VERSION_TF
    global SEP
    global SKIPCOL
    global MERGED
    global EXTRA

    U = UChar()

    pageNums = parseNums(pages)

    workInfo = WORKS[source]
    dest = getTfDest(source, versionTf)
    (SRC_FILE, OCRED) = getFile(source, ocred)
    HAS_TOC = workInfo.get("toc", False)
    TOC_SOURCE = workInfo.get("sourceToc", None)
    TYPE_MAP = TYPE_MAPS[OCRED]
    VERSION_TF = versionTf
    SEP = workInfo["sep"]
    SKIPCOL = workInfo.get("skipcol", None)
    MERGED = workInfo.get("merged", False)
    EXTRA = workInfo.get("extra", False)

    cv = CV(Fabric(locations=dest))

    thisFeatureMeta = (
        featureMeta[None]
        | featureMeta[OCRED]
        | (featureMeta["extra"] if EXTRA else {})
        | (featureMeta["merged"] if MERGED else {})
    )
    if MERGED:
        for feat in ("boxl", "boxt", "boxr", "boxb"):
            del thisFeatureMeta[feat]

    return cv.walk(
        director,
        slotType,
        otext=otext[None]
        | otext[OCRED]
        | (otext["extra"] if EXTRA else {})
        | (otext["merged"] if MERGED else {}),
        generic=generic(source),
        intFeatures=intFeatures[None]
        | intFeatures[OCRED]
        | (intFeatures["extra"] if EXTRA else {})
        | (intFeatures["merged"] if MERGED else {}),
        featureMeta=thisFeatureMeta,
        generateTf=True,
    )

def director(cv)

Read tsv data fields.

This is a function that does the work as indicated in the walker converion engine of Text-Fabric See fusus.convert for a description of the fields in the TSV files.

Expand source code Browse git

def director(cv):
    """Read tsv data fields.

    This is a function that does the work as indicated in the
    [walker converion engine of Text-Fabric](https://annotation.github.io/text-fabric/tf/convert/walker.html)
    See `fusus.convert` for a description of the fields in the TSV files.
    """

    stops = U.stops

    errors = collections.defaultdict(set)

    cur = [None, None, None, None]
    prev = [None, None, None, None]
    nSec = len(prev)

    def getData(dataFile, sep, extra, merged):
        data = []

        with open(dataFile) as fh:
            next(fh)
            for line in fh:
                row = line.rstrip("\n").split(sep)
                if SKIPCOL is not None:
                    del row[SKIPCOL : SKIPCOL + 1]
                page = int(row[0])
                if pageNums is not None and page not in pageNums:
                    continue

                if OCRED:
                    row = (
                        page,
                        int(row[1]),
                        row[2],
                        int(row[3]),
                        *(None if c in {"", "?"} else int(c) for c in row[4:8]),
                        int(row[8]),
                        *row[9:11],
                    )
                elif merged:
                    row = (
                        page,
                        int(row[1]),
                        int(row[2]),
                        int(row[3]),
                        row[4],
                        *row[6:9],
                        row[5],
                        *row[9:11],
                        int(row[11]) if row[11] else None,
                        int(row[12]) if row[12] else None,
                        *row[13:15],
                        int(row[15]) if row[15] else None,
                        int(row[16]) if row[16] else None,
                        int(row[17]) if row[17] else None,
                        int(round(float(row[18]) * 10)) if row[18] else None,
                        int(row[19]) if row[19] else None,
                        int(row[20]) if row[20] else None,
                        int(row[21]),
                        int(row[22]),
                        *row[23:],
                    )
                else:
                    tail = (*row[10:13], row[9], *row[13:]) if extra else row[9:11]
                    row = (
                        page,
                        *(int(c) for c in row[1:4]),
                        row[4],
                        *(None if c in {"", "?"} else int(c) for c in row[5:9]),
                        *tail,
                    )

                data.append(row)
        return data

    data = getData(SRC_FILE, SEP, EXTRA, MERGED)

    boxL = nSec if OCRED else nSec + 1

    if HAS_TOC:
        tocData = data
        if TOC_SOURCE:
            tocFile = f"{TOC_SOURCE['dir']}/{TOC_SOURCE['file']}"
            sep = TOC_SOURCE["sep"]
            tocData = getData(f"{BASE}/{tocFile}", sep, False, False)
        toc = getToc(tocData)
        curPiece = cv.node("piece")
        cv.feature(curPiece, n=1, title="front")

    curSentence = cv.node("sentence")
    nSentence = 1
    cv.feature(curSentence, n=nSentence)

    for (r, fields) in enumerate(data):
        if HAS_TOC:
            page = fields[0]
            if page in toc and page != prev[0]:
                for i in reversed(range(nSec)):
                    cv.terminate(cur[i])

                cv.terminate(curSentence)
                cv.terminate(curPiece)
                nSentence = 1
                curSentence = cv.node("sentence")
                cv.feature(curSentence, n=nSentence)

                (n, np, title) = toc[page]
                curPiece = cv.node("piece")
                cv.feature(curPiece, n=n, title=title)
                if np is not None:
                    cv.feature(curPiece, np=np)

        for i in range(nSec):
            if fields[i] != prev[i]:
                for j in reversed(range(i, nSec)):
                    cv.terminate(cur[j])
                for j in range(i, nSec):
                    cn = cv.node(TYPE_MAP[j])
                    cur[j] = cn
                    if OCRED and j == 2:
                        cv.feature(cn, b=fields[j])
                    elif OCRED and j == 3 or not OCRED and j == 1:
                        cv.feature(cn, ln=fields[j])
                    else:
                        cv.feature(cn, n=fields[j])
                    if not OCRED and j == nSec - 1:
                        cv.feature(cn, dir=fields[nSec])
                break
        for i in range(nSec):
            prev[i] = fields[i]

        lettersIndex = 5 if MERGED else 9 if EXTRA else -2
        puncIndex = 6 if MERGED else 10 if EXTRA else -1
        letters = fields[lettersIndex]
        punc = fields[puncIndex]

        puncBefore = None
        raw = None
        if EXTRA:
            puncBefore = fields[puncIndex + 1]
            raw = fields[puncIndex + 2]

        lettersp = Tr.asciiFromArabic(letters) if letters else ""
        lettersn = Tr.latinFromArabic(letters) if letters else ""
        letterst = Tr.standardFromArabic(letters) if letters else ""
        punca = Tr.asciiFromArabic(punc) if punc else ""

        s = cv.slot()
        if not MERGED:
            cv.feature(
                s,
                boxl=fields[boxL],
                boxt=fields[boxL + 1],
                boxr=fields[boxL + 2],
                boxb=fields[boxL + 3],
            )
        cv.feature(
            s,
            letters=letters,
            lettersp=lettersp,
            lettersn=lettersn,
            letterst=letterst,
        )
        cv.feature(s, punc=punc, punca=punca)
        if puncBefore is not None:
            puncba = Tr.asciiFromArabic(puncBefore) if puncBefore else ""
            cv.feature(s, puncb=puncBefore, puncba=puncba)
        if raw is not None:
            cv.feature(s, raw=raw)

        if EXTRA:
            extraData = {}
            if fields[13]:
                extraData["qunawims"] = fields[13]
            if fields[14]:
                extraData["poetrymeter"] = fields[14]
            if fields[15]:
                extraData["poetryverse"] = int(fields[15])
            if fields[16]:
                extraData["fass"] = int(fields[16])
            if fields[17]:
                extraData["lwcvl"] = fields[17]
            if fields[18]:
                extraData["quran"] = fields[18]
            cv.feature(s, **extraData)

        if MERGED:
            letters_af = fields[-2]
            punc_af = fields[-1]

            lettersp_af = Tr.asciiFromArabic(letters_af) if letters_af else ""
            lettersn_af = Tr.latinFromArabic(letters_af) if letters_af else ""
            letterst_af = Tr.standardFromArabic(letters_af) if letters_af else ""
            punca_af = Tr.asciiFromArabic(punc_af) if punc_af else ""
            cv.feature(
                s,
                letters_af=letters_af,
                lettersp_af=lettersp_af,
                lettersn_af=lettersn_af,
                letterst_af=letterst_af,
                punc_af=punc_af,
                punca_af=punca_af,
                slot_lk=fields[15],
                combine_lk=fields[16],
                editdistance=fields[17],
                ratio=fields[18],
                combine_af=fields[19],
                slot_af=fields[20],
                page_af=fields[21],
                line_af=fields[22],
            )

        if any(c in stops for c in punc):
            cv.terminate(curSentence)
            curSentence = cv.node("sentence")
            nSentence += 1
            cv.feature(curSentence, n=nSentence)
        if OCRED:
            cv.feature(s, confidence=fields[-3])

    cv.terminate(curSentence)

    for i in reversed(range(nSec)):
        if cur[i]:
            cv.terminate(cur[i])

    if HAS_TOC:
        cv.terminate(curPiece)

    for feat in featureMeta:
        if not cv.occurs(feat):
            cv.meta(feat)

    if errors:
        for kind in sorted(errors):
            instances = sorted(errors[kind])
            nInstances = len(instances)
            showInstances = instances[0:20]
            print(f"ERROR {kind}: {nInstances} x")
            print(", ".join(showInstances))

def generic(source)

Expand source code Browse git

def generic(source):
    return {**GENERIC, **WORKS[source]["meta"]}

def getToc(data)

Expand source code Browse git

def getToc(data):
    (start, end) = TOC_PAGES

    lines = []
    curLine = []

    prevLine = None

    for fields in data:
        page = fields[0]
        if page < start:
            continue
        if page > end:
            break

        line = fields[1]
        if prevLine is None or prevLine != line:
            if curLine:
                lines.append("".join(curLine))
                curLine = []

        curLine.append(f"{fields[-2]}{fields[-1]}")
        prevLine = line

    if curLine:
        lines.append("".join(curLine))

    toc = {}

    for line in lines:
        match = TOC_LINE_RE.match(line)
        if not match:
            continue
        (seq, title, pg) = match.group(1, 2, 3)
        seq = int(seq[::-1])
        pg = int(pg[::-1])
        pSeq = None
        matchP = PIECE_RE.match(title)
        if matchP:
            (pSeq, title) = matchP.group(1, 2)
            pSeq = int(pSeq[::-1])
        toc[pg] = (seq, pSeq, title)
    return toc

def loadTf(outDir)

Expand source code Browse git

def loadTf(outDir):
    TF = Fabric(locations=[outDir])
    allFeatures = TF.explore(silent=True, show=True)
    loadableFeatures = allFeatures["nodes"] + allFeatures["edges"]
    api = TF.load(loadableFeatures, silent=False)
    if api:
        print(f"max node = {api.F.otype.maxNode}")
        print("Frequencies of words")
        for (word, n) in api.F.letters.freqList()[0:20]:
            print(f"{n:>6} x {word}")