|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |
|
<meta name="generator" content="AsciiDoc 10.2.0"> |
|
<title>COMBINE_TESSDATA(1)</title> |
|
<style type="text/css"> |
|
|
|
|
|
|
|
body { |
|
font-family: Georgia,serif; |
|
} |
|
|
|
|
|
h1, h2, h3, h4, h5, h6, |
|
div.title, caption.title, |
|
thead, p.table.header, |
|
#toctitle, |
|
#author, #revnumber, #revdate, #revremark, |
|
#footer { |
|
font-family: Arial,Helvetica,sans-serif; |
|
} |
|
|
|
body { |
|
margin: 1em 5% 1em 5%; |
|
} |
|
|
|
a { |
|
color: blue; |
|
text-decoration: underline; |
|
} |
|
a:visited { |
|
color: fuchsia; |
|
} |
|
|
|
em { |
|
font-style: italic; |
|
color: navy; |
|
} |
|
|
|
strong { |
|
font-weight: bold; |
|
color: #083194; |
|
} |
|
|
|
h1, h2, h3, h4, h5, h6 { |
|
color: #527bbd; |
|
margin-top: 1.2em; |
|
margin-bottom: 0.5em; |
|
line-height: 1.3; |
|
} |
|
|
|
h1, h2, h3 { |
|
border-bottom: 2px solid silver; |
|
} |
|
h2 { |
|
padding-top: 0.5em; |
|
} |
|
h3 { |
|
float: left; |
|
} |
|
h3 + * { |
|
clear: left; |
|
} |
|
h5 { |
|
font-size: 1.0em; |
|
} |
|
|
|
div.sectionbody { |
|
margin-left: 0; |
|
} |
|
|
|
hr { |
|
border: 1px solid silver; |
|
} |
|
|
|
p { |
|
margin-top: 0.5em; |
|
margin-bottom: 0.5em; |
|
} |
|
|
|
ul, ol, li > p { |
|
margin-top: 0; |
|
} |
|
ul > li { color: #aaa; } |
|
ul > li > * { color: black; } |
|
|
|
.monospaced, code, pre { |
|
font-family: "Courier New", Courier, monospace; |
|
font-size: inherit; |
|
color: navy; |
|
padding: 0; |
|
margin: 0; |
|
} |
|
pre { |
|
white-space: pre-wrap; |
|
} |
|
|
|
#author { |
|
color: #527bbd; |
|
font-weight: bold; |
|
font-size: 1.1em; |
|
} |
|
#email { |
|
} |
|
#revnumber, #revdate, #revremark { |
|
} |
|
|
|
#footer { |
|
font-size: small; |
|
border-top: 2px solid silver; |
|
padding-top: 0.5em; |
|
margin-top: 4.0em; |
|
} |
|
#footer-text { |
|
float: left; |
|
padding-bottom: 0.5em; |
|
} |
|
#footer-badges { |
|
float: right; |
|
padding-bottom: 0.5em; |
|
} |
|
|
|
#preamble { |
|
margin-top: 1.5em; |
|
margin-bottom: 1.5em; |
|
} |
|
div.imageblock, div.exampleblock, div.verseblock, |
|
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock, |
|
div.admonitionblock { |
|
margin-top: 1.0em; |
|
margin-bottom: 1.5em; |
|
} |
|
div.admonitionblock { |
|
margin-top: 2.0em; |
|
margin-bottom: 2.0em; |
|
margin-right: 10%; |
|
color: #606060; |
|
} |
|
|
|
div.content { |
|
padding: 0; |
|
} |
|
|
|
|
|
div.title, caption.title { |
|
color: #527bbd; |
|
font-weight: bold; |
|
text-align: left; |
|
margin-top: 1.0em; |
|
margin-bottom: 0.5em; |
|
} |
|
div.title + * { |
|
margin-top: 0; |
|
} |
|
|
|
td div.title:first-child { |
|
margin-top: 0.0em; |
|
} |
|
div.content div.title:first-child { |
|
margin-top: 0.0em; |
|
} |
|
div.content + div.title { |
|
margin-top: 0.0em; |
|
} |
|
|
|
div.sidebarblock > div.content { |
|
background: #ffffee; |
|
border: 1px solid #dddddd; |
|
border-left: 4px solid #f0f0f0; |
|
padding: 0.5em; |
|
} |
|
|
|
div.listingblock > div.content { |
|
border: 1px solid #dddddd; |
|
border-left: 5px solid #f0f0f0; |
|
background: #f8f8f8; |
|
padding: 0.5em; |
|
} |
|
|
|
div.quoteblock, div.verseblock { |
|
padding-left: 1.0em; |
|
margin-left: 1.0em; |
|
margin-right: 10%; |
|
border-left: 5px solid #f0f0f0; |
|
color: #888; |
|
} |
|
|
|
div.quoteblock > div.attribution { |
|
padding-top: 0.5em; |
|
text-align: right; |
|
} |
|
|
|
div.verseblock > pre.content { |
|
font-family: inherit; |
|
font-size: inherit; |
|
} |
|
div.verseblock > div.attribution { |
|
padding-top: 0.75em; |
|
text-align: left; |
|
} |
|
|
|
div.verseblock + div.attribution { |
|
text-align: left; |
|
} |
|
|
|
div.admonitionblock .icon { |
|
vertical-align: top; |
|
font-size: 1.1em; |
|
font-weight: bold; |
|
text-decoration: underline; |
|
color: #527bbd; |
|
padding-right: 0.5em; |
|
} |
|
div.admonitionblock td.content { |
|
padding-left: 0.5em; |
|
border-left: 3px solid #dddddd; |
|
} |
|
|
|
div.exampleblock > div.content { |
|
border-left: 3px solid #dddddd; |
|
padding-left: 0.5em; |
|
} |
|
|
|
div.imageblock div.content { padding-left: 0; } |
|
span.image img { border-style: none; vertical-align: text-bottom; } |
|
a.image:visited { color: white; } |
|
|
|
dl { |
|
margin-top: 0.8em; |
|
margin-bottom: 0.8em; |
|
} |
|
dt { |
|
margin-top: 0.5em; |
|
margin-bottom: 0; |
|
font-style: normal; |
|
color: navy; |
|
} |
|
dd > *:first-child { |
|
margin-top: 0.1em; |
|
} |
|
|
|
ul, ol { |
|
list-style-position: outside; |
|
} |
|
ol.arabic { |
|
list-style-type: decimal; |
|
} |
|
ol.loweralpha { |
|
list-style-type: lower-alpha; |
|
} |
|
ol.upperalpha { |
|
list-style-type: upper-alpha; |
|
} |
|
ol.lowerroman { |
|
list-style-type: lower-roman; |
|
} |
|
ol.upperroman { |
|
list-style-type: upper-roman; |
|
} |
|
|
|
div.compact ul, div.compact ol, |
|
div.compact p, div.compact p, |
|
div.compact div, div.compact div { |
|
margin-top: 0.1em; |
|
margin-bottom: 0.1em; |
|
} |
|
|
|
tfoot { |
|
font-weight: bold; |
|
} |
|
td > div.verse { |
|
white-space: pre; |
|
} |
|
|
|
div.hdlist { |
|
margin-top: 0.8em; |
|
margin-bottom: 0.8em; |
|
} |
|
div.hdlist tr { |
|
padding-bottom: 15px; |
|
} |
|
dt.hdlist1.strong, td.hdlist1.strong { |
|
font-weight: bold; |
|
} |
|
td.hdlist1 { |
|
vertical-align: top; |
|
font-style: normal; |
|
padding-right: 0.8em; |
|
color: navy; |
|
} |
|
td.hdlist2 { |
|
vertical-align: top; |
|
} |
|
div.hdlist.compact tr { |
|
margin: 0; |
|
padding-bottom: 0; |
|
} |
|
|
|
.comment { |
|
background: yellow; |
|
} |
|
|
|
.footnote, .footnoteref { |
|
font-size: 0.8em; |
|
} |
|
|
|
span.footnote, span.footnoteref { |
|
vertical-align: super; |
|
} |
|
|
|
#footnotes { |
|
margin: 20px 0 20px 0; |
|
padding: 7px 0 0 0; |
|
} |
|
|
|
#footnotes div.footnote { |
|
margin: 0 0 5px 0; |
|
} |
|
|
|
#footnotes hr { |
|
border: none; |
|
border-top: 1px solid silver; |
|
height: 1px; |
|
text-align: left; |
|
margin-left: 0; |
|
width: 20%; |
|
min-width: 100px; |
|
} |
|
|
|
div.colist td { |
|
padding-right: 0.5em; |
|
padding-bottom: 0.3em; |
|
vertical-align: top; |
|
} |
|
div.colist td img { |
|
margin-top: 0.3em; |
|
} |
|
|
|
@media print { |
|
#footer-badges { display: none; } |
|
} |
|
|
|
#toc { |
|
margin-bottom: 2.5em; |
|
} |
|
|
|
#toctitle { |
|
color: #527bbd; |
|
font-size: 1.1em; |
|
font-weight: bold; |
|
margin-top: 1.0em; |
|
margin-bottom: 0.1em; |
|
} |
|
|
|
div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 { |
|
margin-top: 0; |
|
margin-bottom: 0; |
|
} |
|
div.toclevel2 { |
|
margin-left: 2em; |
|
font-size: 0.9em; |
|
} |
|
div.toclevel3 { |
|
margin-left: 4em; |
|
font-size: 0.9em; |
|
} |
|
div.toclevel4 { |
|
margin-left: 6em; |
|
font-size: 0.9em; |
|
} |
|
|
|
span.aqua { color: aqua; } |
|
span.black { color: black; } |
|
span.blue { color: blue; } |
|
span.fuchsia { color: fuchsia; } |
|
span.gray { color: gray; } |
|
span.green { color: green; } |
|
span.lime { color: lime; } |
|
span.maroon { color: maroon; } |
|
span.navy { color: navy; } |
|
span.olive { color: olive; } |
|
span.purple { color: purple; } |
|
span.red { color: red; } |
|
span.silver { color: silver; } |
|
span.teal { color: teal; } |
|
span.white { color: white; } |
|
span.yellow { color: yellow; } |
|
|
|
span.aqua-background { background: aqua; } |
|
span.black-background { background: black; } |
|
span.blue-background { background: blue; } |
|
span.fuchsia-background { background: fuchsia; } |
|
span.gray-background { background: gray; } |
|
span.green-background { background: green; } |
|
span.lime-background { background: lime; } |
|
span.maroon-background { background: maroon; } |
|
span.navy-background { background: navy; } |
|
span.olive-background { background: olive; } |
|
span.purple-background { background: purple; } |
|
span.red-background { background: red; } |
|
span.silver-background { background: silver; } |
|
span.teal-background { background: teal; } |
|
span.white-background { background: white; } |
|
span.yellow-background { background: yellow; } |
|
|
|
span.big { font-size: 2em; } |
|
span.small { font-size: 0.6em; } |
|
|
|
span.underline { text-decoration: underline; } |
|
span.overline { text-decoration: overline; } |
|
span.line-through { text-decoration: line-through; } |
|
|
|
div.unbreakable { page-break-inside: avoid; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
div.tableblock { |
|
margin-top: 1.0em; |
|
margin-bottom: 1.5em; |
|
} |
|
div.tableblock > table { |
|
border: 3px solid #527bbd; |
|
} |
|
thead, p.table.header { |
|
font-weight: bold; |
|
color: #527bbd; |
|
} |
|
p.table { |
|
margin-top: 0; |
|
} |
|
|
|
div.tableblock > table[frame="void"] { |
|
border-style: none; |
|
} |
|
div.tableblock > table[frame="hsides"] { |
|
border-left-style: none; |
|
border-right-style: none; |
|
} |
|
div.tableblock > table[frame="vsides"] { |
|
border-top-style: none; |
|
border-bottom-style: none; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
table.tableblock { |
|
margin-top: 1.0em; |
|
margin-bottom: 1.5em; |
|
} |
|
thead, p.tableblock.header { |
|
font-weight: bold; |
|
color: #527bbd; |
|
} |
|
p.tableblock { |
|
margin-top: 0; |
|
} |
|
table.tableblock { |
|
border-width: 3px; |
|
border-spacing: 0px; |
|
border-style: solid; |
|
border-color: #527bbd; |
|
border-collapse: collapse; |
|
} |
|
th.tableblock, td.tableblock { |
|
border-width: 1px; |
|
padding: 4px; |
|
border-style: solid; |
|
border-color: #527bbd; |
|
} |
|
|
|
table.tableblock.frame-topbot { |
|
border-left-style: hidden; |
|
border-right-style: hidden; |
|
} |
|
table.tableblock.frame-sides { |
|
border-top-style: hidden; |
|
border-bottom-style: hidden; |
|
} |
|
table.tableblock.frame-none { |
|
border-style: hidden; |
|
} |
|
|
|
th.tableblock.halign-left, td.tableblock.halign-left { |
|
text-align: left; |
|
} |
|
th.tableblock.halign-center, td.tableblock.halign-center { |
|
text-align: center; |
|
} |
|
th.tableblock.halign-right, td.tableblock.halign-right { |
|
text-align: right; |
|
} |
|
|
|
th.tableblock.valign-top, td.tableblock.valign-top { |
|
vertical-align: top; |
|
} |
|
th.tableblock.valign-middle, td.tableblock.valign-middle { |
|
vertical-align: middle; |
|
} |
|
th.tableblock.valign-bottom, td.tableblock.valign-bottom { |
|
vertical-align: bottom; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
body.manpage h1 { |
|
padding-top: 0.5em; |
|
padding-bottom: 0.5em; |
|
border-top: 2px solid silver; |
|
border-bottom: 2px solid silver; |
|
} |
|
body.manpage h2 { |
|
border-style: none; |
|
} |
|
body.manpage div.sectionbody { |
|
margin-left: 3em; |
|
} |
|
|
|
@media print { |
|
body.manpage div#toc { display: none; } |
|
} |
|
|
|
|
|
</style> |
|
<script type="text/javascript"> |
|
|
|
var asciidoc = { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
toc: function (toclevels) { |
|
|
|
function getText(el) { |
|
var text = ""; |
|
for (var i = el.firstChild; i != null; i = i.nextSibling) { |
|
if (i.nodeType == 3 ) |
|
text += i.data; |
|
else if (i.firstChild != null) |
|
text += getText(i); |
|
} |
|
return text; |
|
} |
|
|
|
function TocEntry(el, text, toclevel) { |
|
this.element = el; |
|
this.text = text; |
|
this.toclevel = toclevel; |
|
} |
|
|
|
function tocEntries(el, toclevels) { |
|
var result = new Array; |
|
var re = new RegExp('[hH]([1-'+(toclevels+1)+'])'); |
|
|
|
|
|
|
|
var iterate = function (el) { |
|
for (var i = el.firstChild; i != null; i = i.nextSibling) { |
|
if (i.nodeType == 1 ) { |
|
var mo = re.exec(i.tagName); |
|
if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") { |
|
result[result.length] = new TocEntry(i, getText(i), mo[1]-1); |
|
} |
|
iterate(i); |
|
} |
|
} |
|
} |
|
iterate(el); |
|
return result; |
|
} |
|
|
|
var toc = document.getElementById("toc"); |
|
if (!toc) { |
|
return; |
|
} |
|
|
|
|
|
var tocEntriesToRemove = []; |
|
var i; |
|
for (i = 0; i < toc.childNodes.length; i++) { |
|
var entry = toc.childNodes[i]; |
|
if (entry.nodeName.toLowerCase() == 'div' |
|
&& entry.getAttribute("class") |
|
&& entry.getAttribute("class").match(/^toclevel/)) |
|
tocEntriesToRemove.push(entry); |
|
} |
|
for (i = 0; i < tocEntriesToRemove.length; i++) { |
|
toc.removeChild(tocEntriesToRemove[i]); |
|
} |
|
|
|
|
|
var entries = tocEntries(document.getElementById("content"), toclevels); |
|
for (var i = 0; i < entries.length; ++i) { |
|
var entry = entries[i]; |
|
if (entry.element.id == "") |
|
entry.element.id = "_toc_" + i; |
|
var a = document.createElement("a"); |
|
a.href = "#" + entry.element.id; |
|
a.appendChild(document.createTextNode(entry.text)); |
|
var div = document.createElement("div"); |
|
div.appendChild(a); |
|
div.className = "toclevel" + entry.toclevel; |
|
toc.appendChild(div); |
|
} |
|
if (entries.length == 0) |
|
toc.parentNode.removeChild(toc); |
|
}, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
footnotes: function () { |
|
|
|
var i; |
|
var noteholder = document.getElementById("footnotes"); |
|
if (!noteholder) { |
|
return; |
|
} |
|
var entriesToRemove = []; |
|
for (i = 0; i < noteholder.childNodes.length; i++) { |
|
var entry = noteholder.childNodes[i]; |
|
if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote") |
|
entriesToRemove.push(entry); |
|
} |
|
for (i = 0; i < entriesToRemove.length; i++) { |
|
noteholder.removeChild(entriesToRemove[i]); |
|
} |
|
|
|
|
|
var cont = document.getElementById("content"); |
|
var spans = cont.getElementsByTagName("span"); |
|
var refs = {}; |
|
var n = 0; |
|
for (i=0; i<spans.length; i++) { |
|
if (spans[i].className == "footnote") { |
|
n++; |
|
var note = spans[i].getAttribute("data-note"); |
|
if (!note) { |
|
|
|
|
|
note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1]; |
|
spans[i].innerHTML = |
|
"[<a id='_footnoteref_" + n + "' href='#_footnote_" + n + |
|
"' title='View footnote' class='footnote'>" + n + "</a>]"; |
|
spans[i].setAttribute("data-note", note); |
|
} |
|
noteholder.innerHTML += |
|
"<div class='footnote' id='_footnote_" + n + "'>" + |
|
"<a href='#_footnoteref_" + n + "' title='Return to text'>" + |
|
n + "</a>. " + note + "</div>"; |
|
var id =spans[i].getAttribute("id"); |
|
if (id != null) refs["#"+id] = n; |
|
} |
|
} |
|
if (n == 0) |
|
noteholder.parentNode.removeChild(noteholder); |
|
else { |
|
|
|
for (i=0; i<spans.length; i++) { |
|
if (spans[i].className == "footnoteref") { |
|
var href = spans[i].getElementsByTagName("a")[0].getAttribute("href"); |
|
href = href.match(/#.*/)[0]; |
|
n = refs[href]; |
|
spans[i].innerHTML = |
|
"[<a href='#_footnote_" + n + |
|
"' title='View footnote' class='footnote'>" + n + "</a>]"; |
|
} |
|
} |
|
} |
|
}, |
|
|
|
install: function(toclevels) { |
|
var timerId; |
|
|
|
function reinstall() { |
|
asciidoc.footnotes(); |
|
if (toclevels) { |
|
asciidoc.toc(toclevels); |
|
} |
|
} |
|
|
|
function reinstallAndRemoveTimer() { |
|
clearInterval(timerId); |
|
reinstall(); |
|
} |
|
|
|
timerId = setInterval(reinstall, 500); |
|
if (document.addEventListener) |
|
document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false); |
|
else |
|
window.onload = reinstallAndRemoveTimer; |
|
} |
|
|
|
} |
|
asciidoc.install(); |
|
|
|
</script> |
|
</head> |
|
<body class="article"> |
|
<div id="header"> |
|
<h1>COMBINE_TESSDATA(1)</h1> |
|
</div> |
|
<div id="content"> |
|
<div class="sect1"> |
|
<h2 id="_name">NAME</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>combine_tessdata - combine/extract/overwrite/list/compact Tesseract data</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_synopsis">SYNOPSIS</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p><strong>combine_tessdata</strong> [<em>OPTION</em>] <em>FILE</em>…</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_description">DESCRIPTION</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>combine_tessdata(1) is the main program to combine/extract/overwrite/list/compact |
|
tessdata components in [lang].traineddata files.</p></div> |
|
<div class="paragraph"><p>To combine all the individual tessdata components (unicharset, DAWGs, |
|
classifier templates, ambiguities, language configs) located at, say, |
|
/home/$USER/temp/eng.* run:</p></div> |
|
<div class="literalblock"> |
|
<div class="content monospaced"> |
|
<pre>combine_tessdata /home/$USER/temp/eng.</pre> |
|
</div></div> |
|
<div class="paragraph"><p>The result will be a combined tessdata file /home/$USER/temp/eng.traineddata</p></div> |
|
<div class="paragraph"><p>Specify option -e if you would like to extract individual components |
|
from a combined traineddata file. For example, to extract language config |
|
file and the unicharset from tessdata/eng.traineddata run:</p></div> |
|
<div class="literalblock"> |
|
<div class="content monospaced"> |
|
<pre>combine_tessdata -e tessdata/eng.traineddata \ |
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</pre> |
|
</div></div> |
|
<div class="paragraph"><p>The desired config file and unicharset will be written to |
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</p></div> |
|
<div class="paragraph"><p>Specify option -o to overwrite individual components of the given |
|
[lang].traineddata file. For example, to overwrite language config |
|
and unichar ambiguities files in tessdata/eng.traineddata use:</p></div> |
|
<div class="literalblock"> |
|
<div class="content monospaced"> |
|
<pre>combine_tessdata -o tessdata/eng.traineddata \ |
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs</pre> |
|
</div></div> |
|
<div class="paragraph"><p>As a result, tessdata/eng.traineddata will contain the new language config |
|
and unichar ambigs, plus all the original DAWGs, classifier templates, etc.</p></div> |
|
<div class="paragraph"><p>Note: the file names of the files to extract to and to overwrite from should |
|
have the appropriate file suffixes (extensions) indicating their tessdata |
|
component type (.unicharset for the unicharset, .unicharambigs for unichar |
|
ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.</p></div> |
|
<div class="paragraph"><p>Specify option -u to unpack all the components to the specified path:</p></div> |
|
<div class="literalblock"> |
|
<div class="content monospaced"> |
|
<pre>combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.</pre> |
|
</div></div> |
|
<div class="paragraph"><p>This will create /home/$USER/temp/eng.* files with individual tessdata |
|
components from tessdata/eng.traineddata.</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_options">OPTIONS</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p><strong>-c</strong> <em>.traineddata</em> <em>FILE</em>…: |
|
Compacts the LSTM component in the .traineddata file to int.</p></div> |
|
<div class="paragraph"><p><strong>-d</strong> <em>.traineddata</em> <em>FILE</em>…: |
|
Lists directory of components from the .traineddata file.</p></div> |
|
<div class="paragraph"><p><strong>-e</strong> <em>.traineddata</em> <em>FILE</em>…: |
|
Extracts the specified components from the .traineddata file</p></div> |
|
<div class="paragraph"><p><strong>-l</strong> <em>.traineddata</em> <em>FILE</em>…: |
|
List the network information.</p></div> |
|
<div class="paragraph"><p><strong>-o</strong> <em>.traineddata</em> <em>FILE</em>…: |
|
Overwrites the specified components of the .traineddata file |
|
with those provided on the command line.</p></div> |
|
<div class="paragraph"><p><strong>-u</strong> <em>.traineddata</em> <em>PATHPREFIX</em> |
|
Unpacks the .traineddata using the provided prefix.</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_caveats">CAVEATS</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p><em>Prefix</em> refers to the full file prefix, including period (.)</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_components">COMPONENTS</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>The components in a Tesseract lang.traineddata file as of |
|
Tesseract 4.0 are briefly described below; For more information on |
|
many of these files, see |
|
<a href="https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html">https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html</a> |
|
and |
|
<a href="https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html">https://tesseract-ocr.github.io/tessdoc/TrainingTesseract-4.00.html</a></p></div> |
|
<div class="dlist"><dl> |
|
<dt class="hdlist1"> |
|
lang.config |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional) Language-specific overrides to default config variables. |
|
For 4.0 traineddata files, lang.config provides control parameters which |
|
can affect layout analysis, and sub-languages. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.unicharset |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 3.0x legacy tesseract) The list of symbols that Tesseract recognizes, with properties. |
|
See unicharset(5). |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.unicharambigs |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) This file contains information on pairs of recognized symbols |
|
which are often confused. For example, <em>rn</em> and <em>m</em>. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.inttemp |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 3.0x legacy tesseract) Character shape templates for each unichar. Produced by |
|
mftraining(1). |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.pffmtable |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 3.0x legacy tesseract) The number of features expected for each unichar. |
|
Produced by mftraining(1) from <strong>.tr</strong> files. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.normproto |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 3.0x legacy tesseract) Character normalization prototypes generated by cntraining(1) |
|
from <strong>.tr</strong> files. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.punc-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) A dawg made from punctuation patterns found around words. |
|
The "word" part is replaced by a single space. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.word-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) A dawg made from dictionary words from the language. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.number-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) A dawg made from tokens which originally contained digits. |
|
Each digit is replaced by a space character. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.freq-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) A dawg made from the most frequent words which would have |
|
gone into word-dawg. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.fixed-length-dawgs |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) Several dawgs of different fixed lengths — useful for |
|
languages like Chinese. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.shapetable |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) When present, a shapetable is an extra layer between the character |
|
classifier and the word recognizer that allows the character classifier to |
|
return a collection of unichar ids and fonts instead of a single unichar-id |
|
and font. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.bigram-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) A dawg of word bigrams where the words are separated by a space |
|
and each digit is replaced by a <em>?</em>. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.unambig-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) . |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.params-model |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 3.0x legacy tesseract) . |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 4.0 LSTM) Neural net trained recognition model generated by lstmtraining. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm-punc-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 4.0 LSTM) A dawg made from punctuation patterns found around words. |
|
The "word" part is replaced by a single space. Uses lang.lstm-unicharset. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm-word-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 4.0 LSTM) A dawg made from dictionary words from the language. |
|
Uses lang.lstm-unicharset. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm-number-dawg |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional - 4.0 LSTM) A dawg made from tokens which originally contained digits. |
|
Each digit is replaced by a space character. Uses lang.lstm-unicharset. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm-unicharset |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 4.0 LSTM) The unicode character set that Tesseract recognizes, with properties. |
|
Same unicharset must be used to train the LSTM and build the lstm-*-dawgs files. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.lstm-recoder |
|
</dt> |
|
<dd> |
|
<p> |
|
(Required - 4.0 LSTM) Unicharcompress, aka the recoder, which maps the unicharset |
|
further to the codes actually used by the neural network recognizer. This is created as |
|
part of the starter traineddata by combine_lang_model. |
|
</p> |
|
</dd> |
|
<dt class="hdlist1"> |
|
lang.version |
|
</dt> |
|
<dd> |
|
<p> |
|
(Optional) Version string for the traineddata file. |
|
First appeared in version 4.0 of Tesseract. |
|
Old version of traineddata files will report Version:Pre-4.0.0. |
|
4.0 version of traineddata files may include the network spec |
|
used for LSTM training as part of version string. |
|
</p> |
|
</dd> |
|
</dl></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_history">HISTORY</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>combine_tessdata(1) first appeared in version 3.00 of Tesseract</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_see_also">SEE ALSO</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), |
|
unicharambigs(5)</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_copying">COPYING</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>Copyright (C) 2009, Google Inc. |
|
Licensed under the Apache License, Version 2.0</p></div> |
|
</div> |
|
</div> |
|
<div class="sect1"> |
|
<h2 id="_author">AUTHOR</h2> |
|
<div class="sectionbody"> |
|
<div class="paragraph"><p>The Tesseract OCR engine was written by Ray Smith and his research groups |
|
at Hewlett Packard (1985-1995) and Google (2006-present).</p></div> |
|
</div> |
|
</div> |
|
</div> |
|
<div id="footnotes"><hr></div> |
|
<div id="footer"> |
|
<div id="footer-text"> |
|
Last updated |
|
2022-01-22 13:50:22 CET |
|
</div> |
|
</div> |
|
</body> |
|
</html> |
|
|