diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 521359b8f4..e35c425b43 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1378,18 +1378,42 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it, hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); } -static void AddBoxTohOCR(const PageIterator *it, +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) { + unsigned long bufsize = base.length() + 2 * kMaxIntSize; + char id_buffer[bufsize]; + if (num2 >= 0) { + snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2); + } else { + snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1); + } + id_buffer[bufsize - 1] = '\0'; + *hocr_str += " id='"; + *hocr_str += id_buffer; + *hocr_str += "'"; +} + +static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, STRING* hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); - hocr_str->add_str_int("' title=\"bbox ", left); + // This is the only place we use double quotes instead of single quotes, + // but it may too late to change for consistency + hocr_str->add_str_int(" title=\"bbox ", left); hocr_str->add_str_int(" ", top); hocr_str->add_str_int(" ", right); hocr_str->add_str_int(" ", bottom); - // Add baseline coordinates for textlines only. - if (level == RIL_TEXTLINE) + // Add baseline coordinates & heights for textlines only. + if (level == RIL_TEXTLINE) { AddBaselineCoordsTohOCR(it, level, hocr_str); + // add custom height measures + float row_height, descenders, ascenders; // row attributes + it->RowAttributes(&row_height, &descenders, &ascenders); + // TODO: Do we want to limit these to a single decimal place? + hocr_str->add_str_double("; x_size ", row_height); + hocr_str->add_str_double("; x_descenders ", descenders * -1); + hocr_str->add_str_double("; x_ascenders ", ascenders); + } *hocr_str += "\">"; } @@ -1408,7 +1432,6 @@ char* TessBaseAPI::GetHOCRText(int page_number) { int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. - float row_height, descenders, ascenders; // row attributes bool font_info = false; GetBoolVariable("hocr_font_info", &font_info); @@ -1434,8 +1457,9 @@ char* TessBaseAPI::GetHOCRText(int page_number) { delete[] utf8_str; #endif - hocr_str.add_str_int("
string()); } else { @@ -1457,36 +1481,29 @@ char* TessBaseAPI::GetHOCRText(int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - hocr_str.add_str_int("
IsAtBeginningOf(RIL_PARA)) { + hocr_str += "\n

ParagraphIsLtr()) { - hocr_str.add_str_int("\n

IsAtBeginningOf(RIL_TEXTLINE)) { - int fontsize; - hocr_str.add_str_int("\n WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); - hocr_str.add_str_int("' title='bbox ", left); + hocr_str.add_str_int(" title='bbox ", left); hocr_str.add_str_int(" ", top); hocr_str.add_str_int(" ", right); hocr_str.add_str_int(" ", bottom);