Commit bbd28aa8 authored by Mohammad Mahfuzur Rahman Mamun's avatar Mohammad Mahfuzur Rahman Mamun
Browse files

calculation of column space rectangle and a try to separate columns from a two...

calculation of column space rectangle and a try to separate columns from a two column document on that basis
parent 4c5d844e
......@@ -971,19 +971,7 @@ void TextPage::makeAndSortLines(){
// for(i = 0 ; i < d->m_lines.length() ; i++){
// // list is a line
// TextList list = d->m_lines.at(i);
// if(!i){
// QRect rect = d->m_line_rects.at(i);
// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ": ";
// for(j = 0 ; j < list.length() ; j++){
// TinyTextEntity* ent = list.at(j);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
// d->printTextList(i,list);
// }
// }
......@@ -999,24 +987,73 @@ void TextPage::makeAndSortLines(){
d->m_lines.replace(i,list);
//print lines after sorting
// if(1){
//d->printTextList(i,list);
}
}
// QRect rect = d->m_line_rects.at(i);
// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ": ";
// if the horizontal arm of one rectangle fully contains the other (example below)
// -------- ---- ----- first
// ---- -------- ----- second
// for(j = 0 ; j < list.length() ; j++){
// TinyTextEntity* ent = list.at(j);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
// }
// or we can make it overlap of spaces by 80%
bool doesConsumeX(QRect first, QRect second){
// int threshold = 2;
// if one consumes another fully
if(first.left() >= second.left() && first.right() >= second.right()) return true;
if(first.left() <= second.left() && first.right() <= second.right()) return true;
//or if there is overlap of space by more than 80%
// there is overlap
int overlap;
if(second.right() >= first.left() && first.right() >= second.left()){
int percentage;
if(second.right() > first.right()) overlap = first.right() - second.left();
else overlap = second.right() - first.left();
//we will divide by the smaller rectangle to calculate the overlap
if( first.width() < second.width()){
percentage = overlap * 100 / (first.width());
}
else{
percentage = overlap * 100 / (second.width());
}
if(percentage > 80) return true;
}
return false;
}
void printRect(QRect rect){
cout << "l: " << rect.left() << " r: " << rect.right() << " t: " << rect.top() << " b: " << rect.bottom() << endl;
}
void TextPagePrivate::printTextList(int i, TextList list){
QRect rect = m_line_rects.at(i);
cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
cout << "Line " << i << ": ";
for(int j = 0 ; j < list.length() ; j++){
TinyTextEntity* ent = list.at(j);
cout << ent->text().toAscii().data();
}
cout << endl;
}
//correct the textOrder, all layout recognition works here
void TextPage::correctTextOrder(){
......@@ -1042,8 +1079,11 @@ void TextPage::correctTextOrder(){
//we would like to use QMap instead of QHash as it will keep the keys sorted
QMap<int,int> hor_space_stat; //this is to find word spacing
QMap<int,int> col_space_stat; //this is to find column spacing
QList< QList<QRect> > space_rects; // to save all the word spacing or column spacing rects
QList<QRect> max_hor_space_rects;
int i,j;
for(i = 0 ; i < d->m_lines.length() ; i++){
......@@ -1051,23 +1091,13 @@ void TextPage::correctTextOrder(){
TextList list = d->m_lines.at(i);
QList<QRect> line_space_rects;
// if(1){
// QRect rect = d->m_line_rects.at(i);
// cout << "L:" << rect.left() << " R:" << rect.right() << " T:" << rect.top() << " B:" << rect.bottom() << endl;
// cout << "Line " << i << ": ";
// for(j = 0 ; j < list.length() ; j++){
// TinyTextEntity* ent = list.at(j);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
// }
int maxSpace = 0, minSpace = d->m_page->m_page->width();
// for every TinyTextEntity element in the line
TextList::Iterator it = list.begin(), itEnd = list.end();
QRect max_area1,max_area2;
// cout << "Line " << i << ":";
for( ; it != itEnd ; it++ ){
// cout << (*it)->text().toAscii().data();
......@@ -1076,7 +1106,13 @@ void TextPage::correctTextOrder(){
QRect area2 = (*(it+1))->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height());
int space = area2.left() - area1.right();
if(space > maxSpace) maxSpace = space;
if(space > maxSpace){
max_area1 = area1;
max_area2 = area2;
maxSpace = space;
}
if(space < minSpace && space != 0) minSpace = space;
//if we found a real space, whose length is not zero and also less than the pageWidth
......@@ -1089,9 +1125,6 @@ void TextPage::correctTextOrder(){
left = area1.right();
right = area2.left();
// cout << "left: " << left << ", right: " << right << endl;
area1.top() > area2.top() ? top = area2.top() : top = area1.top();
area1.bottom() < area2.bottom() ? bottom = area2.bottom() : bottom = area1.bottom();
......@@ -1116,13 +1149,27 @@ void TextPage::correctTextOrder(){
if (col_space_stat.contains(maxSpace))
col_space_stat[maxSpace] = col_space_stat[maxSpace]++;
else col_space_stat[maxSpace] = 1;
//store the max rect of each line
int left,right,top,bottom;
left = max_area1.right();
right = max_area2.left();
max_area1.top() > max_area2.top() ? top = max_area2.top() : top = max_area1.top();
max_area1.bottom() < max_area2.bottom() ? bottom = max_area2.bottom() : bottom = max_area1.bottom();
QRect rect(QPoint(left,top),QPoint(right,bottom));
max_hor_space_rects.append(rect);
}
// cout << endl;
// cout << minSpace << " "<< maxSpace << endl;
}
// All the space counts are in hor_space_stat
// All the between word space counts are in hor_space_stat
/*int word_spacing;
cout << "Word Spacing: " << endl;
QMapIterator<int, int> iterate(hor_space_stat);
while (iterate.hasNext()) {
......@@ -1137,28 +1184,172 @@ void TextPage::correctTextOrder(){
iterate_col.next();
cout << iterate_col.key() << ": " << iterate_col.value() << endl;
if(iterate_col.value() > col_spacing) col_spacing = iterate_col.value();
}
}*/
cout << "Column Spacing is: " << col_spacing << endl;
//show all space rects (between words, word spacing or column spacing)
// for( i = 0 ; i < space_rects.length() ; i++){
//print some space rects
for( i = 0 ; i < space_rects.length() ; i++){
// QList<QRect> rectList = space_rects.at(i);
QList<QRect> rectList = space_rects.at(i);
// for( j = 0 ; j < rectList.length() ; j++){
for( j = 0 ; j < rectList.length() ; j++){
// QRect rect = rectList.at(j);
// cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << ","
// << rect.top() << "," << rect.bottom() << endl;
QRect rect = rectList.at(j);
cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << ","
<< rect.top() << "," << rect.bottom() << endl;
// }
// }
// cout << "max_hor_space_rectangle:" << endl;
// for(j = 0 ; j < max_hor_space_rects.length() ; j++){
// QRect rect = max_hor_space_rects.at(j);
// cout << "rect:(left,right,top,bottom) : " << rect.left() << "," << rect.right() << ","
// << rect.top() << "," << rect.bottom() << endl;
// }
}
}
/** Step 2: ........................................................................ **/
/**
We will start with the max whitespace rectangle within the first line, if any. Then, we will get the max whitespace
rectangle of the second line.
If both of them are at the same position, we can say, they creates a column.
Else we will check
if there is any whitespace rectangle under the previous line's maximum whitespace rectangle. In this cae, we can say
its a noisy line, which do not preserve the column separation. if we find 3(col_threshold) lines of this type consecutively,
we can break the column separation, and say that these 3 lines fully are in the same column.
else, the line is a single line in a column. We do not need to separate this.
**/
// int col_threshold = 3;
// int col_spacing_threshold = 2;
int length_line_list = d->m_lines.length();
bool consume12 = false, consume23 = false, consume13 = false;
for(i = 0 ; i < length_line_list ; i++){
int index1 = i % length_line_list, index2 = (i + 1) % length_line_list,
index3 = (i + 2) % length_line_list;
// We will take 3 lines at a time, so that one noisy data do not give wrong idea.
// We will see whether they creates a column or not
TextList line1 = d->m_lines.at(index1);
TextList line2 = d->m_lines.at(index2);
// TextList line3 = d->m_lines.at(index3);
// the estimated column space rectangles of those lines
QRect columnRect1 = max_hor_space_rects.at(index1);
QRect columnRect2 = max_hor_space_rects.at(index2);
// QRect columnRect3 = max_hor_space_rects.at(index3);
QRect rect1,rect2,rect3;
cout << "rect1: ";
printRect(columnRect1);
cout << "rect2: ";
printRect(columnRect2);
//if the maxRectangle of line1 and line2 are at the same place, they may create a column
if(doesConsumeX(columnRect1,columnRect2)){
consume12 = true;
rect1 = columnRect1;
rect2 = columnRect2;
// d->printTextList(index1,line1);
// cout << "rect1: " << columnRect1.left() << " , " << columnRect1.right() << endl;
// d->printTextList(index2,line2);
// cout << "rect2: " << columnRect2.left() << " , " << columnRect2.right() << endl;
}
//else if one of the lines is noisy and do not maintain column spacing correctly,
//so that, maxSpacing is not column spacing but, some other word spacing, so we search
//if some rectangle smaller than some word spacing rectangle remains which is consumed by
//the other lines maxSpacing rectangle.
else{
//1. see whether maxSpacing of line1 consumes any space rectangle in line2
rect1 = columnRect1;
QList<QRect> line2_space_rect = space_rects.at(index2);
for(j = 0 ; j < line2_space_rect.length() ; j++){
rect2 = line2_space_rect.at(j);
if(doesConsumeX(rect1,rect2)){
consume12 = true;
break;
}
}
if(consume12) break;
//2. see whether maxSpacing of line2 consumes any space rectangle in line1
rect2 = columnRect2;
QList<QRect> line1_space_rect = space_rects.at(index1);
for(j = 0 ; j < line1_space_rect.length() ; j++){
rect1 = line1_space_rect.at(j);
if(doesConsumeX(rect1,rect2)){
consume12 = true;
break;
}
}
}
// if consume12 is still false, then we do not get some column spacing, the spacing are random,
// so, possibly line1 and line2 are not column separated lines. so, we don't need to split them.
// possibly we have got a column separator, so, we break the lines in two parts, and
// 1. edit previous lines(delete the part after column)
// 2. add two new lines and append them to the last of the list
if(consume12){
//the separating rectangles are rect1 and rect2
QRect linerect1 = d->m_line_rects.at(i),linerect2 = linerect1;
TextList tmp;
TinyTextEntity* tmp_entity;
for(j = line1.length() - 1 ; j >= 0 ; j --){
tmp_entity = line1.at(j);
QRect area = tmp_entity->area.geometry(d->m_page->m_page->width(),d->m_page->m_page->height());
// we have got maxSpace rect
if(area.left() == rect1.right()){
linerect1.setRight(rect1.left());
linerect2.setLeft(rect1.right());
tmp.push_front(tmp_entity);
line1.pop_back();
break;
}
//push in front in the new line and pop from the back of the old line
tmp.push_front(tmp_entity);
line1.pop_back();
}
d->m_lines.replace(i,line1);
d->m_line_rects.replace(i,linerect1);
d->m_lines.append(tmp);
d->m_line_rects.append(linerect2);
d->printTextList(i,d->m_lines.at(i));
d->printTextList(length_line_list + i,d->m_lines.at(i+length_line_list));
}
// break;
}
/**
......
......@@ -52,6 +52,8 @@ class TextPagePrivate
TextComparisonFunction comparer,
const TextList::ConstIterator &start,
const TextList::ConstIterator &end );
/** prints a line **/
void printTextList(int i, TextList list);
TextList m_words;
QMap< int, SearchPoint* > m_searchPoints;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment