Commit 33d0facf authored by Mohammad Mahfuzur Rahman Mamun's avatar Mohammad Mahfuzur Rahman Mamun
Browse files

selection of text by character done ... need some testing and debug

parent 3d0564fa
......@@ -143,6 +143,8 @@ class TinyTextEntity
class RegionText{
public:
RegionText(){};
RegionText(TextList &list,QRect &area)
: m_region_text(list) ,m_area(area)
{
......@@ -959,6 +961,14 @@ bool compareTinyTextEntityY(TinyTextEntity* first, TinyTextEntity* second){
return firstArea.top() < secondArea.top();
}
bool compareRegionTextY(RegionText first, RegionText second){
return first.area().top() < second.area().top();
}
bool compareRegionTextX(RegionText first, RegionText second){
return first.area().left() < second.area().left();
}
void TextPagePrivate::printTextList(int i, TextList list){
......@@ -1093,11 +1103,14 @@ void TextPage::makeWord(){
int newLeft,newRight,newTop,newBottom;
int pageWidth = d->m_page->m_page->width(), pageHeight = d->m_page->m_page->height();
int index = 0;
QString spaceString(" ");
//For RegionTextList
// It will contain a list of RegionText, where each RegionText contains a word, which comprises of
// TextList which is a list of TinyTextEntity which contains characters info and a QRect which contains
// the area of the region.
RegionTextList regionWordList;
//WordTocharacterList d->m_word_char_map
//for every non-space texts(characters/words) in the textList
for( ; it != itEnd ; it++){
......@@ -1109,15 +1122,10 @@ void TextPage::makeWord(){
tmpIt = it;
// cout << "first : ";
// printRect(lineArea) ;
int space = 0;
while(space <= 1){
// if(textString == spaceString) break;
// we must have to put this line before the if condition of it==itEnd
// otherwise the last character can be missed
if(textString.length()){
......@@ -1189,15 +1197,23 @@ void TextPage::makeWord(){
if(newString.length()){
NormalizedRect newRect(lineArea,pageWidth,pageHeight);
TinyTextEntity *ent = new TinyTextEntity(newString.normalized
(QString::NormalizationForm_KC), newRect );
newList.append(ent);
newList.append( new TinyTextEntity(newString.normalized
(QString::NormalizationForm_KC), newRect ));
// cout << "newString: " << newString.toAscii().data() << endl;
QRect rect = newRect.geometry(pageWidth,pageHeight);
RegionText regionWord(word,rect);
regionWordList.append(regionWord);
int keyRect = rect.left() * rect.top()
+ rect.right() * rect.bottom();
// if there are more than one element in the same key
d->m_word_chars_map.insertMulti(keyRect,regionWord);
index++;
}
......@@ -1209,23 +1225,21 @@ void TextPage::makeWord(){
d->m_region_words = regionWordList;
cout << "words: " << index << endl;
// cout << " ............................................................ " << endl;
d->copy(newList);
// for(int i = 0 ; i < d->m_words.length() ; i++){
// TinyTextEntity *ent = d->m_words.at(i);
// cout << ent->text().toAscii().data() << endl;
// printRect(ent->area.roundedGeometry(pageWidth,pageHeight));
// }
// QRect entArea = ent->area.geometry(pageWidth,pageHeight);
// int key = entArea.top() * entArea.left() + entArea.right() * entArea.bottom();
// cout << endl;
// RegionText text_list = d->m_word_chars_map.value(key);
// TextList list = text_list.text();
// for(int i = 0 ; i < d->m_region_words.length() ; i++){
// RegionText word = d->m_region_words.at(i);
// TextList text = word.text();
// for( int j = 0 ; j < text.length() ; j++){
// TinyTextEntity* ent = text.at(j);
// cout << "key: " << key << " text: ";
// for( int l = 0 ; l < list.length() ; l++){
// ent = list.at(l);
// cout << ent->text().toAscii().data();
// }
// cout << endl;
......@@ -1253,7 +1267,6 @@ void TextPage::makeAndSortLines(){
TextList tmpList = d->m_words;
qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY);
// d->printTextList(0,tmpList);
// Step 2: .......................................
......@@ -1347,9 +1360,6 @@ void TextPage::makeAndSortLines(){
qSort(list.begin(),list.end(),compareTinyTextEntityX);
d->m_lines.replace(i,list);
// d->printTextList(i,list);
// printRect(d->m_line_rects.at(i));
}
}
......@@ -1429,14 +1439,12 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
if (proj_on_yaxis[j] > maxY) maxY = proj_on_yaxis[j];
// cout << "index: " << j << " value: " << proj_on_yaxis[j] << endl;
}
// cout << endl;
// cout << "projection on x axis " << endl << endl;
for( j = 0 ; j < size_proj_x ; j++ ){
if(proj_on_xaxis[j] > maxX) maxX = proj_on_xaxis[j];
// cout << "index: " << j << " value: " << proj_on_xaxis[j] << endl;
}
// cout << endl;
/** 2. Cleanup Boundary White Spaces and removal of noise ..................... **/
......@@ -1458,7 +1466,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
yend--;
}
// printRect(regionRect);
//update the regionRect
int old_left = regionRect.left(), old_top = regionRect.top();
......@@ -1574,39 +1581,19 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
regionRect.height());
// horizontal split (top rect, bottom rect)
cout << "main: ";
printRect(regionRect);
if(gap_y >= gap_x && gap_y > tcy){
// cout << "toprect: ";
// printRect(topRect);
// cout << "bottomrect: ";
// printRect(bottomRect);
cut_hor = true;
}
//vertical cut (left rect, right rect)
else if(gap_y >= gap_x && gap_y <= tcy && gap_x > tcx){
// cout << "leftrect: ";
// printRect(leftRect);
// cout << "rightrect: ";
// printRect(rightRect);
cut_ver = true;
}
//vertical cut
else if(gap_x >= gap_y && gap_x > tcx){
// cout << "leftrect: ";
// printRect(leftRect);
// cout << "rightrect: ";
// printRect(rightRect);
cut_ver = true;
}
//horizontal cut
else if(gap_x >= gap_y && gap_x <= tcx && gap_y > tcy){
// cout << "toprect: ";
// printRect(topRect);
// cout << "bottomrect: ";
// printRect(bottomRect);
cut_hor = true;
}
//no cut possible
......@@ -1692,8 +1679,6 @@ void TextPage::XYCutForBoundingBoxes(int tcx,int tcy){
//correct the textOrder, all layout recognition works here
void TextPage::correctTextOrder(){
// create words from characters (crashes)
removeSpace();
makeWord();
......@@ -1771,19 +1756,15 @@ void TextPage::correctTextOrder(){
QRect max_area1,max_area2;
QString before_max, after_max;
// d->printTextList(i,list);
// for every line
for( ; it != itEnd ; it++ ){
// cout << (*it)->text().toAscii().data() << endl;
QRect area1 = (*it)->area.roundedGeometry(pageWidth,pageHeight);
if( it+1 == itEnd ) break;
// printRect(area1);
QRect area2 = (*(it+1))->area.roundedGeometry(pageWidth,pageHeight);
int space = area2.left() - area1.right();
// printRect(area2);
if(space > maxSpace){
max_area1 = area1;
......@@ -1795,9 +1776,6 @@ void TextPage::correctTextOrder(){
after_max = (*(it+1))->text();
}
// cout << (*it)->text().toAscii().data() << " " << (*(it+1))->text().toAscii().data();
// cout << " space: " << space << endl;
if(space < minSpace && space != 0) minSpace = space;
//if we found a real space, whose length is not zero and also less than the pageWidth
......@@ -1819,14 +1797,10 @@ void TextPage::correctTextOrder(){
QRect rect(left,top,right-left,bottom-top);
line_space_rects.append(rect);
// cout << space << " ";
}
// cout << "space: " << space << " " << area1.right() << " " << area2.left() << endl;
}
// cout << endl << "maxSpace " << maxSpace << " ----------------------------------------------- " << endl << endl;
space_rects.append(line_space_rects);
if(hor_space_stat.contains(maxSpace)){
......@@ -2073,12 +2047,13 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
// we will use the concept of line and line sorting here once again
/**
1. we will first add spaces regionWise
2. Then we will sort all the texts in the region by Y
3. After that, we will create a line containing all overlapping Y
4. Now, we will sort texts in every line by X
5. And, finally we will extract all the space separated texts from each region and
1. We will sort all the texts in the region by Y
2. After that, we will create a line containing all overlapping Y
3. Now, we will sort texts in every line by X
4. We will now add spaces between two words in a line
5. And, then we will extract all the space separated texts from each region and
make m_words nice again.
6. Then we will merge all the texts from every region to make one TextList and assign it to m_words
**/
// m_spaces;m_words;
......@@ -2089,38 +2064,21 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
// we will only change the texts under RegionTexts, not the area
for(j = 0 ; j < tree.length() ; j++){
RegionText tmp = tree.at(j);
QRect area = tmp.area();
TextList tmpList = tmp.text();
// 1. adding space
// TextList::Iterator it1 = m_tmp_words.begin(), itEnd1 = m_tmp_words.end();
// for( ; it1 != itEnd1 ; it1++){
// QRect entArea = (*it1)->area.geometry(pageWidth,pageHeight);
// QPoint center = entArea.center();
// QString text = (*it1)->text();
// // if some space is in the region, add its TinyTextEntity to the tmpList
// if(area.contains(center) && text == spaceStr){
// tmpList.append((*it1));
// }
// }
// now we have to keep tmpList in order and then set tmp with the tmpList
// 2. sorting by Y
// 1. sorting by Y
qSort(tmpList.begin(),tmpList.end(),compareTinyTextEntityY);
//print the tmpList
cout << "printing the tmpList " << " ..................................... " << endl;
for( i = 0 ; i < tmpList.length() ; i++){
TinyTextEntity* ent = tmpList.at(i);
cout << ent->text().toAscii().data();
}
cout << endl << endl;
// cout << "printing the tmpList " << " ..................................... " << endl;
// for( i = 0 ; i < tmpList.length() ; i++){
// TinyTextEntity* ent = tmpList.at(i);
// cout << ent->text().toAscii().data();
// }
// cout << endl << endl;
// 3. create line by Y overlap
// 2. create line by Y overlap
TextList::Iterator it = tmpList.begin(), itEnd = tmpList.end();
int newLeft,newRight,newTop,newBottom;
......@@ -2180,7 +2138,6 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
}
// // when we have found a new line
// // create a new TextList containing only one element and append it to the m_lines
if(!found){
TextList tmp;
tmp.append((*it));
......@@ -2189,7 +2146,7 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
}
}
// 4. sort texts in each line by X
// 3. sort texts in each line by X
for(i = 0 ; i < m_lines.length() ; i++){
TextList list = m_lines.at(i);
......@@ -2197,10 +2154,10 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
qSort(list.begin(),list.end(),compareTinyTextEntityX);
m_lines.replace(i,list);
printTextList(i,list);
// printTextList(i,list);
}
// Bonus ;): Now, we add space in between texts in a region
// 4. Now, we add space in between texts in a region
for(i = 0 ; i < m_lines.length() ; i++){
TextList list = m_lines.at(i);
......@@ -2258,26 +2215,74 @@ void TextPagePrivate::addNecessarySpace(RegionTextList tree){
tree.replace(j,tmp);
}
TextList tmp;
int count = 0;
// Merge all the texts from each region
TextList tmp;
for(i = 0 ; i < tree.length() ; i++){
TextList list = tree.at(i).text();
cout << "node: " << i << endl << endl;
for(j = 0 ; j < list.length() ; j++){
TinyTextEntity *ent = list.at(j);
cout << ent->text().toAscii().data();
if(ent->text() == spaceStr)
count++;
tmp.append(ent);
}
cout << endl << endl;
}
copy(tmp);
// break the words into characters/smallest part that was primarily
while(tmp.length()) tmp.pop_back();
int count = 0;
for(int i = 0 ; i < m_words.length() ; i++){
TinyTextEntity *ent = m_words.at(i);
QRect rect = ent->area.geometry(pageWidth,pageHeight);
// the spaces contains only one character, so we can skip them
if(ent->text() == spaceStr){
tmp.append(ent);
}
else{
int key = rect.left() * rect.top()
+ rect.right() * rect.bottom();
RegionText word_text = m_word_chars_map.value(key);
TextList list = word_text.text();
count = m_word_chars_map.count(key);
if(count > 1){
cout << "count : " << count << endl;
QMap<int, RegionText>::iterator it = m_word_chars_map.find(key);
while( it != m_word_chars_map.end() && it.key() == key ){
word_text = it.value();
it++;
list = word_text.text();
QRect regionRect = word_text.area();
if(regionRect.left() == rect.left() && regionRect.top() == rect.top())
break;
}
}
tmp.append(list);
}
}
copy(tmp);
// print the final text
for( i = 0 ; i < m_words.length() ; i++){
TinyTextEntity* ent = m_words.at(i);
cout << ent->text().toAscii().data();
}
}
......@@ -25,9 +25,6 @@ namespace Okular
class PagePrivate;
typedef QList< TinyTextEntity* > TextList;
/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/
typedef QList<RegionText> RegionTextList;
typedef bool ( *TextComparisonFunction )( const QStringRef & from, const QStringRef & to,
int *fromLength, int *toLength );
......@@ -39,6 +36,8 @@ We will make a line of TextList and also store the bounding rectangle of line
typedef QList<TextList> SortedTextList;
typedef QList<QRect> LineRect;
/** list of RegionText -- keeps a bunch of TextList with their bounding rectangles **/
typedef QList<RegionText> RegionTextList;
class TextPagePrivate
{
......@@ -68,6 +67,8 @@ class TextPagePrivate
**/
void addNecessarySpace(RegionTextList tree);
QMap<int, RegionText> m_word_chars_map;
RegionTextList m_region_words;
TextList m_spaces;
TextList m_words;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment