diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cfb3081 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.vscode +*.o +src/hunalign/hunalign \ No newline at end of file diff --git a/src/hunalign/alignerTool.cpp b/src/hunalign/alignerTool.cpp index b32810a..996c850 100644 --- a/src/hunalign/alignerTool.cpp +++ b/src/hunalign/alignerTool.cpp @@ -69,6 +69,8 @@ class AlignParameters int minCooccForDictBuild; bool utfCharCountingMode; + + bool noNumberScoreBoost; std::string autoDictionaryDumpFilename; // Empty string means do not dump. }; @@ -291,7 +293,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue ); - sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix ); + sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix, alignParameters.noNumberScoreBoost ); std::cerr << std::endl; // temporaryDumpOfAlignMatrix( std::cerr, similarityMatrix ); @@ -411,7 +413,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary, huSentenceListPretty, enSentenceList, huSentenceListGarbled, enSentenceListGarbled ); - sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed ); + sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed, alignParameters.noNumberScoreBoost ); } } @@ -708,6 +710,12 @@ int main_alignerTool(int argC, char* argV[]) alignParameters.realignType = AlignParameters::UpgradeDictRealign; } + if (args.getSwitchCompact("nonumberscoreboost")) + { + alignParameters.noNumberScoreBoost = true; + } + + bool batchMode = args.getSwitchCompact("batch") ; bool justDictBuilding = false; diff --git a/src/hunalign/bookToMatrix.cpp b/src/hunalign/bookToMatrix.cpp index c0d52a5..daf1673 100644 --- a/src/hunalign/bookToMatrix.cpp +++ b/src/hunalign/bookToMatrix.cpp @@ -61,7 +61,7 @@ bool isNumber( const std::string& s ) } // (!!!) We assert that sx and sy are ordered sets of Word-s! -int specializedIntersectionSize( const WordList& sx, const WordList& sy ) +int specializedIntersectionSize( const WordList& sx, const WordList& sy, bool noNumberScoreBoost ) { int inter=0; WordList::const_iterator sxt = sx.begin(); @@ -103,9 +103,12 @@ int specializedIntersectionSize( const WordList& sx, const WordList& sy ) } // TODO miert pont. - if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) ) + if (!noNumberScoreBoost) { - inter += 10; + if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) ) + { + inter += 10; + } } return inter; @@ -151,12 +154,12 @@ bool exceptionalScoring( const Phrase& hu, const Phrase& en, double& score ) const double maximumScore = 3.0; -double scoreByIdentity( const Phrase& hu, const Phrase& en ) +double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost ) { double score = 0; if ( ! exceptionalScoring( hu, en, score ) ) { - score = specializedIntersectionSize( hu, en ); + score = specializedIntersectionSize( hu, en, noNumberScoreBoost ); // If we divide with max here, we are better at avoiding global mistakes. // If we divide with min here, we are better at avoiding local mistakes. @@ -179,7 +182,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en ) //x // Ezt akkor csereltem ki 3.0-rol 5.0-re, amikor a minimumot maximumra csreltem alabb. //x const double maximumScore = 5.0; //x -//x double scoreByIdentity( const Phrase& hu, const Phrase& en ) +//x double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost ) //x { //x double score = 0; //x if ( ! exceptionalScoring( hu, en, score ) ) @@ -195,7 +198,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en ) //x return score; //x } -void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix ) +void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost ) { int huPos,enPos; @@ -211,7 +214,7 @@ void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, con const Phrase& hu = huSentenceList[huPos].words; const Phrase& en = enSentenceList[enPos].words; - alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu,en) ); + alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu, en, noNumberScoreBoost) ); } bool rarelyLogging = true; diff --git a/src/hunalign/bookToMatrix.h b/src/hunalign/bookToMatrix.h index be85dfc..5973130 100644 --- a/src/hunalign/bookToMatrix.h +++ b/src/hunalign/bookToMatrix.h @@ -26,11 +26,11 @@ bool isParagraph( const Phrase& phrase ); // (!!!) We assert that sx and sy are ordered sets of Word-s! int intersectionSize( const WordList& sx, const WordList& sy ); -void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix ); +void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost ); class TransLex; -double scoreByIdentity( const Phrase& hu, const Phrase& en ); +double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost ); double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex ); diff --git a/src/hunalign/help.h b/src/hunalign/help.h index 6950e3e..be48d1f 100644 --- a/src/hunalign/help.h +++ b/src/hunalign/help.h @@ -47,6 +47,12 @@ Arguments:\n\ -autodict=filename\n\ The dictionary built during realign is saved to this file. By default, it is not saved.\n\ \n\ +-nonumberscoreboost\n\ + If this option is not set (default), lines that contain exactly the\n\ + same numbers will be given a very high priority when being matched.\n\ + Use this flag to disable this behaviour (e.g. when you need to match\n\ + decimal chapter numbers to Roman ones).\n\ +\n\ \n\ -onebyteencoding\n\ The system uses the character counts of the sentences as information for the\n\ diff --git a/src/hunalign/oldAlignTest.cpp b/src/hunalign/oldAlignTest.cpp index 75bd3dd..66f24dd 100644 --- a/src/hunalign/oldAlignTest.cpp +++ b/src/hunalign/oldAlignTest.cpp @@ -462,7 +462,7 @@ void main_alignTest() AlignMatrix alignMatrix( huBookSize, enBookSize, thickness ); - sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix ); + sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix, false ); bool visualize = false; bool graphical = false; diff --git a/src/hunalign/similarityEvaluator.cpp b/src/hunalign/similarityEvaluator.cpp index 0e03771..31fb8b3 100644 --- a/src/hunalign/similarityEvaluator.cpp +++ b/src/hunalign/similarityEvaluator.cpp @@ -69,7 +69,7 @@ class IdentityScorer : public SimilarityScorer public: double operator()( const Phrase& hu, const Phrase& en ) const { - return scoreByIdentity(hu,en); + return scoreByIdentity(hu, en, false); } };