.gitignore

-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    .vscode
+    *.o
+    src/hunalign/hunalign

src/hunalign/alignerTool.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -69,6 +69,8 @@ class AlignParameters @@
       int minCooccForDictBuild;
       bool utfCharCountingMode;
+      bool noNumberScoreBoost;
       std::string autoDictionaryDumpFilename; // Empty string means do not dump.
     };
@@ Expand Down Expand Up @@
       AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue );
-      sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix );
+      sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix, alignParameters.noNumberScoreBoost );
       std::cerr << std::endl;
       // temporaryDumpOfAlignMatrix( std::cerr, similarityMatrix );
@@ Expand Down Expand Up @@
                                          huSentenceListPretty,  enSentenceList,
                                          huSentenceListGarbled, enSentenceListGarbled );
-              sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed );
+              sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed, alignParameters.noNumberScoreBoost );
             }
           }
@@ Expand Down Expand Up / @@ -708,6 +710,12 @@ int main_alignerTool(int argC, char* argV[]) @@
           alignParameters.realignType = AlignParameters::UpgradeDictRealign;
         }
+        if (args.getSwitchCompact("nonumberscoreboost"))
+        {
+          alignParameters.noNumberScoreBoost = true;
+        }
         bool batchMode = args.getSwitchCompact("batch") ;
         bool justDictBuilding = false;
@@ Expand Down @@

src/hunalign/bookToMatrix.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -61,7 +61,7 @@ bool isNumber( const std::string& s )
  
    }

    // (!!!) We assert that sx and sy are ordered sets of Word-s!

    int specializedIntersectionSize( const WordList& sx, const WordList& sy )

    int specializedIntersectionSize( const WordList& sx, const WordList& sy, bool noNumberScoreBoost )

    {

      int inter=0;

      WordList::const_iterator sxt = sx.begin();

    @@ -103,9 +103,12 @@ int specializedIntersectionSize( const WordList& sx, const WordList& sy )
  
      }

      // TODO miert pont.

      if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) )

      if (!noNumberScoreBoost) 

      {

        inter += 10;

        if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) )

        {

          inter += 10;

        }

      }

      return inter;

    @@ -151,12 +154,12 @@ bool exceptionalScoring( const Phrase& hu, const Phrase& en, double& score )
  
    const double maximumScore = 3.0;

    double scoreByIdentity( const Phrase& hu, const Phrase& en )

    double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost )

    {

      double score = 0;

      if ( ! exceptionalScoring( hu, en, score ) )

      {

        score = specializedIntersectionSize( hu, en );

        score = specializedIntersectionSize( hu, en, noNumberScoreBoost );

        // If we divide with max here, we are better at avoiding global mistakes.

        // If we divide with min here, we are better at avoiding local mistakes.

    @@ -179,7 +182,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en )
  
    //x // Ezt akkor csereltem ki 3.0-rol 5.0-re, amikor a minimumot maximumra csreltem alabb.

    //x const double maximumScore = 5.0;

    //x 

    //x double scoreByIdentity( const Phrase& hu, const Phrase& en )

    //x double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost )

    //x {

    //x   double score = 0;

    //x   if ( ! exceptionalScoring( hu, en, score ) )

    @@ -195,7 +198,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en )
  
    //x   return score;

    //x }

    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix )

    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost )

    {

      int huPos,enPos;

    @@ -211,7 +214,7 @@ void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, con
  
          const Phrase& hu = huSentenceList[huPos].words;

          const Phrase& en = enSentenceList[enPos].words;

          alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu,en) );

          alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu, en, noNumberScoreBoost) );

        }

        bool rarelyLogging = true;

src/hunalign/bookToMatrix.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,11 +26,11 @@ bool isParagraph( const Phrase& phrase ); @@
     // (!!!) We assert that sx and sy are ordered sets of Word-s!
     int intersectionSize( const WordList& sx, const WordList& sy );
-    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix );
+    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost );
     class TransLex;
-    double scoreByIdentity( const Phrase& hu, const Phrase& en );
+    double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost );
     double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex );
@@ Expand Down @@

src/hunalign/help.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,12 @@ Arguments:\n\ @@
     -autodict=filename\n\
     	The dictionary built during realign is saved to this file. By default, it is not saved.\n\
     \n\
+    -nonumberscoreboost\n\
+    	If this option is not set (default), lines that contain exactly the\n\
+    	same numbers will be given a very high priority when being matched.\n\
+    	Use this flag to disable this behaviour (e.g. when you need to match\n\
+    	decimal chapter numbers to Roman ones).\n\
+    \n\
     \n\
     -onebyteencoding\n\
     	The system uses the character counts of the sentences as information for the\n\
@@ Expand Down @@

src/hunalign/oldAlignTest.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -462,7 +462,7 @@ void main_alignTest() @@
       AlignMatrix alignMatrix( huBookSize, enBookSize, thickness );
-      sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix );
+      sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix, false );
       bool visualize = false;
       bool graphical = false;
@@ Expand Down @@

src/hunalign/similarityEvaluator.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -69,7 +69,7 @@ class IdentityScorer : public SimilarityScorer @@
     public:
       double operator()( const Phrase& hu, const Phrase& en ) const
       {
-        return scoreByIdentity(hu,en);
+        return scoreByIdentity(hu, en, false);
       }
     };
@@ Expand Down @@

Add a flag to disable match score boosting for numbers #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

kyegupov wants to merge 2 commits into danielvarga:master from kyegupov:number-matching-optional

-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    .vscode
+    *.o
+    src/hunalign/hunalign

-Original file line number
+Diff line change
@@ Expand Up / @@ -69,6 +69,8 @@ class AlignParameters @@
       int minCooccForDictBuild;
       bool utfCharCountingMode;
+      bool noNumberScoreBoost;
       std::string autoDictionaryDumpFilename; // Empty string means do not dump.
     };
@@ Expand Down Expand Up @@
       AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue );
-      sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix );
+      sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix, alignParameters.noNumberScoreBoost );
       std::cerr << std::endl;
       // temporaryDumpOfAlignMatrix( std::cerr, similarityMatrix );
@@ Expand Down Expand Up @@
                                          huSentenceListPretty,  enSentenceList,
                                          huSentenceListGarbled, enSentenceListGarbled );
-              sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed );
+              sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed, alignParameters.noNumberScoreBoost );
             }
           }
@@ Expand Down Expand Up / @@ -708,6 +710,12 @@ int main_alignerTool(int argC, char* argV[]) @@
           alignParameters.realignType = AlignParameters::UpgradeDictRealign;
         }
+        if (args.getSwitchCompact("nonumberscoreboost"))
+        {
+          alignParameters.noNumberScoreBoost = true;
+        }
         bool batchMode = args.getSwitchCompact("batch") ;
         bool justDictBuilding = false;
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -26,11 +26,11 @@ bool isParagraph( const Phrase& phrase ); @@
     // (!!!) We assert that sx and sy are ordered sets of Word-s!
     int intersectionSize( const WordList& sx, const WordList& sy );
-    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix );
+    void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost );
     class TransLex;
-    double scoreByIdentity( const Phrase& hu, const Phrase& en );
+    double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost );
     double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex );
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -47,6 +47,12 @@ Arguments:\n\ @@
     -autodict=filename\n\
     	The dictionary built during realign is saved to this file. By default, it is not saved.\n\
     \n\
+    -nonumberscoreboost\n\
+    	If this option is not set (default), lines that contain exactly the\n\
+    	same numbers will be given a very high priority when being matched.\n\
+    	Use this flag to disable this behaviour (e.g. when you need to match\n\
+    	decimal chapter numbers to Roman ones).\n\
+    \n\
     \n\
     -onebyteencoding\n\
     	The system uses the character counts of the sentences as information for the\n\
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -462,7 +462,7 @@ void main_alignTest() @@
       AlignMatrix alignMatrix( huBookSize, enBookSize, thickness );
-      sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix );
+      sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix, false );
       bool visualize = false;
       bool graphical = false;
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -69,7 +69,7 @@ class IdentityScorer : public SimilarityScorer @@
     public:
       double operator()( const Phrase& hu, const Phrase& en ) const
       {
-        return scoreByIdentity(hu,en);
+        return scoreByIdentity(hu, en, false);
       }
     };
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add a flag to disable match score boosting for numbers #3

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Add a flag to disable match score boosting for numbers #3

Are you sure you want to change the base?

Uh oh!

Add a flag to disable match score boosting for numbers #3

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing