Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.vscode
*.o
src/hunalign/hunalign
12 changes: 10 additions & 2 deletions src/hunalign/alignerTool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ class AlignParameters
int minCooccForDictBuild;

bool utfCharCountingMode;

bool noNumberScoreBoost;

std::string autoDictionaryDumpFilename; // Empty string means do not dump.
};
Expand Down Expand Up @@ -291,7 +293,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary,

AlignMatrix similarityMatrix( huBookSize, enBookSize, thickness, outsideOfRadiusValue );

sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix );
sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrix, alignParameters.noNumberScoreBoost );
std::cerr << std::endl;

// temporaryDumpOfAlignMatrix( std::cerr, similarityMatrix );
Expand Down Expand Up @@ -411,7 +413,7 @@ double alignerToolWithObjects( const DictionaryItems& dictionary,
huSentenceListPretty, enSentenceList,
huSentenceListGarbled, enSentenceListGarbled );

sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed );
sentenceListsToAlignMatrixIdentity( huSentenceListGarbled, enSentenceListGarbled, similarityMatrixDetailed, alignParameters.noNumberScoreBoost );
}
}

Expand Down Expand Up @@ -708,6 +710,12 @@ int main_alignerTool(int argC, char* argV[])
alignParameters.realignType = AlignParameters::UpgradeDictRealign;
}

if (args.getSwitchCompact("nonumberscoreboost"))
{
alignParameters.noNumberScoreBoost = true;
}


bool batchMode = args.getSwitchCompact("batch") ;

bool justDictBuilding = false;
Expand Down
19 changes: 11 additions & 8 deletions src/hunalign/bookToMatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ bool isNumber( const std::string& s )
}

// (!!!) We assert that sx and sy are ordered sets of Word-s!
int specializedIntersectionSize( const WordList& sx, const WordList& sy )
int specializedIntersectionSize( const WordList& sx, const WordList& sy, bool noNumberScoreBoost )
{
int inter=0;
WordList::const_iterator sxt = sx.begin();
Expand Down Expand Up @@ -103,9 +103,12 @@ int specializedIntersectionSize( const WordList& sx, const WordList& sy )
}

// TODO miert pont.
if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) )
if (!noNumberScoreBoost)
{
inter += 10;
if ( (numberOfSameNumbers>0) && ( numberOfDifferingNumbers <= numberOfSameNumbers/5 ) )
{
inter += 10;
}
}

return inter;
Expand Down Expand Up @@ -151,12 +154,12 @@ bool exceptionalScoring( const Phrase& hu, const Phrase& en, double& score )

const double maximumScore = 3.0;

double scoreByIdentity( const Phrase& hu, const Phrase& en )
double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost )
{
double score = 0;
if ( ! exceptionalScoring( hu, en, score ) )
{
score = specializedIntersectionSize( hu, en );
score = specializedIntersectionSize( hu, en, noNumberScoreBoost );

// If we divide with max here, we are better at avoiding global mistakes.
// If we divide with min here, we are better at avoiding local mistakes.
Expand All @@ -179,7 +182,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en )
//x // Ezt akkor csereltem ki 3.0-rol 5.0-re, amikor a minimumot maximumra csreltem alabb.
//x const double maximumScore = 5.0;
//x
//x double scoreByIdentity( const Phrase& hu, const Phrase& en )
//x double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost )
//x {
//x double score = 0;
//x if ( ! exceptionalScoring( hu, en, score ) )
Expand All @@ -195,7 +198,7 @@ double scoreByIdentity( const Phrase& hu, const Phrase& en )
//x return score;
//x }

void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix )
void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost )
{
int huPos,enPos;

Expand All @@ -211,7 +214,7 @@ void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, con
const Phrase& hu = huSentenceList[huPos].words;
const Phrase& en = enSentenceList[enPos].words;

alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu,en) );
alignMatrix.setCell( huPos, enPos, scoreByIdentity(hu, en, noNumberScoreBoost) );
}

bool rarelyLogging = true;
Expand Down
4 changes: 2 additions & 2 deletions src/hunalign/bookToMatrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ bool isParagraph( const Phrase& phrase );
// (!!!) We assert that sx and sy are ordered sets of Word-s!
int intersectionSize( const WordList& sx, const WordList& sy );

void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix );
void sentenceListsToAlignMatrixIdentity( const SentenceList& huSentenceList, const SentenceList& enSentenceList, AlignMatrix& alignMatrix, bool noNumberScoreBoost );

class TransLex;

double scoreByIdentity( const Phrase& hu, const Phrase& en );
double scoreByIdentity( const Phrase& hu, const Phrase& en, bool noNumberScoreBoost );

double scoreByTranslation( const Phrase& hu, const Phrase& en, const TransLex& transLex );

Expand Down
6 changes: 6 additions & 0 deletions src/hunalign/help.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ Arguments:\n\
-autodict=filename\n\
The dictionary built during realign is saved to this file. By default, it is not saved.\n\
\n\
-nonumberscoreboost\n\
If this option is not set (default), lines that contain exactly the\n\
same numbers will be given a very high priority when being matched.\n\
Use this flag to disable this behaviour (e.g. when you need to match\n\
decimal chapter numbers to Roman ones).\n\
\n\
\n\
-onebyteencoding\n\
The system uses the character counts of the sentences as information for the\n\
Expand Down
2 changes: 1 addition & 1 deletion src/hunalign/oldAlignTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ void main_alignTest()

AlignMatrix alignMatrix( huBookSize, enBookSize, thickness );

sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix );
sentenceListsToAlignMatrixIdentity( huSentenceList, enSentenceList, alignMatrix, false );

bool visualize = false;
bool graphical = false;
Expand Down
2 changes: 1 addition & 1 deletion src/hunalign/similarityEvaluator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class IdentityScorer : public SimilarityScorer
public:
double operator()( const Phrase& hu, const Phrase& en ) const
{
return scoreByIdentity(hu,en);
return scoreByIdentity(hu, en, false);
}
};

Expand Down