Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

testing data regen strategies #4790

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 19 additions & 35 deletions common/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,11 @@ hmda {

rules {
yearly-filing {
years-allowed = "2018,2019,2020,2021,2022,2023,2024"
years-allowed = ${?RULES_YEARLY_FILING_YEARS_ALLOWED}
}
years-allowed = "2020,2021,2022,2023"
}

quarterly-filing {
years-allowed = "2020,2021,2022,2023,2024"
years-allowed = ${?RULES_QF_FILING_YEARS_ALLOWED}
years-allowed = "2020,2021,2022,2023"

q1 {
start = "April 01"
Expand Down Expand Up @@ -255,37 +253,23 @@ hmda {
endpoint = ${?KAFKA_SSL_ENDPOINT}
}
topics {
institutionTopic = "institution"
institutionTopic = ${?KAFKA_INSTITUTIONS_TOPIC}
signTopic = "hmda-sign"
signTopic = ${?KAFKA_SIGN_TOPIC}
modifiedLarTopic = "hmda-modified-lar"
modifiedLarTopic = ${?KAFKA_MODIFIED_LAR_TOPIC}
irsTopic = "hmda-irs"
irsTopic = ${?KAFKA_IRS_TOPIC}
analyticsTopic = "hmda-analytics"
analyticsTopic = ${?KAFKA_ANALYTICS_TOPIC}
disclosureTopic = "hmda-spark-disclosure"
disclosureTopic = ${?KAFKA_DISCLOSURE_TOPIC}
adTopic = "hmda-spark-ad"
adTopic = ${?KAFKA_AD_TOPIC}
emailTopic = "hmda-email"
emailTopic = ${?KAFKA_EMAIL_TOPIC}
}
institutionTopic = "institution-regen"
signTopic = "hmda-sign-regen"
modifiedLarTopic = "hmda-modified-lar-regen"
irsTopic = "hmda-irs-regen"
analyticsTopic = "hmda-analytics-regen"
disclosureTopic = "hmda-spark-disclosure-regen"
adTopic = "hmda-spark-ad-regen"
emailTopic = "hmda-email-regen"
}
groups {
emailGroup = "email-group"
emailGroup = ${?KAFKA_EMAIL_GROUP}
modifiedLarGroup = "modified-lar-group"
modifiedLarGroup = ${?KAFKA_MODIFIED_LAR_GROUP}
analyticsGroup = "analytics-group"
analyticsGroup = ${?KAFKA_ANALYTICS_GROUP}
irsGroup = "irs-group"
irsGroup = ${?KAFKA_IRS_GROUP}
institutionsGroup = "institutions-group"
institutionsGroup = ${?KAFKA_INSTITUTIONS_GROUP}
submissionErrorsGroup = "triggered-quality-edits-group"
submissionErrorsGroup = ${?TRIGGERED_QUALITY_EDITS_GROUP}
}
emailGroup = "email-group-regen"
modifiedLarGroup = "modified-lar-group-regen"
analyticsGroup = "analytics-group-regen"
irsGroup = "irs-group-regen"
institutionsGroup = "institutions-group-regen"
submissionErrorsGroup = "triggered-quality-edits-group-regen"
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ sealed trait LarFormatValidator extends LarParser {
val occupancy = values(8)
val loanAmount = values(9)
val actionTaken = values(10)
val actionTakenDate = values(11)
val actionTakenDate = values(11).replaceFirst("^0+(?!$)", "")
val street = values(12)
val city = values(13)
val state = values(14)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ class ModifiedLarRepository(databaseConfig: DatabaseConfig[JdbcProfile]) {

def fetchYearTable(year: Int): String =
year match {
case 2018 => "modifiedlar2018"
case 2019 => "modifiedlar2019"
case 2020 => "modifiedlar2020"
case 2021 => "modifiedlar2021"
case 2022 => "modifiedlar2022"
case 2023 => "modifiedlar2023"
case 2018 => "modifiedlar2018_regen_2024"
case 2019 => "modifiedlar2019_regen_2024"
case 2020 => "modifiedlar2020_regen_2024"
case 2021 => "modifiedlar2021_regen_2024"
case 2022 => "modifiedlar2022_regen_2024"
case 2023 => "modifiedlar2023_regen_2024"

case _ => "modifiedlar2021"
case _ => "modifiedlar2021_regen_2024"
}

/**
Expand Down
167 changes: 59 additions & 108 deletions hmda-analytics/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -56,122 +56,73 @@ kafka {
hmda {
analytics {
parallelism = 1
tsDeletion = true
tsDeletion = false
tsDeletion = ${?HMDA-TS-DELETE}
larDeletion = true
larDeletion = false
larDeletion = ${?HMDA-LAR-DELETE}
historyInsertion = true
historyInsertion = ${?HMDA-HISTORY-INSERT}
yearsAvailable = "2018,2019,2020,2021,2022,2023,2024"
yearsAvailable = ${?ANALYTICS_YEARS_AVAILABLE}
2024 {
tsTableName = "transmittalsheet2024"
tsTableName = ${?TS_TABLE_NAME_2024}
tsTableNameQ1 = "ts2024_q1"
tsTableNameQ1 = ${?TS_TABLE_NAME_2024_Q1}
tsTableNameQ2 = "ts2024_q2"
tsTableNameQ2 = ${?TS_TABLE_NAME_2024_Q2}
tsTableNameQ3 = "ts2024_q3"
tsTableNameQ3 = ${?TS_TABLE_NAME_2024_Q1}
larTableName = "loanapplicationregister2024"
larTableName = ${?LAR_TABLE_NAME_2024}
larTableNameQ1 = "lar2024_q1"
larTableNameQ1 = ${?LAR_TABLE_NAME_2024_Q1}
larTableNameQ2 = "lar2024_q2"
larTableNameQ2 = ${?LAR_TABLE_NAME_2024_Q2}
larTableNameQ3 = "lar2024_q3"
larTableNameQ3 = ${?LAR_TABLE_NAME_2024_Q3}
}
historyInsertion = false
yearsAvailable = "2018,2019,2020,2021,2022,2023,2024"
2024 {
tsTableName = "transmittalsheet2024_regen_2024"
tsTableNameQ1 = "ts2024_q1_regen_2024"
tsTableNameQ2 = "ts2024_q2_regen_2024"
tsTableNameQ3 = "ts2024_q3_regen_2024"
larTableName = "loanapplicationregister2024_regen_2024"
larTableNameQ1 = "lar2024_q1_regen_2024"
larTableNameQ2 = "lar2024_q2_regen_2024"
larTableNameQ3 = "lar2024_q3_regen_2024"
}
2023 {
tsTableName = "transmittalsheet2023"
tsTableName = ${?TS_TABLE_NAME_2023}
tsTableNameQ1 = "ts2023_q1"
tsTableNameQ1 = ${?TS_TABLE_NAME_2023_Q1}
tsTableNameQ2 = "ts2023_q2"
tsTableNameQ2 = ${?TS_TABLE_NAME_2023_Q2}
tsTableNameQ3 = "ts2023_q3"
tsTableNameQ3 = ${?TS_TABLE_NAME_2023_Q1}
larTableName = "loanapplicationregister2023"
larTableName = ${?LAR_TABLE_NAME_2023}
larTableNameQ1 = "lar2023_q1"
larTableNameQ1 = ${?LAR_TABLE_NAME_2023_Q1}
larTableNameQ2 = "lar2023_q2"
larTableNameQ2 = ${?LAR_TABLE_NAME_2023_Q2}
larTableNameQ3 = "lar2023_q3"
larTableNameQ3 = ${?LAR_TABLE_NAME_2023_Q3}
}
tsTableName = "transmittalsheet2023_regen_2024"
tsTableNameQ1 = "ts2023_q1_regen_2024"
tsTableNameQ2 = "ts2023_q2_regen_2024"
tsTableNameQ3 = "ts2023_q3_regen_2024"
larTableName = "loanapplicationregister2023_regen_2024"
larTableNameQ1 = "lar2023_q1_regen_2024"
larTableNameQ2 = "lar2023_q2_regen_2024"
larTableNameQ3 = "lar2023_q3_regen_2024"
}
2022 {
tsTableName = "transmittalsheet2022"
tsTableName = ${?TS_TABLE_NAME_2022}
larTableName = "loanapplicationregister2022"
larTableName = ${?LAR_TABLE_NAME_2022}
tsTableNameQ1 = "ts2022_q1"
tsTableNameQ1 = ${?TS_TABLE_NAME_2022_Q1}
tsTableNameQ2 = "ts2022_q2"
tsTableNameQ2 = ${?TS_TABLE_NAME_2022_Q2}
tsTableNameQ3 = "ts2022_q3"
tsTableNameQ3 = ${?TS_TABLE_NAME_2022_Q1}
larTableNameQ1 = "lar2022_q1"
larTableNameQ1 = ${?LAR_TABLE_NAME_2022_Q1}
larTableNameQ2 = "lar2022_q2"
larTableNameQ2 = ${?LAR_TABLE_NAME_2022_Q2}
larTableNameQ3 = "lar2022_q3"
larTableNameQ3 = ${?LAR_TABLE_NAME_2022_Q3}
}
tsTableName = "transmittalsheet2022_regen_2024"
larTableName = "loanapplicationregister2022_regen_2024"
tsTableNameQ1 = "ts2022_q1_regen_2024"
tsTableNameQ2 = "ts2022_q2_regen_2024"
tsTableNameQ3 = "ts2022_q3_regen_2024"
larTableNameQ1 = "lar2022_q1_regen_2024"
larTableNameQ2 = "lar2022_q2_regen_2024"
larTableNameQ3 = "lar2022_q3_regen_2024"
}
2021 {
tsTableName = "transmittalsheet2021"
tsTableName = ${?TS_TABLE_NAME_2021}
tsTableNameQ1 = "ts2021_q1"
tsTableNameQ1 = ${?TS_TABLE_NAME_2021_Q1}
tsTableNameQ2 = "ts2021_q2"
tsTableNameQ2 = ${?TS_TABLE_NAME_2021_Q2}
tsTableNameQ3 = "ts2021_q3"
tsTableNameQ3 = ${?TS_TABLE_NAME_2021_Q1}
larTableName = "loanapplicationregister2021"
larTableName = ${?LAR_TABLE_NAME_2021}
larTableNameQ1 = "lar2021_q1"
larTableNameQ1 = ${?LAR_TABLE_NAME_2021_Q1}
larTableNameQ2 = "lar2021_q2"
larTableNameQ2 = ${?LAR_TABLE_NAME_2021_Q2}
larTableNameQ3 = "lar2021_q3"
larTableNameQ3 = ${?LAR_TABLE_NAME_2021_Q3}
}
tsTableName = "transmittalsheet2021_regen_2024"
tsTableNameQ1 = "ts2021_q1_regen_2024"
tsTableNameQ2 = "ts2021_q2_regen_2024"
tsTableNameQ3 = "ts2021_q3_regen_2024"
larTableName = "loanapplicationregister2021_regen_2024"
larTableNameQ1 = "lar2021_q1_regen_2024"
larTableNameQ2 = "lar2021_q2_regen_2024"
larTableNameQ3 = "lar2021_q3_regen_2024"
}
2020 {
tsTableName = "transmittalsheet2020"
tsTableName = ${?TS_TABLE_NAME_2020}
tsTableNameQ1 = "ts2020_q1"
tsTableNameQ1 = ${?TS_TABLE_NAME_2020_Q1}
tsTableNameQ2 = "ts2020_q2"
tsTableNameQ2 = ${?TS_TABLE_NAME_2020_Q2}
tsTableNameQ3 = "ts2020_q3"
tsTableNameQ3 = ${?TS_TABLE_NAME_2020_Q3}
larTableName = "loanapplicationregister2020"
larTableName = ${?LAR_TABLE_NAME_2020}
larTableNameQ1 = "lar2020_q1"
larTableNameQ1 = ${?LAR_TABLE_NAME_2020_Q1}
larTableNameQ2 = "lar2020_q2"
larTableNameQ2 = ${?LAR_TABLE_NAME_2020_Q2}
larTableNameQ3 = "lar2020_q3"
larTableNameQ3 = ${?LAR_TABLE_NAME_2020_Q3}
historyTableName = "submission_history"
historyTableName = ${?HIST_TABLE_NAME}
}
tsTableName = "transmittalsheet2020_regen_2024"
tsTableNameQ1 = "ts2020_q1_regen_2024"
tsTableNameQ2 = "ts2020_q2_regen_2024"
tsTableNameQ3 = "ts2020_q3_regen_2024"
larTableName = "loanapplicationregister2020_regen_2024"
larTableNameQ1 = "lar2020_q1_regen_2024"
larTableNameQ2 = "lar2020_q2_regen_2024"
larTableNameQ3 = "lar2020_q3_regen_2024"
historyTableName = "submission_history_regen_2024"
}
2019 {
tsTableName = "transmittalsheet2019"
tsTableName = ${?TS_TABLE_NAME_2019}
larTableName = "loanapplicationregister2019"
larTableName = ${?LAR_TABLE_NAME_2019}
historyTableName = "submission_history"
historyTableName = ${?HIST_TABLE_NAME}
}
tsTableName = "transmittalsheet2019_regen_2024"
larTableName = "loanapplicationregister2019_regen_2024"
historyTableName = "submission_history"
}
2018 {
tsTableName = "transmittalsheet2018"
tsTableName = ${?TS_TABLE_NAME_2018}
larTableName = "loanapplicationregister2018"
larTableName = ${?LAR_TABLE_NAME_2018}
historyTableName = "submission_history"
historyTableName = ${?HIST_TABLE_NAME}
}
tsTableName = "transmittalsheet2018_regen_2024"
larTableName = "loanapplicationregister2018_regen_2024"
historyTableName = "submission_history"
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ object SnapshotCheck {
def pathSelector(s3Path: String,fileName:String): String = {
if(snapshotActive){
if (fileName.takeRight(3) == "zip"){
val snapshotFile=fileName.replace(".zip","_snapshot.zip")
val snapsƒhotFile=fileName.replace(".zip","_snapshot.zip")
snapshotPath+snapshotFile
}
else{
Expand Down
6 changes: 2 additions & 4 deletions irs-publisher/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,10 @@ kafka {
aws {
access-key-id = ""
access-key-id = ${?AWS_ACCESS_KEY_ID}
secret-access-key = ""
secret-ΩΩΩΩaccess-key = ""
secret-access-key = ${?AWS_SECRET_ACCESS_KEY}
environment = "dev" //change to "prod" for production S3 bucket
environment = ${?AWS_ENVIRONMENT}
environment = "dev/regen" //change to "prod" for production S3 bucket
public-bucket = "cfpb-hmda-public"
public-bucket = ${?S3_PUBLIC_BUCKET}
region = "us-east-1"
region = ${?AWS_REGION}
}
Expand Down
7 changes: 2 additions & 5 deletions modified-lar/src/main/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,8 @@ aws {
access-key-id = ${?AWS_ACCESS_KEY_ID}
secret-access-key = ""
secret-access-key = ${?AWS_SECRET_ACCESS_KEY}
environment = "dev" //change to "prod" for production S3 bucket
environment = ${?AWS_ENVIRONMENT}
environment = "dev/regen" //change to "prod" for production S3 bucket
public-bucket = "cfpb-hmda-public"
public-bucket = ${?S3_PUBLIC_BUCKET}
region = "us-east-1"
region = ${?AWS_REGION}
}
Expand All @@ -76,8 +74,7 @@ hmda {
lar {
modified {
parallelism = 1
regenerateMlar = false
regenerateMlar = ${IS_REGENERATE_MLAR}
regenerateMlar = true
generateS3Files = true
generateS3Files = ${?IS_GENERATE_MLAR_S3_FIES}
creteDispositionRecord = false
Expand Down
8 changes: 4 additions & 4 deletions modified-lar/src/main/resources/logback.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
<appender-ref ref="STDOUT" />
</appender>

<root level="INFO">
<root level="DEBUG">
<appender-ref ref="ASYNC" />
</root>
<logger name="org.apache.zookeeper" level="${ZOOKEEPER_LOG_LEVEL:-WARN}"/>
<logger name="org.apache.kafka" level="${KAFKA_LOG_LEVEL:-INFO}" />
<logger name="org.apache.cassandra" level="INFO" />
<logger name="org.apache.zookeeper" level="${ZOOKEEPER_LOG_LEVEL:-DEBUG}"/>
<logger name="org.apache.kafka" level="${KAFKA_LOG_LEVEL:-DEBUG}" />
<logger name="org.apache.cassandra" level="DEBUG" />
</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,11 @@ object ModifiedLarPublisher {
val graphWithJustS3WithHeader = mlarSource.via(serializeMlar).prepend(mlarHeader).toMat(s3SinkWithHeader)(Keep.right)

val finalResult: Future[Unit] = for {
_ <- if (regenerateMlar)
_ <- if (true)
graphWithS3AndPG.run()
else if (isGenerateBothS3Files) {
removeLei
graphWithS3AndPG.run()
} else if (isJustGenerateS3File)
graphWithJustS3NoHeader.run()
else if (isJustGenerateS3FileHeader)
graphWithJustS3WithHeader.run()

else { //everything
removeLei

Future.sequence(List(graphWithJustS3NoHeader.run(), graphWithJustS3WithHeader.run(), graphWithJustPG.run()))
}
_ <- produceRecord(disclosureTopic, submissionId.lei, submissionId.toString, kafkaProducer)
Expand Down
8 changes: 4 additions & 4 deletions modified-lar/src/test/resources/modifiedlar.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CREATE TABLE modifiedlar2019 (
CREATE TABLE modifiedlar2019_regen_2024 (
id integer NOT NULL,
uli character varying not null,
lei character varying NOT NULL,
Expand Down Expand Up @@ -99,7 +99,7 @@ CREATE TABLE modifiedlar2019 (
conforming_loan_limit character varying,
median_age integer,
median_age_calculated character varying,
median_income_percentage integer,
median_income_percentage double precision,
race_categorization character varying,
sex_categorization character varying,
ethnicity_categorization character varying,
Expand All @@ -110,7 +110,7 @@ CREATE TABLE modifiedlar2019 (
checksum character varying
);

CREATE TABLE modifiedlar2018 (
CREATE TABLE modifiedlar2018_regen_2024 (
id integer NOT NULL,
uli character varying not null,
lei character varying NOT NULL,
Expand Down Expand Up @@ -211,7 +211,7 @@ CREATE TABLE modifiedlar2018 (
conforming_loan_limit character varying,
median_age integer,
median_age_calculated character varying,
median_income_percentage integer,
median_income_percentage double precision,
race_categorization character varying,
sex_categorization character varying,
ethnicity_categorization character varying,
Expand Down