Skip to content

Commit

Permalink
Truncate Firestore data before export (#56)
Browse files Browse the repository at this point in the history
* firestore truncate option

* conditional db environment

* modularized functions

* trigger using both env

* truncate required

* lint

* lint
  • Loading branch information
max-ostapenko authored Jan 26, 2025
1 parent 24ecd03 commit ef00169
Show file tree
Hide file tree
Showing 16 changed files with 321 additions and 151 deletions.
20 changes: 11 additions & 9 deletions infra/bigquery-export/firestore.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ export class FirestoreBatch {
constructor () {
this.firestore = new Firestore()
this.bigquery = new BigQueryExport()
this.firestore.settings({
databaseId: 'tech-report-apis-prod'
})
this.batchSize = 500
this.maxConcurrentBatches = 200
}
Expand Down Expand Up @@ -139,13 +136,18 @@ export class FirestoreBatch {
console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`)
}

async export (config, query) {
this.date = config.date
this.collectionName = config.name
this.collectionType = config.type
async export (exportConfig, query) {
this.date = exportConfig.date
this.collectionName = exportConfig.name
this.collectionType = exportConfig.type
this.firestore.settings({
databaseId: 'tech-report-apis-' + exportConfig.environment
})

// Delete documents before writing new ones
await this.batchDelete()
// Delete all the documents before writing the new ones
if (exportConfig.truncate !== 'false') {
await this.batchDelete()
}

const rowStream = await this.bigquery.queryResultsStream(query)
await this.streamFromBigQuery(rowStream)
Expand Down
2 changes: 1 addition & 1 deletion infra/bigquery-export/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"main": "index.js",
"scripts": {
"start": "node index.js",
"buildpack": "rm -rf node_modules; gcloud builds submit --pack image=gcr.io/httparchive/bigquery-export"
"buildpack": "rm -rf node_modules; gcloud builds submit --pack image=us.gcr.io/httparchive/cloud-run/bigquery-export"
},
"type": "module",
"dependencies": {
Expand Down
38 changes: 19 additions & 19 deletions infra/bigquery-export/reports.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ export class ReportsExporter {
}

// Export timeseries reports
async exportTimeseries (exportData) {
const metric = exportData.name
async exportTimeseries (exportConfig) {
const metric = exportConfig.name
const query = `
SELECT
FORMAT_DATE('%Y_%m_%d', date) AS date,
Expand All @@ -22,9 +22,9 @@ FROM reports.${metric}_timeseries
}

// Export monthly histogram report
async exportHistogram (exportData) {
const metric = exportData.name
const date = exportData.date
async exportHistogram (exportConfig) {
const metric = exportConfig.name
const date = exportConfig.date

const query = `
SELECT * EXCEPT(date)
Expand All @@ -35,16 +35,16 @@ WHERE date = '${date}'
await this.storage.exportToJson(rows, `${this.storagePath}${date.replaceAll('-', '_')}/${metric}.json`)
}

async export (exportData) {
if (exportData.dataform_trigger !== 'report_complete') {
async export (exportConfig) {
if (exportConfig.dataform_trigger !== 'report_complete') {
console.error('Invalid dataform trigger')
return
}

if (exportData.type === 'histogram') {
await this.exportHistogram(exportData)
} else if (exportData.type === 'timeseries') {
await this.exportTimeseries(exportData)
if (exportConfig.type === 'histogram') {
await this.exportHistogram(exportConfig)
} else if (exportConfig.type === 'timeseries') {
await this.exportTimeseries(exportConfig)
} else {
console.error('Invalid report type')
}
Expand All @@ -56,30 +56,30 @@ export class TechReportsExporter {
this.firestore = new FirestoreBatch()
}

async export (exportData) {
if (exportData.dataform_trigger !== 'report_cwv_tech_complete') {
async export (exportConfig) {
if (exportConfig.dataform_trigger !== 'report_cwv_tech_complete') {
console.error('Invalid dataform trigger')
return
}

let query = ''
if (exportData.type === 'report') {
if (exportConfig.type === 'report') {
query = `
SELECT
STRING(date) AS date,
* EXCEPT(date)
FROM httparchive.reports.cwv_tech_${exportData.name}
WHERE date = '${exportData.date}'
FROM httparchive.reports.cwv_tech_${exportConfig.name}
WHERE date = '${exportConfig.date}'
`
} else if (exportData.type === 'dict') {
} else if (exportConfig.type === 'dict') {
query = `
SELECT *
FROM reports.cwv_tech_${exportData.name}
FROM reports.cwv_tech_${exportConfig.name}
`
} else {
console.error('Invalid export type')
}

await this.firestore.export(exportData, query)
await this.firestore.export(exportConfig, query)
}
}
11 changes: 11 additions & 0 deletions infra/dataform-export/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,12 @@ functions.http('dataform-export', async (req, res) => {
res.status(400).send('Bad Request: no query found')
}

const repoEnvironment = messageData.protoPayload.serviceData.jobCompletedEvent.job.jobConfiguration.labels.dataform_repository_id
if (!repoEnvironment) {
console.log(`no repo environment found: ${JSON.stringify(messageData)}`)
res.status(400).send('Bad Request: no repo environment found')
}

const regex = /\/\* ({"dataform_trigger":.+) \*\//
const reportConfig = regex.exec(query)
if (!reportConfig) {
Expand All @@ -62,6 +68,11 @@ functions.http('dataform-export', async (req, res) => {
}

const eventData = JSON.parse(reportConfig[1])
if (!eventData) {
console.log(`no event data found: ${reportConfig[1]}`)
res.status(400).send('Bad Request: no event data found')
}
eventData.environment = repoEnvironment === 'crawl-data' ? 'prod' : 'dev'
await callRunJob(eventData)

res.status(200).send('OK')
Expand Down
52 changes: 26 additions & 26 deletions infra/tf/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 54 additions & 0 deletions infra/tf/bigquery_export/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
terraform {
required_version = ">= 1.9.7"

required_providers {
archive = {
source = "hashicorp/archive"
version = "2.6.0"
}
google = {
source = "hashicorp/google"
version = ">= 6.13.0"
}
}
}

data "archive_file" "zip" {
type = "zip"
source_dir = "../${var.function_name}/"
output_path = "./tmp/${var.function_name}.zip"
}

resource "google_storage_bucket_object" "zource" {
bucket = "gcf-v2-uploads-${var.project_number}-${var.region}"
name = "${var.function_name}_${data.archive_file.zip.id}.zip"
source = data.archive_file.zip.output_path
}

resource "google_cloud_run_v2_job" "bigquery_export" {
name = var.function_name
location = var.region

deletion_protection = false

template {
template {
containers {
image = "${var.location}.gcr.io/${var.project}/cloud-run/${var.function_name}:latest"
resources {
limits = {
cpu = "4"
memory = "4Gi"
}
}
env {
name = "EXPORT_CONFIG"
value = ""
}
}
timeout = "3600s"
service_account = var.function_identity
}
}
}

23 changes: 23 additions & 0 deletions infra/tf/bigquery_export/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
variable "project" {
type = string
}

variable "project_number" {
type = string
}

variable "region" {
type = string
}

variable "function_identity" {
type = string
}

variable "function_name" {
type = string
}

variable "location" {
type = string
}
25 changes: 0 additions & 25 deletions infra/tf/bigquery_export_job.tf

This file was deleted.

Loading

0 comments on commit ef00169

Please sign in to comment.