irworkshop
diff --git a/‎state/fl/lobbying/reg/docs/disk%20file%20definition.pdf?cp=0.3379601757893852
106 KB b/‎state/fl/lobbying/reg/docs/disk%20file%20definition.pdf?cp=0.3379601757893852
106 KB
diff --git a/‎state/fl/lobbying/reg/docs/fl_lobby_reg_diary.Rmd
Lines changed: 6 additions & 6 deletions b/‎state/fl/lobbying/reg/docs/fl_lobby_reg_diary.Rmd
Lines changed: 6 additions & 6 deletions
diff --git a/‎state/hi/lobbying/docs/hi_lobby_diary.Rmd
Lines changed: 10 additions & 73 deletions b/‎state/hi/lobbying/docs/hi_lobby_diary.Rmd
Lines changed: 10 additions & 73 deletions
diff --git a/‎state/ia/contribs/docs/ia_contribs_diary.Rmd
Lines changed: 28 additions & 24 deletions b/‎state/ia/contribs/docs/ia_contribs_diary.Rmd
Lines changed: 28 additions & 24 deletions
@@ -1,6 +1,6 @@
 ---
 title: "Florida Lobbyist Registration"
-author: "Kiernan Nicholls"
+author: "Kiernan Nicholls & Yanqi Xu"
 date: "`r Sys.time()`"
 output:
   github_document: 
@@ -32,7 +32,7 @@ set.seed(5)
 ```
 
 ```{r create_docs_dir, eval=FALSE, echo=FALSE, include=FALSE}
-fs::dir_create(here::here("fl", "lobbying", "reg", "docs"))
+fs::dir_create(here::here("state","fl", "lobbying", "reg", "docs"))
 ```
 
 ## Project
@@ -118,7 +118,7 @@ to view the file format. The datasets are updated daily.
 
 ```{r download_key}
 key_url <- "https://floridalobbyist.gov/reports/disk%20file%20definition.pdf?cp=0.3379601757893852"
-download.file(key_url, destfile = url2path(key_url, here("fl", "lobbying", "reg", "docs")))
+download.file(key_url, destfile = url2path(key_url, here("state","fl", "lobbying", "reg", "docs")))
 ```
 
 ## Import
@@ -136,7 +136,7 @@ processor, spreadsheet, or database program.
 We can download these two files to our raw directory.
 
 ```{r create_raw_dir}
-raw_dir <- here("fl", "lobbying", "reg", "data", "raw")
+raw_dir <- here("state","fl", "lobbying", "reg", "data", "raw")
 dir_create(raw_dir)
 ```
 
@@ -165,8 +165,8 @@ fllr <- vroom(
   skip = 2,
   col_types = cols(
     .default = col_character(),
-    `Eff Date` = col_date_usa(),
-    `WD Date` = col_date_usa(),
+    `Eff Date` = col_date_mdy(),
+    `WD Date` = col_date_mdy(),
   )
 )
 ```
 
@@ -1,6 +1,6 @@
 ---
 title: "Hawaii Lobbyists"
-author: "Kiernan Nicholls"
+author: "Kiernan Nicholls & Yanqi Xu"
 date: "`r Sys.time()`"
 output:
   github_document: 
@@ -32,7 +32,7 @@ set.seed(5)
 ```
 
 ```{r create_docs_dir, eval=FALSE, echo=FALSE, include=FALSE}
-fs::dir_create(here::here("hi", "lobbying", "docs"))
+fs::dir_create(here::here("state","hi", "lobbying", "docs"))
 ```
 
 ## Project
@@ -107,94 +107,31 @@ here::here()
 
 ## Data
 
-Data can be obtained by the [Hawaii State Ethics Commission][hec] via their [Socrata portal][hsp].
+Data can be obtained by the [Hawaii State Ethics Commission][hec] via their [Socrata portal][hsp]. The [search tool][download] lets us export all lobbying registration data from 2019-2023 in a csv format.
 
 [hec]: https://ethics.hawaii.gov/
 [hsp]: https://data.hawaii.gov/
+[download]:https://hawaiiethics.my.site.com/public/s/reports?report=Lobbyist%20Registrations
 
-```{r meta_read, echo=FALSE}
-hi_meta <- fromJSON("https://data.hawaii.gov/api/views/gdxe-t5ff")
-```
-
-The relavent file is named `r quote(hi_meta$name)` with the ID of `r md_code(hi_meta$id)`. The file
-was created at `r as_datetime(hi_meta$createdAt)`.
-
-There are `r nrow(hi_meta$columns)` columns in the database.
-
-```{r meta_cols}
-hi_meta %>% 
-  use_series("columns") %>% 
-  as_tibble() %>% 
-  select(
-    position,
-    fieldName,
-    name,
-    dataTypeName
-  ) %>%
-  mutate(fieldName = md_code(fieldName)) %>% 
-  kable(col.names = c("col", "variable", "name", "type"))
-```
 
-This data does _not_ include the mailing addresses of the lobbyists and principal organizations.
-After contacting the Ethics Commission, I was provided with an Excel file containing the additional
-mailing address variable; this data will be processed and added to the site.
 
 ## Import
 
 If the file containing addresses is found on disc, the wrangling will continue; otherwise, the
 raw file will be read from the portal and not wrangled any futher.
 
 ```{r raw_dir}
-raw_dir <- here("hi", "lobbying", "data", "raw")
+raw_dir <- here("state","hi", "lobbying", "data", "raw")
 dir_create(raw_dir)
 ```
 
-```{r raw_read}
-geo_file <- dir_ls(raw_dir, type = "file", glob = "*.xlsx") 
-raw_file <- "https://data.hawaii.gov/api/views/gdxe-t5ff/rows.csv"
-
-if (file_exists(geo_file)) {
-  # read the excel file
-  hilr <- 
-     read_csv(
-      file = format_csv(read_excel(geo_file)),
-      skip = 1,
-      col_names = c(
-        "reg_name", "lob_name", "org_name", "lob_firm", "date_filed", "status", 
-        "date_reg", "date_term", "lob_email", "lob_phone", "lob_ext", "lob_geo", 
-        "lob_city", "lob_state", "lob_zip"
-      ),
-      col_types = cols(
-        .default = col_character(),
-        date_filed = col_date_usa(),
-        date_reg = col_date_usa(),
-        date_term = col_date_usa()
-      )
-    )
-} else {
-  # read the portal file
-  hilr <- 
-    read_csv(
-      file = raw_file,
-      col_types = cols(
-        Registration = col_date_usa(),
-        Termination = col_date_usa()
-      )
-    ) %>%
-    # rename, reorder, and clean
-    rename(
-      lob_name = `Lobbyist Name`,
-      org_name = `Organization Name`,
-      session = `Lobby Year`
-    ) %>% 
-    select(-View, View) %>% 
-    clean_names() %>% 
-    mutate(view = str_extract(view, "(?<=\\()(.*)(?=\\))"))
-  # stop the document
-  knit_exit()
-}
+```{r}
+hilr <- read_csv(dir_ls(raw_dir, regex = ".+registration.+"))
+
+hilr <- hilr %>% clean_names()
 ```
 
+
 ## Wrangle
 
 ### Phone
 
@@ -1,6 +1,6 @@
 ---
-title: "Iowa Contributions"
-author: "Kiernan Nicholls"
+title: "Iowa Campaign Contributions"
+author: "Kiernan Nicholls & Yanqi Xu"
 date: "`r Sys.time()`"
 output:
   github_document: 
@@ -34,7 +34,7 @@ if (!interactive()) {
 ```
 
 ```{r create_docs_dir, eval=FALSE, echo=FALSE, include=FALSE}
-fs::dir_create(here::here("ia", "contribs", "docs"))
+fs::dir_create(here::here("state","ia", "contribs", "docs"))
 ```
 
 ## Project
@@ -114,7 +114,7 @@ here::here()
 
 ## Data
 
-Data is obtained from the [Iowa Ethics and Campaign Disclosure Board][iae].
+Data is obtained from the [Iowa Ethics and Campaign Disclosure Board][iae]. The API returns contributions dating back to `2003-01-01`. 
 
 > In order to accomplish its Mission, the Board will enforce the provisions of
 the "Campaign Disclosure Act" in Iowa Code chapter 68A, the "Government Ethics
@@ -125,12 +125,13 @@ rules in Chapter 351 of the Iowa Administrative Code.
 [iae]: https://ethics.iowa.gov/
 
 The Board provides the file through the [state open data portal][odp] under the
-title "Iowa Campaign Contributions Received." The data can be accessed as a
+title ["Iowa Campaign Contributions Received."][cont] The data can be accessed as a
 tabular CSV file or through a number of direct APIs.
 
-The database was created June 18, 2015 and last updated December 10, 2019.
+The database was created June 18, 2015 and last updated July 1, 2023.
 
 [odp]: https://data.iowa.gov/
+[cont]:https://data.iowa.gov/Campaigns-Elections/Iowa-Campaign-Contributions-Received/smfg-ds7h
 
 > This dataset contains information on contributions and in kind donations made
 by organizations and individuals to state-wide, legislative or local candidate
@@ -173,7 +174,7 @@ These fixed files can be read into a single data frame with `purrr::map_df()`
 and `readr::read_delim()`.
 
 ```{r raw_dir}
-raw_dir <- dir_create(here("ia", "contribs", "data", "raw"))
+raw_dir <- dir_create(here("state","ia", "contribs", "data", "raw"))
 raw_url <- "https://data.iowa.gov/api/views/smfg-ds7h/rows.csv"
 raw_path <- path(raw_dir, basename(raw_url))
 if (!this_file_new(raw_path)) {
@@ -187,32 +188,29 @@ iac <- vroom(
   na = c("", "N/A", "NA", "n/a", "na"),
   col_types = cols(
     .default = col_character(),
-    `Date` = col_date_usa(),
+    `Date` = col_date_mdy(),
     `Contribution Amount` = col_double()
   )
 )
 ```
 
 ```{r raw_rename, echo=FALSE}
 iac <- iac %>% 
-  clean_names("snake") %>%
+  clean_names("snake") %>% 
   rename(
     code = committee_code,
     committee = committee_name,
     type = transaction_type,
     cont_comm_cd = contributing_committee_code,
     cont_org = contributing_organization,
     first = first_name,
-    mi = middle_initial,
     last = last_name,
     addr1 = address_line_1,
     addr2 = address_line_2,
     zip = zip_code,
-    amount = contribution_amount,
-    tx = transaction_id,
+    amount = contribution_amount
   ) %>% 
-  mutate(last = coalesce(last, cont_org)) %>% 
-  select(-cont_org, -cont_comm_cd)
+  select(-cont_comm_cd)
 ```
 
 We can ensure this file was read correctly by counting distinct values of a 
@@ -268,7 +266,8 @@ iac %>%
 We can create a file containing every duplicate record in the data.
 
 ```{r dupe_write}
-dupe_file <- path(dirname(raw_dir), "dupes.csv")
+dupe_file <- path(raw_dir, "dupes.csv")
+iac <- rowid_to_column(iac, var = "tx")
 if (!file_exists(dupe_file)) {
   write_lines("tx,dupe_flag", dupe_file)
   iac <- mutate(iac, group = str_sub(date, end = 7))
@@ -279,7 +278,7 @@ if (!file_exists(dupe_file)) {
   pb <- txtProgressBar(max = length(ias), style = 3)
   for (i in seq_along(ias)) {
     write_csv(
-      path = dupe_file,
+      file = dupe_file,
       append = TRUE,
       x = tibble(
         tx = ia_tx[[i]],
@@ -300,7 +299,7 @@ if (!file_exists(dupe_file)) {
 dupes <- read_csv(
   file = dupe_file,
   col_types = cols(
-    tx = col_character(),
+    tx = col_integer(),
     dupe_flag = col_logical()
   )
 )
@@ -321,6 +320,10 @@ iac %>%
   arrange(date, last)
 ```
 
+```{r}
+iac <- iac %>% select(-tx)
+```
+
 ## Categorical
 
 ```{r n_distinct}
@@ -362,7 +365,7 @@ iac %>%
   filter(!is.na(amount), amount >= 1) %>% 
   ggplot(aes(x = type, y = amount)) +
   geom_violin(aes(fill = type), adjust = 3) +
-  scale_fill_brewer(palette = "Dark2", guide = FALSE) +
+  scale_fill_brewer(palette = "Dark2", guide = "none") +
   scale_y_continuous(
     breaks = c(1 %o% 10^(0:6)),
     labels = dollar,
@@ -391,7 +394,7 @@ iac %>%
   geom_col(aes(fill = even)) + 
   scale_fill_brewer(palette = "Dark2") +
   scale_y_continuous(labels = comma) +
-  scale_x_continuous(breaks = seq(1998, 2020, by = 2)) +
+  scale_x_continuous(breaks = seq(1998, 2024, by = 2)) +
   theme(legend.position = "bottom") +
   labs(
     title = "Iowa Contributions by Year",
@@ -672,7 +675,7 @@ glimpse(sample_n(iac, 50))
 Now the file can be saved on disk for upload to the Accountability server.
 
 ```{r clean_dir}
-clean_dir <- dir_create(here("ia", "contribs", "data", "clean"))
+clean_dir <- dir_create(here("state","ia", "contribs", "data", "clean"))
 clean_path <- path(clean_dir, "ia_contribs_clean.csv")
 write_csv(iac, clean_path, na = "")
 (clean_size <- file_size(clean_path))
@@ -684,7 +687,7 @@ file_encoding(clean_path) %>%
 
 We can use the `aws.s3::put_object()` to upload the text file to the IRW server.
 
-```{r aws_upload, eval=TRUE}
+```{r aws_upload, eval=FALSE}
 aws_path <- path("csv", basename(clean_path))
 if (!object_exists(aws_path, "publicaccountability")) {
   put_object(
@@ -710,20 +713,21 @@ dict_raw <- tibble(
   var = md_code(names(iac)),
   type = md_code(map_chr(iac, typeof)),
   def = c(
-    "Unique transaction hash",
     "Date contribution was made",
     "Recipient committee code",
+    "Recipient committee type",
     "Recipient committee name",
     "Type of contribution (direct, in-kind)",
+    "Contributor organization",
     "Contributor first name",
-    "Contributor middle initial",
-    "Contributor last name or organization",
+    "Contributor last name",
     "Contributor street address",
     "Contributor secondary address",
     "Contributor state abbreviation",
     "Contributor city name",
     "Contributor ZIP+4 code",
     "Amount or correction",
+    "Check number",
     "Flag for missing value",
     "Flag for duplicate row",
     "Calendar year contribution made",