cboettig commited on
Commit
e0df861
·
1 Parent(s): 5107814
Files changed (5) hide show
  1. app.R +57 -97
  2. footer.md +16 -0
  3. schema.yml +26 -0
  4. system-prompt.md +34 -0
  5. test.R +71 -0
app.R CHANGED
@@ -1,20 +1,23 @@
1
  library(shiny)
2
  library(bslib)
3
  library(htmltools)
4
- library(markdown)
5
  library(fontawesome)
6
  library(bsicons)
7
  library(gt)
8
  library(glue)
9
  library(ggplot2)
10
-
11
- library(mapgl)
12
  library(dplyr)
 
13
  library(duckdbfs)
14
-
15
  duckdbfs::load_spatial()
16
 
17
- css <- HTML("<link rel='stylesheet' type='text/css' href='https://demos.creative-tim.com/material-dashboard/assets/css/material-dashboard.min.css?v=3.2.0'>")
 
 
 
 
18
 
19
 
20
  # Define the UI
@@ -23,22 +26,23 @@ ui <- page_sidebar(
23
  tags$head(css),
24
  titlePanel("Demo App"),
25
 
26
- "This is a proof-of-principle for a simple chat-driven interface to dynamically explore geospatial data.
27
- ",
28
-
 
29
 
30
  card(
31
  layout_columns(
32
- textInput("chat",
33
- label = NULL,
34
- "Which counties in California have the highest average social vulnerability?",
35
- width = "100%"),
36
- div(
37
- actionButton("user_msg", "", icon = icon("paper-plane"),
38
- class = "btn-primary btn-sm align-bottom"),
39
- class = "align-text-bottom"),
40
- col_widths = c(11, 1)),
41
- fill = FALSE
42
  ),
43
  layout_columns(
44
  card(maplibreOutput("map")),
@@ -51,12 +55,10 @@ ui <- page_sidebar(
51
  max_height = "700px"
52
  ),
53
 
54
-
55
  gt_output("table"),
56
 
57
  card(fill = TRUE,
58
  card_header(fa("robot")),
59
-
60
  accordion(
61
  open = FALSE,
62
  accordion_panel(
@@ -70,34 +72,13 @@ ui <- page_sidebar(
70
  textOutput("explanation"),
71
  )
72
  ),
73
-
74
  card(
75
  card_header("Errata"),
76
- markdown(
77
- "
78
- #### Credits
79
-
80
- Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
81
-
82
- Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
83
-
84
- #### Technical details
85
-
86
- The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
87
- a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
88
- geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
89
- computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
90
- the large size of the data sources.
91
-
92
- The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
93
- The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
94
- ")
95
  )
96
-
97
  ),
98
 
99
  sidebar = sidebar(
100
-
101
  input_switch("redlines", "Redlined Areas", value = FALSE),
102
  input_switch("svi", "Social Vulnerability", value = TRUE),
103
  input_switch("richness", "Biodiversity Richness", value = FALSE),
@@ -113,40 +94,14 @@ The PMTiles layer is also hosted on Source cooperative where it can be streamed
113
  )
114
 
115
 
116
-
117
-
118
  repo <- "https://data.source.coop/cboettig/social-vulnerability"
119
  pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
120
  parquet <- glue("{repo}/svi2020_us_tract.parquet")
121
- svi <- open_dataset(parquet, tblname = "svi") |>
122
- filter(RPL_THEMES > 0)
123
-
124
-
125
  con <- duckdbfs::cached_connection()
126
- schema <- DBI::dbGetQuery(con, "PRAGMA table_info(svi)")
127
-
128
- system_prompt = glue::glue('
129
- You are a helpful agent who always replies strictly in JSON-formatted text.
130
- Your task is to translate the users question into a SQL query that will be run
131
- against the "svi" table in a duckdb database. The duckdb database has a
132
- spatial extension which understands PostGIS operations as well.
133
- Include semantically meaningful columns like COUNTY and STATE name.
134
-
135
- In the data, each row represents an individual census tract. If asked for
136
- county or state level statistics, be sure to aggregate across all the tracts
137
- in that county or state.
138
-
139
- The table schema is <schema>
140
-
141
- The column called "RPL_THEMES" corresponds to the overall "Social vulnerability index" number.
142
-
143
- Format your answer as follows:
144
-
145
- {
146
- "query": "your raw SQL response goes here",
147
- "explanation": "your explanation of the query"
148
- }
149
- ', .open = "<", .close = ">")
150
 
151
  chat <- ellmer::chat_vllm(
152
  base_url = "https://llm.nrp-nautilus.io/",
@@ -168,18 +123,20 @@ filter_column <- function(full_data, filtered_data, id_col = "FIPS") {
168
  list("in", list("get", id_col), list("literal", values))
169
  }
170
 
171
- chart1_data <- svi |>
172
- group_by(COUNTY) |>
173
- summarise(mean_svi = mean(RPL_THEMES)) |>
174
- collect()
175
-
176
- chart1 <- chart1_data |>
177
- ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
178
- ggtitle("County-level vulnerability nation-wide")
179
 
180
 
181
  # Define the server
182
  server <- function(input, output, session) {
 
 
 
 
 
 
 
 
 
 
183
  data <- reactiveValues(df = tibble())
184
  output$chart1 <- renderPlot(chart1)
185
 
@@ -191,29 +148,32 @@ server <- function(input, output, session) {
191
 
192
  # Parse response
193
  response <- jsonlite::fromJSON(stream)
194
- output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
195
- output$explanation <- renderText(response$explanation)
196
 
197
- # Actually execute the SQL query generated:
198
- df <- DBI::dbGetQuery(con, response$query)
 
 
 
 
199
 
200
- # don't display shape column in render
201
- df <- df |> select(-any_of("Shape"))
202
- output$table <- render_gt(df, height = 300)
203
 
204
 
205
- y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
206
- chart2 <- df |>
207
- rename(social_vulnerability = y_axis) |>
208
- ggplot(aes(social_vulnerability)) +
209
- geom_density(fill = "darkred") +
210
- xlim(c(0, 1)) +
211
- ggtitle("Vulnerability of selected areas")
212
 
213
- output$chart2 <- renderPlot(chart2)
214
 
215
- # We need to somehow trigger this df to update the map.
216
- data$df <- df
 
217
 
218
  })
219
 
 
1
  library(shiny)
2
  library(bslib)
3
  library(htmltools)
4
+ #library(markdown)
5
  library(fontawesome)
6
  library(bsicons)
7
  library(gt)
8
  library(glue)
9
  library(ggplot2)
10
+ library(readr)
 
11
  library(dplyr)
12
+ library(mapgl)
13
  library(duckdbfs)
 
14
  duckdbfs::load_spatial()
15
 
16
+ css <-
17
+ HTML(paste0("<link rel='stylesheet' type='text/css' ",
18
+ "href='https://demos.creative-tim.com/",
19
+ "material-dashboard/assets/css/",
20
+ "material-dashboard.min.css?v=3.2.0'>"))
21
 
22
 
23
  # Define the UI
 
26
  tags$head(css),
27
  titlePanel("Demo App"),
28
 
29
+ "
30
+ This is a proof-of-principle for a simple chat-driven interface
31
+ to dynamically explore geospatial data.
32
+ ",
33
 
34
  card(
35
  layout_columns(
36
+ textInput("chat",
37
+ label = NULL,
38
+ "Which counties in California have the highest average social vulnerability?",
39
+ width = "100%"),
40
+ div(
41
+ actionButton("user_msg", "", icon = icon("paper-plane"),
42
+ class = "btn-primary btn-sm align-bottom"),
43
+ class = "align-text-bottom"),
44
+ col_widths = c(11, 1)),
45
+ fill = FALSE
46
  ),
47
  layout_columns(
48
  card(maplibreOutput("map")),
 
55
  max_height = "700px"
56
  ),
57
 
 
58
  gt_output("table"),
59
 
60
  card(fill = TRUE,
61
  card_header(fa("robot")),
 
62
  accordion(
63
  open = FALSE,
64
  accordion_panel(
 
72
  textOutput("explanation"),
73
  )
74
  ),
 
75
  card(
76
  card_header("Errata"),
77
+ shiny::markdown(readr::read_file("footer.md")),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
 
79
  ),
80
 
81
  sidebar = sidebar(
 
82
  input_switch("redlines", "Redlined Areas", value = FALSE),
83
  input_switch("svi", "Social Vulnerability", value = TRUE),
84
  input_switch("richness", "Biodiversity Richness", value = FALSE),
 
94
  )
95
 
96
 
 
 
97
  repo <- "https://data.source.coop/cboettig/social-vulnerability"
98
  pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
99
  parquet <- glue("{repo}/svi2020_us_tract.parquet")
 
 
 
 
100
  con <- duckdbfs::cached_connection()
101
+ svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
102
+ schema <- read_file("schema.yml")
103
+ system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
104
+ .open = "<", .close = ">")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  chat <- ellmer::chat_vllm(
107
  base_url = "https://llm.nrp-nautilus.io/",
 
123
  list("in", list("get", id_col), list("literal", values))
124
  }
125
 
 
 
 
 
 
 
 
 
126
 
127
 
128
  # Define the server
129
  server <- function(input, output, session) {
130
+
131
+ chart1_data <- svi |>
132
+ group_by(COUNTY) |>
133
+ summarise(mean_svi = mean(RPL_THEMES)) |>
134
+ collect()
135
+
136
+ chart1 <- chart1_data |>
137
+ ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
138
+ ggtitle("County-level vulnerability nation-wide")
139
+
140
  data <- reactiveValues(df = tibble())
141
  output$chart1 <- renderPlot(chart1)
142
 
 
148
 
149
  # Parse response
150
  response <- jsonlite::fromJSON(stream)
 
 
151
 
152
+ if ("query" %in% names(response)) {
153
+ output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
154
+ output$explanation <- renderText(response$explanation)
155
+
156
+ # Actually execute the SQL query generated:
157
+ df <- DBI::dbGetQuery(con, response$query)
158
 
159
+ # don't display shape column in render
160
+ df <- df |> select(-any_of("Shape"))
161
+ output$table <- render_gt(df, height = 300)
162
 
163
 
164
+ y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
165
+ chart2 <- df |>
166
+ rename(social_vulnerability = y_axis) |>
167
+ ggplot(aes(social_vulnerability)) +
168
+ geom_density(fill = "darkred") +
169
+ xlim(c(0, 1)) +
170
+ ggtitle("Vulnerability of selected areas")
171
 
172
+ output$chart2 <- renderPlot(chart2)
173
 
174
+ # We need to somehow trigger this df to update the map.
175
+ data$df <- df
176
+ }
177
 
178
  })
179
 
footer.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Credits
2
+
3
+ Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
4
+
5
+ Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
6
+
7
+ #### Technical details
8
+
9
+ The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
10
+ a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
11
+ geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
12
+ computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
13
+ the large size of the data sources.
14
+
15
+ The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
16
+ The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
schema.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - VARIABLE_NAME: ST
2
+ DESCRIPTION: State-level FIPS code (two-digit integer)
3
+ - VARIABLE_NAME: STATE
4
+ DESCRIPTION: State name
5
+ - VARIABLE_NAME: ST_ABBR
6
+ DESCRIPTION: State abbreviation
7
+ - VARIABLE_NAME: STCNTY
8
+ DESCRIPTION: County-level FIPS code (5 digit integer)
9
+ - VARIABLE_NAME: COUNTY
10
+ DESCRIPTION: County name
11
+ - VARIABLE_NAME: FIPS
12
+ DESCRIPTION: Tract-level geographic identification (full Census Bureau FIPS code)
13
+ - VARIABLE_NAME: LOCATION
14
+ DESCRIPTION: Text description of tract county state
15
+ - VARIABLE_NAME: AREA_SQMI
16
+ DESCRIPTION: Tract area in square miles
17
+ - VARIABLE_NAME: RPL_THEMES
18
+ DESCRIPTION: Overall social vulnerability. Should always be used unless explicit sub-theme is called for.
19
+ - VARIABLE_NAME: RPL_THEME1
20
+ DESCRIPTION: Subtheme for socio-economic status social vulnerability score
21
+ - VARIABLE_NAME: RPL_THEME2
22
+ DESCRIPTION: Subtheme for Household characteristics vulnerability score
23
+ - VARIABLE_NAME: RPL_THEME3
24
+ DESCRIPTION: Subtheme for Racial and Ethnic Minority status based vulnerability score
25
+ - VARIABLE_NAME: RPL_THEME4
26
+ DESCRIPTION: Subtheme for Housing and transportation-based vulnerability score
system-prompt.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ You are a helpful agent who always replies strictly in JSON-formatted text.
3
+ Your task is to translate the user's questions about the data into a SQL query
4
+ that will be run against the "svi" table in a duckdb database.
5
+ The duckdb database has a spatial extension which understands PostGIS operations as well.
6
+ Include semantically meaningful columns like COUNTY and STATE name.
7
+
8
+ If your answer involves the construction of a SQL query, you must format your answer as follows:
9
+
10
+ {
11
+ "query": "your raw SQL response goes here",
12
+ "explanation": "your explanation of the query"
13
+ }
14
+
15
+ If your answer does not involve a SQL query, please reply with the following format instead:
16
+
17
+ {
18
+ "user": "user question goes here",
19
+ "agent": "your response goes here"
20
+ }
21
+
22
+ If you are asked to describe the data or for information about the data schema, give only a human-readable response with SQL.
23
+
24
+ In the data, each row represents an individual census tract. If asked for
25
+ county or state level statistics, be sure to aggregate across all the tracts
26
+ in that county or state.
27
+
28
+ Refer to this descriptions of each of the columns (VARIABLE_NAME) from the metadata table:
29
+ <schema>
30
+
31
+ Note that the column called "RPL_THEMES" corresponds to the overall "Social vulnerability index" number. Whenenver you SELECT the COUNTY, you must include the STCNTY
32
+ column as well because county names are not unique across states!
33
+
34
+
test.R ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Illustrate/test core app functionality without shiny
2
+
3
+ library(tidyverse)
4
+ library(duckdbfs)
5
+ library(mapgl)
6
+ library(ellmer)
7
+ library(glue)
8
+
9
+ repo <- "https://data.source.coop/cboettig/social-vulnerability"
10
+ pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
11
+ parquet <- glue("{repo}/svi2020_us_tract.parquet")
12
+ svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
13
+
14
+ schema <- read_file("schema.yml")
15
+ system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
16
+ .open = "<", .close = ">")
17
+
18
+ chat <- ellmer::chat_vllm(
19
+ base_url = "https://llm.nrp-nautilus.io/",
20
+ model = "llama3",
21
+ api_key = Sys.getenv("NRP_API_KEY"),
22
+ system_prompt = system_prompt,
23
+ api_args = list(temperature = 0)
24
+ )
25
+
26
+ chat <- ellmer::chat_vllm(
27
+ base_url = "https://llm.cirrus.carlboettiger.info/v1/",
28
+ model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
29
+ api_key = Sys.getenv("CIRRUS_LLM_KEY"),
30
+ system_prompt = system_prompt,
31
+ api_args = list(temperature = 0)
32
+ )
33
+
34
+ # Test a chat-based response
35
+ chat$chat("Which columns describes racial components of social vulnerability?")
36
+ ## A query-based response
37
+ stream <- chat$chat("Which counties in California have the highest average social vulnerability?")
38
+ response <- jsonlite::fromJSON(stream)
39
+
40
+ con <- duckdbfs::cached_connection()
41
+ filtered_data <- DBI::dbGetQuery(con, response$query)
42
+ full_data <- svi
43
+ response_query <- "
44
+ SELECT COUNTY, AVG(RPL_THEME1) as avg_soc_vuln FROM
45
+ svi WHERE STATE = 'California' GROUP BY COUNTY ORDER BY
46
+ avg_soc_vuln DESC LIMIT 10;
47
+ "
48
+
49
+
50
+ filter_column <- function(full_data, filtered_data, id_col) {
51
+ if (nrow(filtered_data) < 1) return(NULL)
52
+ values <- full_data |>
53
+ inner_join(filtered_data, copy = TRUE) |>
54
+ pull(id_col)
55
+ # maplibre syntax for the filter of PMTiles
56
+ list("in", list("get", id_col), list("literal", values))
57
+ }
58
+
59
+ maplibre(center = c(-102.9, 41.3), zoom = 3) |>
60
+ add_fill_layer(
61
+ id = "svi_layer",
62
+ source = list(type = "vector", url = paste0("pmtiles://", pmtiles)),
63
+ source_layer = "SVI2000_US_tract",
64
+ filter = filter_column(full_data, filtered_data, "FIPS"),
65
+ fill_opacity = 0.5,
66
+ fill_color = interpolate(column = "RPL_THEMES",
67
+ values = c(0, 1),
68
+ stops = c("#e19292c0", "darkblue"),
69
+ na_color = "lightgrey")
70
+ )
71
+