Spaces:
Running
Running
:broom:
Browse files- app.R +57 -97
- footer.md +16 -0
- schema.yml +26 -0
- system-prompt.md +34 -0
- test.R +71 -0
app.R
CHANGED
@@ -1,20 +1,23 @@
|
|
1 |
library(shiny)
|
2 |
library(bslib)
|
3 |
library(htmltools)
|
4 |
-
library(markdown)
|
5 |
library(fontawesome)
|
6 |
library(bsicons)
|
7 |
library(gt)
|
8 |
library(glue)
|
9 |
library(ggplot2)
|
10 |
-
|
11 |
-
library(mapgl)
|
12 |
library(dplyr)
|
|
|
13 |
library(duckdbfs)
|
14 |
-
|
15 |
duckdbfs::load_spatial()
|
16 |
|
17 |
-
css <-
|
|
|
|
|
|
|
|
|
18 |
|
19 |
|
20 |
# Define the UI
|
@@ -23,22 +26,23 @@ ui <- page_sidebar(
|
|
23 |
tags$head(css),
|
24 |
titlePanel("Demo App"),
|
25 |
|
26 |
-
"
|
27 |
-
|
28 |
-
|
|
|
29 |
|
30 |
card(
|
31 |
layout_columns(
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
),
|
43 |
layout_columns(
|
44 |
card(maplibreOutput("map")),
|
@@ -51,12 +55,10 @@ ui <- page_sidebar(
|
|
51 |
max_height = "700px"
|
52 |
),
|
53 |
|
54 |
-
|
55 |
gt_output("table"),
|
56 |
|
57 |
card(fill = TRUE,
|
58 |
card_header(fa("robot")),
|
59 |
-
|
60 |
accordion(
|
61 |
open = FALSE,
|
62 |
accordion_panel(
|
@@ -70,34 +72,13 @@ ui <- page_sidebar(
|
|
70 |
textOutput("explanation"),
|
71 |
)
|
72 |
),
|
73 |
-
|
74 |
card(
|
75 |
card_header("Errata"),
|
76 |
-
markdown(
|
77 |
-
"
|
78 |
-
#### Credits
|
79 |
-
|
80 |
-
Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
|
81 |
-
|
82 |
-
Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
|
83 |
-
|
84 |
-
#### Technical details
|
85 |
-
|
86 |
-
The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
|
87 |
-
a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
|
88 |
-
geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
|
89 |
-
computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
|
90 |
-
the large size of the data sources.
|
91 |
-
|
92 |
-
The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
|
93 |
-
The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
|
94 |
-
")
|
95 |
)
|
96 |
-
|
97 |
),
|
98 |
|
99 |
sidebar = sidebar(
|
100 |
-
|
101 |
input_switch("redlines", "Redlined Areas", value = FALSE),
|
102 |
input_switch("svi", "Social Vulnerability", value = TRUE),
|
103 |
input_switch("richness", "Biodiversity Richness", value = FALSE),
|
@@ -113,40 +94,14 @@ The PMTiles layer is also hosted on Source cooperative where it can be streamed
|
|
113 |
)
|
114 |
|
115 |
|
116 |
-
|
117 |
-
|
118 |
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
119 |
pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
|
120 |
parquet <- glue("{repo}/svi2020_us_tract.parquet")
|
121 |
-
svi <- open_dataset(parquet, tblname = "svi") |>
|
122 |
-
filter(RPL_THEMES > 0)
|
123 |
-
|
124 |
-
|
125 |
con <- duckdbfs::cached_connection()
|
126 |
-
|
127 |
-
|
128 |
-
system_prompt
|
129 |
-
|
130 |
-
Your task is to translate the users question into a SQL query that will be run
|
131 |
-
against the "svi" table in a duckdb database. The duckdb database has a
|
132 |
-
spatial extension which understands PostGIS operations as well.
|
133 |
-
Include semantically meaningful columns like COUNTY and STATE name.
|
134 |
-
|
135 |
-
In the data, each row represents an individual census tract. If asked for
|
136 |
-
county or state level statistics, be sure to aggregate across all the tracts
|
137 |
-
in that county or state.
|
138 |
-
|
139 |
-
The table schema is <schema>
|
140 |
-
|
141 |
-
The column called "RPL_THEMES" corresponds to the overall "Social vulnerability index" number.
|
142 |
-
|
143 |
-
Format your answer as follows:
|
144 |
-
|
145 |
-
{
|
146 |
-
"query": "your raw SQL response goes here",
|
147 |
-
"explanation": "your explanation of the query"
|
148 |
-
}
|
149 |
-
', .open = "<", .close = ">")
|
150 |
|
151 |
chat <- ellmer::chat_vllm(
|
152 |
base_url = "https://llm.nrp-nautilus.io/",
|
@@ -168,18 +123,20 @@ filter_column <- function(full_data, filtered_data, id_col = "FIPS") {
|
|
168 |
list("in", list("get", id_col), list("literal", values))
|
169 |
}
|
170 |
|
171 |
-
chart1_data <- svi |>
|
172 |
-
group_by(COUNTY) |>
|
173 |
-
summarise(mean_svi = mean(RPL_THEMES)) |>
|
174 |
-
collect()
|
175 |
-
|
176 |
-
chart1 <- chart1_data |>
|
177 |
-
ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
|
178 |
-
ggtitle("County-level vulnerability nation-wide")
|
179 |
|
180 |
|
181 |
# Define the server
|
182 |
server <- function(input, output, session) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
data <- reactiveValues(df = tibble())
|
184 |
output$chart1 <- renderPlot(chart1)
|
185 |
|
@@ -191,29 +148,32 @@ server <- function(input, output, session) {
|
|
191 |
|
192 |
# Parse response
|
193 |
response <- jsonlite::fromJSON(stream)
|
194 |
-
output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
|
195 |
-
output$explanation <- renderText(response$explanation)
|
196 |
|
197 |
-
|
198 |
-
|
|
|
|
|
|
|
|
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
|
213 |
-
|
214 |
|
215 |
-
|
216 |
-
|
|
|
217 |
|
218 |
})
|
219 |
|
|
|
1 |
library(shiny)
|
2 |
library(bslib)
|
3 |
library(htmltools)
|
4 |
+
#library(markdown)
|
5 |
library(fontawesome)
|
6 |
library(bsicons)
|
7 |
library(gt)
|
8 |
library(glue)
|
9 |
library(ggplot2)
|
10 |
+
library(readr)
|
|
|
11 |
library(dplyr)
|
12 |
+
library(mapgl)
|
13 |
library(duckdbfs)
|
|
|
14 |
duckdbfs::load_spatial()
|
15 |
|
16 |
+
css <-
|
17 |
+
HTML(paste0("<link rel='stylesheet' type='text/css' ",
|
18 |
+
"href='https://demos.creative-tim.com/",
|
19 |
+
"material-dashboard/assets/css/",
|
20 |
+
"material-dashboard.min.css?v=3.2.0'>"))
|
21 |
|
22 |
|
23 |
# Define the UI
|
|
|
26 |
tags$head(css),
|
27 |
titlePanel("Demo App"),
|
28 |
|
29 |
+
"
|
30 |
+
This is a proof-of-principle for a simple chat-driven interface
|
31 |
+
to dynamically explore geospatial data.
|
32 |
+
",
|
33 |
|
34 |
card(
|
35 |
layout_columns(
|
36 |
+
textInput("chat",
|
37 |
+
label = NULL,
|
38 |
+
"Which counties in California have the highest average social vulnerability?",
|
39 |
+
width = "100%"),
|
40 |
+
div(
|
41 |
+
actionButton("user_msg", "", icon = icon("paper-plane"),
|
42 |
+
class = "btn-primary btn-sm align-bottom"),
|
43 |
+
class = "align-text-bottom"),
|
44 |
+
col_widths = c(11, 1)),
|
45 |
+
fill = FALSE
|
46 |
),
|
47 |
layout_columns(
|
48 |
card(maplibreOutput("map")),
|
|
|
55 |
max_height = "700px"
|
56 |
),
|
57 |
|
|
|
58 |
gt_output("table"),
|
59 |
|
60 |
card(fill = TRUE,
|
61 |
card_header(fa("robot")),
|
|
|
62 |
accordion(
|
63 |
open = FALSE,
|
64 |
accordion_panel(
|
|
|
72 |
textOutput("explanation"),
|
73 |
)
|
74 |
),
|
|
|
75 |
card(
|
76 |
card_header("Errata"),
|
77 |
+
shiny::markdown(readr::read_file("footer.md")),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
)
|
|
|
79 |
),
|
80 |
|
81 |
sidebar = sidebar(
|
|
|
82 |
input_switch("redlines", "Redlined Areas", value = FALSE),
|
83 |
input_switch("svi", "Social Vulnerability", value = TRUE),
|
84 |
input_switch("richness", "Biodiversity Richness", value = FALSE),
|
|
|
94 |
)
|
95 |
|
96 |
|
|
|
|
|
97 |
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
98 |
pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
|
99 |
parquet <- glue("{repo}/svi2020_us_tract.parquet")
|
|
|
|
|
|
|
|
|
100 |
con <- duckdbfs::cached_connection()
|
101 |
+
svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
|
102 |
+
schema <- read_file("schema.yml")
|
103 |
+
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
104 |
+
.open = "<", .close = ">")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
chat <- ellmer::chat_vllm(
|
107 |
base_url = "https://llm.nrp-nautilus.io/",
|
|
|
123 |
list("in", list("get", id_col), list("literal", values))
|
124 |
}
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
|
127 |
|
128 |
# Define the server
|
129 |
server <- function(input, output, session) {
|
130 |
+
|
131 |
+
chart1_data <- svi |>
|
132 |
+
group_by(COUNTY) |>
|
133 |
+
summarise(mean_svi = mean(RPL_THEMES)) |>
|
134 |
+
collect()
|
135 |
+
|
136 |
+
chart1 <- chart1_data |>
|
137 |
+
ggplot(aes(mean_svi)) + geom_density(fill="darkred") +
|
138 |
+
ggtitle("County-level vulnerability nation-wide")
|
139 |
+
|
140 |
data <- reactiveValues(df = tibble())
|
141 |
output$chart1 <- renderPlot(chart1)
|
142 |
|
|
|
148 |
|
149 |
# Parse response
|
150 |
response <- jsonlite::fromJSON(stream)
|
|
|
|
|
151 |
|
152 |
+
if ("query" %in% names(response)) {
|
153 |
+
output$sql_code <- renderText(stringr::str_wrap(response$query, width = 60))
|
154 |
+
output$explanation <- renderText(response$explanation)
|
155 |
+
|
156 |
+
# Actually execute the SQL query generated:
|
157 |
+
df <- DBI::dbGetQuery(con, response$query)
|
158 |
|
159 |
+
# don't display shape column in render
|
160 |
+
df <- df |> select(-any_of("Shape"))
|
161 |
+
output$table <- render_gt(df, height = 300)
|
162 |
|
163 |
|
164 |
+
y_axis <- colnames(df)[!colnames(df) %in% colnames(svi)]
|
165 |
+
chart2 <- df |>
|
166 |
+
rename(social_vulnerability = y_axis) |>
|
167 |
+
ggplot(aes(social_vulnerability)) +
|
168 |
+
geom_density(fill = "darkred") +
|
169 |
+
xlim(c(0, 1)) +
|
170 |
+
ggtitle("Vulnerability of selected areas")
|
171 |
|
172 |
+
output$chart2 <- renderPlot(chart2)
|
173 |
|
174 |
+
# We need to somehow trigger this df to update the map.
|
175 |
+
data$df <- df
|
176 |
+
}
|
177 |
|
178 |
})
|
179 |
|
footer.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#### Credits
|
2 |
+
|
3 |
+
Developed by Carl Boettiger, UC Berkeley, 2025. BSD License.
|
4 |
+
|
5 |
+
Data from the US Census and CDC's [Social Vulnerability Index](https://www.atsdr.cdc.gov/place-health/php/svi/index.html)
|
6 |
+
|
7 |
+
#### Technical details
|
8 |
+
|
9 |
+
The app is written entirely in R using shiny. The app will translate natural language queries in SQL code using
|
10 |
+
a small open-weights language model. The SQL code is executed using the duckdb backend against cloud-native
|
11 |
+
geoparquet snapshot of the Social Vulnerability Index hosted on Source Cooperative. Summary chart data are also
|
12 |
+
computed in duckdb by streaming, providing responsive updates while needing minimal RAM or disk storage despite
|
13 |
+
the large size of the data sources.
|
14 |
+
|
15 |
+
The map is rendered and updated using MapLibre with PMTiles, which provides responsive rendering for large feature sets.
|
16 |
+
The PMTiles layer is also hosted on Source cooperative where it can be streamed efficiently.
|
schema.yml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
- VARIABLE_NAME: ST
|
2 |
+
DESCRIPTION: State-level FIPS code (two-digit integer)
|
3 |
+
- VARIABLE_NAME: STATE
|
4 |
+
DESCRIPTION: State name
|
5 |
+
- VARIABLE_NAME: ST_ABBR
|
6 |
+
DESCRIPTION: State abbreviation
|
7 |
+
- VARIABLE_NAME: STCNTY
|
8 |
+
DESCRIPTION: County-level FIPS code (5 digit integer)
|
9 |
+
- VARIABLE_NAME: COUNTY
|
10 |
+
DESCRIPTION: County name
|
11 |
+
- VARIABLE_NAME: FIPS
|
12 |
+
DESCRIPTION: Tract-level geographic identification (full Census Bureau FIPS code)
|
13 |
+
- VARIABLE_NAME: LOCATION
|
14 |
+
DESCRIPTION: Text description of tract county state
|
15 |
+
- VARIABLE_NAME: AREA_SQMI
|
16 |
+
DESCRIPTION: Tract area in square miles
|
17 |
+
- VARIABLE_NAME: RPL_THEMES
|
18 |
+
DESCRIPTION: Overall social vulnerability. Should always be used unless explicit sub-theme is called for.
|
19 |
+
- VARIABLE_NAME: RPL_THEME1
|
20 |
+
DESCRIPTION: Subtheme for socio-economic status social vulnerability score
|
21 |
+
- VARIABLE_NAME: RPL_THEME2
|
22 |
+
DESCRIPTION: Subtheme for Household characteristics vulnerability score
|
23 |
+
- VARIABLE_NAME: RPL_THEME3
|
24 |
+
DESCRIPTION: Subtheme for Racial and Ethnic Minority status based vulnerability score
|
25 |
+
- VARIABLE_NAME: RPL_THEME4
|
26 |
+
DESCRIPTION: Subtheme for Housing and transportation-based vulnerability score
|
system-prompt.md
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
You are a helpful agent who always replies strictly in JSON-formatted text.
|
3 |
+
Your task is to translate the user's questions about the data into a SQL query
|
4 |
+
that will be run against the "svi" table in a duckdb database.
|
5 |
+
The duckdb database has a spatial extension which understands PostGIS operations as well.
|
6 |
+
Include semantically meaningful columns like COUNTY and STATE name.
|
7 |
+
|
8 |
+
If your answer involves the construction of a SQL query, you must format your answer as follows:
|
9 |
+
|
10 |
+
{
|
11 |
+
"query": "your raw SQL response goes here",
|
12 |
+
"explanation": "your explanation of the query"
|
13 |
+
}
|
14 |
+
|
15 |
+
If your answer does not involve a SQL query, please reply with the following format instead:
|
16 |
+
|
17 |
+
{
|
18 |
+
"user": "user question goes here",
|
19 |
+
"agent": "your response goes here"
|
20 |
+
}
|
21 |
+
|
22 |
+
If you are asked to describe the data or for information about the data schema, give only a human-readable response with SQL.
|
23 |
+
|
24 |
+
In the data, each row represents an individual census tract. If asked for
|
25 |
+
county or state level statistics, be sure to aggregate across all the tracts
|
26 |
+
in that county or state.
|
27 |
+
|
28 |
+
Refer to this descriptions of each of the columns (VARIABLE_NAME) from the metadata table:
|
29 |
+
<schema>
|
30 |
+
|
31 |
+
Note that the column called "RPL_THEMES" corresponds to the overall "Social vulnerability index" number. Whenenver you SELECT the COUNTY, you must include the STCNTY
|
32 |
+
column as well because county names are not unique across states!
|
33 |
+
|
34 |
+
|
test.R
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Illustrate/test core app functionality without shiny
|
2 |
+
|
3 |
+
library(tidyverse)
|
4 |
+
library(duckdbfs)
|
5 |
+
library(mapgl)
|
6 |
+
library(ellmer)
|
7 |
+
library(glue)
|
8 |
+
|
9 |
+
repo <- "https://data.source.coop/cboettig/social-vulnerability"
|
10 |
+
pmtiles <- glue("{repo}/svi2020_us_tract.pmtiles")
|
11 |
+
parquet <- glue("{repo}/svi2020_us_tract.parquet")
|
12 |
+
svi <- open_dataset(parquet, tblname = "svi") |> filter(RPL_THEMES > 0)
|
13 |
+
|
14 |
+
schema <- read_file("schema.yml")
|
15 |
+
system_prompt <- glue::glue(readr::read_file("system-prompt.md"),
|
16 |
+
.open = "<", .close = ">")
|
17 |
+
|
18 |
+
chat <- ellmer::chat_vllm(
|
19 |
+
base_url = "https://llm.nrp-nautilus.io/",
|
20 |
+
model = "llama3",
|
21 |
+
api_key = Sys.getenv("NRP_API_KEY"),
|
22 |
+
system_prompt = system_prompt,
|
23 |
+
api_args = list(temperature = 0)
|
24 |
+
)
|
25 |
+
|
26 |
+
chat <- ellmer::chat_vllm(
|
27 |
+
base_url = "https://llm.cirrus.carlboettiger.info/v1/",
|
28 |
+
model = "kosbu/Llama-3.3-70B-Instruct-AWQ",
|
29 |
+
api_key = Sys.getenv("CIRRUS_LLM_KEY"),
|
30 |
+
system_prompt = system_prompt,
|
31 |
+
api_args = list(temperature = 0)
|
32 |
+
)
|
33 |
+
|
34 |
+
# Test a chat-based response
|
35 |
+
chat$chat("Which columns describes racial components of social vulnerability?")
|
36 |
+
## A query-based response
|
37 |
+
stream <- chat$chat("Which counties in California have the highest average social vulnerability?")
|
38 |
+
response <- jsonlite::fromJSON(stream)
|
39 |
+
|
40 |
+
con <- duckdbfs::cached_connection()
|
41 |
+
filtered_data <- DBI::dbGetQuery(con, response$query)
|
42 |
+
full_data <- svi
|
43 |
+
response_query <- "
|
44 |
+
SELECT COUNTY, AVG(RPL_THEME1) as avg_soc_vuln FROM
|
45 |
+
svi WHERE STATE = 'California' GROUP BY COUNTY ORDER BY
|
46 |
+
avg_soc_vuln DESC LIMIT 10;
|
47 |
+
"
|
48 |
+
|
49 |
+
|
50 |
+
filter_column <- function(full_data, filtered_data, id_col) {
|
51 |
+
if (nrow(filtered_data) < 1) return(NULL)
|
52 |
+
values <- full_data |>
|
53 |
+
inner_join(filtered_data, copy = TRUE) |>
|
54 |
+
pull(id_col)
|
55 |
+
# maplibre syntax for the filter of PMTiles
|
56 |
+
list("in", list("get", id_col), list("literal", values))
|
57 |
+
}
|
58 |
+
|
59 |
+
maplibre(center = c(-102.9, 41.3), zoom = 3) |>
|
60 |
+
add_fill_layer(
|
61 |
+
id = "svi_layer",
|
62 |
+
source = list(type = "vector", url = paste0("pmtiles://", pmtiles)),
|
63 |
+
source_layer = "SVI2000_US_tract",
|
64 |
+
filter = filter_column(full_data, filtered_data, "FIPS"),
|
65 |
+
fill_opacity = 0.5,
|
66 |
+
fill_color = interpolate(column = "RPL_THEMES",
|
67 |
+
values = c(0, 1),
|
68 |
+
stops = c("#e19292c0", "darkblue"),
|
69 |
+
na_color = "lightgrey")
|
70 |
+
)
|
71 |
+
|