test

espm-157 · Dec 19, 2024 · b4b03d4 · b4b03d4
1 parent a255d8d
commit b4b03d4
Show file tree

Hide file tree

Showing 9 changed files with 357 additions and 2 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,20 @@
+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://cboettig:[email protected]/spaces/${GITHUB_ACTOR}/streamlit main
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM gitlab-registry.nrp-nautilus.io/cboettig/images
+WORKDIR /app
+
+COPY . .
+
+# huggingface uses port 7860 by default
+CMD streamlit run app.py \
+    --server.address 0.0.0.0 \
+    --server.port 7860 \
+    --server.headless true \
+    --server.fileWatcherType none
diff --git a/Makefile b/Makefile
@@ -0,0 +1,23 @@
+BASE="nature.datahub.berkeley.edu"
+MAKEFLAGS += s
+
+.PHONY: serve
+serve: 
+	@echo "\n 🌎  preview at: \033[1m https://${BASE}${JUPYTERHUB_SERVICE_PREFIX}proxy/8501/ \033[0m \n"
+	streamlit run app.py --server.port 8501  1> /dev/null 2>&1
+
+
+.PHONY: chat 
+chat: 
+	@echo "\n 🌎  preview at: \033[1m https://${BASE}${JUPYTERHUB_SERVICE_PREFIX}proxy/8501/ \033[0m \n"
+	streamlit run chat.py --server.port 8501  1> /dev/null 2>&1
+
+.PHONY: rag 
+rag: 
+	@echo "\n 🌎  preview at: \033[1m https://${BASE}${JUPYTERHUB_SERVICE_PREFIX}proxy/8501/ \033[0m \n"
+	streamlit run rag.py --server.port 8501  1> /dev/null 2>&1
+
+.PHONY: sql 
+sql: 
+	@echo "\n 🌎  preview at: \033[1m https://${BASE}${JUPYTERHUB_SERVICE_PREFIX}proxy/8501/ \033[0m \n"
+	streamlit run sql.py --server.port 8501  1> /dev/null 2>&1
diff --git a/README.md b/README.md
@@ -1,2 +1,19 @@
-# app-python-template
-Creating Web applications using Streamlit &amp; LLM Agents
+---
+title: Streamlit Demo
+emoji: 🌍
+colorFrom: yellow
+colorTo: indigo
+sdk: docker
+pinned: false
+license: bsd-2-clause
+---
+
+For ESPM-157 students using <https://nature.datahub.berkeley.edu> servers: 
+To preview locally, use
+
+```
+make
+```
+
+and click the link provided
+
diff --git a/app.py b/app.py
@@ -0,0 +1,42 @@
+import streamlit as st
+
+st.set_page_config(
+    page_title="Streamlit demos",
+)
+
+st.sidebar.success("Select a demo above.")
+
+st.title("Exploring LLM Agent Use")
+
+'''
+Select any of the demos on the sidebar.  Each illustrates a different way we can incorporate an LLM tool to perform reliable data retrieval (sometimes called retrieval augmented generation, RAG) from specified data resources.  
+
+In this module, you will be adapt one or more of these agents into an interactive application exploring the redlining data we encountered in Module 3 (as seen below). 
+
+'''
+
+import streamlit as st
+import leafmap.maplibregl as leafmap
+import ibis
+from ibis import _
+con = ibis.duckdb.connect()
+
+
+# fixme could create drop-down selection of the 300 cities
+city_name = st.text_input("Select a city", "Oakland")
+
+# Extract the specified city 
+city = (con
+    .read_geo("/vsicurl/https://dsl.richmond.edu/panorama/redlining/static/mappinginequality.gpkg")
+    .filter(_.city == city_name, _.residential)
+    .execute()
+)
+
+# Render the map
+m = leafmap.Map(style="positron")
+if city_name == "Oakland":
+    m.add_cog_layer("https://espm-157-f24.github.io/spatial-carl-amanda-tyler/ndvi.tif", name="ndvi", palette = "greens")
+m.add_gdf(city, "fill", paint = {"fill-color": ["get", "fill"], "fill-opacity": 0.8})
+m.add_layer_control()
+m.to_streamlit()
+
diff --git a/pages/chat.py b/pages/chat.py
@@ -0,0 +1,54 @@
+import streamlit as st
+from openai import OpenAI
+
+st.title("Chat Demo")
+
+'''
+This application presents a traditional chat interface to a range of open source or open weights models running on the National Research Platform (<https://nrp.ai>).  Unlike the other two demos, this pattern does not use specified data resources.
+
+'''
+
+
+with st.sidebar:
+    model = st.radio("Select an LLM:", ['olmo', 'gemma2', 'phi3', 'llama3', 'embed-mistral', 'mixtral', 'gorilla', 'groq-tools', 'llava'])
+    st.session_state["model"] =  model
+
+## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
+import os
+api_key = os.getenv("LITELLM_KEY")
+if api_key is None:
+    api_key = st.secrets["LITELLM_KEY"]
+
+
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+
+client = OpenAI(
+    api_key = api_key, 
+    base_url = "https://llm.nrp-nautilus.io"
+)
+
+# Button to clear session state
+if st.button('Clear History'):
+    st.session_state.clear()
+
+if prompt := st.chat_input("What is up?"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    with st.chat_message("assistant"):
+        stream = client.chat.completions.create(
+            model=st.session_state["model"],
+            messages=[
+                {"role": m["role"], "content": m["content"]}
+                for m in st.session_state.messages
+            ],
+            stream=True,
+        )
+        response = st.write_stream(stream)
+    st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/pages/rag.py b/pages/rag.py
@@ -0,0 +1,105 @@
+import streamlit as st
+from langchain_community.document_loaders import PyPDFLoader
+
+## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
+import os
+api_key = os.getenv("LITELLM_KEY")
+if api_key is None:
+    api_key = st.secrets["LITELLM_KEY"]
+
+
+st.title("RAG Demo")
+
+
+'''
+This demonstration combines an LLM trained specifically text embedding (`embed-mistral` in our case) with a traditional "instruct" based LLM (`llama3`) to create a retreival augmented generation (RAG) interface to a provided PDF document.  We can query the model to return relatively precise citations to the matched text in the PDF document to verify the queries. 
+
+
+Provide a URL to a PDF document you want to ask questions about.
+Once the document has been uploaded and parsed, ask your questions in the chat dialog that will appear below.  The default example comes from a recent report on California's initiative for biodiversity conservation.
+'''
+
+# Create a file uploader?
+# st.sidebar.file_uploader("Choose a PDF file", type=["pdf"])
+url = st.text_input("PDF URL", "https://www.resources.ca.gov/-/media/CNRA-Website/Files/2024_30x30_Pathways_Progress_Report.pdf")
+
+# +
+import bs4
+from langchain import hub
+from langchain_chroma import Chroma
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+
+@st.cache_data
+def pdf_loader(url):
+    loader = PyPDFLoader(url)
+    return loader.load()
+
+docs = pdf_loader(url)
+
+
+# Set up the language model
+from langchain_openai import ChatOpenAI
+llm = ChatOpenAI(model = "llama3", api_key = api_key, base_url = "https://llm.nrp-nautilus.io",  temperature=0)
+
+# Set up the embedding model
+from langchain_openai import OpenAIEmbeddings
+embedding = OpenAIEmbeddings(
+    model = "embed-mistral", 
+    api_key = api_key, 
+    base_url = "https://llm.nrp-nautilus.io"
+)
+
+# Build a retrival agent
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+splits = text_splitter.split_documents(docs)
+vectorstore = InMemoryVectorStore.from_documents(documents=splits, embedding=embedding)
+retriever = vectorstore.as_retriever()
+
+from langchain.chains import create_retrieval_chain
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain_core.prompts import ChatPromptTemplate
+system_prompt = (
+    "You are an assistant for question-answering tasks. "
+    "Use the following pieces of retrieved context to answer "
+    "the question. If you don't know the answer, say that you "
+    "don't know. Use three sentences maximum and keep the "
+    "answer concise."
+    "\n\n"
+    "{context}"
+)
+prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", system_prompt),
+        ("human", "{input}"),
+    ]
+)
+question_answer_chain = create_stuff_documents_chain(llm, prompt)
+rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+
+
+# Place agent inside a streamlit application:
+
+if prompt := st.chat_input("What is the goal of CA 30x30?"):
+    with st.chat_message("user"):
+        st.markdown(prompt)
+
+    with st.chat_message("assistant"):
+        results = rag_chain.invoke({"input": prompt})
+        st.write(results['answer'])
+
+        with st.expander("See context matched"):
+            st.write(results['context'][0].page_content)
+            st.write(results['context'][0].metadata)
+
+
+# adapt for memory / multi-question interaction with:
+# https://python.langchain.com/docs/tutorials/qa_chat_history/
+
+# Also see structured outputs.
diff --git a/pages/sql.py b/pages/sql.py
@@ -0,0 +1,82 @@
+import streamlit as st
+
+st.title("SQL demo")
+
+
+'''
+This demonstration illustrates building an LLM-based agent that performs tasks by generating and executing code based on plain-text queries. In this example, we use a custom system prompt to instruct the LLM to generate SQL code which is then executed against the parquet data we generated in Module 3.  Note that SQL query itself is shown as well as the table produced by the query.  
+
+
+'''
+
+## dockerized streamlit app wants to read from os.getenv(), otherwise use st.secrets
+import os
+api_key = os.getenv("LITELLM_KEY")
+if api_key is None:
+    api_key = st.secrets["LITELLM_KEY"]
+
+
+parquet = st.text_input("parquet file:", "https://espm-157-f24.github.io/spatial-carl-amanda-tyler/new_haven_stats.parquet")
+
+# create sharable low-level connection, see: https://github.com/Mause/duckdb_engine
+import sqlalchemy
+eng = sqlalchemy.create_engine("duckdb:///:memory:")
+
+# ibis can talk to this connection and create the VIEW
+import ibis
+from ibis import _
+con = ibis.duckdb.from_connection(eng.raw_connection())
+tbl = con.read_parquet(parquet, "mydata")
+
+# langchain can also talk to this connection and see the table:
+from langchain_community.utilities import SQLDatabase
+db = SQLDatabase(eng, view_support=True)
+
+#db.run(f"create or replace view mydata as select * from read_parquet('{parquet}');")
+#print(db.get_usable_table_names()) 
+
+# Build the template for system prompt
+template = '''
+You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question.
+Always return all columns from a query (select *) unless otherwise instructed.
+Wrap each column name in double quotes (") to denote them as delimited identifiers.
+Pay attention to use only the column names you can see in the tables below. 
+Be careful to not query for columns that do not exist. 
+Also, pay attention to which column is in which table.
+Pay attention to use today() function to get the current date, if the question involves "today".
+Respond with only the SQL query to run.  Do not repeat the question or explanation. Just the raw SQL query.
+Only use the following tables:
+{table_info}
+Question: {input}    
+'''
+
+with st.sidebar:
+    model = st.selectbox("LLM:", ["gorilla", "llama3", "olmo"])
+
+
+from langchain_core.prompts import PromptTemplate
+prompt = PromptTemplate.from_template(template, partial_variables = {"dialect": "duckdb", "top_k": 10})
+
+# Now we are ready to create our model and start querying!
+from langchain_openai import ChatOpenAI
+llm = ChatOpenAI(model="gorilla", # Try: llama3, gorilla, or groq-tools, or other models
+                 temperature=0, 
+                 api_key=api_key, 
+                 base_url = "https://llm.nrp-nautilus.io")
+
+
+from langchain.chains import create_sql_query_chain
+chain = create_sql_query_chain(llm, db, prompt)
+
+prompt = st.chat_input("What is the mean ndvi by grade?")
+
+if prompt:
+    response = chain.invoke({"question": prompt})
+    with st.chat_message("ai"):
+        st.write(response)
+        df = tbl.sql(response).head(10).execute()
+        df
+
+
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+streamlit