From 7b44707a33bd66e2fc2f18d7d803960d77cac679 Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Tue, 22 Oct 2024 12:26:43 -0700 Subject: [PATCH 1/6] inital commit for yellowhammer jupyter assistant --- examples/001-getting-started.ipynb | 2 +- examples/Demo.ipynb | 133 ++++++++ pyproject.toml | 1 + src/yellowhammer/__init__.py | 11 + src/yellowhammer/datalab-api-prompt.md | 447 +++++++++++++++++++++++++ src/yellowhammer/llm.py | 61 ++++ src/yellowhammer/magics.py | 86 +++++ src/yellowhammer/prompt.py | 13 + src/yellowhammer/system-prompt.md | 1 + src/yellowhammer/test.ipynb | 189 +++++++++++ src/yellowhammmer/__init__.py | 5 - uv.lock | 4 +- 12 files changed, 946 insertions(+), 7 deletions(-) create mode 100644 examples/Demo.ipynb create mode 100755 src/yellowhammer/datalab-api-prompt.md create mode 100644 src/yellowhammer/llm.py create mode 100644 src/yellowhammer/magics.py create mode 100644 src/yellowhammer/prompt.py create mode 100644 src/yellowhammer/system-prompt.md create mode 100644 src/yellowhammer/test.ipynb delete mode 100644 src/yellowhammmer/__init__.py diff --git a/examples/001-getting-started.ipynb b/examples/001-getting-started.ipynb index 5bf27ef..3c8eb0e 100644 --- a/examples/001-getting-started.ipynb +++ b/examples/001-getting-started.ipynb @@ -123,7 +123,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/examples/Demo.ipynb b/examples/Demo.ipynb new file mode 100644 index 0000000..20475f3 --- /dev/null +++ b/examples/Demo.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2671fc38-c9ca-49ef-948a-abb7815ca2b9", + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext yellowhammer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32c5ffc5-ea83-458c-8e5a-75bb15da8a2d", + "metadata": {}, + "outputs": [], + "source": [ + "import os, getpass\n", + "os.environ['LLM_PROVIDER'] = \"OPENAI\"\n", + "os.environ['LLM_API_KEY'] = getpass.getpass(\"Enter your LLM API key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2f61e78f-b8a3-4f0c-b5e4-503af30019f3", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"DATALAB_API_KEY\"] = getpass.getpass(\"Enter your Datalab API key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f44cabeb-58ef-4ccd-86d7-5cb04a3f8643", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "Datalab is a data management platform designed for scientists and researchers to manage their experimental data efficiently. It allows users to organize, analyze, and visualize their data through a structured interface. Datalab uses the concept of \"data blocks\" to handle files associated with samples, enabling users to parse data according to scientific schemas and create plots for better data interpretation. The platform also provides an API for programmatic access, allowing users to automate tasks such as data entry, file uploads, and data analysis." + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%llm\n", + "What is datalab?" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d10442e5-7c5c-40cd-becf-9c2e9a79fd45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "To create a new sample in Datalab with the specified ID, name, and formula, we will use the `create_item` method of the `DatalabClient`. This method requires the item ID, item type, and the data that matches the schema for samples. In this case, we will set the `item_id` to 'llm-test3', the `name` to 'virtual sample', and the `chemform` to 'RbCl'." + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%llm\n", + "Create a sample with ID llm-test3, sample name \"virtual sample\", formula RbCl" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "33c0e59e-586d-48f3-aa5f-f6fa743dcc96", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/yue/Library/CloudStorage/OneDrive-Personal/code/yellowhammer/.venv/lib/python3.10/site-packages/datalab_api/_base.py:165: UserWarning: Found API URL https://demo-api.datalab-org.io in HTML meta tag. Creating client with this URL instead.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from datalab_api import DatalabClient\n", + "import os\n", + "with DatalabClient(\"https://demo.datalab-org.io\") as client:\n", + " json_data = {\n", + " \"name\": \"virtual sample\",\n", + " \"chemform\": \"RbCl\"\n", + " }\n", + " client.create_item(item_id=\"llm-test3\", item_type=\"samples\", item_data=json_data)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "yellowhammer", + "language": "python", + "name": "yellowhammer" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index ed363dc..a8ab03f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "datalab-api>=0.2.4", "jupyter-ai>=2.24.1", "jupyterlab>=4.2.5", + "langchain>=0.2.16", "langchain-anthropic>=0.1.23", "langchain-openai>=0.1.25", "notebook>=7.2.2", diff --git a/src/yellowhammer/__init__.py b/src/yellowhammer/__init__.py index 77003b9..9c8eac8 100644 --- a/src/yellowhammer/__init__.py +++ b/src/yellowhammer/__init__.py @@ -6,3 +6,14 @@ __version__ = "develop" __all__ = ("__version__",) + + +from .magics import datalabMagics + +def load_ipython_extension(ipython): + """ + Any module file that define a function named `load_ipython_extension` + can be loaded via `%load_ext module.path` or be configured to be + autoloaded by IPython at startup time. + """ + ipython.register_magics(datalabMagics) diff --git a/src/yellowhammer/datalab-api-prompt.md b/src/yellowhammer/datalab-api-prompt.md new file mode 100755 index 0000000..5497102 --- /dev/null +++ b/src/yellowhammer/datalab-api-prompt.md @@ -0,0 +1,447 @@ +Use the datalab Python API package to query entries on the datalab instance at https://demo.datalab-org.io/. +Each method of the DatalabClient class will return a dictionary constructed directly +from the JSON response of the Datalab API. + +Datalab uses "data blocks" to take a file attached to a sample, parse it +according to some scientific schema, and then make a plot. + +The rest of this prompt contains the README for the datalab python API module `datalab_api`, which you already have installed. + +Assume the `DATALAB_API_KEY` has been set an environment variable. + +Python API +This package implements basic functionality for displaying and manipulating entries: + +```python +from datalab_api import DatalabClient + +with DatalabClient("https://demo.datalab-org.io") as client: + + # Get the info about this datalab instance + client.get_info() + + # Get the current user's info + client.authenticate() + + # Search for items with the string + items = client.search_items("search-values") + + # List all items of a given type + # Types can be 'samples' or 'starting_materials' + items = client.get_items(item_type="samples") + + # Get more info on a particular item called 'test' + item = client.get_item(item_id="test") + + # Create a new item with some data that matches the corresponding `item_type` schema + json_data = {"chemform": "NaCl"} + client.create_item(item_id="test_new", item_type="samples", item_data=json_data) + + # Attach a file to an item and get the uploaded ID + file_response = client.upload_file(filepath="my_echem_data.mpr", item_id="test") + file_id = file_response["file_id"] + + # Create a data block for a sample, then show the plot + client.create_data_block(item_id="test", file_ids=file_id) + + # Download all files attached to a sample and return their paths + file_paths = client.get_item_files(item_id="test") + + # Get the item graph, useful for finding relationships + graph = client.get_item_graph() + +``` + +Here is an abridged JSONSchema for a sample, that also has some info about other +types. + +```json +{ + "title": "Sample", + "description": "A model for representing an experimental sample.", + "type": "object", + "properties": { + "blocks_obj": { + "title": "Blocks Obj", + "default": {}, + "type": "object" + }, + "display_order": { + "title": "Display Order", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "collections": { + "title": "Collections", + "default": [], + "type": "array", + "items": { + "$ref": "#/definitions/Collection" + } + }, + "revision": { + "title": "Revision", + "default": 1, + "type": "integer" + }, + "revisions": { + "title": "Revisions", + "type": "object" + }, + "creator_ids": { + "title": "Creator Ids", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "creators": { + "title": "Creators", + "type": "array", + "items": { + "$ref": "#/definitions/Person" + } + }, + "type": { + "title": "Type", + "default": "samples", + "const": "samples", + "pattern": "^samples$", + "type": "string" + }, + "immutable_id": { + "title": "Immutable ID", + "type": "string" + }, + "last_modified": { + "title": "Last Modified", + "type": "date", + "format": "date-time" + }, + "relationships": { + "title": "Relationships", + "type": "array", + "items": { + "$ref": "#/definitions/TypedRelationship" + } + }, + "refcode": { + "title": "Refcode", + "minLength": 1, + "maxLength": 40, + "pattern": "^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$", + "type": "string" + }, + "item_id": { + "title": "Item Id", + "minLength": 1, + "maxLength": 40, + "pattern": "^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$", + "type": "string" + }, + "description": { + "title": "Description", + "type": "string" + }, + "date": { + "title": "Date", + "type": "date", + "format": "date-time" + }, + "name": { + "title": "Name", + "type": "string" + }, + "files": { + "title": "Files", + "type": "array", + "items": { + "$ref": "#/definitions/File" + } + }, + "file_ObjectIds": { + "title": "File Objectids", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "chemform": { + "title": "Chemform", + "example": [ + "Na3P", + "LiNiO2@C" + ], + "type": "string" + }, + "synthesis_constituents": { + "title": "Synthesis Constituents", + "default": [], + "type": "array", + "items": { + "$ref": "#/definitions/Constituent" + } + }, + "synthesis_description": { + "title": "Synthesis Description", + "type": "string" + } + }, + "required": [ + "item_id" + ], + "definitions": { + "KnownType": { + "title": "KnownType", + "description": "An enumeration of the types of entry known by this implementation, should be made dynamic in the future.", + "enum": [ + "samples", + "starting_materials", + "blocks", + "files", + "people", + "collections" + ], + "type": "string" + }, + "File": { + "title": "File", + "description": "A model for representing a file that has been tracked or uploaded to datalab.", + "type": "object", + "properties": { + "revision": { + "title": "Revision", + "default": 1, + "type": "integer" + }, + "revisions": { + "title": "Revisions", + "type": "object" + }, + "creator_ids": { + "title": "Creator Ids", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "creators": { + "title": "Creators", + "type": "array", + "items": { + "$ref": "#/definitions/Person" + } + }, + "type": { + "title": "Type", + "default": "files", + "const": "files", + "pattern": "^files$", + "type": "string" + }, + "immutable_id": { + "title": "Immutable ID", + "type": "string" + }, + "last_modified": { + "title": "Last Modified", + "type": "date", + "format": "date-time" + }, + "relationships": { + "title": "Relationships", + "type": "array", + "items": { + "$ref": "#/definitions/TypedRelationship" + } + }, + "size": { + "title": "Size", + "description": "The size of the file on disk in bytes.", + "type": "integer" + }, + "last_modified_remote": { + "title": "Last Modified Remote", + "description": "The last date/time at which the remote file was modified.", + "type": "date", + "format": "date-time" + }, + "item_ids": { + "title": "Item Ids", + "description": "A list of item IDs associated with this file.", + "type": "array", + "items": { + "type": "string" + } + }, + "blocks": { + "title": "Blocks", + "description": "A list of block IDs associated with this file.", + "type": "array", + "items": { + "type": "string" + } + }, + "name": { + "title": "Name", + "description": "The filename on disk.", + "type": "string" + }, + "extension": { + "title": "Extension", + "description": "The file extension that the file was uploaded with.", + "type": "string" + }, + "original_name": { + "title": "Original Name", + "description": "The raw filename as uploaded.", + "type": "string" + }, + "location": { + "title": "Location", + "description": "The location of the file on disk.", + "type": "string" + }, + "url_path": { + "title": "Url Path", + "description": "The path to a remote file.", + "type": "string" + }, + "source": { + "title": "Source", + "description": "The source of the file, e.g. 'remote' or 'uploaded'.", + "type": "string" + }, + "time_added": { + "title": "Time Added", + "description": "The timestamp for the original file upload.", + "type": "string", + "format": "date-time" + }, + "metadata": { + "title": "Metadata", + "description": "Any additional metadata.", + "type": "object" + }, + "representation": { + "title": "Representation" + }, + "source_server_name": { + "title": "Source Server Name", + "description": "The server name at which the file is stored.", + "type": "string" + }, + "source_path": { + "title": "Source Path", + "description": "The path to the file on the remote resource.", + "type": "string" + }, + "is_live": { + "title": "Is Live", + "description": "Whether or not the file should be watched for future updates.", + "type": "boolean" + } + }, + "required": [ + "item_ids", + "blocks", + "name", + "extension", + "time_added", + "is_live" + ] + }, + "EntryReference": { + "title": "EntryReference", + "description": "A reference to a database entry by ID and type.\n\nCan include additional arbitarary metadata useful for\ninlining the item data.", + "type": "object", + "properties": { + "type": { + "title": "Type", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "immutable_id": { + "title": "Immutable Id", + "type": "string" + }, + "item_id": { + "title": "Item Id", + "minLength": 1, + "maxLength": 40, + "pattern": "^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$", + "type": "string" + }, + "refcode": { + "title": "Refcode", + "minLength": 1, + "maxLength": 40, + "pattern": "^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$", + "type": "string" + } + }, + "required": [ + "type" + ] + }, + "InlineSubstance": { + "title": "InlineSubstance", + "type": "object", + "properties": { + "name": { + "title": "Name", + "type": "string" + }, + "chemform": { + "title": "Chemform", + "type": "string" + } + }, + "required": [ + "name" + ] + }, + "Constituent": { + "title": "Constituent", + "description": "A constituent of a sample.", + "type": "object", + "properties": { + "item": { + "title": "Item", + "anyOf": [ + { + "$ref": "#/definitions/EntryReference" + }, + { + "$ref": "#/definitions/InlineSubstance" + } + ] + }, + "quantity": { + "title": "Quantity", + "minimum": 0, + "type": "number" + }, + "unit": { + "title": "Unit", + "default": "g", + "type": "string" + } + }, + "required": [ + "item", + "quantity" + ] + } + } +} +``` diff --git a/src/yellowhammer/llm.py b/src/yellowhammer/llm.py new file mode 100644 index 0000000..bed2644 --- /dev/null +++ b/src/yellowhammer/llm.py @@ -0,0 +1,61 @@ +from langchain_core.prompts import ChatPromptTemplate +from pydantic import BaseModel, Field +from .prompt import SYSTEM_PROMPT +from typing import Union + +# Pydantic schema to use with .with_structured_output() +class code(BaseModel): + """Schema for code solutions""" + prefix: str = Field(description="Description of the problem and approach") + imports: str = Field(description="Code block import statements") + code: str = Field(description="Code block not including import statements") + +class ConversationalResponse(BaseModel): + """Respond in a conversational manner. Be kind and helpful.""" + response: str = Field(description="A conversational response to the user's query") + +class FinalResponse(BaseModel): + """The final response can be either a code solution or a conversational response""" + final_output: Union[code, ConversationalResponse] + +def get_chain( + api_provider, + api_key, + api_model=None, + api_temperature=0, + ): + + # API provider logic + if api_provider.lower() == "openai": + from langchain_openai import ChatOpenAI + if api_model is None: + api_model = "gpt-4o-mini" + llm = ChatOpenAI( + model=api_model, + temperature=api_temperature, + openai_api_key=api_key + ) + + elif api_provider.lower() == "anthropic": + from langchain_anthropic import ChatAnthropic + if api_model is None: + api_model = "claude-3-5-sonnet-20240620" + llm = ChatAnthropic( + model=api_model, + temperature=api_temperature, + anthropic_api_key=api_key, + ) + + # Prompt + prompt = ChatPromptTemplate.from_messages( + [ + ("system", SYSTEM_PROMPT), # datalab API info is passed via {context} to the system prompt + ("placeholder", "{messages}"), + ] + ) + + # Create a chain where the final output takes the FinalResponse schema + chain = prompt | llm.with_structured_output(FinalResponse, include_raw=False) + + # Returns a runnable chain which accepts datalab API documentation "context" and user question "messages" + return chain \ No newline at end of file diff --git a/src/yellowhammer/magics.py b/src/yellowhammer/magics.py new file mode 100644 index 0000000..8bd94cd --- /dev/null +++ b/src/yellowhammer/magics.py @@ -0,0 +1,86 @@ +""" +Magics to support LLM interactions in IPython/Jupyter. +Adapted from fperez/jupytee and jan-janssen/LangSim. +""" +import os + +from IPython import get_ipython +from IPython.core.magic import ( + Magics, + magics_class, + line_cell_magic, +) +from IPython.core.magic_arguments import ( + magic_arguments, + argument, + parse_argstring, +) +from IPython.display import Markdown +from .llm import get_chain, code, FinalResponse, ConversationalResponse +from .prompt import API_PROMPT + + +def get_output(messages, temp=0.1): + env = os.environ + agent_executor = get_chain( + api_provider=env.get("LLM_PROVIDER", "OPENAI"), + api_key=env.get("LLM_API_KEY"), + api_model=env.get("LLM_MODEL", None), + api_temperature=env.get("LLM_TEMP", temp), + ) + + return agent_executor.invoke({"context": API_PROMPT, "messages": messages}) + +# Class to manage state and expose the main magics +@magics_class +class datalabMagics(Magics): + def __init__(self, shell): + super(datalabMagics, self).__init__(shell) + self.messages = [] + + # A datalab magic that returns a code block + @magic_arguments() + @argument( + "prompt", + nargs="*", + help="""Prompt for code generation. When used as a line magic, + it runs to the end of the line. In cell mode, the entire cell + is considered the code generation prompt. + """, + ) + + @argument( + "-T", + "--temp", + type=float, + default=0.0, + help="""Temperature, float in [0,1]. Higher values push the algorithm + to generate more aggressive/"creative" output. [default=0.1].""", + ) + + @line_cell_magic + def llm(self, line, cell=None): + """ + Chat with the LLM. Return either conversation or code. + """ + args = parse_argstring(self.llm, line) #self.llm is a bound method + + if cell is None: + prompt = " ".join(args.prompt) + else: + prompt = cell + + self.messages.append(("human", prompt)) + response = get_output(self.messages).final_output + + if isinstance(response, ConversationalResponse): + output = response.response + self.messages.append(("ai", output)) + return Markdown(output) + + elif isinstance(response, code): + output = response + self.messages.append(("ai", output.prefix)) + cell_fill = output.imports + "\n" + output.code + get_ipython().set_next_input(cell_fill) + return Markdown(output.prefix) \ No newline at end of file diff --git a/src/yellowhammer/prompt.py b/src/yellowhammer/prompt.py new file mode 100644 index 0000000..efb5d12 --- /dev/null +++ b/src/yellowhammer/prompt.py @@ -0,0 +1,13 @@ +from pathlib import Path + +def load_file_content(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + return file.read() + +# Load API_PROMPT +api_prompt_path = Path(__file__).parent / "datalab-api-prompt.md" +API_PROMPT = load_file_content(api_prompt_path) + +# Load SYSTEM_PROMPT +system_prompt_path = Path(__file__).parent / "system-prompt.md" +SYSTEM_PROMPT = load_file_content(system_prompt_path) \ No newline at end of file diff --git a/src/yellowhammer/system-prompt.md b/src/yellowhammer/system-prompt.md new file mode 100644 index 0000000..18ce00c --- /dev/null +++ b/src/yellowhammer/system-prompt.md @@ -0,0 +1 @@ +You are a virtual assistant that helps scientists use the data management platform datalab to manage their experimental data, plan experiments, analyse data and plot results. Here is the documentation of the datalab API: {context}. Answer user questions and use the provided documentation to interact with datalab. Your code responses will be output to a Jupyter notebook cell that has access to the datalab API and common python scientific libraries. Ensure any code you provide can be executed with all required imports and variables defined. Structure your answer with a description of the code solution. Then list the imports. Finally list the functioning code block. Here is the user question: \ No newline at end of file diff --git a/src/yellowhammer/test.ipynb b/src/yellowhammer/test.ipynb new file mode 100644 index 0000000..4961327 --- /dev/null +++ b/src/yellowhammer/test.ipynb @@ -0,0 +1,189 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from langchain_community.document_loaders import TextLoader\n", + "\n", + "path = \"datalab-api-prompt.md\"\n", + "loader = TextLoader(path)\n", + "API_PROMPT = loader.load()[0].page_content\n", + "\n", + "path = \"system-prompt.md\"\n", + "loader = TextLoader(path)\n", + "SYSTEM_PROMPT = loader.load()[0].page_content\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Use the datalab Python API package to query entries on the datalab instance at https://demo.datalab-org.io/.\\nEach method of the DatalabClient class will return a dictionary constructed directly\\nfrom the JSON response of the Datalab API.\\n\\nDatalab uses \"data blocks\" to take a file attached to a sample, parse it\\naccording to some scientific schema, and then make a plot.\\n\\nThe rest of this prompt contains the README for the datalab python API module `datalab_api`, which you already have installed.\\n\\nAssume the `DATALAB_API_KEY` has been set an environment variable.\\n\\nPython API\\nThis package implements basic functionality for displaying and manipulating entries:\\n\\n```python\\nfrom datalab_api import DatalabClient\\n\\nwith DatalabClient(\"https://demo.datalab-org.io\") as client:\\n\\n # Get the info about this datalab instance\\n client.get_info()\\n\\n # Get the current user\\'s info\\n client.authenticate()\\n\\n # Search for items with the string\\n items = client.search_items(\"search-values\")\\n\\n # List all items of a given type\\n # Types can be \\'samples\\' or \\'starting_materials\\'\\n items = client.get_items(item_type=\"samples\")\\n\\n # Get more info on a particular item called \\'test\\'\\n item = client.get_item(item_id=\"test\")\\n\\n # Create a new item with some data that matches the corresponding `item_type` schema\\n json_data = {\"chemform\": \"NaCl\"}\\n client.create_item(item_id=\"test_new\", item_type=\"samples\", item_data=json_data)\\n\\n # Attach a file to an item and get the uploaded ID\\n file_response = client.upload_file(filepath=\"my_echem_data.mpr\", item_id=\"test\")\\n file_id = file_response[\"file_id\"]\\n\\n # Create a data block for a sample, then show the plot\\n client.create_data_block(item_id=\"test\", file_ids=file_id)\\n\\n # Download all files attached to a sample and return their paths\\n file_paths = client.get_item_files(item_id=\"test\")\\n\\n # Get the item graph, useful for finding relationships\\n graph = client.get_item_graph()\\n\\n```\\n\\nHere is an abridged JSONSchema for a sample, that also has some info about other\\ntypes.\\n\\n```json \\n{\\n \"title\": \"Sample\",\\n \"description\": \"A model for representing an experimental sample.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"blocks_obj\": {\\n \"title\": \"Blocks Obj\",\\n \"default\": {},\\n \"type\": \"object\"\\n },\\n \"display_order\": {\\n \"title\": \"Display Order\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"collections\": {\\n \"title\": \"Collections\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Collection\"\\n }\\n },\\n \"revision\": {\\n \"title\": \"Revision\",\\n \"default\": 1,\\n \"type\": \"integer\"\\n },\\n \"revisions\": {\\n \"title\": \"Revisions\",\\n \"type\": \"object\"\\n },\\n \"creator_ids\": {\\n \"title\": \"Creator Ids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"creators\": {\\n \"title\": \"Creators\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Person\"\\n }\\n },\\n \"type\": {\\n \"title\": \"Type\",\\n \"default\": \"samples\",\\n \"const\": \"samples\",\\n \"pattern\": \"^samples$\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable ID\",\\n \"type\": \"string\"\\n },\\n \"last_modified\": {\\n \"title\": \"Last Modified\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"relationships\": {\\n \"title\": \"Relationships\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/TypedRelationship\"\\n }\\n },\\n \"refcode\": {\\n \"title\": \"Refcode\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"item_id\": {\\n \"title\": \"Item Id\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"description\": {\\n \"title\": \"Description\",\\n \"type\": \"string\"\\n },\\n \"date\": {\\n \"title\": \"Date\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"files\": {\\n \"title\": \"Files\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/File\"\\n }\\n },\\n \"file_ObjectIds\": {\\n \"title\": \"File Objectids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"chemform\": {\\n \"title\": \"Chemform\",\\n \"example\": [\\n \"Na3P\",\\n \"LiNiO2@C\"\\n ],\\n \"type\": \"string\"\\n },\\n \"synthesis_constituents\": {\\n \"title\": \"Synthesis Constituents\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Constituent\"\\n }\\n },\\n \"synthesis_description\": {\\n \"title\": \"Synthesis Description\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"item_id\"\\n ],\\n \"definitions\": {\\n \"KnownType\": {\\n \"title\": \"KnownType\",\\n \"description\": \"An enumeration of the types of entry known by this implementation, should be made dynamic in the future.\",\\n \"enum\": [\\n \"samples\",\\n \"starting_materials\",\\n \"blocks\",\\n \"files\",\\n \"people\",\\n \"collections\"\\n ],\\n \"type\": \"string\"\\n },\\n \"File\": {\\n \"title\": \"File\",\\n \"description\": \"A model for representing a file that has been tracked or uploaded to datalab.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"revision\": {\\n \"title\": \"Revision\",\\n \"default\": 1,\\n \"type\": \"integer\"\\n },\\n \"revisions\": {\\n \"title\": \"Revisions\",\\n \"type\": \"object\"\\n },\\n \"creator_ids\": {\\n \"title\": \"Creator Ids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"creators\": {\\n \"title\": \"Creators\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Person\"\\n }\\n },\\n \"type\": {\\n \"title\": \"Type\",\\n \"default\": \"files\",\\n \"const\": \"files\",\\n \"pattern\": \"^files$\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable ID\",\\n \"type\": \"string\"\\n },\\n \"last_modified\": {\\n \"title\": \"Last Modified\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"relationships\": {\\n \"title\": \"Relationships\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/TypedRelationship\"\\n }\\n },\\n \"size\": {\\n \"title\": \"Size\",\\n \"description\": \"The size of the file on disk in bytes.\",\\n \"type\": \"integer\"\\n },\\n \"last_modified_remote\": {\\n \"title\": \"Last Modified Remote\",\\n \"description\": \"The last date/time at which the remote file was modified.\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"item_ids\": {\\n \"title\": \"Item Ids\",\\n \"description\": \"A list of item IDs associated with this file.\",\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"blocks\": {\\n \"title\": \"Blocks\",\\n \"description\": \"A list of block IDs associated with this file.\",\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"description\": \"The filename on disk.\",\\n \"type\": \"string\"\\n },\\n \"extension\": {\\n \"title\": \"Extension\",\\n \"description\": \"The file extension that the file was uploaded with.\",\\n \"type\": \"string\"\\n },\\n \"original_name\": {\\n \"title\": \"Original Name\",\\n \"description\": \"The raw filename as uploaded.\",\\n \"type\": \"string\"\\n },\\n \"location\": {\\n \"title\": \"Location\",\\n \"description\": \"The location of the file on disk.\",\\n \"type\": \"string\"\\n },\\n \"url_path\": {\\n \"title\": \"Url Path\",\\n \"description\": \"The path to a remote file.\",\\n \"type\": \"string\"\\n },\\n \"source\": {\\n \"title\": \"Source\",\\n \"description\": \"The source of the file, e.g. \\'remote\\' or \\'uploaded\\'.\",\\n \"type\": \"string\"\\n },\\n \"time_added\": {\\n \"title\": \"Time Added\",\\n \"description\": \"The timestamp for the original file upload.\",\\n \"type\": \"string\",\\n \"format\": \"date-time\"\\n },\\n \"metadata\": {\\n \"title\": \"Metadata\",\\n \"description\": \"Any additional metadata.\",\\n \"type\": \"object\"\\n },\\n \"representation\": {\\n \"title\": \"Representation\"\\n },\\n \"source_server_name\": {\\n \"title\": \"Source Server Name\",\\n \"description\": \"The server name at which the file is stored.\",\\n \"type\": \"string\"\\n },\\n \"source_path\": {\\n \"title\": \"Source Path\",\\n \"description\": \"The path to the file on the remote resource.\",\\n \"type\": \"string\"\\n },\\n \"is_live\": {\\n \"title\": \"Is Live\",\\n \"description\": \"Whether or not the file should be watched for future updates.\",\\n \"type\": \"boolean\"\\n }\\n },\\n \"required\": [\\n \"item_ids\",\\n \"blocks\",\\n \"name\",\\n \"extension\",\\n \"time_added\",\\n \"is_live\"\\n ]\\n },\\n \"EntryReference\": {\\n \"title\": \"EntryReference\",\\n \"description\": \"A reference to a database entry by ID and type.\\\\n\\\\nCan include additional arbitarary metadata useful for\\\\ninlining the item data.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"type\": {\\n \"title\": \"Type\",\\n \"type\": \"string\"\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable Id\",\\n \"type\": \"string\"\\n },\\n \"item_id\": {\\n \"title\": \"Item Id\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"refcode\": {\\n \"title\": \"Refcode\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"type\"\\n ]\\n },\\n \"InlineSubstance\": {\\n \"title\": \"InlineSubstance\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"chemform\": {\\n \"title\": \"Chemform\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"name\"\\n ]\\n },\\n \"Constituent\": {\\n \"title\": \"Constituent\",\\n \"description\": \"A constituent of a sample.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"item\": {\\n \"title\": \"Item\",\\n \"anyOf\": [\\n {\\n \"$ref\": \"#/definitions/EntryReference\"\\n },\\n {\\n \"$ref\": \"#/definitions/InlineSubstance\"\\n }\\n ]\\n },\\n \"quantity\": {\\n \"title\": \"Quantity\",\\n \"minimum\": 0,\\n \"type\": \"number\"\\n },\\n \"unit\": {\\n \"title\": \"Unit\",\\n \"default\": \"g\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"item\",\\n \"quantity\"\\n ]\\n }\\n }\\n}\\n```\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "API_PROMPT" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def load_file_content(file_path):\n", + " with open(file_path, 'r', encoding='utf-8') as file:\n", + " return file.read()\n", + "\n", + "# Load API_PROMPT\n", + "api_prompt_path = \"datalab-api-prompt.md\"\n", + "API_PROMPT_ = load_file_content(api_prompt_path)\n", + "\n", + "# Load SYSTEM_PROMPT\n", + "system_prompt_path = \"system-prompt.md\"\n", + "SYSTEM_PROMPT_ = load_file_content(system_prompt_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11679" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(API_PROMPT_)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11679" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(API_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "698" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(SYSTEM_PROMPT_)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "698" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(SYSTEM_PROMPT)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'You are a virtual assistant that helps scientists use the data management platform datalab to manage their experimental data, plan experiments, analyse data and plot results. Here is the documentation of the datalab API: {context}. Answer user questions and use the provided documentation to interact with datalab. Your code responses will be output to a Jupyter notebook cell that has access to the datalab API and common python scientific libraries. Ensure any code you provide can be executed with all required imports and variables defined. Structure your answer with a description of the code solution. Then list the imports. Finally list the functioning code block. Here is the user question:'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SYSTEM_PROMPT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/yellowhammmer/__init__.py b/src/yellowhammmer/__init__.py deleted file mode 100644 index 9485887..0000000 --- a/src/yellowhammmer/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from importlib import metadata - -__version__ = metadata.version("yellowhammer") - -__all__ = ("__version__",) diff --git a/uv.lock b/uv.lock index 5939b9a..c78ff28 100644 --- a/uv.lock +++ b/uv.lock @@ -3053,12 +3053,13 @@ wheels = [ [[package]] name = "yellowhammer" -version = "0.1.dev8+g117550e.d20241008" +version = "0.1.dev11+gf3789f6.d20241022" source = { editable = "." } dependencies = [ { name = "datalab-api" }, { name = "jupyter-ai" }, { name = "jupyterlab" }, + { name = "langchain" }, { name = "langchain-anthropic" }, { name = "langchain-openai" }, { name = "notebook" }, @@ -3076,6 +3077,7 @@ requires-dist = [ { name = "datalab-api", specifier = ">=0.2.4" }, { name = "jupyter-ai", specifier = ">=2.24.1" }, { name = "jupyterlab", specifier = ">=4.2.5" }, + { name = "langchain", specifier = ">=0.2.16" }, { name = "langchain-anthropic", specifier = ">=0.1.23" }, { name = "langchain-openai", specifier = ">=0.1.25" }, { name = "notebook", specifier = ">=7.2.2" }, From bf94201ed72f709757a52674d0f87954edda6a8a Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:51:27 -0700 Subject: [PATCH 2/6] minor changes for initial commit --- src/yellowhammer/magics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/yellowhammer/magics.py b/src/yellowhammer/magics.py index 8bd94cd..4da2e57 100644 --- a/src/yellowhammer/magics.py +++ b/src/yellowhammer/magics.py @@ -33,9 +33,9 @@ def get_output(messages, temp=0.1): # Class to manage state and expose the main magics @magics_class -class datalabMagics(Magics): +class DatalabMagics(Magics): def __init__(self, shell): - super(datalabMagics, self).__init__(shell) + super(DatalabMagics, self).__init__(shell) self.messages = [] # A datalab magic that returns a code block From d93d90e4ea2d3f9a826413c4481775716c10055d Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:52:18 -0700 Subject: [PATCH 3/6] minor changes for initial commit --- examples/001-getting-started.ipynb | 131 ------------ examples/Demo_anthropic.ipynb | 197 ++++++++++++++++++ examples/{Demo.ipynb => Demo_openai.ipynb} | 3 +- .../datalab-api-prompt.md | 0 .../yellowhammer => prompts}/system-prompt.md | 0 pyproject.toml | 6 +- src/yellowhammer/__init__.py | 5 +- src/yellowhammer/llm.py | 33 +-- src/yellowhammer/magics.py | 19 +- src/yellowhammer/prompt.py | 14 +- src/yellowhammer/test.ipynb | 39 +++- uv.lock | 2 +- 12 files changed, 285 insertions(+), 164 deletions(-) delete mode 100644 examples/001-getting-started.ipynb create mode 100644 examples/Demo_anthropic.ipynb rename examples/{Demo.ipynb => Demo_openai.ipynb} (98%) rename {src/yellowhammer => prompts}/datalab-api-prompt.md (100%) rename {src/yellowhammer => prompts}/system-prompt.md (100%) diff --git a/examples/001-getting-started.ipynb b/examples/001-getting-started.ipynb deleted file mode 100644 index 3c8eb0e..0000000 --- a/examples/001-getting-started.ipynb +++ /dev/null @@ -1,131 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "929301f6-5982-4662-b68f-9e72269cace7", - "metadata": {}, - "outputs": [], - "source": [ - "%reload_ext jupyter_ai " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f7f8325c-18b8-4943-b7e8-14a839060958", - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "Here's the information about Datalab in markdown format:\n", - "\n", - "# Datalab\n", - "\n", - "Datalab is a tool developed by Google Cloud Platform that provides an interactive environment for data exploration, analysis, and machine learning. Key features include:\n", - "\n", - "## Features\n", - "\n", - "- **Jupyter Notebooks**: Uses Jupyter notebooks for interactive coding and visualization\n", - "- **Cloud Integration**: Seamlessly integrates with Google Cloud services\n", - "- **Big Data Support**: Designed to work with large datasets using BigQuery and other Google Cloud data services\n", - "- **Pre-installed Libraries**: Comes with popular data science libraries like pandas, numpy, and scikit-learn\n", - "- **Collaborative**: Allows for easy sharing and collaboration on data projects\n", - "\n", - "## Use Cases\n", - "\n", - "- Data exploration and visualization\n", - "- Machine learning model development\n", - "- Big data analysis\n", - "- Prototyping data pipelines\n", - "\n", - "## Advantages\n", - "\n", - "- Easy setup and configuration\n", - "- Cost-effective (pay only for resources used)\n", - "- Scalable to handle large datasets\n", - "- Integrates well with other Google Cloud services\n", - "\n", - "Datalab is particularly useful for data scientists and analysts who work with Google Cloud Platform and need a powerful, cloud-based environment for their data projects." - ], - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": { - "text/markdown": { - "jupyter_ai": { - "model_id": "claude-3-5-sonnet-20240620", - "provider_id": "anthropic-chat" - } - } - }, - "output_type": "execute_result" - } - ], - "source": [ - "%%ai anthropic-chat:claude-3-5-sonnet-20240620\n", - "Do you know what datalab is?" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "12aaa0b1-6ccd-4645-b578-d1fde5cd2614", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "UsageError: Line magic function `%%yellowhammer` not found.\n" - ] - } - ], - "source": [ - "\"\"\" (mock up ) \"\"\"\n", - "%%yellowhammer\n", - "Do you know what datalab is?" - ] - }, - { - "cell_type": "markdown", - "id": "b943d5eb-a83a-4ed5-b531-e69a44d6d701", - "metadata": {}, - "source": [ - "Yes, *datalab* is ACTUALLY an open source research data management tool for chemistry and materials science, and here is the functionality I know about..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d93f30b-3dde-457a-8aa9-9d08cc19f3ad", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/Demo_anthropic.ipynb b/examples/Demo_anthropic.ipynb new file mode 100644 index 0000000..ab73625 --- /dev/null +++ b/examples/Demo_anthropic.ipynb @@ -0,0 +1,197 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "2671fc38-c9ca-49ef-948a-abb7815ca2b9", + "metadata": {}, + "outputs": [], + "source": [ + "%reload_ext yellowhammer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "32c5ffc5-ea83-458c-8e5a-75bb15da8a2d", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your LLM API key ········\n" + ] + } + ], + "source": [ + "import os\n", + "import getpass\n", + "os.environ['LLM_PROVIDER'] = \"ANTHROPIC\"\n", + "os.environ['LLM_API_KEY'] = getpass.getpass(\"Enter your LLM API key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2f61e78f-b8a3-4f0c-b5e4-503af30019f3", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your Datalab API key ········\n" + ] + } + ], + "source": [ + "os.environ[\"DATALAB_API_KEY\"] = getpass.getpass(\"Enter your Datalab API key\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f44cabeb-58ef-4ccd-86d7-5cb04a3f8643", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "Datalab is a data management platform designed to help scientists manage their experimental data, plan experiments, analyze data, and plot results. It provides a structured way to store, organize, and interact with scientific data, particularly in the context of materials science and chemistry experiments. \n", + "\n", + "Some key features of Datalab include:\n", + "\n", + "1. Sample management: You can create, store, and retrieve information about experimental samples, including their chemical composition, synthesis methods, and related metadata.\n", + "\n", + "2. File attachments: Datalab allows you to attach files (such as raw data or images) to sample entries, making it easy to keep all relevant information together.\n", + "\n", + "3. Data blocks: These are used to parse attached files according to scientific schemas and generate plots, facilitating data analysis and visualization.\n", + "\n", + "4. Search functionality: You can search for items (samples, materials, etc.) across the Datalab instance.\n", + "\n", + "5. Relationship tracking: Datalab can track relationships between different items, helping to maintain the context of experiments and materials.\n", + "\n", + "6. API access: Datalab provides a Python API that allows programmatic interaction with the platform, enabling integration with other tools and scripts.\n", + "\n", + "Datalab is particularly useful for maintaining experimental workflows, ensuring data provenance, and facilitating collaboration among scientists. It helps in organizing complex scientific data in a structured manner, making it easier to retrieve, analyze, and share research findings." + ], + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%llm\n", + "What is datalab?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d10442e5-7c5c-40cd-becf-9c2e9a79fd45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "To create a sample with the specified details using the Datalab API, we'll use the `create_item` method of the `DatalabClient`. Here's what the code will do:\n", + "\n", + "1. Import the necessary module\n", + "2. Create a DatalabClient instance\n", + "3. Prepare the sample data as a dictionary\n", + "4. Use the create_item method to create the sample\n", + "5. Print the response to confirm the sample creation" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%llm\n", + "Create a sample with ID llm-test4, sample name \"virtual sample (Claude)\", formula FrCl" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6ef8a1c2-50c8-4807-b84b-008c0bfb4efa", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/yue/Library/CloudStorage/OneDrive-Personal/code/yellowhammer/.venv/lib/python3.10/site-packages/datalab_api/_base.py:165: UserWarning: Found API URL https://demo-api.datalab-org.io in HTML meta tag. Creating client with this URL instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample creation response:\n", + "{'collections': [], 'creator_ids': [{'$oid': '66abc00dcb992f4b299aa60a'}], 'creators': [{'contact_email': None, 'display_name': 'Yue Wu'}], 'date': '2024-10-22T20:06:55.888755', 'item_id': 'llm-test4', 'name': 'virtual sample (Claude)', 'nblocks': 0, 'refcode': 'demo:YASTWQ', 'type': 'samples'}\n" + ] + } + ], + "source": [ + "from datalab_api import DatalabClient\n", + "# Create a DatalabClient instance\n", + "with DatalabClient(\"https://demo.datalab-org.io\") as client:\n", + " # Prepare the sample data\n", + " sample_data = {\n", + " \"item_id\": \"llm-test4\",\n", + " \"name\": \"virtual sample (Claude)\",\n", + " \"chemform\": \"FrCl\",\n", + " \"type\": \"samples\"\n", + " }\n", + " \n", + " # Create the sample\n", + " response = client.create_item(item_id=\"llm-test4\", item_type=\"samples\", item_data=sample_data)\n", + " \n", + " # Print the response\n", + " print(\"Sample creation response:\")\n", + " print(response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f3241bc-ba1f-4807-babb-3a928baad6ea", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "yellowhammer", + "language": "python", + "name": "yellowhammer" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/Demo.ipynb b/examples/Demo_openai.ipynb similarity index 98% rename from examples/Demo.ipynb rename to examples/Demo_openai.ipynb index 20475f3..02c507e 100644 --- a/examples/Demo.ipynb +++ b/examples/Demo_openai.ipynb @@ -17,7 +17,8 @@ "metadata": {}, "outputs": [], "source": [ - "import os, getpass\n", + "import os\n", + "import getpass\n", "os.environ['LLM_PROVIDER'] = \"OPENAI\"\n", "os.environ['LLM_API_KEY'] = getpass.getpass(\"Enter your LLM API key\")" ] diff --git a/src/yellowhammer/datalab-api-prompt.md b/prompts/datalab-api-prompt.md similarity index 100% rename from src/yellowhammer/datalab-api-prompt.md rename to prompts/datalab-api-prompt.md diff --git a/src/yellowhammer/system-prompt.md b/prompts/system-prompt.md similarity index 100% rename from src/yellowhammer/system-prompt.md rename to prompts/system-prompt.md diff --git a/pyproject.toml b/pyproject.toml index a8ab03f..e4bff8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,4 +36,8 @@ ignore_missing_imports = true follow_imports = "skip" [tool.uv] -dev-dependencies = ["ipykernel>=6.29.5", "pre-commit>=4.0.0", "pytest>=8.3.3"] +dev-dependencies = [ + "ipykernel>=6.29.5", + "pre-commit>=4.0.0", + "pytest>=8.3.3", +] diff --git a/src/yellowhammer/__init__.py b/src/yellowhammer/__init__.py index 9c8eac8..c883b66 100644 --- a/src/yellowhammer/__init__.py +++ b/src/yellowhammer/__init__.py @@ -1,4 +1,5 @@ from importlib.metadata import PackageNotFoundError, version +from .magics import DatalabMagics try: __version__ = version("yellowhammer") @@ -8,12 +9,10 @@ __all__ = ("__version__",) -from .magics import datalabMagics - def load_ipython_extension(ipython): """ Any module file that define a function named `load_ipython_extension` can be loaded via `%load_ext module.path` or be configured to be autoloaded by IPython at startup time. """ - ipython.register_magics(datalabMagics) + ipython.register_magics(DatalabMagics) diff --git a/src/yellowhammer/llm.py b/src/yellowhammer/llm.py index bed2644..a2601ac 100644 --- a/src/yellowhammer/llm.py +++ b/src/yellowhammer/llm.py @@ -3,41 +3,45 @@ from .prompt import SYSTEM_PROMPT from typing import Union + # Pydantic schema to use with .with_structured_output() class code(BaseModel): """Schema for code solutions""" + prefix: str = Field(description="Description of the problem and approach") imports: str = Field(description="Code block import statements") code: str = Field(description="Code block not including import statements") + class ConversationalResponse(BaseModel): """Respond in a conversational manner. Be kind and helpful.""" + response: str = Field(description="A conversational response to the user's query") + class FinalResponse(BaseModel): """The final response can be either a code solution or a conversational response""" + final_output: Union[code, ConversationalResponse] -def get_chain( - api_provider, - api_key, - api_model=None, - api_temperature=0, - ): +def get_chain( + api_provider, + api_key, + api_model=None, + api_temperature=0, +): # API provider logic if api_provider.lower() == "openai": from langchain_openai import ChatOpenAI + if api_model is None: api_model = "gpt-4o-mini" - llm = ChatOpenAI( - model=api_model, - temperature=api_temperature, - openai_api_key=api_key - ) + llm = ChatOpenAI(model=api_model, temperature=api_temperature, openai_api_key=api_key) elif api_provider.lower() == "anthropic": from langchain_anthropic import ChatAnthropic + if api_model is None: api_model = "claude-3-5-sonnet-20240620" llm = ChatAnthropic( @@ -49,7 +53,10 @@ def get_chain( # Prompt prompt = ChatPromptTemplate.from_messages( [ - ("system", SYSTEM_PROMPT), # datalab API info is passed via {context} to the system prompt + ( + "system", + SYSTEM_PROMPT, + ), # datalab API info is passed via {context} to the system prompt ("placeholder", "{messages}"), ] ) @@ -58,4 +65,4 @@ def get_chain( chain = prompt | llm.with_structured_output(FinalResponse, include_raw=False) # Returns a runnable chain which accepts datalab API documentation "context" and user question "messages" - return chain \ No newline at end of file + return chain diff --git a/src/yellowhammer/magics.py b/src/yellowhammer/magics.py index 4da2e57..fef7f6b 100644 --- a/src/yellowhammer/magics.py +++ b/src/yellowhammer/magics.py @@ -2,6 +2,7 @@ Magics to support LLM interactions in IPython/Jupyter. Adapted from fperez/jupytee and jan-janssen/LangSim. """ + import os from IPython import get_ipython @@ -16,7 +17,7 @@ parse_argstring, ) from IPython.display import Markdown -from .llm import get_chain, code, FinalResponse, ConversationalResponse +from .llm import get_chain, code, ConversationalResponse from .prompt import API_PROMPT @@ -31,11 +32,12 @@ def get_output(messages, temp=0.1): return agent_executor.invoke({"context": API_PROMPT, "messages": messages}) + # Class to manage state and expose the main magics @magics_class class DatalabMagics(Magics): def __init__(self, shell): - super(DatalabMagics, self).__init__(shell) + super().__init__(shell) self.messages = [] # A datalab magic that returns a code block @@ -48,7 +50,6 @@ def __init__(self, shell): is considered the code generation prompt. """, ) - @argument( "-T", "--temp", @@ -57,13 +58,12 @@ def __init__(self, shell): help="""Temperature, float in [0,1]. Higher values push the algorithm to generate more aggressive/"creative" output. [default=0.1].""", ) - @line_cell_magic def llm(self, line, cell=None): """ Chat with the LLM. Return either conversation or code. """ - args = parse_argstring(self.llm, line) #self.llm is a bound method + args = parse_argstring(self.llm, line) # self.llm is a bound method if cell is None: prompt = " ".join(args.prompt) @@ -77,10 +77,15 @@ def llm(self, line, cell=None): output = response.response self.messages.append(("ai", output)) return Markdown(output) - + elif isinstance(response, code): output = response self.messages.append(("ai", output.prefix)) cell_fill = output.imports + "\n" + output.code get_ipython().set_next_input(cell_fill) - return Markdown(output.prefix) \ No newline at end of file + return Markdown(output.prefix) + + +# If testing interactively, it's convenient to %run as a script in Jupyter +if __name__ == "__main__": + get_ipython().register_magics(DatalabMagics) diff --git a/src/yellowhammer/prompt.py b/src/yellowhammer/prompt.py index efb5d12..1658829 100644 --- a/src/yellowhammer/prompt.py +++ b/src/yellowhammer/prompt.py @@ -1,13 +1,19 @@ from pathlib import Path + def load_file_content(file_path): - with open(file_path, 'r', encoding='utf-8') as file: + with open(file_path, encoding="utf-8") as file: return file.read() + # Load API_PROMPT -api_prompt_path = Path(__file__).parent / "datalab-api-prompt.md" +api_prompt_path = Path(__file__).parent.parent.parent / "prompts" / "datalab-api-prompt.md" API_PROMPT = load_file_content(api_prompt_path) # Load SYSTEM_PROMPT -system_prompt_path = Path(__file__).parent / "system-prompt.md" -SYSTEM_PROMPT = load_file_content(system_prompt_path) \ No newline at end of file +system_prompt_path = Path(__file__).parent.parent.parent / "prompts" / "system-prompt.md" +SYSTEM_PROMPT = load_file_content(system_prompt_path) + +if __name__ == "__main__": + print(SYSTEM_PROMPT) + print(API_PROMPT) diff --git a/src/yellowhammer/test.ipynb b/src/yellowhammer/test.ipynb index 4961327..7ff7249 100644 --- a/src/yellowhammer/test.ipynb +++ b/src/yellowhammer/test.ipynb @@ -15,7 +15,7 @@ "\n", "path = \"system-prompt.md\"\n", "loader = TextLoader(path)\n", - "SYSTEM_PROMPT = loader.load()[0].page_content\n" + "SYSTEM_PROMPT = loader.load()[0].page_content" ] }, { @@ -45,9 +45,10 @@ "outputs": [], "source": [ "def load_file_content(file_path):\n", - " with open(file_path, 'r', encoding='utf-8') as file:\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", " return file.read()\n", "\n", + "\n", "# Load API_PROMPT\n", "api_prompt_path = \"datalab-api-prompt.md\"\n", "API_PROMPT_ = load_file_content(api_prompt_path)\n", @@ -114,7 +115,7 @@ } ], "source": [ - "len(SYSTEM_PROMPT_)\n" + "len(SYSTEM_PROMPT_)" ] }, { @@ -157,6 +158,38 @@ "SYSTEM_PROMPT" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name '__file__' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m file\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Load API_PROMPT\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m api_prompt_path \u001b[38;5;241m=\u001b[39m Path(\u001b[38;5;18;43m__file__\u001b[39;49m)\u001b[38;5;241m.\u001b[39mparent \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprompts\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdatalab-api-prompt.md\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m API_PROMPT \u001b[38;5;241m=\u001b[39m load_file_content(api_prompt_path)\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# Load SYSTEM_PROMPT\u001b[39;00m\n", + "\u001b[0;31mNameError\u001b[0m: name '__file__' is not defined" + ] + } + ], + "source": [ + "def load_file_content(file_path):\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " return file.read()\n", + "\n", + "\n", + "# Load API_PROMPT\n", + "api_prompt_path = Path(__file__).parent / \"prompts\" / \"datalab-api-prompt.md\"\n", + "API_PROMPT = load_file_content(api_prompt_path)\n", + "\n", + "# Load SYSTEM_PROMPT\n", + "system_prompt_path = Path(__file__).parent / \"prompts\" / \"system-prompt.md\"\n", + "SYSTEM_PROMPT = load_file_content(system_prompt_path)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/uv.lock b/uv.lock index c78ff28..d6a0291 100644 --- a/uv.lock +++ b/uv.lock @@ -3053,7 +3053,7 @@ wheels = [ [[package]] name = "yellowhammer" -version = "0.1.dev11+gf3789f6.d20241022" +version = "0.1.dev12+g7b44707.d20241024" source = { editable = "." } dependencies = [ { name = "datalab-api" }, From 4f3128f6728ec7bc529eb1b9b38441d2fc1ce48f Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Wed, 23 Oct 2024 17:53:02 -0700 Subject: [PATCH 4/6] pre-commit update --- examples/Demo_anthropic.ipynb | 12 +++++++----- examples/Demo_openai.ipynb | 11 +++++------ prompts/datalab-api-prompt.md | 2 +- prompts/system-prompt.md | 2 +- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/Demo_anthropic.ipynb b/examples/Demo_anthropic.ipynb index ab73625..b56cf27 100644 --- a/examples/Demo_anthropic.ipynb +++ b/examples/Demo_anthropic.ipynb @@ -27,8 +27,9 @@ "source": [ "import os\n", "import getpass\n", - "os.environ['LLM_PROVIDER'] = \"ANTHROPIC\"\n", - "os.environ['LLM_API_KEY'] = getpass.getpass(\"Enter your LLM API key\")" + "\n", + "os.environ[\"LLM_PROVIDER\"] = \"ANTHROPIC\"\n", + "os.environ[\"LLM_API_KEY\"] = getpass.getpass(\"Enter your LLM API key\")" ] }, { @@ -146,6 +147,7 @@ ], "source": [ "from datalab_api import DatalabClient\n", + "\n", "# Create a DatalabClient instance\n", "with DatalabClient(\"https://demo.datalab-org.io\") as client:\n", " # Prepare the sample data\n", @@ -153,12 +155,12 @@ " \"item_id\": \"llm-test4\",\n", " \"name\": \"virtual sample (Claude)\",\n", " \"chemform\": \"FrCl\",\n", - " \"type\": \"samples\"\n", + " \"type\": \"samples\",\n", " }\n", - " \n", + "\n", " # Create the sample\n", " response = client.create_item(item_id=\"llm-test4\", item_type=\"samples\", item_data=sample_data)\n", - " \n", + "\n", " # Print the response\n", " print(\"Sample creation response:\")\n", " print(response)" diff --git a/examples/Demo_openai.ipynb b/examples/Demo_openai.ipynb index 02c507e..59cfd16 100644 --- a/examples/Demo_openai.ipynb +++ b/examples/Demo_openai.ipynb @@ -19,8 +19,9 @@ "source": [ "import os\n", "import getpass\n", - "os.environ['LLM_PROVIDER'] = \"OPENAI\"\n", - "os.environ['LLM_API_KEY'] = getpass.getpass(\"Enter your LLM API key\")" + "\n", + "os.environ[\"LLM_PROVIDER\"] = \"OPENAI\"\n", + "os.environ[\"LLM_API_KEY\"] = getpass.getpass(\"Enter your LLM API key\")" ] }, { @@ -101,11 +102,9 @@ "source": [ "from datalab_api import DatalabClient\n", "import os\n", + "\n", "with DatalabClient(\"https://demo.datalab-org.io\") as client:\n", - " json_data = {\n", - " \"name\": \"virtual sample\",\n", - " \"chemform\": \"RbCl\"\n", - " }\n", + " json_data = {\"name\": \"virtual sample\", \"chemform\": \"RbCl\"}\n", " client.create_item(item_id=\"llm-test3\", item_type=\"samples\", item_data=json_data)" ] } diff --git a/prompts/datalab-api-prompt.md b/prompts/datalab-api-prompt.md index 5497102..9beb219 100755 --- a/prompts/datalab-api-prompt.md +++ b/prompts/datalab-api-prompt.md @@ -55,7 +55,7 @@ with DatalabClient("https://demo.datalab-org.io") as client: Here is an abridged JSONSchema for a sample, that also has some info about other types. -```json +```json { "title": "Sample", "description": "A model for representing an experimental sample.", diff --git a/prompts/system-prompt.md b/prompts/system-prompt.md index 18ce00c..18e81c6 100644 --- a/prompts/system-prompt.md +++ b/prompts/system-prompt.md @@ -1 +1 @@ -You are a virtual assistant that helps scientists use the data management platform datalab to manage their experimental data, plan experiments, analyse data and plot results. Here is the documentation of the datalab API: {context}. Answer user questions and use the provided documentation to interact with datalab. Your code responses will be output to a Jupyter notebook cell that has access to the datalab API and common python scientific libraries. Ensure any code you provide can be executed with all required imports and variables defined. Structure your answer with a description of the code solution. Then list the imports. Finally list the functioning code block. Here is the user question: \ No newline at end of file +You are a virtual assistant that helps scientists use the data management platform datalab to manage their experimental data, plan experiments, analyse data and plot results. Here is the documentation of the datalab API: {context}. Answer user questions and use the provided documentation to interact with datalab. Your code responses will be output to a Jupyter notebook cell that has access to the datalab API and common python scientific libraries. Ensure any code you provide can be executed with all required imports and variables defined. Structure your answer with a description of the code solution. Then list the imports. Finally list the functioning code block. Here is the user question: From 82ce847e3bf0cb11eab7b197c7a13958af64160b Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Sat, 26 Oct 2024 22:26:46 -0700 Subject: [PATCH 5/6] add error parsing logic to llm chain --- .gitignore | 5 + src/yellowhammer/llm.py | 22 +++- src/yellowhammer/magics.py | 2 +- src/yellowhammer/test.ipynb | 222 ------------------------------------ 4 files changed, 26 insertions(+), 225 deletions(-) delete mode 100644 src/yellowhammer/test.ipynb diff --git a/.gitignore b/.gitignore index 82f9275..6a7416b 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,8 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Test notebook +examples/Demo_yue.ipynb +examples/yue_examples +examples/FA-Mn-H2PO2.raw \ No newline at end of file diff --git a/src/yellowhammer/llm.py b/src/yellowhammer/llm.py index a2601ac..fff1ee3 100644 --- a/src/yellowhammer/llm.py +++ b/src/yellowhammer/llm.py @@ -25,6 +25,25 @@ class FinalResponse(BaseModel): final_output: Union[code, ConversationalResponse] +def error_parser(output): + """ + Parse the API output to handle errors gracefully. + """ + if output["parsing_error"]: + raw_output = str(output["raw"].content) + error = output["parsing_error"] + out_string = f"Error parsing LLM output. Parse error: {error}. \n Raw output: {raw_output}" + return FinalResponse(final_output=ConversationalResponse(response=out_string)) + + elif not output["parsed"]: + raw_output = str(output["raw"].content) + out_string = f"Error in LLM response. \n Raw output: {raw_output}" + return FinalResponse(final_output=ConversationalResponse(response=out_string)) + + else: + # Return the parsed output (should be FinalResponse) + return output["parsed"] + def get_chain( api_provider, api_key, @@ -62,7 +81,6 @@ def get_chain( ) # Create a chain where the final output takes the FinalResponse schema - chain = prompt | llm.with_structured_output(FinalResponse, include_raw=False) - + chain = prompt | llm.with_structured_output(FinalResponse, include_raw=True) | error_parser # Returns a runnable chain which accepts datalab API documentation "context" and user question "messages" return chain diff --git a/src/yellowhammer/magics.py b/src/yellowhammer/magics.py index fef7f6b..5342c2a 100644 --- a/src/yellowhammer/magics.py +++ b/src/yellowhammer/magics.py @@ -54,7 +54,7 @@ def __init__(self, shell): "-T", "--temp", type=float, - default=0.0, + default=0.1, help="""Temperature, float in [0,1]. Higher values push the algorithm to generate more aggressive/"creative" output. [default=0.1].""", ) diff --git a/src/yellowhammer/test.ipynb b/src/yellowhammer/test.ipynb deleted file mode 100644 index 7ff7249..0000000 --- a/src/yellowhammer/test.ipynb +++ /dev/null @@ -1,222 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "from langchain_community.document_loaders import TextLoader\n", - "\n", - "path = \"datalab-api-prompt.md\"\n", - "loader = TextLoader(path)\n", - "API_PROMPT = loader.load()[0].page_content\n", - "\n", - "path = \"system-prompt.md\"\n", - "loader = TextLoader(path)\n", - "SYSTEM_PROMPT = loader.load()[0].page_content" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Use the datalab Python API package to query entries on the datalab instance at https://demo.datalab-org.io/.\\nEach method of the DatalabClient class will return a dictionary constructed directly\\nfrom the JSON response of the Datalab API.\\n\\nDatalab uses \"data blocks\" to take a file attached to a sample, parse it\\naccording to some scientific schema, and then make a plot.\\n\\nThe rest of this prompt contains the README for the datalab python API module `datalab_api`, which you already have installed.\\n\\nAssume the `DATALAB_API_KEY` has been set an environment variable.\\n\\nPython API\\nThis package implements basic functionality for displaying and manipulating entries:\\n\\n```python\\nfrom datalab_api import DatalabClient\\n\\nwith DatalabClient(\"https://demo.datalab-org.io\") as client:\\n\\n # Get the info about this datalab instance\\n client.get_info()\\n\\n # Get the current user\\'s info\\n client.authenticate()\\n\\n # Search for items with the string\\n items = client.search_items(\"search-values\")\\n\\n # List all items of a given type\\n # Types can be \\'samples\\' or \\'starting_materials\\'\\n items = client.get_items(item_type=\"samples\")\\n\\n # Get more info on a particular item called \\'test\\'\\n item = client.get_item(item_id=\"test\")\\n\\n # Create a new item with some data that matches the corresponding `item_type` schema\\n json_data = {\"chemform\": \"NaCl\"}\\n client.create_item(item_id=\"test_new\", item_type=\"samples\", item_data=json_data)\\n\\n # Attach a file to an item and get the uploaded ID\\n file_response = client.upload_file(filepath=\"my_echem_data.mpr\", item_id=\"test\")\\n file_id = file_response[\"file_id\"]\\n\\n # Create a data block for a sample, then show the plot\\n client.create_data_block(item_id=\"test\", file_ids=file_id)\\n\\n # Download all files attached to a sample and return their paths\\n file_paths = client.get_item_files(item_id=\"test\")\\n\\n # Get the item graph, useful for finding relationships\\n graph = client.get_item_graph()\\n\\n```\\n\\nHere is an abridged JSONSchema for a sample, that also has some info about other\\ntypes.\\n\\n```json \\n{\\n \"title\": \"Sample\",\\n \"description\": \"A model for representing an experimental sample.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"blocks_obj\": {\\n \"title\": \"Blocks Obj\",\\n \"default\": {},\\n \"type\": \"object\"\\n },\\n \"display_order\": {\\n \"title\": \"Display Order\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"collections\": {\\n \"title\": \"Collections\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Collection\"\\n }\\n },\\n \"revision\": {\\n \"title\": \"Revision\",\\n \"default\": 1,\\n \"type\": \"integer\"\\n },\\n \"revisions\": {\\n \"title\": \"Revisions\",\\n \"type\": \"object\"\\n },\\n \"creator_ids\": {\\n \"title\": \"Creator Ids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"creators\": {\\n \"title\": \"Creators\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Person\"\\n }\\n },\\n \"type\": {\\n \"title\": \"Type\",\\n \"default\": \"samples\",\\n \"const\": \"samples\",\\n \"pattern\": \"^samples$\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable ID\",\\n \"type\": \"string\"\\n },\\n \"last_modified\": {\\n \"title\": \"Last Modified\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"relationships\": {\\n \"title\": \"Relationships\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/TypedRelationship\"\\n }\\n },\\n \"refcode\": {\\n \"title\": \"Refcode\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"item_id\": {\\n \"title\": \"Item Id\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"description\": {\\n \"title\": \"Description\",\\n \"type\": \"string\"\\n },\\n \"date\": {\\n \"title\": \"Date\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"files\": {\\n \"title\": \"Files\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/File\"\\n }\\n },\\n \"file_ObjectIds\": {\\n \"title\": \"File Objectids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"chemform\": {\\n \"title\": \"Chemform\",\\n \"example\": [\\n \"Na3P\",\\n \"LiNiO2@C\"\\n ],\\n \"type\": \"string\"\\n },\\n \"synthesis_constituents\": {\\n \"title\": \"Synthesis Constituents\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Constituent\"\\n }\\n },\\n \"synthesis_description\": {\\n \"title\": \"Synthesis Description\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"item_id\"\\n ],\\n \"definitions\": {\\n \"KnownType\": {\\n \"title\": \"KnownType\",\\n \"description\": \"An enumeration of the types of entry known by this implementation, should be made dynamic in the future.\",\\n \"enum\": [\\n \"samples\",\\n \"starting_materials\",\\n \"blocks\",\\n \"files\",\\n \"people\",\\n \"collections\"\\n ],\\n \"type\": \"string\"\\n },\\n \"File\": {\\n \"title\": \"File\",\\n \"description\": \"A model for representing a file that has been tracked or uploaded to datalab.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"revision\": {\\n \"title\": \"Revision\",\\n \"default\": 1,\\n \"type\": \"integer\"\\n },\\n \"revisions\": {\\n \"title\": \"Revisions\",\\n \"type\": \"object\"\\n },\\n \"creator_ids\": {\\n \"title\": \"Creator Ids\",\\n \"default\": [],\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"creators\": {\\n \"title\": \"Creators\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/Person\"\\n }\\n },\\n \"type\": {\\n \"title\": \"Type\",\\n \"default\": \"files\",\\n \"const\": \"files\",\\n \"pattern\": \"^files$\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable ID\",\\n \"type\": \"string\"\\n },\\n \"last_modified\": {\\n \"title\": \"Last Modified\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"relationships\": {\\n \"title\": \"Relationships\",\\n \"type\": \"array\",\\n \"items\": {\\n \"$ref\": \"#/definitions/TypedRelationship\"\\n }\\n },\\n \"size\": {\\n \"title\": \"Size\",\\n \"description\": \"The size of the file on disk in bytes.\",\\n \"type\": \"integer\"\\n },\\n \"last_modified_remote\": {\\n \"title\": \"Last Modified Remote\",\\n \"description\": \"The last date/time at which the remote file was modified.\",\\n \"type\": \"date\",\\n \"format\": \"date-time\"\\n },\\n \"item_ids\": {\\n \"title\": \"Item Ids\",\\n \"description\": \"A list of item IDs associated with this file.\",\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"blocks\": {\\n \"title\": \"Blocks\",\\n \"description\": \"A list of block IDs associated with this file.\",\\n \"type\": \"array\",\\n \"items\": {\\n \"type\": \"string\"\\n }\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"description\": \"The filename on disk.\",\\n \"type\": \"string\"\\n },\\n \"extension\": {\\n \"title\": \"Extension\",\\n \"description\": \"The file extension that the file was uploaded with.\",\\n \"type\": \"string\"\\n },\\n \"original_name\": {\\n \"title\": \"Original Name\",\\n \"description\": \"The raw filename as uploaded.\",\\n \"type\": \"string\"\\n },\\n \"location\": {\\n \"title\": \"Location\",\\n \"description\": \"The location of the file on disk.\",\\n \"type\": \"string\"\\n },\\n \"url_path\": {\\n \"title\": \"Url Path\",\\n \"description\": \"The path to a remote file.\",\\n \"type\": \"string\"\\n },\\n \"source\": {\\n \"title\": \"Source\",\\n \"description\": \"The source of the file, e.g. \\'remote\\' or \\'uploaded\\'.\",\\n \"type\": \"string\"\\n },\\n \"time_added\": {\\n \"title\": \"Time Added\",\\n \"description\": \"The timestamp for the original file upload.\",\\n \"type\": \"string\",\\n \"format\": \"date-time\"\\n },\\n \"metadata\": {\\n \"title\": \"Metadata\",\\n \"description\": \"Any additional metadata.\",\\n \"type\": \"object\"\\n },\\n \"representation\": {\\n \"title\": \"Representation\"\\n },\\n \"source_server_name\": {\\n \"title\": \"Source Server Name\",\\n \"description\": \"The server name at which the file is stored.\",\\n \"type\": \"string\"\\n },\\n \"source_path\": {\\n \"title\": \"Source Path\",\\n \"description\": \"The path to the file on the remote resource.\",\\n \"type\": \"string\"\\n },\\n \"is_live\": {\\n \"title\": \"Is Live\",\\n \"description\": \"Whether or not the file should be watched for future updates.\",\\n \"type\": \"boolean\"\\n }\\n },\\n \"required\": [\\n \"item_ids\",\\n \"blocks\",\\n \"name\",\\n \"extension\",\\n \"time_added\",\\n \"is_live\"\\n ]\\n },\\n \"EntryReference\": {\\n \"title\": \"EntryReference\",\\n \"description\": \"A reference to a database entry by ID and type.\\\\n\\\\nCan include additional arbitarary metadata useful for\\\\ninlining the item data.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"type\": {\\n \"title\": \"Type\",\\n \"type\": \"string\"\\n },\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"immutable_id\": {\\n \"title\": \"Immutable Id\",\\n \"type\": \"string\"\\n },\\n \"item_id\": {\\n \"title\": \"Item Id\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n },\\n \"refcode\": {\\n \"title\": \"Refcode\",\\n \"minLength\": 1,\\n \"maxLength\": 40,\\n \"pattern\": \"^[a-z]{2,10}:(?:[a-zA-Z0-9]+|[a-zA-Z0-9][a-zA-Z0-9._-]+[a-zA-Z0-9])$\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"type\"\\n ]\\n },\\n \"InlineSubstance\": {\\n \"title\": \"InlineSubstance\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"name\": {\\n \"title\": \"Name\",\\n \"type\": \"string\"\\n },\\n \"chemform\": {\\n \"title\": \"Chemform\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"name\"\\n ]\\n },\\n \"Constituent\": {\\n \"title\": \"Constituent\",\\n \"description\": \"A constituent of a sample.\",\\n \"type\": \"object\",\\n \"properties\": {\\n \"item\": {\\n \"title\": \"Item\",\\n \"anyOf\": [\\n {\\n \"$ref\": \"#/definitions/EntryReference\"\\n },\\n {\\n \"$ref\": \"#/definitions/InlineSubstance\"\\n }\\n ]\\n },\\n \"quantity\": {\\n \"title\": \"Quantity\",\\n \"minimum\": 0,\\n \"type\": \"number\"\\n },\\n \"unit\": {\\n \"title\": \"Unit\",\\n \"default\": \"g\",\\n \"type\": \"string\"\\n }\\n },\\n \"required\": [\\n \"item\",\\n \"quantity\"\\n ]\\n }\\n }\\n}\\n```\\n'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "API_PROMPT" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def load_file_content(file_path):\n", - " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " return file.read()\n", - "\n", - "\n", - "# Load API_PROMPT\n", - "api_prompt_path = \"datalab-api-prompt.md\"\n", - "API_PROMPT_ = load_file_content(api_prompt_path)\n", - "\n", - "# Load SYSTEM_PROMPT\n", - "system_prompt_path = \"system-prompt.md\"\n", - "SYSTEM_PROMPT_ = load_file_content(system_prompt_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11679" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(API_PROMPT_)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "11679" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(API_PROMPT)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "698" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(SYSTEM_PROMPT_)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "698" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(SYSTEM_PROMPT)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'You are a virtual assistant that helps scientists use the data management platform datalab to manage their experimental data, plan experiments, analyse data and plot results. Here is the documentation of the datalab API: {context}. Answer user questions and use the provided documentation to interact with datalab. Your code responses will be output to a Jupyter notebook cell that has access to the datalab API and common python scientific libraries. Ensure any code you provide can be executed with all required imports and variables defined. Structure your answer with a description of the code solution. Then list the imports. Finally list the functioning code block. Here is the user question:'" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "SYSTEM_PROMPT" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name '__file__' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m file\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 7\u001b[0m \u001b[38;5;66;03m# Load API_PROMPT\u001b[39;00m\n\u001b[0;32m----> 8\u001b[0m api_prompt_path \u001b[38;5;241m=\u001b[39m Path(\u001b[38;5;18;43m__file__\u001b[39;49m)\u001b[38;5;241m.\u001b[39mparent \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mprompts\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m/\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdatalab-api-prompt.md\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 9\u001b[0m API_PROMPT \u001b[38;5;241m=\u001b[39m load_file_content(api_prompt_path)\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# Load SYSTEM_PROMPT\u001b[39;00m\n", - "\u001b[0;31mNameError\u001b[0m: name '__file__' is not defined" - ] - } - ], - "source": [ - "def load_file_content(file_path):\n", - " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", - " return file.read()\n", - "\n", - "\n", - "# Load API_PROMPT\n", - "api_prompt_path = Path(__file__).parent / \"prompts\" / \"datalab-api-prompt.md\"\n", - "API_PROMPT = load_file_content(api_prompt_path)\n", - "\n", - "# Load SYSTEM_PROMPT\n", - "system_prompt_path = Path(__file__).parent / \"prompts\" / \"system-prompt.md\"\n", - "SYSTEM_PROMPT = load_file_content(system_prompt_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From bdb22c207bfb9c0937b19a847acf5320f29f0210 Mon Sep 17 00:00:00 2001 From: Yue Wu <14996988+yue-here@users.noreply.github.com> Date: Sun, 27 Oct 2024 17:40:01 -0700 Subject: [PATCH 6/6] pre-commit autoupdate --- .gitignore | 3 ++- src/yellowhammer/llm.py | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6a7416b..167efc0 100644 --- a/.gitignore +++ b/.gitignore @@ -164,4 +164,5 @@ cython_debug/ # Test notebook examples/Demo_yue.ipynb examples/yue_examples -examples/FA-Mn-H2PO2.raw \ No newline at end of file +examples/FA-Mn-H2PO2.raw +src/yellowhammer/yue_test.ipynb diff --git a/src/yellowhammer/llm.py b/src/yellowhammer/llm.py index fff1ee3..effde10 100644 --- a/src/yellowhammer/llm.py +++ b/src/yellowhammer/llm.py @@ -34,16 +34,17 @@ def error_parser(output): error = output["parsing_error"] out_string = f"Error parsing LLM output. Parse error: {error}. \n Raw output: {raw_output}" return FinalResponse(final_output=ConversationalResponse(response=out_string)) - + elif not output["parsed"]: raw_output = str(output["raw"].content) out_string = f"Error in LLM response. \n Raw output: {raw_output}" return FinalResponse(final_output=ConversationalResponse(response=out_string)) - + else: # Return the parsed output (should be FinalResponse) return output["parsed"] + def get_chain( api_provider, api_key,