AI Agent Memory in Python:
A Complete Implementation Guide
Giving an AI agent persistent memory is deceptively hard. A dict works until it doesn't. SQLite is better until you need semantic search. This guide walks through every approach — from naive to production-ready — with full Python code at each step.
The 3 memory types and when to use each in Python
Cognitive science distinguishes three long-term memory systems. Agent memory maps naturally onto these same categories:
Most implementations default to episodic-only, storing every conversation turn. This works initially but becomes noisy as the episodic store fills with low-signal exchanges. A mature memory system uses all three types, stored with different retrieval weights.
Naive approach: in-memory dict
The simplest possible implementation stores messages in a Python dict keyed by session ID. This is what most tutorials show, and it has one critical flaw: it dies with the process.
class NaiveAgentMemory: def __init__(self): self._store: dict[str, list[str]] = {} def remember(self, session_id: str, content: str) -> None: self._store.setdefault(session_id, []).append(content) def recall(self, session_id: str, query: str = None) -> list[str]: # No semantic search — returns ALL memories for session return self._store.get(session_id, []) # Problems: # 1. Dies on process restart # 2. No semantic search — returns everything or nothing # 3. No TTL, no type differentiation, no confidence scoring # 4. Unbounded memory growth per session
Better: SQLite with embeddings
SQLite survives process restarts and is zero-infrastructure. Adding OpenAI embeddings and cosine similarity gives you basic semantic search. This is a legitimate approach for prototypes and single-machine deployments.
pip install openai numpy
import sqlite3, json, os import numpy as np from openai import OpenAI _openai = OpenAI() def embed(text: str) -> list[float]: return _openai.embeddings.create( model="text-embedding-3-small", input=text ).data[0].embedding def cosine(a: list[float], b: list[float]) -> float: a, b = np.array(a), np.array(b) return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))) class SQLiteMemory: def __init__(self, db_path: str = "agent_memory.db"): self.conn = sqlite3.connect(db_path, check_same_thread=False) self.conn.execute(""" CREATE TABLE IF NOT EXISTS memories ( id INTEGER PRIMARY KEY, session_id TEXT, content TEXT, embedding TEXT, -- JSON-serialized float list created_at REAL DEFAULT (unixepoch('now', 'subsec')) ) """) self.conn.commit() def remember(self, session_id: str, content: str) -> None: vec = embed(content) self.conn.execute( "INSERT INTO memories (session_id, content, embedding) VALUES (?,?,?)", (session_id, content, json.dumps(vec)) ) self.conn.commit() def recall(self, session_id: str, query: str, top_k: int = 5) -> list[str]: q_vec = embed(query) rows = self.conn.execute( "SELECT content, embedding FROM memories WHERE session_id = ?", (session_id,) ).fetchall() # In-process cosine similarity — O(n) scan, no index scored = sorted( rows, key=lambda r: cosine(q_vec, json.loads(r[1])), reverse=True ) return [r[0] for r in scored[:top_k]]
Production: managed API approach
For production agents, the right approach is to delegate memory to a service that handles the hard parts: ANN indexing, confidence scoring, TTL management, multi-tenant isolation, and GDPR-compliant EU hosting. This is what Kronvex provides.
The trade-off is a network round-trip (~20–50ms depending on region). For most agent architectures, this is acceptable — you're already waiting on LLM inference at 500ms+. The memory call runs in parallel or is fast enough to be negligible.
Full class: AgentWithMemory using Kronvex SDK
Here is a complete, production-ready AgentWithMemory class that encapsulates the full memory lifecycle:
pip install kronvex openai
from __future__ import annotations import os from dataclasses import dataclass, field from typing import Literal from kronvex import Kronvex from openai import OpenAI MemoryType = Literal["episodic", "semantic", "procedural"] @dataclass class AgentWithMemory: """ A production-ready AI agent with persistent, typed memory. Memory persists across process restarts, sessions, and deployments. """ kronvex_api_key: str = field(default_factory=lambda: os.environ["KRONVEX_API_KEY"]) kronvex_agent_id: str = field(default_factory=lambda: os.environ["KRONVEX_AGENT_ID"]) openai_api_key: str = field(default_factory=lambda: os.environ["OPENAI_API_KEY"]) model: str = "gpt-4o" system_prompt: str = "You are a helpful AI assistant." top_k: int = 6 def __post_init__(self): self._kv = Kronvex(self.kronvex_api_key) self._agent = self._kv.agent(self.kronvex_agent_id) self._openai = OpenAI(api_key=self.openai_api_key) def learn( self, content: str, memory_type: MemoryType = "semantic", session_id: str = None, ttl_days: int = None, ) -> None: """Explicitly store a fact or lesson in long-term memory.""" kwargs = {} if session_id: kwargs["session_id"] = session_id if ttl_days: kwargs["ttl_days"] = ttl_days self._agent.remember(content, memory_type=memory_type, **kwargs) def chat( self, user_message: str, session_id: str = None, remember_exchange: bool = True, ) -> str: """ Send a message, auto-inject relevant memory context, get a response, and optionally store the exchange. """ # 1. Fetch relevant memories ctx = self._agent.inject_context( query=user_message, top_k=self.top_k, session_id=session_id, ) # 2. Build system prompt with memory context system = self.system_prompt if ctx.context: system += f"\n\n--- Relevant memory context ---\n{ctx.context}" # 3. Call the LLM response = self._openai.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": system}, {"role": "user", "content": user_message}, ], ) ai_reply = response.choices[0].message.content # 4. Store the exchange as episodic memory if remember_exchange: kw = dict(session_id=session_id) if session_id else {} self._agent.remember(user_message, memory_type="episodic", ttl_days=30, **kw) self._agent.remember(ai_reply, memory_type="episodic", ttl_days=30, **kw) return ai_reply
agent = AgentWithMemory( system_prompt="You are a customer support agent for Acme Corp." ) # Teach the agent a durable fact agent.learn( "User alice@acme.com is on the Enterprise plan, account since 2023", memory_type="semantic", session_id="alice", ) # Session 1 r1 = agent.chat("What integrations do I have access to?", session_id="alice") # → "As an Enterprise customer since 2023, you have access to all integrations..." # Session 2 (new process, same session_id) r2 = agent.chat("Can you remind me what plan I'm on?", session_id="alice") # → "You're on the Enterprise plan, and you've been with us since 2023."
Async version with httpx
For FastAPI, async frameworks, or high-concurrency deployments, use the async client to avoid blocking the event loop:
pip install "kronvex[async]" openai
from kronvex import AsyncKronvex from openai import AsyncOpenAI import asyncio, os _kv = AsyncKronvex(os.environ["KRONVEX_API_KEY"]) _agent = _kv.agent(os.environ["KRONVEX_AGENT_ID"]) _llm = AsyncOpenAI() async def chat_async(user_msg: str, session_id: str) -> str: # Fetch context and call LLM concurrently ctx_task = asyncio.create_task( _agent.inject_context(user_msg, top_k=6, session_id=session_id) ) ctx = await ctx_task system = "You are a helpful assistant." if ctx.context: system += f"\n\n[MEMORY]\n{ctx.context}" resp = await _llm.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": system}, {"role": "user", "content": user_msg}, ], ) reply = resp.choices[0].message.content # Fire-and-forget memory writes (don't block response) asyncio.create_task(_agent.remember(user_msg, memory_type="episodic", session_id=session_id, ttl_days=30)) asyncio.create_task(_agent.remember(reply, memory_type="episodic", session_id=session_id, ttl_days=30)) return reply # FastAPI usage from fastapi import FastAPI app = FastAPI() @app.post("/chat/{user_id}") async def chat_endpoint(user_id: str, message: str): return {"reply": await chat_async(message, session_id=user_id)}
Testing memory recall quality
Testing that your memory integration works is different from unit testing. You need to verify that the right memories are recalled for a given query — which requires inspecting confidence scores.
import pytest from kronvex import Kronvex AGENT_ID = "test-agent-id" # dedicated test agent @pytest.fixture(autouse=True) def clean_agent(): """Clear all memories before each test.""" kv = Kronvex(os.environ["KRONVEX_API_KEY"]) kv.agent(AGENT_ID).clear_all() # test helper endpoint yield def test_semantic_recall_precision(): kv = Kronvex(os.environ["KRONVEX_API_KEY"]) agent = kv.agent(AGENT_ID) # Store semantically related and unrelated memories agent.remember("User prefers formal tone", memory_type="semantic") agent.remember("User is on the Pro plan", memory_type="semantic") agent.remember("Company uses Slack for comms", memory_type="semantic") agent.remember("Last deployment was on March 10", memory_type="episodic") result = agent.recall(query="how should I write my responses?", top_k=2) # The tone preference should be the top result assert result.memories[0].content == "User prefers formal tone" # Confidence should be high (semantic match) assert result.memories[0].confidence >= 0.60 # Deployment info should NOT be in top-2 contents = [m.content for m in result.memories] assert "Last deployment was on March 10" not in contents def test_session_isolation(): """Memories from session A must not appear in session B queries.""" kv = Kronvex(os.environ["KRONVEX_API_KEY"]) agent = kv.agent(AGENT_ID) agent.remember("Alice is VP of Engineering", memory_type="semantic", session_id="alice") agent.remember("Bob is on the free plan", memory_type="semantic", session_id="bob") bob_result = agent.recall(query="what role does this user have?", session_id="bob") bob_contents = [m.content for m in bob_result.memories] assert "Alice is VP of Engineering" not in bob_contents
clear_all() in your fixture teardown.