A practical guide to implementing memory systems for AI agents. Memory allows agents to maintain context across conversations, personalize responses, and learn from past interactions.
| Memory Type | How It Works | Best For | Trade-offs |
|---|---|---|---|
| Conversation Buffer | Stores full recent conversation | Simple chatbots, short sessions | High token usage, forgets old context |
| Sliding Window | Fixed-size window of recent N turns | Ongoing sessions needing recency | Fixed size limits depth |
| Vector Store | Embeds interactions, retrieves by similarity | Semantic recall over large histories | Weak on relationships, embedding cost |
| Knowledge Graph | Entities as nodes, relations as edges | Multi-hop reasoning, complex domains | Setup complexity, graph query overhead |
The simplest form — store the full conversation history and pass it as context.
from collections import deque from openai import OpenAI client = OpenAI() class ConversationBufferMemory: def __init__(self): self.messages = [] def add_user_message(self, content: str): self.messages.append({"role": "user", "content": content}) def add_assistant_message(self, content: str): self.messages.append({"role": "assistant", "content": content}) def get_context(self) -> list: return self.messages.copy() def clear(self): self.messages = [] memory = ConversationBufferMemory() def chat(user_input: str) -> str: memory.add_user_message(user_input) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."} ] + memory.get_context() ) reply = response.choices[0].message.content memory.add_assistant_message(reply) return reply # Usage print(chat("My name is Alice")) print(chat("What is my name?")) # Remembers: "Alice"
Keep only the last N exchanges to control token usage while maintaining recent context.
from collections import deque from openai import OpenAI client = OpenAI() class SlidingWindowMemory: def __init__(self, window_size: int = 10): self.window = deque(maxlen=window_size * 2) # *2 for user+assistant pairs def add_user_message(self, content: str): self.window.append({"role": "user", "content": content}) def add_assistant_message(self, content: str): self.window.append({"role": "assistant", "content": content}) def get_context(self) -> list: return list(self.window) memory = SlidingWindowMemory(window_size=5) # Keep last 5 exchanges def chat(user_input: str) -> str: memory.add_user_message(user_input) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."} ] + memory.get_context() ) reply = response.choices[0].message.content memory.add_assistant_message(reply) return reply
Embed past interactions and retrieve the most relevant ones via semantic search. Scales to thousands of past conversations.
from openai import OpenAI from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct import uuid client = OpenAI() qdrant = QdrantClient(":memory:") # Use url="http://localhost:6333" for production # Create collection qdrant.create_collection( collection_name="agent_memory", vectors_config=VectorParams(size=1536, distance=Distance.COSINE) ) def embed(text: str) -> list: response = client.embeddings.create( model="text-embedding-3-small", input=text ) return response.data[0].embedding def store_memory(text: str, metadata: dict = None): vector = embed(text) point = PointStruct( id=str(uuid.uuid4()), vector=vector, payload={"text": text, **(metadata or {})} ) qdrant.upsert(collection_name="agent_memory", points=[point]) def recall(query: str, top_k: int = 3) -> list[str]: vector = embed(query) results = qdrant.search( collection_name="agent_memory", query_vector=vector, limit=top_k ) return [hit.payload["text"] for hit in results] # Store interactions store_memory("User prefers dark mode and works in fintech", {"user_id": "user123"}) store_memory("User's project deadline is March 2026", {"user_id": "user123"}) store_memory("User likes Python over JavaScript", {"user_id": "user123"}) # Recall relevant memories memories = recall("What programming language does the user prefer?") print(memories) # Returns the Python preference memory # Use in agent loop def chat_with_memory(user_input: str) -> str: relevant_memories = recall(user_input, top_k=3) context = "\n".join(relevant_memories) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": f"You are a helpful assistant. Relevant memories:\n{context}"}, {"role": "user", "content": user_input} ] ) reply = response.choices[0].message.content store_memory(f"User: {user_input}\nAssistant: {reply}") return reply
Model memories as entities and relationships for structured, multi-hop reasoning.
from neo4j import GraphDatabase driver = GraphDatabase.driver( "bolt://localhost:7687", auth=("neo4j", "password") ) def add_entity_relation(entity1: str, relation: str, entity2: str): with driver.session() as session: session.run( "MERGE (a:Entity {name: $e1}) " "MERGE (b:Entity {name: $e2}) " "MERGE (a)-[:RELATES {type: $rel}]->(b)", e1=entity1, e2=entity2, rel=relation ) def query_relations(entity: str, relation_type: str = None) -> list: with driver.session() as session: if relation_type: result = session.run( "MATCH (a:Entity {name: $name})-[r:RELATES {type: $rel}]->(b) " "RETURN b.name AS target", name=entity, rel=relation_type ) else: result = session.run( "MATCH (a:Entity {name: $name})-[r:RELATES]->(b) " "RETURN r.type AS relation, b.name AS target", name=entity ) return [dict(record) for record in result] # Build knowledge graph from interactions add_entity_relation("Alice", "WORKS_IN", "Fintech") add_entity_relation("Alice", "PREFERS", "Dark Mode") add_entity_relation("Alice", "USES", "Python") add_entity_relation("Alice", "MANAGES", "Project Alpha") add_entity_relation("Project Alpha", "DEADLINE", "March 2026") # Multi-hop query relations = query_relations("Alice") print(relations) # [{'relation': 'WORKS_IN', 'target': 'Fintech'}, # {'relation': 'PREFERS', 'target': 'Dark Mode'}, ...]
Mem0 provides a managed memory layer that combines vector, graph, and SQL storage with automatic summarization.
pip install mem0ai
from mem0 import MemoryClient import os os.environ["OPENAI_API_KEY"] = "your-key" client = MemoryClient(api_key="your-mem0-api-key") # Add memories (auto-categorized: episodic, semantic, procedural) client.add("User prefers dark mode and works in fintech.", user_id="user123") client.add("User reviewed product X for project Y.", user_id="user123") # Semantic search across all memory types memories = client.search(query="user preferences", user_id="user123") for m in memories: print(f"[{m['score']:.2f}] {m['text']}") # Use in agent loop def agent_with_mem0(query: str, user_id: str) -> str: # Retrieve relevant memories memories = client.search(query=query, user_id=user_id) context = "\n".join([m["text"] for m in memories]) # Generate response with memory context from openai import OpenAI llm = OpenAI() response = llm.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": f"Known about user:\n{context}"}, {"role": "user", "content": query} ] ) reply = response.choices[0].message.content # Store the new interaction client.add(f"Q: {query} A: {reply}", user_id=user_id) return reply
Letta provides persistent agents with built-in memory management, including episodic and archival memory tiers.
pip install letta
from letta import create_client # Connect to Letta server (run: letta server) client = create_client() # Create an agent with built-in memory management agent_state = client.create_agent( name="memory-agent", memory_human="User is a developer working on AI projects.", memory_persona="I am a helpful assistant with long-term memory." ) # Chat — Letta auto-manages memory tiers response = client.send_message( agent_id=agent_state.id, message="I prefer Python and work in fintech." ) print(response.messages) # Memory persists across sessions automatically response = client.send_message( agent_id=agent_state.id, message="What do you know about me?" ) # Agent recalls preferences from previous interaction
For production systems, combine multiple memory types:
from dataclasses import dataclass, field @dataclass class HybridMemory: buffer: list = field(default_factory=list) # Short-term window_size: int = 10 vector_store: object = None # Qdrant/Chroma for semantic recall graph_store: object = None # Neo4j for relational recall def add_interaction(self, user_msg: str, assistant_msg: str): # 1. Add to buffer (short-term) self.buffer.append({"user": user_msg, "assistant": assistant_msg}) if len(self.buffer) > self.window_size: # 2. Summarize and store overflow in vector store overflow = self.buffer.pop(0) summary = f"User asked: {overflow['user']}. Response: {overflow['assistant']}" self.vector_store.store_memory(summary) # 3. Extract entities for knowledge graph # Use NER or LLM to extract (entity, relation, entity) triples # self.graph_store.add_entity_relation(...) def get_context(self, query: str) -> str: # Combine all memory sources recent = "\n".join([ f"User: {m['user']}\nAssistant: {m['assistant']}" for m in self.buffer[-3:] ]) semantic = "\n".join(self.vector_store.recall(query, top_k=3)) # graph = self.graph_store.query_relations(...) return f"Recent:\n{recent}\n\nRelevant past:\n{semantic}"
memory agents vector-store knowledge-graph mem0 letta how-to