====== How to Add Memory to an Agent ====== A practical guide to implementing memory systems for AI agents. Memory allows agents to maintain context across conversations, personalize responses, and learn from past interactions. ===== Memory Architecture Overview ===== graph TB subgraph Short-Term A[Conversation Buffer] --> C[Working Memory] B[Sliding Window] --> C end subgraph Long-Term D[Vector Store Memory] E[Knowledge Graph Memory] F[SQL/Structured Store] end C -->|Summarize & Persist| D C -->|Extract Entities| E C -->|Store Facts| F G[User Query] --> H[Memory Router] H --> C H --> D H --> E D --> I[Context for LLM] E --> I C --> I I --> J[Agent Response] ===== Memory Types Compared ===== ^ Memory Type ^ How It Works ^ Best For ^ Trade-offs ^ | **Conversation Buffer** | Stores full recent conversation | Simple chatbots, short sessions | High token usage, forgets old context | | **Sliding Window** | Fixed-size window of recent N turns | Ongoing sessions needing recency | Fixed size limits depth | | **Vector Store** | Embeds interactions, retrieves by similarity | Semantic recall over large histories | Weak on relationships, embedding cost | | **Knowledge Graph** | Entities as nodes, relations as edges | Multi-hop reasoning, complex domains | Setup complexity, graph query overhead | ===== When to Use Which ===== graph TD A[Need memory?] --> B{Session length?} B -->|Short, < 10 turns| C[Conversation Buffer] B -->|Medium, 10-50 turns| D[Sliding Window] B -->|Long / Cross-session| E{What kind of recall?} E -->|Fuzzy / Semantic| F[Vector Store Memory] E -->|Relational / Structured| G[Knowledge Graph Memory] E -->|Both| H[Hybrid: Vector + Graph] F --> I[Use Mem0 or Qdrant] G --> J[Use Neo4j] H --> K[Use Mem0 with graph mode] ===== Approach 1: Conversation Buffer Memory ===== The simplest form — store the full conversation history and pass it as context. from collections import deque from openai import OpenAI client = OpenAI() class ConversationBufferMemory: def __init__(self): self.messages = [] def add_user_message(self, content: str): self.messages.append({"role": "user", "content": content}) def add_assistant_message(self, content: str): self.messages.append({"role": "assistant", "content": content}) def get_context(self) -> list: return self.messages.copy() def clear(self): self.messages = [] memory = ConversationBufferMemory() def chat(user_input: str) -> str: memory.add_user_message(user_input) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."} ] + memory.get_context() ) reply = response.choices[0].message.content memory.add_assistant_message(reply) return reply # Usage print(chat("My name is Alice")) print(chat("What is my name?")) # Remembers: "Alice" ===== Approach 2: Sliding Window Memory ===== Keep only the last N exchanges to control token usage while maintaining recent context. from collections import deque from openai import OpenAI client = OpenAI() class SlidingWindowMemory: def __init__(self, window_size: int = 10): self.window = deque(maxlen=window_size * 2) # *2 for user+assistant pairs def add_user_message(self, content: str): self.window.append({"role": "user", "content": content}) def add_assistant_message(self, content: str): self.window.append({"role": "assistant", "content": content}) def get_context(self) -> list: return list(self.window) memory = SlidingWindowMemory(window_size=5) # Keep last 5 exchanges def chat(user_input: str) -> str: memory.add_user_message(user_input) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."} ] + memory.get_context() ) reply = response.choices[0].message.content memory.add_assistant_message(reply) return reply ===== Approach 3: Vector Store Memory ===== Embed past interactions and retrieve the most relevant ones via semantic search. Scales to thousands of past conversations. from openai import OpenAI from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct import uuid client = OpenAI() qdrant = QdrantClient(":memory:") # Use url="http://localhost:6333" for production # Create collection qdrant.create_collection( collection_name="agent_memory", vectors_config=VectorParams(size=1536, distance=Distance.COSINE) ) def embed(text: str) -> list: response = client.embeddings.create( model="text-embedding-3-small", input=text ) return response.data[0].embedding def store_memory(text: str, metadata: dict = None): vector = embed(text) point = PointStruct( id=str(uuid.uuid4()), vector=vector, payload={"text": text, **(metadata or {})} ) qdrant.upsert(collection_name="agent_memory", points=[point]) def recall(query: str, top_k: int = 3) -> list[str]: vector = embed(query) results = qdrant.search( collection_name="agent_memory", query_vector=vector, limit=top_k ) return [hit.payload["text"] for hit in results] # Store interactions store_memory("User prefers dark mode and works in fintech", {"user_id": "user123"}) store_memory("User's project deadline is March 2026", {"user_id": "user123"}) store_memory("User likes Python over JavaScript", {"user_id": "user123"}) # Recall relevant memories memories = recall("What programming language does the user prefer?") print(memories) # Returns the Python preference memory # Use in agent loop def chat_with_memory(user_input: str) -> str: relevant_memories = recall(user_input, top_k=3) context = "\n".join(relevant_memories) response = client.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": f"You are a helpful assistant. Relevant memories:\n{context}"}, {"role": "user", "content": user_input} ] ) reply = response.choices[0].message.content store_memory(f"User: {user_input}\nAssistant: {reply}") return reply ===== Approach 4: Knowledge Graph Memory ===== Model memories as entities and relationships for structured, multi-hop reasoning. from neo4j import GraphDatabase driver = GraphDatabase.driver( "bolt://localhost:7687", auth=("neo4j", "password") ) def add_entity_relation(entity1: str, relation: str, entity2: str): with driver.session() as session: session.run( "MERGE (a:Entity {name: $e1}) " "MERGE (b:Entity {name: $e2}) " "MERGE (a)-[:RELATES {type: $rel}]->(b)", e1=entity1, e2=entity2, rel=relation ) def query_relations(entity: str, relation_type: str = None) -> list: with driver.session() as session: if relation_type: result = session.run( "MATCH (a:Entity {name: $name})-[r:RELATES {type: $rel}]->(b) " "RETURN b.name AS target", name=entity, rel=relation_type ) else: result = session.run( "MATCH (a:Entity {name: $name})-[r:RELATES]->(b) " "RETURN r.type AS relation, b.name AS target", name=entity ) return [dict(record) for record in result] # Build knowledge graph from interactions add_entity_relation("Alice", "WORKS_IN", "Fintech") add_entity_relation("Alice", "PREFERS", "Dark Mode") add_entity_relation("Alice", "USES", "Python") add_entity_relation("Alice", "MANAGES", "Project Alpha") add_entity_relation("Project Alpha", "DEADLINE", "March 2026") # Multi-hop query relations = query_relations("Alice") print(relations) # [{'relation': 'WORKS_IN', 'target': 'Fintech'}, # {'relation': 'PREFERS', 'target': 'Dark Mode'}, ...] ===== Using Mem0 (Production-Ready Memory Layer) ===== Mem0 provides a managed memory layer that combines vector, graph, and SQL storage with automatic summarization. pip install mem0ai from mem0 import MemoryClient import os os.environ["OPENAI_API_KEY"] = "your-key" client = MemoryClient(api_key="your-mem0-api-key") # Add memories (auto-categorized: episodic, semantic, procedural) client.add("User prefers dark mode and works in fintech.", user_id="user123") client.add("User reviewed product X for project Y.", user_id="user123") # Semantic search across all memory types memories = client.search(query="user preferences", user_id="user123") for m in memories: print(f"[{m['score']:.2f}] {m['text']}") # Use in agent loop def agent_with_mem0(query: str, user_id: str) -> str: # Retrieve relevant memories memories = client.search(query=query, user_id=user_id) context = "\n".join([m["text"] for m in memories]) # Generate response with memory context from openai import OpenAI llm = OpenAI() response = llm.chat.completions.create( model="gpt-4o-mini", messages=[ {"role": "system", "content": f"Known about user:\n{context}"}, {"role": "user", "content": query} ] ) reply = response.choices[0].message.content # Store the new interaction client.add(f"Q: {query} A: {reply}", user_id=user_id) return reply ===== Using Letta (Stateful Agent Framework) ===== Letta provides persistent agents with built-in memory management, including episodic and archival memory tiers. pip install letta from letta import create_client # Connect to Letta server (run: letta server) client = create_client() # Create an agent with built-in memory management agent_state = client.create_agent( name="memory-agent", memory_human="User is a developer working on AI projects.", memory_persona="I am a helpful assistant with long-term memory." ) # Chat — Letta auto-manages memory tiers response = client.send_message( agent_id=agent_state.id, message="I prefer Python and work in fintech." ) print(response.messages) # Memory persists across sessions automatically response = client.send_message( agent_id=agent_state.id, message="What do you know about me?" ) # Agent recalls preferences from previous interaction ===== Hybrid Architecture: Best of All Worlds ===== For production systems, combine multiple memory types: from dataclasses import dataclass, field @dataclass class HybridMemory: buffer: list = field(default_factory=list) # Short-term window_size: int = 10 vector_store: object = None # Qdrant/Chroma for semantic recall graph_store: object = None # Neo4j for relational recall def add_interaction(self, user_msg: str, assistant_msg: str): # 1. Add to buffer (short-term) self.buffer.append({"user": user_msg, "assistant": assistant_msg}) if len(self.buffer) > self.window_size: # 2. Summarize and store overflow in vector store overflow = self.buffer.pop(0) summary = f"User asked: {overflow['user']}. Response: {overflow['assistant']}" self.vector_store.store_memory(summary) # 3. Extract entities for knowledge graph # Use NER or LLM to extract (entity, relation, entity) triples # self.graph_store.add_entity_relation(...) def get_context(self, query: str) -> str: # Combine all memory sources recent = "\n".join([ f"User: {m['user']}\nAssistant: {m['assistant']}" for m in self.buffer[-3:] ]) semantic = "\n".join(self.vector_store.recall(query, top_k=3)) # graph = self.graph_store.query_relations(...) return f"Recent:\n{recent}\n\nRelevant past:\n{semantic}" ===== Best Practices ===== * **Start simple** — Use conversation buffer for prototypes, upgrade to vector/graph as needed * **Scope memories** — Isolate by user_id, session_id, or agent_id to prevent cross-contamination * **Add decay** — Score memories by recency and relevance; auto-forget low-scoring entries * **Summarize** — Periodically summarize old memories to reduce storage and improve retrieval * **Compliance** — Encrypt stored memories; implement GDPR/HIPAA deletion on request * **Test recall** — Regularly verify that the right memories surface for given queries ===== See Also ===== * [[how_to_build_a_rag_pipeline|How to Build a RAG Pipeline]] * [[how_to_deploy_an_agent|How to Deploy an Agent]] * [[how_to_evaluate_an_agent|How to Evaluate an Agent]] {{tag>memory agents vector-store knowledge-graph mem0 letta how-to}}