Table of Contents

How to Add Memory to an Agent

A practical guide to implementing memory systems for AI agents. Memory allows agents to maintain context across conversations, personalize responses, and learn from past interactions.

Memory Architecture Overview

graph TB subgraph Short-Term A[Conversation Buffer] --> C[Working Memory] B[Sliding Window] --> C end subgraph Long-Term D[Vector Store Memory] E[Knowledge Graph Memory] F[SQL/Structured Store] end C -->|Summarize & Persist| D C -->|Extract Entities| E C -->|Store Facts| F G[User Query] --> H[Memory Router] H --> C H --> D H --> E D --> I[Context for LLM] E --> I C --> I I --> J[Agent Response]

Memory Types Compared

Memory Type How It Works Best For Trade-offs
Conversation Buffer Stores full recent conversation Simple chatbots, short sessions High token usage, forgets old context
Sliding Window Fixed-size window of recent N turns Ongoing sessions needing recency Fixed size limits depth
Vector Store Embeds interactions, retrieves by similarity Semantic recall over large histories Weak on relationships, embedding cost
Knowledge Graph Entities as nodes, relations as edges Multi-hop reasoning, complex domains Setup complexity, graph query overhead

When to Use Which

graph TD A[Need memory?] --> B{Session length?} B -->|Short, < 10 turns| C[Conversation Buffer] B -->|Medium, 10-50 turns| D[Sliding Window] B -->|Long / Cross-session| E{What kind of recall?} E -->|Fuzzy / Semantic| F[Vector Store Memory] E -->|Relational / Structured| G[Knowledge Graph Memory] E -->|Both| H[Hybrid: Vector + Graph] F --> I[Use Mem0 or Qdrant] G --> J[Use Neo4j] H --> K[Use Mem0 with graph mode]

Approach 1: Conversation Buffer Memory

The simplest form — store the full conversation history and pass it as context.

from collections import deque
from openai import OpenAI
 
client = OpenAI()
 
class ConversationBufferMemory:
    def __init__(self):
        self.messages = []
 
    def add_user_message(self, content: str):
        self.messages.append({"role": "user", "content": content})
 
    def add_assistant_message(self, content: str):
        self.messages.append({"role": "assistant", "content": content})
 
    def get_context(self) -> list:
        return self.messages.copy()
 
    def clear(self):
        self.messages = []
 
memory = ConversationBufferMemory()
 
def chat(user_input: str) -> str:
    memory.add_user_message(user_input)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."}
        ] + memory.get_context()
    )
    reply = response.choices[0].message.content
    memory.add_assistant_message(reply)
    return reply
 
# Usage
print(chat("My name is Alice"))
print(chat("What is my name?"))  # Remembers: "Alice"

Approach 2: Sliding Window Memory

Keep only the last N exchanges to control token usage while maintaining recent context.

from collections import deque
from openai import OpenAI
 
client = OpenAI()
 
class SlidingWindowMemory:
    def __init__(self, window_size: int = 10):
        self.window = deque(maxlen=window_size * 2)  # *2 for user+assistant pairs
 
    def add_user_message(self, content: str):
        self.window.append({"role": "user", "content": content})
 
    def add_assistant_message(self, content: str):
        self.window.append({"role": "assistant", "content": content})
 
    def get_context(self) -> list:
        return list(self.window)
 
memory = SlidingWindowMemory(window_size=5)  # Keep last 5 exchanges
 
def chat(user_input: str) -> str:
    memory.add_user_message(user_input)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."}
        ] + memory.get_context()
    )
    reply = response.choices[0].message.content
    memory.add_assistant_message(reply)
    return reply

Approach 3: Vector Store Memory

Embed past interactions and retrieve the most relevant ones via semantic search. Scales to thousands of past conversations.

from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
 
client = OpenAI()
qdrant = QdrantClient(":memory:")  # Use url="http://localhost:6333" for production
 
# Create collection
qdrant.create_collection(
    collection_name="agent_memory",
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
)
 
def embed(text: str) -> list:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding
 
def store_memory(text: str, metadata: dict = None):
    vector = embed(text)
    point = PointStruct(
        id=str(uuid.uuid4()),
        vector=vector,
        payload={"text": text, **(metadata or {})}
    )
    qdrant.upsert(collection_name="agent_memory", points=[point])
 
def recall(query: str, top_k: int = 3) -> list[str]:
    vector = embed(query)
    results = qdrant.search(
        collection_name="agent_memory",
        query_vector=vector,
        limit=top_k
    )
    return [hit.payload["text"] for hit in results]
 
# Store interactions
store_memory("User prefers dark mode and works in fintech", {"user_id": "user123"})
store_memory("User's project deadline is March 2026", {"user_id": "user123"})
store_memory("User likes Python over JavaScript", {"user_id": "user123"})
 
# Recall relevant memories
memories = recall("What programming language does the user prefer?")
print(memories)  # Returns the Python preference memory
 
# Use in agent loop
def chat_with_memory(user_input: str) -> str:
    relevant_memories = recall(user_input, top_k=3)
    context = "\n".join(relevant_memories)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"You are a helpful assistant. Relevant memories:\n{context}"},
            {"role": "user", "content": user_input}
        ]
    )
    reply = response.choices[0].message.content
    store_memory(f"User: {user_input}\nAssistant: {reply}")
    return reply

Approach 4: Knowledge Graph Memory

Model memories as entities and relationships for structured, multi-hop reasoning.

from neo4j import GraphDatabase
 
driver = GraphDatabase.driver(
    "bolt://localhost:7687",
    auth=("neo4j", "password")
)
 
def add_entity_relation(entity1: str, relation: str, entity2: str):
    with driver.session() as session:
        session.run(
            "MERGE (a:Entity {name: $e1}) "
            "MERGE (b:Entity {name: $e2}) "
            "MERGE (a)-[:RELATES {type: $rel}]->(b)",
            e1=entity1, e2=entity2, rel=relation
        )
 
def query_relations(entity: str, relation_type: str = None) -> list:
    with driver.session() as session:
        if relation_type:
            result = session.run(
                "MATCH (a:Entity {name: $name})-[r:RELATES {type: $rel}]->(b) "
                "RETURN b.name AS target",
                name=entity, rel=relation_type
            )
        else:
            result = session.run(
                "MATCH (a:Entity {name: $name})-[r:RELATES]->(b) "
                "RETURN r.type AS relation, b.name AS target",
                name=entity
            )
        return [dict(record) for record in result]
 
# Build knowledge graph from interactions
add_entity_relation("Alice", "WORKS_IN", "Fintech")
add_entity_relation("Alice", "PREFERS", "Dark Mode")
add_entity_relation("Alice", "USES", "Python")
add_entity_relation("Alice", "MANAGES", "Project Alpha")
add_entity_relation("Project Alpha", "DEADLINE", "March 2026")
 
# Multi-hop query
relations = query_relations("Alice")
print(relations)
# [{'relation': 'WORKS_IN', 'target': 'Fintech'},
#  {'relation': 'PREFERS', 'target': 'Dark Mode'}, ...]

Using Mem0 (Production-Ready Memory Layer)

Mem0 provides a managed memory layer that combines vector, graph, and SQL storage with automatic summarization.

pip install mem0ai
from mem0 import MemoryClient
import os
 
os.environ["OPENAI_API_KEY"] = "your-key"
 
client = MemoryClient(api_key="your-mem0-api-key")
 
# Add memories (auto-categorized: episodic, semantic, procedural)
client.add("User prefers dark mode and works in fintech.", user_id="user123")
client.add("User reviewed product X for project Y.", user_id="user123")
 
# Semantic search across all memory types
memories = client.search(query="user preferences", user_id="user123")
for m in memories:
    print(f"[{m['score']:.2f}] {m['text']}")
 
# Use in agent loop
def agent_with_mem0(query: str, user_id: str) -> str:
    # Retrieve relevant memories
    memories = client.search(query=query, user_id=user_id)
    context = "\n".join([m["text"] for m in memories])
 
    # Generate response with memory context
    from openai import OpenAI
    llm = OpenAI()
    response = llm.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": f"Known about user:\n{context}"},
            {"role": "user", "content": query}
        ]
    )
    reply = response.choices[0].message.content
 
    # Store the new interaction
    client.add(f"Q: {query} A: {reply}", user_id=user_id)
    return reply

Using Letta (Stateful Agent Framework)

Letta provides persistent agents with built-in memory management, including episodic and archival memory tiers.

pip install letta
from letta import create_client
 
# Connect to Letta server (run: letta server)
client = create_client()
 
# Create an agent with built-in memory management
agent_state = client.create_agent(
    name="memory-agent",
    memory_human="User is a developer working on AI projects.",
    memory_persona="I am a helpful assistant with long-term memory."
)
 
# Chat — Letta auto-manages memory tiers
response = client.send_message(
    agent_id=agent_state.id,
    message="I prefer Python and work in fintech."
)
print(response.messages)
 
# Memory persists across sessions automatically
response = client.send_message(
    agent_id=agent_state.id,
    message="What do you know about me?"
)
# Agent recalls preferences from previous interaction

Hybrid Architecture: Best of All Worlds

For production systems, combine multiple memory types:

from dataclasses import dataclass, field
 
@dataclass
class HybridMemory:
    buffer: list = field(default_factory=list)       # Short-term
    window_size: int = 10
    vector_store: object = None   # Qdrant/Chroma for semantic recall
    graph_store: object = None    # Neo4j for relational recall
 
    def add_interaction(self, user_msg: str, assistant_msg: str):
        # 1. Add to buffer (short-term)
        self.buffer.append({"user": user_msg, "assistant": assistant_msg})
        if len(self.buffer) > self.window_size:
            # 2. Summarize and store overflow in vector store
            overflow = self.buffer.pop(0)
            summary = f"User asked: {overflow['user']}. Response: {overflow['assistant']}"
            self.vector_store.store_memory(summary)
 
        # 3. Extract entities for knowledge graph
        # Use NER or LLM to extract (entity, relation, entity) triples
        # self.graph_store.add_entity_relation(...)
 
    def get_context(self, query: str) -> str:
        # Combine all memory sources
        recent = "\n".join([
            f"User: {m['user']}\nAssistant: {m['assistant']}"
            for m in self.buffer[-3:]
        ])
        semantic = "\n".join(self.vector_store.recall(query, top_k=3))
        # graph = self.graph_store.query_relations(...)
        return f"Recent:\n{recent}\n\nRelevant past:\n{semantic}"

Best Practices

See Also

memory agents vector-store knowledge-graph mem0 letta how-to