feat: init

2026-02-26 13:55:13 -05:00
commit b96277064a
8 changed files with 459 additions and 0 deletions
@@ -0,0 +1,3 @@
 MAILDIR_PATH=/path/to/your/maildir/dump
 QDRANT_URL=http://localhost:6333
 COLLECTION_NAME=my_emails
@@ -0,0 +1,28 @@
 # Virtual environments
 venv/
 .venv/
 env/
 env.bak/
 # Environment variables
 .env
 .env.local
 # Python cache and compiled files
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 # Distribution / packaging
 build/
 dist/
 *.egg-info/
 .eggs/
 # Vector database local storage
 qdrant_storage/
 # OS generated files
 .DS_Store
 Thumbs.db
@@ -0,0 +1,81 @@
 # mcp-maildir
 **mcp-maildir** is an MCP (Model Context Protocol) server allowing AI agents (Claude, OpenHands, Cursor, etc.) to explore, search, and read your email archives in a **100% offline, local, and strict Read-Only (R/O)** manner.
 This project uses an email dump in the `Maildir` format (e.g., generated via `offlineimap`), vectorizes the content locally for semantic search, and stores everything in **Qdrant** to provide ultra-fast hybrid search (Semantic + Exact filtering on metadata like dates or senders).
 ## ✨ Features
 * 🔒 **Strict Read-Only**: The AI interacts with a Qdrant index; it has no direct access to your actual mailbox or the `maildir` files. Zero risk of accidental deletion or sending.
 * 🧠 **Hybrid Search**:
  * *Semantic*: Find concepts ("farewell party organization") via local `sentence-transformers`.
  * *Factual*: Filter deterministically by sender or exact date ranges (thanks to Qdrant's native payload indexes).
 * 🚀 **MCP Standard**: Instantly compatible with any client supporting the Model Context Protocol.
 ## 🏗️ Architecture
 1. **Source**: Your local `Maildir` folder.
 2. **Indexer (`indexer.py`)**: A Python script that parses emails, extracts raw text, generates local embeddings, and pushes everything to Qdrant along with metadata.
 3. **Database**: Qdrant (running locally via Docker).
 4. **MCP Server (`server.py`)**: Exposes search and read tools to the AI agent via `FastMCP`.
 ## 🛠️ Prerequisites
 * Python 3.10+
 * Docker (to run Qdrant)
 * An email folder in `Maildir` format
 ## 🚀 Installation & Setup
 ### 1. Clone and prepare the environment
 ```bash
 git clone https://github.com/your-username/mcp-maildir.git
 cd mcp-maildir
 # Create a virtual environment
 python -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
 # Install dependencies
 pip install mcp fastmcp qdrant-client sentence-transformers
 ```
 ### 2. Start Qdrant (Vector Database)
 Run Qdrant locally in the background using Docker:
 ```sh
 docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage:z \
    qdrant/qdrant
 ```
 ### 3. Configuration
 Create a .env file or modify the variables in the code to point to your Maildir folder:
 ```
 MAILDIR_PATH=/path/to/your/maildir/dump
 QDRANT_URL=<http://localhost:6333>
 COLLECTION_NAME=my_emails
 ```
 ### 4. Initial Indexing (Ingestion)
 Before the AI can search, you need to index your emails. Run the ingestion script (to be executed every time you sync new emails):
 ```sh
 python indexer.py
 ```
 Note: The indexer.py script automatically configures Qdrant payload indexes for metadata (sender as KEYWORD, date as DATETIME) to guarantee fast and deterministic static searches.
 ### 🤖 Usage with an MCP Client
 Tools exposed by the server
 The server.py script exposes the following tools to the AI:
 * search_emails(query: str, sender: str, start_date: str, end_date: str): Performs a hybrid search. Metadata parameters are optional.
 * read_email(message_id: str): Returns the full text content (cleaned of HTML) of a specific email.
@@ -0,0 +1,31 @@
 services:
  qdrant:
    image: docker.io/qdrant/qdrant:latest
    container_name: mcp_maildir_qdrant
    ports:
      - "6333:6333" # REST API
      - "6334:6334" # gRPC API
    volumes:
      - ./qdrant_storage:/qdrant/storage:z
    restart: unless-stopped
  mcp-server:
    build:
      context: .
      dockerfile: pkg/Dockerfile
    container_name: mcp_maildir_server
    ports:
      - "8000:8000" # Expose the MCP HTTP (SSE) server
    env_file:
      - .env
    environment:
      # Override Qdrant URL to point to the docker-compose service
      - QDRANT_URL=http://qdrant:6333
    volumes:
      # Mount the source code for hot-reloading (optional)
      - ./src:/app/src:ro,z
      # Mount the maildir dump as read-only.
      # Ensure you set MAILDIR_PATH in your .env file
      - ${MAILDIR_PATH:-./maildir_dump}:/path/to/your/maildir/dump:ro,z
    depends_on:
      - qdrant
@@ -0,0 +1,21 @@
 # Use Python 3.14 as requested
 FROM docker.io/library/python:3.14-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 # Set the working directory in the container
 WORKDIR /app
 # Copy the requirements file into the container
 COPY requirements.txt .
 # Install dependencies using buildkit cache
 RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
 # Copy the source code
 COPY src/ ./src/
 # Command to run the MCP server
 CMD ["python", "src/server.py"]
@@ -0,0 +1,9 @@
 mcp
 fastmcp
 qdrant-client
 sentence-transformers
 python-dotenv
 uvicorn
 starlette
 beautifulsoup4
 python-dateutil
@@ -0,0 +1,254 @@
 """
 Indexer script to parse emails from Maildir and push them to Qdrant.
 """
 import os
 import email
 import mailbox
 from datetime import datetime
 from email.utils import parsedate_to_datetime
 from email.header import decode_header
 from typing import List, Dict, Any, Tuple
 import uuid
 from dotenv import load_dotenv
 from qdrant_client import QdrantClient
 from qdrant_client.http import models
 from sentence_transformers import SentenceTransformer
 from bs4 import BeautifulSoup
 # Load .env config
 load_dotenv()
 # Configuration
 MAILDIR_PATH = os.environ.get("MAILDIR_PATH", "")
 QDRANT_URL = os.environ.get("QDRANT_URL", "")
 COLLECTION_NAME = os.environ.get("COLLECTION_NAME", "")
 if not MAILDIR_PATH:
    raise ValueError("MAILDIR_PATH environment variable is required.")
 if not QDRANT_URL:
    raise ValueError("QDRANT_URL environment variable is required.")
 if not COLLECTION_NAME:
    raise ValueError("COLLECTION_NAME environment variable is required.")
 EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
 BATCH_SIZE = 50
 def decode_mime_words(s: str) -> str:
    """Decodes MIME encoded strings (e.g. subjects, filenames)."""
    if not s:
        return ""
    decoded_words = decode_header(s)
    result = []
    for word, encoding in decoded_words:
        if isinstance(word, bytes):
            try:
                result.append(word.decode(encoding or 'utf-8', errors='replace'))
            except LookupError:
                result.append(word.decode('utf-8', errors='replace'))
        else:
            result.append(word)
    return "".join(result)
 def extract_text_from_html(html_content: str) -> str:
    """Extracts plain text from HTML content."""
    try:
        soup = BeautifulSoup(html_content, "html.parser")
        return soup.get_text(separator=" ", strip=True)
    except Exception:
        return html_content
 def parse_email_message(msg: mailbox.Message) -> Tuple[str, List[str]]:
    """Extracts plain text body and a list of attachment filenames."""
    body_parts = []
    attachments = []
    for part in msg.walk():
        # Skip multiparts, we only care about leaf nodes
        if part.is_multipart():
            continue
        content_type = part.get_content_type()
        content_disposition = str(part.get("Content-Disposition", ""))
        # Check for attachments
        if "attachment" in content_disposition or part.get_filename():
            filename = part.get_filename()
            if filename:
                attachments.append(decode_mime_words(filename))
            continue
        # Extract text body
        if content_type in ["text/plain", "text/html"]:
            try:
                payload = part.get_payload(decode=True)
                if payload:
                    charset = part.get_content_charset('utf-8') or 'utf-8'
                    if isinstance(payload, bytes):
                        text = payload.decode(charset, errors='replace')
                    else:
                        text = str(payload)
                    if content_type == "text/html":
                        text = extract_text_from_html(text)
                    body_parts.append(text)
            except Exception as e:
                print(f"Error extracting payload: {e}")
                pass
    return "\n".join(body_parts).strip(), attachments
 def init_qdrant_collection(client: QdrantClient, model: SentenceTransformer):
    """Ensures Qdrant collection exists and payload indexes are created."""
    vector_size = model.get_sentence_embedding_dimension()
    # Check if collection exists
    collections = client.get_collections().collections
    if not any(c.name == COLLECTION_NAME for c in collections):
        print(f"Creating collection '{COLLECTION_NAME}' with vector size {vector_size}...")
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE),
        )
    else:
        print(f"Collection '{COLLECTION_NAME}' already exists.")
    # Create payload indexes for filtering metadata deterministically
    print("Ensuring payload indexes exist...")
    # Date index (DATETIME)
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name="date",
        field_schema=models.PayloadSchemaType.DATETIME,
    )
    # Sender index (KEYWORD)
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name="sender",
        field_schema=models.PayloadSchemaType.KEYWORD,
    )
    # Receiver index (KEYWORD)
    client.create_payload_index(
        collection_name=COLLECTION_NAME,
        field_name="receiver",
        field_schema=models.PayloadSchemaType.KEYWORD,
    )
 def main():
    """
    Main ingestion function.
    Reads Maildir, extracts text, generates local embeddings, and pushes to Qdrant.
    """
    print(f"Indexing emails from {MAILDIR_PATH} into {QDRANT_URL}...")
    if not os.path.exists(MAILDIR_PATH):
        print(f"Error: Maildir path not found: {MAILDIR_PATH}")
        return
    # Initialize model
    print(f"Loading embedding model: {EMBEDDING_MODEL_NAME}...")
    model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    # Initialize Qdrant
    print("Connecting to Qdrant...")
    qdrant_client = QdrantClient(url=QDRANT_URL)
    init_qdrant_collection(qdrant_client, model)
    points = []
    # Iterate and parse over all maildir directories found in MAILDIR_PATH
    for root, dirs, files in os.walk(MAILDIR_PATH):
        # A valid Maildir has 'cur', 'new', and 'tmp' subdirectories
        if all(subdir in dirs for subdir in ['cur', 'new', 'tmp']):
            print(f"Processing Maildir found at: {root}")
            mbox = mailbox.Maildir(root)
            total_emails_in_dir = len(mbox)
            print(f"Found {total_emails_in_dir} emails in this directory.")
            for idx, (key, msg) in enumerate(mbox.items()):
                try:
                    # Parse headers
                    subject = decode_mime_words(msg.get("Subject", "No Subject"))
                    sender = decode_mime_words(msg.get("From", "Unknown"))
                    receiver = decode_mime_words(msg.get("To", "Unknown"))
                    message_id = msg.get("Message-ID", str(uuid.uuid4()))
                    # Parse date
                    date_str = msg.get("Date")
                    dt_obj = None
                    if date_str:
                        try:
                            dt_obj = parsedate_to_datetime(date_str)
                        except Exception:
                            pass
                    if dt_obj is None:
                        dt_obj = datetime.now()
                    # Format to ISO 8601 for Qdrant DATETIME index
                    iso_date = dt_obj.isoformat()
                    # Parse Body and Attachments
                    body_text, attachments = parse_email_message(msg)
                    # Prepare Vector text
                    attachments_str = ", ".join(attachments) if attachments else "None"
                    vector_text = (
                        f"Date: {iso_date}\n"
                        f"From: {sender}\n"
                        f"To: {receiver}\n"
                        f"Subject: {subject}\n\n"
                        f"{body_text}\n\n"
                        f"Attachments: {attachments_str}"
                    )
                    # Embed the text
                    vector = model.encode(vector_text).tolist()
                    # Prepare payload (metadata)
                    payload = {
                        "message_id": message_id,
                        "date": iso_date,
                        "sender": sender,
                        "receiver": receiver,
                        "subject": subject,
                        "body_text": body_text,
                        "attachments": attachments
                    }
                    # Assign deterministic UUID point ID based on message_id
                    point_id = str(uuid.uuid5(uuid.NAMESPACE_OID, message_id))
                    points.append(models.PointStruct(
                        id=point_id,
                        vector=vector,
                        payload=payload
                    ))
                    # Push in batches
                    if len(points) >= BATCH_SIZE:
                        qdrant_client.upsert(
                            collection_name=COLLECTION_NAME,
                            points=points
                        )
                        print(f"Processed {idx + 1}/{total_emails_in_dir} emails in current directory...")
                        points = []
                except Exception as e:
                    print(f"Error processing email key={key}: {e}")
    # Push remaining points
    if points:
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=points
        )
    print("Indexing completed successfully!")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,32 @@
 """
 MCP Server exposing search and read tools for the indexed emails.
 """
 import os
 from fastmcp import FastMCP
 from dotenv import load_dotenv
 load_dotenv()
 # Initialize FastMCP server
 mcp = FastMCP("mcp-maildir")
@mcp.tool()
 def search_emails(query: str, sender: str | None = None, start_date: str | None = None, end_date: str | None = None):
    """
    Performs a hybrid search (Semantic + Exact filtering on metadata).
    """
    # TODO: Implement Qdrant search
    return f"Searching for '{query}'..."
@mcp.tool()
 def read_email(message_id: str):
    """
    Returns the full text content (cleaned of HTML) of a specific email.
    """
    # TODO: Implement fetching email by message_id
    return f"Reading email {message_id}..."
 if __name__ == "__main__":
    # Start the MCP server using SSE (Server-Sent Events) over HTTP
    mcp.run(transport="sse", host="0.0.0.0", port=8000)