config-sample.yml

# This is a sample config file.
# Rename this file to `config.yml` and modify its contents.

db:
  driver: sqlite
  connectionString: "./data.db" # Set this to ":memory:" to avoid creating a file

http:
  listen: localhost
  port: 8080

resultsPage:
  # Enables the default search results page at localhost:8080 (the address specified in the `http` block above)
  enabled: true
  # Inject an HTML fragment at the bottom of the <head> element on the search results page
  customHTML: |
    <style type="text/css">
      h1 {
        color: green;
      }
    </style>

sources:
  # Internally identify the site as `brendan`. All API requests will have to reference this ID.
  - id: brendan
    # Start crawling at this URL:
    url: https://www.bswanson.dev
    # Only allow crawling on these domains. Must include the domain of the base URL.
    allowedDomains:
      - "www.bswanson.dev"
    # The maximum number of links the crawler will follow away from the base URL.
    maxDepth: 100
    # The amount of requests **per minute** that the crawler will make to your site.
    # This number is used to start a scheduled task, so don't set this number too high to conserve CPU cycles.
    speed: 30
    refresh:
      # Set `enabled` to `true` to recrawl old content after a certain amount of days.
      enabled: true
      # The minimum amount of time between refreshes, **in days**.
      # In this example, pages are recrawled weekly.
      minAge: 7
    # The maximum amount of text content to index per page, in characters
    sizeLimit: 200000 # Content will be truncated after 200,000 characters
    embeddings:
      enabled: true
      # The maximum number of requests per minute to the embeddings API.
      # This number is used to start a scheduled task, so don't set this number too high to conserve CPU cycles.
      speed: 30
      # The maximum amount of chunks to include in a single embeddings API request.
      batchSize: 64

      # Use OpenAI's embedding model:
      # openaiBaseUrl: https://api.openai.com/v1/
      # model: text-embedding-3-small
      # dimensions: 1536
      # apiKey: sk-*************************************

      # You can also use any OpenAI-compatible API, like a local Ollama server:
      openaiBaseUrl: http://localhost:11434/v1/
      model: bge-m3
      dimensions: 1024

      chunkSize: 200
      chunkOverlap: 30 # 15% overlap