> helicone

LLM observability proxy that sits between your app and LLM providers. Logs every request, enables caching, rate limiting, and provides cost analytics. Works with OpenAI, Anthropic, and other providers with a one-line integration change.

fetch
$curl "https://skillshub.wtf/TerminalSkills/skills/helicone?format=md"
SKILL.mdhelicone

Helicone

Proxy Integration (OpenAI Python)

# helicone_proxy.py — Route OpenAI calls through Helicone proxy for logging
from openai import OpenAI

client = OpenAI(
    api_key="sk-your-openai-key",
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": "Bearer sk-helicone-xxxx",
    },
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Hello!"}],
)
print(response.choices[0].message.content)

Proxy Integration (Anthropic)

# helicone_anthropic.py — Route Anthropic calls through Helicone proxy
from anthropic import Anthropic

client = Anthropic(
    api_key="sk-ant-xxxx",
    base_url="https://anthropic.helicone.ai",
    default_headers={
        "Helicone-Auth": "Bearer sk-helicone-xxxx",
    },
)

message = client.messages.create(
    model="claude-3-5-sonnet-20241022",
    max_tokens=1024,
    messages=[{"role": "user", "content": "Explain caching."}],
)

Custom Properties and User Tracking

# custom_properties.py — Add metadata to requests for filtering in the dashboard
from openai import OpenAI

client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={"Helicone-Auth": "Bearer sk-helicone-xxxx"},
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Help me with billing"}],
    extra_headers={
        "Helicone-User-Id": "user-123",
        "Helicone-Session-Id": "session-abc",
        "Helicone-Property-Feature": "support-chat",
        "Helicone-Property-Environment": "production",
        "Helicone-Property-Ticket-Id": "T-5678",
    },
)

Caching

# caching.py — Enable response caching to reduce costs on repeated queries
from openai import OpenAI

client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": "Bearer sk-helicone-xxxx",
        "Helicone-Cache-Enabled": "true",
    },
)

# First call hits the API
response1 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "What is 2+2?"}],
)

# Second identical call returns cached response (no API cost)
response2 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "What is 2+2?"}],
)

# Custom cache bucket for grouping
response3 = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": "Summarize our FAQ"}],
    extra_headers={"Helicone-Cache-Bucket-Max-Size": "3"},
)

Rate Limiting

# rate_limiting.py — Apply rate limits per user or globally via headers
from openai import OpenAI

client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": "Bearer sk-helicone-xxxx",
        "Helicone-RateLimit-Policy": "10;w=60;s=user",  # 10 req per 60s per user
    },
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Hello"}],
    extra_headers={
        "Helicone-User-Id": "user-123",
    },
)

Retries and Fallbacks

# retries.py — Configure automatic retries on failures
from openai import OpenAI

client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": "Bearer sk-helicone-xxxx",
        "Helicone-Retry-Enabled": "true",
        "Helicone-Retry-Num": "3",
        "Helicone-Retry-Factor": "2",  # Exponential backoff factor
    },
)

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Analyze this data"}],
)

Async Logging (Node.js)

// helicone_async.ts — Use async logging to avoid proxy latency in the request path
import OpenAI from "openai";
import { HeliconeAsyncLogger } from "@helicone/helicone";

const logger = new HeliconeAsyncLogger({
  apiKey: "sk-helicone-xxxx",
});
logger.init();

const openai = new OpenAI();

const response = await openai.chat.completions.create({
  model: "gpt-4",
  messages: [{ role: "user", content: "Hello" }],
});

// Log asynchronously — no added latency to user requests
await logger.log(response, {
  userId: "user-123",
  properties: { feature: "chat" },
});

Feedback and Scoring

# Score a request via the Helicone API for quality tracking
curl -X POST https://api.helicone.ai/v1/request/{request-id}/feedback \
  -H "Authorization: Bearer sk-helicone-xxxx" \
  -H "Content-Type: application/json" \
  -d '{"rating": true}'
# scoring_api.py — Score requests programmatically
import requests

def score_request(request_id: str, rating: bool):
    requests.post(
        f"https://api.helicone.ai/v1/request/{request_id}/feedback",
        headers={"Authorization": "Bearer sk-helicone-xxxx"},
        json={"rating": rating},
    )

Key Concepts

  • Proxy mode: Change base_url to route through Helicone — zero code changes otherwise
  • Async logging: Log after the fact for zero-latency overhead in production
  • Headers-based config: All features controlled via HTTP headers — no SDK lock-in
  • Cost tracking: Automatic token counting and cost calculation per request, user, and model
  • Custom properties: Tag requests with arbitrary key-value pairs for filtering and analytics
  • Cache: Reduce costs by caching identical requests; configurable TTL and bucket sizes

┌ stats

installs/wk0
░░░░░░░░░░
github stars38
████████░░
first seenMar 17, 2026
└────────────

┌ repo

TerminalSkills/skills
by TerminalSkills
└────────────

┌ tags

└────────────