π API Reference#
Complete reference for every public function and class in AI Cortex.
All symbols are importable directly from the aicortex package.
Quick Import Reference#
# The essentials β most users only need these
from aicortex import chat, models, families
# Model & server discovery
from aicortex import get_model_info, list_model_servers, get_server_info
# LangChain / OllamaLLM integration
from aicortex import get_llm_params, get_random_llm_params
# Advanced: build a raw request payload without sending it
from aicortex import build_api_request
# Streaming types
from aicortex import Stream, StreamEvent
# Tools sub-package
from aicortex import tools
Core Functions#
chat()#
The primary entry point for interacting with any Ollama-served language model. Handles model validation, server selection, automatic failover across multiple endpoints, and both synchronous and streaming response modes.
def chat(
prompt: str,
*,
model: str = "gpt-oss:20b",
stream: bool = False,
temperature: float = 0.7,
max_tokens: int | None = None,
top_p: float = 1.0,
stop: list[str] | None = None,
) -> str | Stream
Parameters
Name |
Type |
Default |
Description |
|---|---|---|---|
|
|
(required) |
The input text to send to the model |
|
|
|
Model identifier β must exist in the bundled registry. Use |
|
|
|
When |
|
|
|
Controls randomness. |
|
|
|
Maximum number of tokens to generate. |
|
|
|
Nucleus sampling: at each step, only the top tokens whose cumulative probability reaches |
|
|
|
One or more strings that cause generation to stop immediately when produced. Useful for structured output (e.g. |
Returns
strβ full response text whenstream=FalseStreamβ iterable event container whenstream=True
Raises
Exception |
When |
|---|---|
|
The |
|
No Ollama servers are available for the model, or all tried servers failed |
Behaviour Notes
AI Cortex shuffles the list of known servers for the model and tries each one in random order. Only when every server fails does it raise
RuntimeError. Transient network failures are silently retried.Passing
model=None(internal API) selects a random model from the full registry.
Examples
from aicortex import chat
# Minimal β uses default model
response = chat("What year did the Berlin Wall fall?")
print(response)
# Deterministic code generation
code = chat(
"Write a Python function that checks if a number is prime.",
model="llama3.2:3b",
temperature=0.1,
max_tokens=300,
)
# Creative writing with stop sequence
poem = chat(
"Write a haiku about autumn:",
model="mistral:7b",
temperature=0.9,
stop=["\n\n"],
)
# Streaming
stream = chat("Explain how GPT works.", stream=True)
for event in stream:
if event.type == "token":
print(event.content, end="", flush=True)
families()#
Returns the list of model families whose metadata is bundled with the package.
Family names are derived from the JSON filenames in aicortex/models/.
def families() -> List[str]
Returns β List[str] : alphabetically sorted family names.
from aicortex import families
print(families())
# ['deepseek', 'gemma', 'llama', 'mistral', 'qwen']
π‘ Tip: Family names are always lowercase and correspond 1-to-1 with files in
aicortex/models/(e.g.llama.jsonβ"llama").
models()#
Lists all available model names, optionally filtered to a single family.
def models(family: Optional[str] = None) -> List[str]
Parameters
Name |
Type |
Default |
Description |
|---|---|---|---|
|
|
|
Family name to filter by (case-insensitive). Returns all models when |
Returns β List[str] : model name strings (e.g. "llama3.2:3b", "mistral:7b").
from aicortex import models
# Every model across all families
all_models = models()
print(f"{len(all_models)} models available")
# Models in a specific family
llama = models("llama")
mistral = models("Mistral") # case-insensitive
unknown = models("unknown") # returns [] β never raises
get_model_info()#
Returns the complete metadata record for a named model from the bundled JSON registry. Useful for inspecting model size, quantization, server location, and performance data.
def get_model_info(model: str) -> Dict[str, Any]
Parameters
Name |
Type |
Description |
|---|---|---|
|
|
Exact model name (e.g. |
Returns β Dict[str, Any] : the full metadata dict for that model.
Raises β ValueError if the model is not found.
Example metadata fields
Field |
Description |
|---|---|
|
Canonical model identifier |
|
Ollama server URL where this model is hosted |
|
Human-readable parameter count (e.g. |
|
Quantization format (e.g. |
|
Model family ( |
|
File format (e.g. |
|
City of the hosting server |
|
Country of the hosting server |
|
Measured inference speed |
from aicortex import get_model_info
info = get_model_info("llama3.2:3b")
print(f"Size: {info['parameter_size']}")
print(f"Quantization: {info['quantization_level']}")
print(f"Server: {info['ip_port']}")
print(f"Location: {info.get('ip_city_name_en')}, {info.get('ip_country_name_en')}")
list_model_servers()#
Returns every known Ollama server that hosts a specific model, with location and performance metadata for each.
def list_model_servers(model: str) -> List[Dict[str, Any]]
Parameters
Name |
Type |
Description |
|---|---|---|
|
|
Model name to look up |
Returns β List[Dict] where each dict has:
{
"url": "http://1.2.3.4:11434",
"location": {
"city": "Frankfurt",
"country": "Germany",
"continent": "Europe",
},
"organization": "Hetzner Online GmbH",
"performance": {
"tokens_per_second": 42.3,
"last_tested": "2025-04-01T12:00:00Z",
},
}
from aicortex import list_model_servers
servers = list_model_servers("llama3.2:3b")
print(f"{len(servers)} server(s) found")
for s in servers:
tps = s["performance"]["tokens_per_second"]
loc = f"{s['location']['city']}, {s['location']['country']}"
print(f" {s['url']} | {loc} | {tps} tok/s")
get_server_info()#
Returns metadata for a single Ollama server hosting a given model.
If server_url is omitted, returns the first server in the list.
def get_server_info(
model: str,
server_url: Optional[str] = None,
) -> Dict[str, Any]
Parameters
Name |
Type |
Default |
Description |
|---|---|---|---|
|
|
(required) |
Model name |
|
|
|
If provided, returns info for that specific URL. Raises |
Raises
ValueErrorβ model has no known servers, orserver_urlwas specified but not found.
from aicortex import get_server_info
# First available server
server = get_server_info("llama3.2:3b")
print(server["url"])
# Specific server
server = get_server_info("llama3.2:3b", server_url="http://1.2.3.4:11434")
build_api_request()#
Constructs the raw Ollama JSON payload for a chat request without sending it. Use this when you need to inspect, log, or manually submit the request.
def build_api_request(
model: str,
prompt: str,
**kwargs: Any,
) -> Dict[str, Any]
Parameters
Name |
Type |
Description |
|---|---|---|
|
|
Model name β validated against the registry |
|
|
Input prompt text |
|
|
Any of: |
Returns β Dict[str, Any] : Ollama-compatible request payload.
Raises β ValueError if the model is not in the registry.
from aicortex import build_api_request
payload = build_api_request(
model="llama3.2:3b",
prompt="What is 2 + 2?",
temperature=0.0,
seed=42,
num_predict=20,
)
# {
# 'model': 'llama3.2:3b',
# 'prompt': 'What is 2 + 2?',
# 'options': {
# 'temperature': 0.0,
# 'top_p': 0.9,
# 'stop': [],
# 'num_predict': 20,
# 'seed': 42
# }
# }
get_llm_params()#
Returns a {"model": ..., "base_url": ...} dict for a specific model,
selecting a random live server. Designed for direct use with LangChainβs OllamaLLM.
def get_llm_params(model: Optional[str] = None) -> Dict[str, str]
Parameters
Name |
Type |
Default |
Description |
|---|---|---|---|
|
|
|
Model name. When |
Returns β Dict[str, str] with keys "model" and "base_url".
Raises
ValueErrorβ specified model not found in registry.RuntimeErrorβ no models available, or no servers for the model.
from aicortex import get_llm_params
from langchain_ollama import OllamaLLM
params = get_llm_params("mistral:7b")
# {'model': 'mistral:7b', 'base_url': 'http://...'}
llm = OllamaLLM(**params)
print(llm.invoke("Summarise the Turing test in one sentence."))
get_random_llm_params()#
Equivalent to get_llm_params(model=None). Picks a random model and
a random server β useful for distributing load or experimentation.
def get_random_llm_params() -> Dict[str, str]
Returns β Dict[str, str] with keys "model" and "base_url".
Raises β RuntimeError if no models or servers are available.
from aicortex import get_random_llm_params
from langchain_ollama import OllamaLLM
# Every call may land on a different model and server
params = get_random_llm_params()
print(f"Using model: {params['model']} at {params['base_url']}")
llm = OllamaLLM(**params)
Classes#
StreamEvent#
A dataclass representing a single event emitted during a streaming response.
Every iteration of a Stream yields one StreamEvent.
@dataclass
class StreamEvent:
type: EventType # What kind of event this is
content: str | None # Text payload (present on 'token' events)
index: int | None # Sequential token index (0-based)
tool_name: str | None # Name of the tool being called
tool_args: dict | None # Arguments passed to the tool
tool_result: Any # Return value from tool execution
meta: dict | None # Arbitrary server-side metadata
timestamp: float | None # Unix timestamp of event creation
EventType values
Type |
When it fires |
|
|---|---|---|
|
Once, before any tokens β signals the stream has begun |
|
|
Once per generated token β the core data event |
The token text (may be a word, sub-word, or punctuation) |
|
Once, after all tokens β signals clean completion |
|
|
When a server or generation error occurs |
Error message string |
|
When the model invokes a tool |
Tool invocation info |
|
When a tool execution completes |
Tool result value |
|
For server-side metadata or diagnostics |
Varies |
β οΈ Important: Always check
event.type == "token"before readingevent.content. On"start"and"end"events,contentis an empty string, notNone.
from aicortex import chat, StreamEvent
stream = chat("Name three planets.", stream=True)
for event in stream:
if event.type == "start":
print(f"[{event.timestamp:.2f}] Stream started")
elif event.type == "token":
print(event.content, end="", flush=True)
# event.index tells you which token number this is
elif event.type == "end":
print(f"\n[{event.timestamp:.2f}] Stream complete")
elif event.type == "error":
print(f"\nβ οΈ Error: {event.content}")
Stream#
A container returned by chat(..., stream=True).
Collects StreamEvent objects as they arrive and exposes them as an iterable.
After iteration, all events remain accessible via stream.events.
class Stream:
events: list[StreamEvent] # All collected events
def __iter__(self) -> Iterator[StreamEvent]: ...
def add(self, event: StreamEvent) -> None: ...
def text(self) -> str: ...
Methods
__iter__() β Iterate over events#
stream = chat("Hello", stream=True)
for event in stream:
...
add(event) β Append an event#
Used internally by the streaming engine. You can also use it to build
Stream objects manually for testing:
from aicortex import Stream, StreamEvent
s = Stream()
s.add(StreamEvent(type="token", content="Hello", index=0))
s.add(StreamEvent(type="token", content=" world", index=1))
print(s.text()) # "Hello world"
text() β Extract full response text#
Concatenates the content of every "token" event in order.
Non-token events (start, end, error, meta) are excluded.
stream = chat("What is Python?", stream=True)
# Consume the stream silently, then get the full text
full_response = stream.text()
print(full_response)
# Or iterate AND retain the text
for event in stream:
if event.type == "token":
print(event.content, end="", flush=True)
print(f"\n\n{len(stream.text())} characters generated.")
π‘ Note:
stream.eventspersists after iteration. You can inspect the full event log, count tokens, extract timestamps, or replay events.
stream = chat("Brief answer please.", stream=True)
for event in stream: pass # consume
token_events = [e for e in stream.events if e.type == "token"]
print(f"Generated {len(token_events)} tokens")
print(f"First token at t={token_events[0].timestamp:.3f}")
print(f"Last token at t={token_events[-1].timestamp:.3f}")
Tools Sub-Package#
See the dedicated Tools Reference for full documentation of:
tools.find_valid_endpoints()β ping-test all known Ollama IP endpointstools.fetch_models()β pull model lists from validated URLstools.resolve_models()β merge fetched data with IP metadatatools.apply_valid_models()β write resolved data into family JSON filestools.run_server()β launch the OpenAI-compatible FastAPI proxy
Quick import:
from aicortex.tools import (
find_valid_endpoints,
fetch_models,
resolve_models,
apply_valid_models,
run_server,
)