ArchibaldAI
/

ruvector-fixed

Model card Files Files and versions

ruvector-fixed / dist /core /learning-engine.js

Archie

Fix dimension/dimensions bug and positional insert/search args

40d7073 about 1 month ago

history blame contribute delete

22.3 kB

	"use strict";
	/**
	* Multi-Algorithm Learning Engine
	* Supports 9 RL algorithms for intelligent hooks optimization
	*/
	Object.defineProperty(exports, "__esModule", { value: true });
	exports.LearningEngine = void 0;
	// Default configs for each task type
	const TASK_ALGORITHM_MAP = {
	'agent-routing': {
	algorithm: 'double-q',
	learningRate: 0.1,
	discountFactor: 0.95,
	epsilon: 0.1,
	},
	'error-avoidance': {
	algorithm: 'sarsa',
	learningRate: 0.05,
	discountFactor: 0.99,
	epsilon: 0.05,
	},
	'confidence-scoring': {
	algorithm: 'actor-critic',
	learningRate: 0.01,
	discountFactor: 0.95,
	epsilon: 0.1,
	entropyCoef: 0.01,
	},
	'trajectory-learning': {
	algorithm: 'decision-transformer',
	learningRate: 0.001,
	discountFactor: 0.99,
	epsilon: 0,
	sequenceLength: 20,
	},
	'context-ranking': {
	algorithm: 'ppo',
	learningRate: 0.0003,
	discountFactor: 0.99,
	epsilon: 0.2,
	clipRange: 0.2,
	entropyCoef: 0.01,
	},
	'memory-recall': {
	algorithm: 'td-lambda',
	learningRate: 0.1,
	discountFactor: 0.9,
	epsilon: 0.1,
	lambda: 0.8,
	},
	};
	class LearningEngine {
	constructor() {
	this.configs = new Map();
	this.qTables = new Map();
	this.qTables2 = new Map(); // For Double-Q
	this.eligibilityTraces = new Map();
	this.actorWeights = new Map();
	this.criticValues = new Map();
	this.trajectories = [];
	this.stats = new Map();
	this.rewardHistory = [];
	// Initialize with default configs
	for (const [task, config] of Object.entries(TASK_ALGORITHM_MAP)) {
	this.configs.set(task, { ...config });
	}
	// Initialize stats for all algorithms
	const algorithms = [
	'q-learning', 'sarsa', 'double-q', 'actor-critic',
	'ppo', 'decision-transformer', 'monte-carlo', 'td-lambda', 'dqn'
	];
	for (const alg of algorithms) {
	this.stats.set(alg, {
	algorithm: alg,
	updates: 0,
	avgReward: 0,
	convergenceScore: 0,
	lastUpdate: Date.now(),
	});
	}
	}
	/**
	* Configure algorithm for a specific task type
	*/
	configure(task, config) {
	const existing = this.configs.get(task) \|\| TASK_ALGORITHM_MAP[task];
	this.configs.set(task, { ...existing, ...config });
	}
	/**
	* Get current configuration for a task
	*/
	getConfig(task) {
	return this.configs.get(task) \|\| TASK_ALGORITHM_MAP[task];
	}
	/**
	* Update Q-value using the appropriate algorithm
	*/
	update(task, experience) {
	const config = this.getConfig(task);
	let delta = 0;
	switch (config.algorithm) {
	case 'q-learning':
	delta = this.qLearningUpdate(experience, config);
	break;
	case 'sarsa':
	delta = this.sarsaUpdate(experience, config);
	break;
	case 'double-q':
	delta = this.doubleQUpdate(experience, config);
	break;
	case 'actor-critic':
	delta = this.actorCriticUpdate(experience, config);
	break;
	case 'ppo':
	delta = this.ppoUpdate(experience, config);
	break;
	case 'td-lambda':
	delta = this.tdLambdaUpdate(experience, config);
	break;
	case 'monte-carlo':
	// Monte Carlo needs full episodes
	this.addToCurrentTrajectory(experience);
	if (experience.done) {
	delta = this.monteCarloUpdate(config);
	}
	break;
	case 'decision-transformer':
	this.addToCurrentTrajectory(experience);
	if (experience.done) {
	delta = this.decisionTransformerUpdate(config);
	}
	break;
	case 'dqn':
	delta = this.dqnUpdate(experience, config);
	break;
	}
	// Update stats
	this.updateStats(config.algorithm, experience.reward, Math.abs(delta));
	return delta;
	}
	/**
	* Get best action for a state
	*/
	getBestAction(task, state, actions) {
	const config = this.getConfig(task);
	// Epsilon-greedy exploration
	if (Math.random() < config.epsilon) {
	const randomAction = actions[Math.floor(Math.random() * actions.length)];
	return { action: randomAction, confidence: 0.5 };
	}
	let bestAction = actions[0];
	let bestValue = -Infinity;
	let values = [];
	const qTable = this.getQTable(state);
	for (const action of actions) {
	const value = qTable.get(action) \|\| 0;
	values.push(value);
	if (value > bestValue) {
	bestValue = value;
	bestAction = action;
	}
	}
	// Calculate confidence using softmax
	const confidence = this.softmaxConfidence(values, actions.indexOf(bestAction));
	return { action: bestAction, confidence };
	}
	/**
	* Get action probabilities (for Actor-Critic and PPO)
	*/
	getActionProbabilities(state, actions) {
	const probs = new Map();
	const qTable = this.getQTable(state);
	const values = actions.map(a => qTable.get(a) \|\| 0);
	const maxVal = Math.max(...values);
	const expValues = values.map(v => Math.exp(v - maxVal));
	const sumExp = expValues.reduce((a, b) => a + b, 0);
	for (let i = 0; i < actions.length; i++) {
	probs.set(actions[i], expValues[i] / sumExp);
	}
	return probs;
	}
	// ============ Algorithm Implementations ============
	/**
	* Standard Q-Learning: Q(s,a) += α * (r + γ * max_a' Q(s',a') - Q(s,a))
	*/
	qLearningUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ } = config;
	const qTable = this.getQTable(state);
	const nextQTable = this.getQTable(nextState);
	const currentQ = qTable.get(action) \|\| 0;
	const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
	const target = reward + γ * maxNextQ;
	const delta = target - currentQ;
	const newQ = currentQ + α * delta;
	qTable.set(action, newQ);
	return delta;
	}
	/**
	* SARSA: On-policy, more conservative
	* Q(s,a) += α * (r + γ * Q(s',a') - Q(s,a))
	*/
	sarsaUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ, epsilon } = config;
	const qTable = this.getQTable(state);
	const nextQTable = this.getQTable(nextState);
	const currentQ = qTable.get(action) \|\| 0;
	// On-policy: use expected value under current policy (ε-greedy)
	let nextQ = 0;
	if (!done) {
	const nextActions = Array.from(nextQTable.keys());
	if (nextActions.length > 0) {
	const maxQ = Math.max(...Array.from(nextQTable.values()));
	const avgQ = Array.from(nextQTable.values()).reduce((a, b) => a + b, 0) / nextActions.length;
	// Expected value under ε-greedy
	nextQ = (1 - epsilon) * maxQ + epsilon * avgQ;
	}
	}
	const target = reward + γ * nextQ;
	const delta = target - currentQ;
	const newQ = currentQ + α * delta;
	qTable.set(action, newQ);
	return delta;
	}
	/**
	* Double Q-Learning: Reduces overestimation bias
	* Uses two Q-tables, randomly updates one using the other for target
	*/
	doubleQUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ } = config;
	const useFirst = Math.random() < 0.5;
	const qTable = useFirst ? this.getQTable(state) : this.getQTable2(state);
	const otherQTable = useFirst ? this.getQTable2(nextState) : this.getQTable(nextState);
	const nextQTable = useFirst ? this.getQTable(nextState) : this.getQTable2(nextState);
	const currentQ = qTable.get(action) \|\| 0;
	let nextQ = 0;
	if (!done) {
	// Find best action in next state using one table
	let bestAction = '';
	let bestValue = -Infinity;
	for (const [a, v] of nextQTable) {
	if (v > bestValue) {
	bestValue = v;
	bestAction = a;
	}
	}
	// Evaluate using other table
	if (bestAction) {
	nextQ = otherQTable.get(bestAction) \|\| 0;
	}
	}
	const target = reward + γ * nextQ;
	const delta = target - currentQ;
	const newQ = currentQ + α * delta;
	qTable.set(action, newQ);
	return delta;
	}
	/**
	* Actor-Critic: Policy gradient with value baseline
	*/
	actorCriticUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ } = config;
	// Critic update (TD error)
	const V = this.criticValues.get(state) \|\| 0;
	const V_next = done ? 0 : (this.criticValues.get(nextState) \|\| 0);
	const tdError = reward + γ * V_next - V;
	this.criticValues.set(state, V + α * tdError);
	// Actor update (policy gradient)
	const qTable = this.getQTable(state);
	const currentQ = qTable.get(action) \|\| 0;
	// Use TD error as advantage estimate
	const newQ = currentQ + α * tdError;
	qTable.set(action, newQ);
	return tdError;
	}
	/**
	* PPO: Clipped policy gradient for stable training
	*/
	ppoUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ, clipRange = 0.2 } = config;
	// Critic update
	const V = this.criticValues.get(state) \|\| 0;
	const V_next = done ? 0 : (this.criticValues.get(nextState) \|\| 0);
	const advantage = reward + γ * V_next - V;
	this.criticValues.set(state, V + α * advantage);
	// Actor update with clipping
	const qTable = this.getQTable(state);
	const oldQ = qTable.get(action) \|\| 0;
	// Compute probability ratio (simplified)
	const ratio = Math.exp(α * advantage);
	const clippedRatio = Math.max(1 - clipRange, Math.min(1 + clipRange, ratio));
	// PPO objective: min(ratio * A, clip(ratio) * A)
	const update = Math.min(ratio * advantage, clippedRatio * advantage);
	const newQ = oldQ + α * update;
	qTable.set(action, newQ);
	return advantage;
	}
	/**
	* TD(λ): Temporal difference with eligibility traces
	*/
	tdLambdaUpdate(exp, config) {
	const { state, action, reward, nextState, done } = exp;
	const { learningRate: α, discountFactor: γ, lambda = 0.8 } = config;
	const qTable = this.getQTable(state);
	const nextQTable = this.getQTable(nextState);
	const currentQ = qTable.get(action) \|\| 0;
	const maxNextQ = done ? 0 : Math.max(0, ...Array.from(nextQTable.values()));
	const tdError = reward + γ * maxNextQ - currentQ;
	// Update eligibility trace for current state-action
	const traces = this.getEligibilityTraces(state);
	traces.set(action, (traces.get(action) \|\| 0) + 1);
	// Update all state-actions with eligibility traces
	for (const [s, sTraces] of this.eligibilityTraces) {
	const sQTable = this.getQTable(s);
	for (const [a, trace] of sTraces) {
	const q = sQTable.get(a) \|\| 0;
	sQTable.set(a, q + α * tdError * trace);
	// Decay trace
	sTraces.set(a, γ * lambda * trace);
	}
	}
	return tdError;
	}
	/**
	* Monte Carlo: Full episode learning
	*/
	monteCarloUpdate(config) {
	const { learningRate: α, discountFactor: γ } = config;
	const trajectory = this.trajectories[this.trajectories.length - 1];
	if (!trajectory \|\| trajectory.experiences.length === 0)
	return 0;
	let G = 0; // Return
	let totalDelta = 0;
	// Work backwards through episode
	for (let t = trajectory.experiences.length - 1; t >= 0; t--) {
	const exp = trajectory.experiences[t];
	G = exp.reward + γ * G;
	const qTable = this.getQTable(exp.state);
	const currentQ = qTable.get(exp.action) \|\| 0;
	const delta = G - currentQ;
	qTable.set(exp.action, currentQ + α * delta);
	totalDelta += Math.abs(delta);
	}
	trajectory.completed = true;
	trajectory.totalReward = G;
	return totalDelta / trajectory.experiences.length;
	}
	/**
	* Decision Transformer: Sequence modeling for trajectories
	*/
	decisionTransformerUpdate(config) {
	const { learningRate: α, sequenceLength = 20 } = config;
	const trajectory = this.trajectories[this.trajectories.length - 1];
	if (!trajectory \|\| trajectory.experiences.length === 0)
	return 0;
	// Decision Transformer learns to predict actions given (return, state, action) sequences
	// Here we use a simplified version that learns state-action patterns
	let totalDelta = 0;
	const experiences = trajectory.experiences.slice(-sequenceLength);
	// Calculate returns-to-go
	const returns = [];
	let R = 0;
	for (let i = experiences.length - 1; i >= 0; i--) {
	R += experiences[i].reward;
	returns.unshift(R);
	}
	// Update Q-values weighted by return-to-go
	for (let i = 0; i < experiences.length; i++) {
	const exp = experiences[i];
	const qTable = this.getQTable(exp.state);
	const currentQ = qTable.get(exp.action) \|\| 0;
	// Weight by normalized return
	const normalizedReturn = returns[i] / (Math.abs(returns[0]) + 1);
	const target = currentQ + α * normalizedReturn * exp.reward;
	const delta = target - currentQ;
	qTable.set(exp.action, target);
	totalDelta += Math.abs(delta);
	}
	trajectory.completed = true;
	trajectory.totalReward = returns[0];
	return totalDelta / experiences.length;
	}
	/**
	* DQN: Deep Q-Network (simplified without actual neural network)
	* Uses experience replay and target network concepts
	*/
	dqnUpdate(exp, config) {
	// Add to replay buffer (trajectory)
	this.addToCurrentTrajectory(exp);
	// Sample from replay buffer
	const replayExp = this.sampleFromReplay();
	if (!replayExp)
	return this.qLearningUpdate(exp, config);
	// Use sampled experience for update (breaks correlation)
	return this.qLearningUpdate(replayExp, config);
	}
	// ============ Helper Methods ============
	getQTable(state) {
	if (!this.qTables.has(state)) {
	this.qTables.set(state, new Map());
	}
	return this.qTables.get(state);
	}
	getQTable2(state) {
	if (!this.qTables2.has(state)) {
	this.qTables2.set(state, new Map());
	}
	return this.qTables2.get(state);
	}
	getEligibilityTraces(state) {
	if (!this.eligibilityTraces.has(state)) {
	this.eligibilityTraces.set(state, new Map());
	}
	return this.eligibilityTraces.get(state);
	}
	softmaxConfidence(values, selectedIdx) {
	if (values.length === 0)
	return 0.5;
	const maxVal = Math.max(...values);
	const expValues = values.map(v => Math.exp(v - maxVal));
	const sumExp = expValues.reduce((a, b) => a + b, 0);
	return expValues[selectedIdx] / sumExp;
	}
	addToCurrentTrajectory(exp) {
	if (this.trajectories.length === 0 \|\| this.trajectories[this.trajectories.length - 1].completed) {
	this.trajectories.push({
	experiences: [],
	totalReward: 0,
	completed: false,
	});
	}
	this.trajectories[this.trajectories.length - 1].experiences.push(exp);
	}
	sampleFromReplay() {
	const allExperiences = [];
	for (const traj of this.trajectories) {
	allExperiences.push(...traj.experiences);
	}
	if (allExperiences.length === 0)
	return null;
	return allExperiences[Math.floor(Math.random() * allExperiences.length)];
	}
	updateStats(algorithm, reward, delta) {
	const stats = this.stats.get(algorithm);
	if (!stats)
	return;
	stats.updates++;
	stats.lastUpdate = Date.now();
	// Running average reward
	this.rewardHistory.push(reward);
	if (this.rewardHistory.length > 1000) {
	this.rewardHistory.shift();
	}
	stats.avgReward = this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length;
	// Convergence score (inverse of recent delta magnitude)
	stats.convergenceScore = 1 / (1 + delta);
	}
	/**
	* Get statistics for all algorithms
	*/
	getStats() {
	return new Map(this.stats);
	}
	/**
	* Get statistics summary
	*/
	getStatsSummary() {
	let bestAlgorithm = 'q-learning';
	let bestScore = -Infinity;
	let totalUpdates = 0;
	const algorithms = [];
	for (const [alg, stats] of this.stats) {
	algorithms.push(stats);
	totalUpdates += stats.updates;
	const score = stats.avgReward * stats.convergenceScore;
	if (score > bestScore && stats.updates > 0) {
	bestScore = score;
	bestAlgorithm = alg;
	}
	}
	return {
	bestAlgorithm,
	totalUpdates,
	avgReward: this.rewardHistory.length > 0
	? this.rewardHistory.reduce((a, b) => a + b, 0) / this.rewardHistory.length
	: 0,
	algorithms: algorithms.filter(a => a.updates > 0),
	};
	}
	/**
	* Export state for persistence
	*/
	export() {
	const qTables = {};
	for (const [state, actions] of this.qTables) {
	qTables[state] = Object.fromEntries(actions);
	}
	const qTables2 = {};
	for (const [state, actions] of this.qTables2) {
	qTables2[state] = Object.fromEntries(actions);
	}
	const criticValues = Object.fromEntries(this.criticValues);
	const stats = {};
	for (const [alg, s] of this.stats) {
	stats[alg] = s;
	}
	const configs = {};
	for (const [task, config] of this.configs) {
	configs[task] = config;
	}
	return {
	qTables,
	qTables2,
	criticValues,
	trajectories: this.trajectories.slice(-100), // Keep last 100 trajectories
	stats,
	configs,
	rewardHistory: this.rewardHistory.slice(-1000),
	};
	}
	/**
	* Import state from persistence
	*/
	import(data) {
	// Q-tables
	this.qTables.clear();
	for (const [state, actions] of Object.entries(data.qTables \|\| {})) {
	this.qTables.set(state, new Map(Object.entries(actions)));
	}
	this.qTables2.clear();
	for (const [state, actions] of Object.entries(data.qTables2 \|\| {})) {
	this.qTables2.set(state, new Map(Object.entries(actions)));
	}
	// Critic values
	this.criticValues = new Map(Object.entries(data.criticValues \|\| {}));
	// Trajectories
	this.trajectories = data.trajectories \|\| [];
	// Stats
	for (const [alg, s] of Object.entries(data.stats \|\| {})) {
	this.stats.set(alg, s);
	}
	// Configs
	for (const [task, config] of Object.entries(data.configs \|\| {})) {
	this.configs.set(task, config);
	}
	// Reward history
	this.rewardHistory = data.rewardHistory \|\| [];
	}
	/**
	* Clear all learning data
	*/
	clear() {
	this.qTables.clear();
	this.qTables2.clear();
	this.eligibilityTraces.clear();
	this.actorWeights.clear();
	this.criticValues.clear();
	this.trajectories = [];
	this.rewardHistory = [];
	// Reset stats
	for (const stats of this.stats.values()) {
	stats.updates = 0;
	stats.avgReward = 0;
	stats.convergenceScore = 0;
	}
	}
	/**
	* Get available algorithms
	*/
	static getAlgorithms() {
	return [
	{ algorithm: 'q-learning', description: 'Simple off-policy learning', bestFor: 'General routing' },
	{ algorithm: 'sarsa', description: 'On-policy, conservative', bestFor: 'Error avoidance' },
	{ algorithm: 'double-q', description: 'Reduces overestimation', bestFor: 'Precise routing' },
	{ algorithm: 'actor-critic', description: 'Policy gradient + value', bestFor: 'Confidence scoring' },
	{ algorithm: 'ppo', description: 'Stable policy updates', bestFor: 'Preference learning' },
	{ algorithm: 'decision-transformer', description: 'Sequence modeling', bestFor: 'Trajectory patterns' },
	{ algorithm: 'monte-carlo', description: 'Full episode learning', bestFor: 'Unbiased estimates' },
	{ algorithm: 'td-lambda', description: 'Eligibility traces', bestFor: 'Credit assignment' },
	{ algorithm: 'dqn', description: 'Experience replay', bestFor: 'High-dim states' },
	];
	}
	}
	exports.LearningEngine = LearningEngine;
	exports.default = LearningEngine;