Tutorial
⏱️ ~60 minutes 🎓 Intermediate–Advanced 🛠️ JavaScript · Math · Machine Learning

Neural Network from Scratch in Vanilla JS

No TensorFlow. No libraries. Build a fully-connected neural network from first principles: matrix-free forward pass, sigmoid and ReLU activations, mean-squared-error loss, backpropagation using the chain rule, and stochastic gradient descent. Test it on XOR and sine function approximation.

Prerequisites

Layer and Network Data Structure

Each layer stores weights, biases and cached activations needed for backprop. We use plain arrays (no matrices — easier to follow):

function randomWeight() { return (Math.random() - 0.5) * 0.5; }

class Layer {
  // inputSize: neurons feeding in; outputSize: neurons in this layer
  constructor(inputSize, outputSize, activation = 'sigmoid') {
    this.w = Array.from({length: outputSize}, () =>
      Array.from({length: inputSize}, randomWeight)
    ); // w[j][i] = weight from input i to output j
    this.b = Array(outputSize).fill(0); // biases
    this.activation = activation;

    // Cache for backprop (filled during forward pass)
    this.input = null;  // pre-activation input
    this.z     = null;  // pre-activation (w·x + b)
    this.a     = null;  // post-activation output
  }
}

class NeuralNetwork {
  constructor(topology, activations) {
    // topology = [2, 4, 4, 1] → 2 inputs, two hidden layers of 4, 1 output
    this.layers = [];
    for (let i = 1; i < topology.length; i++) {
      this.layers.push(new Layer(topology[i-1], topology[i], activations?.[i-1] ?? 'sigmoid'));
    }
  }
}

Activation Functions and Derivatives

const activations = {
  sigmoid: {
    fn:  x => 1 / (1 + Math.exp(-x)),
    der: x => { const s = 1/(1+Math.exp(-x)); return s*(1-s); },
  },
  relu: {
    fn:  x => Math.max(0, x),
    der: x => x > 0 ? 1 : 0,
  },
  tanh: {
    fn:  x => Math.tanh(x),
    der: x => 1 - Math.tanh(x)**2,
  },
  linear: {
    fn:  x => x,
    der: _ => 1,
  },
};

Forward Pass

Propagate input through every layer. For each layer, compute z = W·x + b then a = activation(z):

NeuralNetwork.prototype.forward = function(input) {
  let x = input;
  for (const layer of this.layers) {
    layer.input = x;
    const act = activations[layer.activation];

    // z[j] = sum_i(w[j][i] * x[i]) + b[j]
    layer.z = layer.w.map((row, j) =>
      row.reduce((sum, wij, i) => sum + wij * x[i], layer.b[j])
    );

    // a[j] = activation(z[j])
    layer.a = layer.z.map(act.fn);
    x = layer.a;
  }
  return x; // final output
};

Loss Function

// Mean Squared Error: L = (1/N) * Σ(y_pred - y_true)²
function mseLoss(pred, target) {
  return pred.reduce((sum, p, i) => sum + (p - target[i])**2, 0) / pred.length;
}

// dL/d(pred[i]) = 2 * (pred[i] - target[i]) / N
function mseLossGrad(pred, target) {
  const n = pred.length;
  return pred.map((p, i) => 2 * (p - target[i]) / n);
}

Backpropagation

The chain rule propagates gradients backward from output to input. For each layer (in reverse):

NeuralNetwork.prototype.backward = function(lossGrad) {
  const grads = []; // store weight/bias gradients per layer
  let delta = lossGrad; // gradient flowing from previous layer

  for (let l = this.layers.length - 1; l >= 0; l--) {
    const layer = this.layers[l];
    const act = activations[layer.activation];

    // dL/dz[j] = delta[j] * activation'(z[j])
    const dz = layer.z.map((zj, j) => delta[j] * act.der(zj));

    // dL/dw[j][i] = dz[j] * input[i]
    const dw = layer.w.map((row, j) => row.map((_, i) => dz[j] * layer.input[i]));

    // dL/db[j] = dz[j]
    const db = dz.slice();

    // dL/d(input[i]) = sum_j(dz[j] * w[j][i]) — propagate to previous layer
    delta = layer.input.map((_, i) =>
      layer.w.reduce((sum, row, j) => sum + dz[j] * row[i], 0)
    );

    grads.unshift({ dw, db }); // prepend (we're going backward)
  }
  return grads;
};

SGD Weight Update

NeuralNetwork.prototype.update = function(grads, lr = 0.01) {
  for (let l = 0; l < this.layers.length; l++) {
    const layer = this.layers[l];
    const { dw, db } = grads[l];

    // w[j][i] -= lr * dL/dw[j][i]
    for (let j = 0; j < layer.w.length; j++) {
      for (let i = 0; i < layer.w[j].length; i++) {
        layer.w[j][i] -= lr * dw[j][i];
      }
      layer.b[j] -= lr * db[j];
    }
  }
};

// One complete training step
function trainStep(net, inputs, targets, lr) {
  let totalLoss = 0;
  for (let k = 0; k < inputs.length; k++) {
    const pred = net.forward(inputs[k]);
    totalLoss += mseLoss(pred, targets[k]);
    const grad = mseLossGrad(pred, targets[k]);
    const grads = net.backward(grad);
    net.update(grads, lr);
  }
  return totalLoss / inputs.length;
}

Train on XOR and Visualise

// XOR dataset
const XOR_X = [[0,0],[0,1],[1,0],[1,1]];
const XOR_Y = [[0],[1],[1],[0]];

// 2 inputs → 4 hidden (sigmoid) → 1 output (sigmoid)
const net = new NeuralNetwork([2, 4, 4, 1], ['sigmoid','sigmoid','sigmoid']);

// Training loop
const canvas = document.getElementById('loss-chart');
const ctx = canvas.getContext('2d');
const lossHistory = [];

let epoch = 0;
function trainEpochs(steps = 50) {
  for (let i = 0; i < steps; i++) {
    const loss = trainStep(net, XOR_X, XOR_Y, 0.3);
    lossHistory.push(loss);
    epoch++;
  }

  // Quick loss chart
  ctx.clearRect(0, 0, canvas.width, canvas.height);
  ctx.strokeStyle = '#22c55e';
  ctx.lineWidth = 2;
  ctx.beginPath();
  lossHistory.forEach((l, i) => {
    const x = i / lossHistory.length * canvas.width;
    const y = (1 - Math.min(l, 1)) * canvas.height;
    i === 0 ? ctx.moveTo(x, y) : ctx.lineTo(x, y);
  });
  ctx.stroke();

  if (epoch < 3000) requestAnimationFrame(() => trainEpochs(50));
}

trainEpochs();

// Test after training
console.log('XOR results after training:');
for (let k = 0; k < XOR_X.length; k++) {
  const pred = net.forward(XOR_X[k])[0].toFixed(3);
  console.log(`[${XOR_X[k]}] → ${pred} (expected ${XOR_Y[k][0]})`);
}
// Expected output: values near 0, 1, 1, 0

XOR is the classic "non-linearly separable" test — a single layer can't solve it, but 2 hidden layers can. A 2-4-1 network usually solves XOR in ~2000 epochs with LR=0.3. If it doesn't converge, re-randomise weights (local minima are common with XOR).

Continue Learning

🛠

Experiment in Playground

Extend the network — train on a different problem or add more layers in the browser.

Open Playground → View Simulation ↗