Build a Neural Network from Scratch in JavaScript
Modern deep learning frameworks hide thousands of lines behind a
single model.fit(). This tutorial strips everything away.
We build a fully-connected neural network in pure JavaScript — matrix
multiply, activation functions, forward pass, mean-squared-error loss,
backpropagation, and SGD — with zero external dependencies and full
source code you can run in any browser.
1. Neurons and Layers
A single artificial neuron computes a weighted sum of its inputs plus a bias, then applies a non-linear activation function:
A layer is a column of neurons processing the same input vector in parallel. Its entire computation is one matrix-vector product:
A network chains these layers: the output of one becomes the input of the next. Because each layer applies a linear transform followed by a non-linearity, the composition is a universal function approximator (Universal Approximation Theorem, Cybenko 1989).
2. Matrix Operations
We represent matrices as flat Float64Arrays for speed. Key operations we need:
// Lightweight matrix library — rows-major, flat array
function mat(rows, cols, data) {
return { rows, cols, d: data ? new Float64Array(data) : new Float64Array(rows * cols) };
}
function matMul(A, B) {
// C = A (m×k) · B (k×n)
const C = mat(A.rows, B.cols);
for (let i = 0; i < A.rows; i++)
for (let j = 0; j < B.cols; j++) {
let s = 0;
for (let k = 0; k < A.cols; k++) s += A.d[i * A.cols + k] * B.d[k * B.cols + j];
C.d[i * B.cols + j] = s;
}
return C;
}
function matT(A) {
const B = mat(A.cols, A.rows);
for (let i = 0; i < A.rows; i++)
for (let j = 0; j < A.cols; j++)
B.d[j * A.rows + i] = A.d[i * A.cols + j];
return B;
}
function matAdd(A, b) { // broadcast add column vector b to each column of A
const C = mat(A.rows, A.cols);
for (let i = 0; i < A.rows; i++)
for (let j = 0; j < A.cols; j++)
C.d[i * A.cols + j] = A.d[i * A.cols + j] + b.d[i];
return C;
}
3. Activation Functions
Sigmoid
σ(z) = 1/(1+e⁻ᶻ) — squashes to (0,1), classic but prone to vanishing gradients. σ'(z) = σ(z)(1−σ(z)).
Tanh
tanh(z) — zero-centered output in (−1,1), generally better than sigmoid for hidden layers. tanh'(z) = 1−tanh²(z).
ReLU
max(0,z) — no saturation for positive values, sparse activation, simple derivative: 0 or 1. Default choice for deep nets.
Softmax
eᶻᵢ/Σeᶻⱼ — converts logit vector to probability distribution, used in output layer for multi-class classification.
const activations = {
sigmoid: {
f: z => 1 / (1 + Math.exp(-z)),
df: a => a * (1 - a) // a = σ(z)
},
tanh: {
f: z => Math.tanh(z),
df: a => 1 - a * a // a = tanh(z)
},
relu: {
f: z => Math.max(0, z),
df: a => a > 0 ? 1 : 0
}
};
4. Forward Pass
The forward pass propagates input through every layer, storing the pre-activation Z and post-activation A at each layer — we will need them during backprop:
5. Loss Functions
The loss measures how wrong the network's prediction Ŷ is compared to the ground truth Y:
The gradient of the loss with respect to the output layer's activations is the starting point for backpropagation.
6. Backpropagation
Backprop applies the chain rule to propagate the loss gradient backward through each layer. For layer ℓ (going from L down to 1):
7. Full Network Implementation
// Fully-connected network — pure JS, zero dependencies
class DenseLayer {
constructor(inSize, outSize, activation = 'sigmoid') {
this.activation = activations[activation];
// Xavier / Glorot initialisation
const scale = Math.sqrt(2 / (inSize + outSize));
this.W = mat(outSize, inSize,
Array.from({length: outSize * inSize}, () => (Math.random() * 2 - 1) * scale));
this.b = mat(outSize, 1); // zeros
this.Z = null; this.A = null; this.A_prev = null;
}
forward(A_prev) {
this.A_prev = A_prev;
this.Z = matAdd(matMul(this.W, A_prev), this.b);
this.A = mat(this.Z.rows, this.Z.cols,
this.Z.d.map(this.activation.f));
return this.A;
}
backward(delta_next) {
// delta_next is δ from the layer above (already multiplied by W[ℓ+1]ᵀ if any)
const dZ = mat(this.Z.rows, this.Z.cols,
delta_next.d.map((v, i) => v * this.activation.df(this.A.d[i])));
const m = this.A_prev.cols;
this.dW = scalarDiv(matMul(dZ, matT(this.A_prev)), m);
this.db = rowMean(dZ);
return matMul(matT(this.W), dZ); // δ to pass to layer ℓ−1
}
update(lr) {
for (let i = 0; i < this.W.d.length; i++) this.W.d[i] -= lr * this.dW.d[i];
for (let i = 0; i < this.b.d.length; i++) this.b.d[i] -= lr * this.db.d[i];
}
}
class NeuralNetwork {
constructor(...layers) { this.layers = layers; }
predict(X) {
let A = X;
for (const layer of this.layers) A = layer.forward(A);
return A;
}
train(X, Y, lr = 0.1) {
const Yhat = this.predict(X);
// Gradient of MSE: 2*(Yhat-Y)/n
const m = X.cols;
let delta = mat(Yhat.rows, Yhat.cols,
Yhat.d.map((v, i) => 2 * (v - Y.d[i]) / m));
for (let i = this.layers.length - 1; i >= 0; i--)
delta = this.layers[i].backward(delta);
for (const layer of this.layers) layer.update(lr);
return mseLoss(Yhat, Y);
}
}
// Utility: scalar divide, row mean
function scalarDiv(M, s) { return mat(M.rows, M.cols, M.d.map(v => v / s)); }
function rowMean(M) {
const b = mat(M.rows, 1);
for (let i = 0; i < M.rows; i++) {
let s = 0;
for (let j = 0; j < M.cols; j++) s += M.d[i * M.cols + j];
b.d[i] = s / M.cols;
}
return b;
}
function mseLoss(Yhat, Y) {
return Yhat.d.reduce((s, v, i) => s + (v - Y.d[i]) ** 2, 0) / Yhat.d.length;
}
8. Training on XOR
XOR is the classic non-linearly-separable problem that a single-layer perceptron cannot solve. A hidden layer with at least 2 neurons can:
// XOR dataset — 4 samples, batch training
// Inputs as columns (2 rows × 4 samples)
const X = mat(2, 4, [0,0,1,1, 0,1,0,1]);
const Y = mat(1, 4, [0,1,1,0]);
const net = new NeuralNetwork(
new DenseLayer(2, 4, 'tanh'), // hidden: 4 neurons
new DenseLayer(4, 1, 'sigmoid') // output: 1 neuron
);
for (let epoch = 0; epoch < 10000; epoch++) {
const loss = net.train(X, Y, 0.5);
if (epoch % 1000 === 0)
console.log(`Epoch ${epoch}: loss = ${loss.toFixed(5)}`);
}
// After training, predictions should approximate [0, 1, 1, 0]
const pred = net.predict(X);
console.log('Predictions:', [...pred.d].map(v => v.toFixed(3)));
Running this in the browser console trains in milliseconds. After 10 000 epochs the network reliably outputs values close to [0.02, 0.97, 0.97, 0.02] — correctly solving XOR with no library at all.