Tutorial · Beginner · ~40 min
Markov Chain Text Generator
A Markov chain generates text by remembering the last N words and randomly choosing what comes next — based on frequencies observed in a training corpus. Order-2 chains produce surprisingly coherent gibberish; order-4 chains often reproduce exact source sentences. Build one in 60 lines of JavaScript.
1Tokenise and build n-gram table
Split the training text into tokens (words, punctuation). For each n-gram (window of N consecutive tokens), record what token follows it:
function tokenize(text) { // Keep words, apostrophes, and punctuation
as separate tokens return text .toLowerCase()
.match(/[a-z']+|[.,!?;:"-]/g) || []; } function buildModel(text, order
= 2) { const tokens = tokenize(text); const model = new Map(); // key:
N-word tuple → value: array of following words for (let i = 0; i <
tokens.length - order; i++) { const key = tokens.slice(i, i +
order).join(' '); const next = tokens[i + order]; if (!model.has(key))
model.set(key, []); model.get(key).push(next); } return model; } //
Example usage with a Shakespeare excerpt const corpus = `To be or not
to be that is the question whether tis nobler in the mind to suffer
the slings and arrows of outrageous fortune …`; const model =
buildModel(corpus, 2); // model.get("to be") → ["or", "that", "or",
"or", ...] (frequencies encoded as repetitions)
Storing frequencies as repeated values (e.g.
["the","the","the","a"]) rather than count maps makes
sampling trivially simple: just pick a random index. Memory usage is
slightly higher but negligible for typical corpora.
2Generate text by sampling
function generate(model, order, maxWords = 100, seed = null) { //
Start from a random key, or a provided seed phrase const keys =
[...model.keys()]; let current = seed ?? keys[Math.floor(Math.random()
* keys.length)]; const words = current.split(' '); for (let i = 0; i
< maxWords; i++) { const candidates = model.get(current); if
(!candidates || candidates.length === 0) break; const next =
candidates[Math.floor(Math.random() * candidates.length)];
words.push(next); // Advance the window by one word const window =
words.slice(-order); current = window.join(' '); } return words.join('
'); } console.log(generate(model, 2, 80)); // → "to be that is the
question whether tis nobler in the mind to // suffer the slings and
arrows of outrageous fortune whether ..."
3Temperature sampling
Raw frequency sampling always favours the most common continuations. Temperature controls diversity: below 1 = more predictable, above 1 = more random:
function temperatureSample(candidates, temperature = 1.0) { if
(temperature === 1.0) { return candidates[Math.floor(Math.random() *
candidates.length)]; } // Count frequencies const freq = new Map();
for (const w of candidates) freq.set(w, (freq.get(w) ?? 0) + 1); //
Convert to probability distribution, apply temperature const words =
[...freq.keys()]; let logits = words.map(w => Math.log(freq.get(w) /
candidates.length) / temperature); const maxLogit =
Math.max(...logits); let probs = logits.map(l => Math.exp(l -
maxLogit)); const sum = probs.reduce((a, b) => a + b, 0); probs =
probs.map(p => p / sum); // Sample from distribution let r =
Math.random(), cumulative = 0; for (let i = 0; i < words.length;
i++) { cumulative += probs[i]; if (r <= cumulative) return
words[i]; } return words[words.length - 1]; }
4Start tokens and sentence boundaries
// Add special <START> marker at sentence beginnings function
buildModelWithBoundaries(text, order = 2) { const sentences =
text.split(/[.!?]+/).map(s => s.trim()).filter(Boolean); const model =
new Map(); for (const sentence of sentences) { const tokens =
tokenize(sentence); const padded =
[...Array(order).fill('<START>'), ...tokens, '<END>']; for
(let i = 0; i < padded.length - order; i++) { const key =
padded.slice(i, i + order).join(' '); const next = padded[i + order];
if (!model.has(key)) model.set(key, []); model.get(key).push(next); }
} return model; } // Generator: always start from <START> and
stop at <END> function generateSentence(model, order,
temperature = 1.0) { let current =
Array(order).fill('<START>').join(' '); const words = []; for
(let i = 0; i < 200; i++) { const candidates = model.get(current);
if (!candidates) break; const next = temperatureSample(candidates,
temperature); if (next === '<END>') break; words.push(next);
current = [...current.split(' ').slice(1), next].join(' '); } return
words.join(' '); }
5Build a browser UI
<!-- HTML skeleton --> <textarea id="corpus" rows="8"
placeholder="Paste training text…"></textarea>
<label>Order: <input id="order" type="range" min="1" max="5"
value="2"> <span id="orderVal">2</span></label>
<label>Temperature: <input id="temp" type="range" min="1"
max="20" value="10"> <span
id="tempVal">1.0</span></label> <button
id="train">Train</button> <button
id="gen">Generate</button> <div
id="output"></div> <script> let model = null,
currentOrder = 2;
document.getElementById('order').addEventListener('input', e => {
document.getElementById('orderVal').textContent = currentOrder =
+e.target.value; });
document.getElementById('temp').addEventListener('input', e => {
document.getElementById('tempVal').textContent = (+e.target.value /
10).toFixed(1); });
document.getElementById('train').addEventListener('click', () => {
const corpus = document.getElementById('corpus').value; model =
buildModelWithBoundaries(corpus, currentOrder);
document.getElementById('output').textContent = `Trained.
${model.size} unique states.`; });
document.getElementById('gen').addEventListener('click', () => { if
(!model) return; const temp = +document.getElementById('temp').value /
10; const text = Array.from({length: 5}, () => generateSentence(model,
currentOrder, temp)).join(' ');
document.getElementById('output').textContent = text; });
</script>
Try order 1 (very random), order 3 (coherent paragraphs), and order 5
(nearly memorises the source). The sweet spot for interesting
generated text is usually order 2–3 with temperature 0.8–1.2.