“”“Course of the WikiText dataset for coaching the BERT mannequin. Utilizing Hugging Face
datasets library.
““”
import time
import random
from typing import Iterator
import tokenizers
from datasets import load_dataset, Dataset
# path and title of every dataset
DATASETS = {
“wikitext-2”: (“wikitext”, “wikitext-2-raw-v1”),
“wikitext-103”: (“wikitext”, “wikitext-103-raw-v1”),
}
PATH, NAME = DATASETS[“wikitext-103”]
TOKENIZER_PATH = “wikitext-103_wordpiece.json”
def create_docs(path: str, title: str, tokenizer: tokenizers.Tokenizer) -> checklist[list[list[int]]]:
“”“Load wikitext dataset and extract textual content as paperwork”“”
dataset = load_dataset(path, title, cut up=“practice”)
docs: checklist[list[list[int]]] = []
for line in dataset[“text”]:
line = line.strip()
if not line or line.startswith(“=”):
docs.append([]) # new doc encountered
else:
tokens = tokenizer.encode(line).ids
docs[–1].append(tokens)
docs = [doc for doc in docs if doc] # take away empty paperwork
return docs
def create_dataset(
docs: checklist[list[list[int]]],
tokenizer: tokenizers.Tokenizer,
max_seq_length: int = 512,
doc_repeat: int = 10,
mask_prob: float = 0.15,
short_seq_prob: float = 0.1,
max_predictions_per_seq: int = 20,
) -> Iterator[dict]:
“”“Generate samples from all paperwork”“”
doc_indices = checklist(vary(len(docs))) * doc_repeat
for doc_idx in doc_indices:
yield from generate_samples(doc_idx, docs, tokenizer, max_seq_length, mask_prob, short_seq_prob, max_predictions_per_seq)
def generate_samples(
doc_idx: int,
all_docs: checklist[list[list[int]]],
tokenizer: tokenizers.Tokenizer,
max_seq_length: int = 512,
mask_prob: float = 0.15,
short_seq_prob: float = 0.1,
max_predictions_per_seq: int = 20,
) -> Iterator[dict]:
“”“Generate samples from a given doc”“”
# variety of tokens to extract from this doc, excluding [CLS], [SEP], [SEP]
target_length = max_seq_length – 3
if random.random() < short_seq_prob:
# shorter sequence is used 10% of the time
target_length = random.randint(2, target_length)
# copy the doc
chunks = []
for chunk in all_docs[doc_idx]:
chunks.append(chunk)
# exhaust chunks and create samples
whereas chunks:
# scan till goal token size
running_length = 0
finish = 1
whereas finish < len(chunks) and running_length < target_length:
running_length += len(chunks[end–1])
finish += 1
# randomly separate the chunk into two segments
sep = random.randint(1, finish–1) if finish > 1 else 1
sentence_a = [tok for chunk in chunks[:sep] for tok in chunk]
sentence_b = [tok for chunk in chunks[sep:end] for tok in chunk]
# sentence B: could also be from one other doc
if not sentence_b or random.random() < 0.5:
# discover one other doc (should not be the identical as doc_idx)
b_idx = random.randint(0, len(all_docs)–2)
if b_idx >= doc_idx:
b_idx += 1
# sentence B begins from a random place within the new doc
sentence_b = []
running_length = len(sentence_a)
i = random.randint(0, len(all_docs[b_idx])–1)
whereas i < len(all_docs[b_idx]) and running_length < target_length:
sentence_b.prolong(all_docs[b_idx][i])
running_length += len(all_docs[b_idx][i])
i += 1
is_random_next = True
chunks = chunks[sep:]
else:
is_random_next = False
chunks = chunks[end:]
# create a pattern from the pair
yield create_sample(sentence_a, sentence_b, is_random_next, tokenizer, max_seq_length, mask_prob, max_predictions_per_seq)
def create_sample(
sentence_a: checklist[list[int]],
sentence_b: checklist[list[int]],
is_random_next: bool,
tokenizer: tokenizers.Tokenizer,
max_seq_length: int = 512,
mask_prob: float = 0.15,
max_predictions_per_seq: int = 20,
) -> dict:
“”“Create a pattern from a pair of sentences”“”
# Acquire id of particular tokens
cls_id = tokenizer.token_to_id(“[CLS]”)
sep_id = tokenizer.token_to_id(“[SEP]”)
mask_id = tokenizer.token_to_id(“[MASK]”)
pad_id = tokenizer.padding[“pad_id”]
# modify size to suit the max sequence size
truncate_seq_pair(sentence_a, sentence_b, max_seq_length–3)
num_pad = max_seq_length – len(sentence_a) – len(sentence_b) – 3
# create unmodified tokens sequence
tokens = [cls_id] + sentence_a + [sep_id] + sentence_b + [sep_id] + ([pad_id] * num_pad)
seg_id = [0] * (len(sentence_a) + 2) + [1] * (len(sentence_b) + 1) + [–1] * num_pad
assert len(tokens) == len(seg_id) == max_seq_size
# create the prediction targets
cand_indices = [i for i, tok in enumerate(tokens) if tok not in [cls_id, sep_id, pad_id]]
random.shuffle(cand_indices)
num_predictions = int(spherical((len(sentence_a) + len(sentence_b)) * mask_prob))
num_predictions = min(max_predictions_per_seq, max(1, num_predictions))
mlm_positions = sorted(cand_indices[:num_predictions])
mlm_labels = []
for i in mlm_positions:
mlm_labels.append(tokens[i])
# prob 0.8 substitute with [MASK], prob 0.1 substitute with random phrase, prob 0.1 hold authentic
if random.random() < 0.8:
tokens[i] = mask_id
elif random.random() < 0.5:
tokens[i] = random.randint(4, tokenizer.get_vocab_size()–1)
# randomly masks some tokens
ret = {
“tokens”: tokens,
“segment_ids”: seg_id,
“is_random_next”: is_random_next,
“masked_positions”: mlm_positions,
“masked_labels”: mlm_labels,
}
return ret
def truncate_seq_pair(sentence_a: checklist[int], sentence_b: checklist[int], max_num_tokens: int) -> None:
“”“Truncate a pair of sequences till under a most sequence size.”“”
whereas len(sentence_a) + len(sentence_b) > max_num_tokens:
# decide the longer sentence to take away tokens from
candidate = sentence_a if len(sentence_a) > len(sentence_b) else sentence_b
# take away one token from both finish in equal chances
if random.random() < 0.5:
candidate.pop(0)
else:
candidate.pop()
if __name__ == “__main__”:
print(time.time(), “began”)
tokenizer = tokenizers.Tokenizer.from_file(TOKENIZER_PATH)
print(time.time(), “loaded tokenizer”)
docs = create_docs(PATH, NAME, tokenizer)
print(time.time(), “created docs with %d paperwork” % len(docs))
dataset = Dataset.from_generator(create_dataset, gen_kwargs={“docs”: docs, “tokenizer”: tokenizer})
print(time.time(), “created dataset from generator”)
# Save dataset to parquet file
dataset.to_parquet(“wikitext-103_train_data.parquet”)
print(time.time(), “saved dataset to parquet file”)
# Load dataset from parquet file
dataset = Dataset.from_parquet(“wikitext-103_train_data.parquet”, streaming=True)
print(time.time(), “loaded dataset from parquet file”)
# Print just a few samples
for i, pattern in enumerate(dataset):
print(i)
print(pattern)
print()
if i >= 3:
break
print(time.time(), “completed”)
















