wip gpt2

2023-01-11 06:34:35 +00:00 · 2023-01-11 06:34:35 +00:00 · 96a1c51d74
parent 290bb33538
commit 96a1c51d74
2 changed files with 34 additions and 6 deletions
--- a/generators/gpt2.py
+++ b/generators/gpt2.py
@ -0,0 +1,32 @@
 import pytorch_lightning.utilities
 # hack until https://github.com/minimaxir/aitextgen/issues/200 is fixed
 pytorch_lightning.utilities._TPU_AVAILABLE = False
 from aitextgen.TokenDataset import TokenDataset
 from aitextgen.tokenizers import train_tokenizer
 from aitextgen.utils import GPT2ConfigCPU
 from aitextgen import aitextgen
 # The name of the downloaded Shakespeare text for training
 file_name = "littlethief.txt"
 # Train a custom BPE Tokenizer on the downloaded text
 # This will save one file: `aitextgen.tokenizer.json`, which contains the
 # information needed to rebuild the tokenizer.
 train_tokenizer(file_name)
 tokenizer_file = "aitextgen.tokenizer.json"
 # GPT2ConfigCPU is a mini variant of GPT-2 optimized for CPU-training
 # e.g. the # of input tokens here is 64 vs. 1024 for base GPT-2.
 config = GPT2ConfigCPU()
 # Instantiate aitextgen using the created tokenizer and config
 ai = aitextgen(tokenizer_file=tokenizer_file, config=config)
 # You can build datasets for training by creating TokenDatasets,
 # which automatically processes the dataset with the appropriate size.
 data = TokenDataset(file_name, tokenizer_file=tokenizer_file, block_size=64)
 # Train the model! It will save pytorch_model.bin periodically and after completion to the `trained_model` folder.
 # On a 2020 8-core iMac, this took ~25 minutes to run.
 ai.train(data, batch_size=8, num_steps=50000, generate_every=5000, save_every=5000)
--- a/requirements/gpt2.txt
+++ b/requirements/gpt2.txt
@ -1,6 +1,2 @@
-fire ~= 0.1.3
+pytorch-lightning==1.7.7
-# this package claims perpetual backwards compatibility with the "re" module so we allow any version
+aitextgen
 regex
 requests ~= 2.0
 tqdm ~= 4.0
 toposort ~= 1.0