| import os |
| import tqdm |
|
|
|
|
| |
| def split_files(input_path, output_path1, output_path2): |
|
|
| files = [ |
| f |
| for f in os.listdir(input_path) |
| if os.path.isfile(os.path.join(input_path, f)) and f.endswith(".py") |
| ] |
|
|
| |
| files.sort() |
|
|
| split_index = int(len(files) * 0.8) |
| first_80_files = files[:split_index] |
| last_20_files = files[split_index:] |
|
|
| print(os.listdir(input_path)) |
|
|
| with open(output_path1, "w") as outfile1: |
| for fname in tqdm.tqdm(first_80_files): |
| with open(os.path.join(input_path, fname), errors="ignore") as infile: |
| outfile1.write(infile.read()) |
| outfile1.write("\nprint('---FILESEP---')\n") |
|
|
| with open(output_path2, "w") as outfile2: |
| for fname in tqdm.tqdm(last_20_files): |
| with open(os.path.join(input_path, fname), errors="ignore") as infile: |
| outfile2.write(infile.read()) |
| outfile2.write("\nprint('---FILESEP---')\n") |
|
|
|
|
| |
| input_path = os.path.expanduser("~/torch_datasets/github-python/all_trains") |
| output_path1 = os.path.expanduser("~/torch_datasets/github-python/80") |
| output_path2 = os.path.expanduser("~/torch_datasets/github-python/20") |
| split_files(input_path, output_path1, output_path2) |
|
|