{"payload":{"feedbackUrl":"https://github.com/orgs/community/discussions/53140","repo":{"id":653623369,"defaultBranch":"main","name":"datatrove","ownerLogin":"huggingface","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-06-14T12:05:28.000Z","ownerAvatar":"https://github.com/avatars/u/25720743?v=4","public":true,"private":false,"isOrgOwned":true},"refInfo":{"name":"","listCacheKey":"v0:1721475039.0","currentOid":""},"activityList":{"items":[{"before":"734990228d305bdd38c2c3bab4e697d988c9ae68","after":"b5443d2b8ef473262bc97b3d7717a217b6eaf1f3","ref":"refs/heads/main","pushedAt":"2024-07-22T09:17:54.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fix correct type inference for cache filesystems (#257)","shortMessageHtmlLink":"fix correct type inference for cache filesystems (#257)"}},{"before":null,"after":"5f2a316498a22cdeb70dc453a934091c754ee522","ref":"refs/heads/filecache_handling","pushedAt":"2024-07-20T11:30:39.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"hynky1999","name":"Hynek Kydlíček","path":"/hynky1999","primaryAvatarUrl":"https://github.com/avatars/u/39408646?s=80&v=4"},"commit":{"message":"fix correct type inference for cache filesystems","shortMessageHtmlLink":"fix correct type inference for cache filesystems"}},{"before":"452e69ab17c8e5f25243e0b89a8c6c385e4c9a5a","after":"734990228d305bdd38c2c3bab4e697d988c9ae68","ref":"refs/heads/main","pushedAt":"2024-07-17T12:03:19.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"not -> is None","shortMessageHtmlLink":"not -> is None"}},{"before":"71f94cfcea63d3765e6c85eeb1d62d739260bd8f","after":null,"ref":"refs/heads/hist_token_counts","pushedAt":"2024-07-15T15:47:57.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"}},{"before":"c279f269a306958211bb576d06c25dbde440aabc","after":"452e69ab17c8e5f25243e0b89a8c6c385e4c9a5a","ref":"refs/heads/main","pushedAt":"2024-07-15T15:47:53.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"Add token and char count to histogram stats (#251)\n\n* add token and char count to histogram stats\r\n\r\n* fixes for merger\r\n\r\n* revert group change","shortMessageHtmlLink":"Add token and char count to histogram stats (#251)"}},{"before":"1687ba3db5529e564f28bd41421542f996f37f2f","after":"71f94cfcea63d3765e6c85eeb1d62d739260bd8f","ref":"refs/heads/hist_token_counts","pushedAt":"2024-07-15T15:46:36.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"revert group change","shortMessageHtmlLink":"revert group change"}},{"before":"359d9fa6de7d703ff3434d6d7dbba28888b7bbf0","after":"1687ba3db5529e564f28bd41421542f996f37f2f","ref":"refs/heads/hist_token_counts","pushedAt":"2024-07-15T15:41:49.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fixes for merger","shortMessageHtmlLink":"fixes for merger"}},{"before":null,"after":"359d9fa6de7d703ff3434d6d7dbba28888b7bbf0","ref":"refs/heads/hist_token_counts","pushedAt":"2024-07-15T15:32:50.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"add token and char count to histogram stats","shortMessageHtmlLink":"add token and char count to histogram stats"}},{"before":"0814338c24824f6b3ca9780fc6dc7af1612932b8","after":"84dd1263ec408ce3dfe33b0cb01beefe25075c4b","ref":"refs/heads/slurm_nodes","pushedAt":"2024-07-12T00:13:57.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"do not kill everything when a single task fails","shortMessageHtmlLink":"do not kill everything when a single task fails"}},{"before":"bc19878e28fd024ed69b6da1b21284eff9943d0d","after":"0814338c24824f6b3ca9780fc6dc7af1612932b8","ref":"refs/heads/slurm_nodes","pushedAt":"2024-07-10T22:37:23.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fix merge stats","shortMessageHtmlLink":"fix merge stats"}},{"before":"61420733313601cbfc81f9a203b139f0abd82fdd","after":"bc19878e28fd024ed69b6da1b21284eff9943d0d","ref":"refs/heads/slurm_nodes","pushedAt":"2024-07-10T22:31:13.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"bugfixes","shortMessageHtmlLink":"bugfixes"}},{"before":null,"after":"61420733313601cbfc81f9a203b139f0abd82fdd","ref":"refs/heads/slurm_nodes","pushedAt":"2024-07-10T16:37:10.000Z","pushType":"branch_creation","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"push slurm nodes executor","shortMessageHtmlLink":"push slurm nodes executor"}},{"before":"aa43e3fb6c94fdf2e669534b5e12244c051497ec","after":"c279f269a306958211bb576d06c25dbde440aabc","ref":"refs/heads/main","pushedAt":"2024-07-09T10:18:51.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"Add withdirs to extra_options only when not using glob_pattern (#244)\n\nCo-authored-by: Olga Bakkari ","shortMessageHtmlLink":"Add withdirs to extra_options only when not using glob_pattern (#244)"}},{"before":"8391d120402e6ba5e0b14c53c128af48be3aabc0","after":"aa43e3fb6c94fdf2e669534b5e12244c051497ec","ref":"refs/heads/main","pushedAt":"2024-07-08T16:14:40.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fix shard check","shortMessageHtmlLink":"fix shard check"}},{"before":"e01bd0a82558226cd9296c5ab4fa4777b7f45e88","after":"8391d120402e6ba5e0b14c53c128af48be3aabc0","ref":"refs/heads/main","pushedAt":"2024-07-08T11:18:15.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fix linter","shortMessageHtmlLink":"fix linter"}},{"before":"55a9072e75323f7994e8f82bb761ccbd7543aee1","after":"e01bd0a82558226cd9296c5ab4fa4777b7f45e88","ref":"refs/heads/main","pushedAt":"2024-07-08T10:25:57.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"fix split_dataset_by_node on HuggingFaceReader","shortMessageHtmlLink":"fix split_dataset_by_node on HuggingFaceReader"}},{"before":"af63762ce338e511baa1ba1e0eb936d590f75bfd","after":"55a9072e75323f7994e8f82bb761ccbd7543aee1","ref":"refs/heads/main","pushedAt":"2024-07-08T10:09:35.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"add dependencies lid.py, io.py #239 (#241)\n\n* add dependencies lid.py, io.py #239\r\n\r\n* Update src/datatrove/utils/lid.py\r\n\r\nCo-authored-by: Guilherme Penedo \r\n\r\n---------\r\n\r\nCo-authored-by: Guilherme Penedo ","shortMessageHtmlLink":"add dependencies lid.py, io.py #239 (#241)"}},{"before":"061d4db7f26336bd6223af72f282e30db4916121","after":"af63762ce338e511baa1ba1e0eb936d590f75bfd","ref":"refs/heads/main","pushedAt":"2024-07-08T09:42:32.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"index file read fix (#229)\n\nFix `.ds.index` file read in shuffler code. Also fix the logging where `path` field is accessed from string.","shortMessageHtmlLink":"index file read fix (#229)"}},{"before":"7ba873fc87086098657e488e7365f8c14aeb4d06","after":"061d4db7f26336bd6223af72f282e30db4916121","ref":"refs/heads/main","pushedAt":"2024-07-05T15:40:49.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"option to keep more language scores","shortMessageHtmlLink":"option to keep more language scores"}},{"before":"898efc0fc6ee2050f8ef78f7236cace2b26f2824","after":"7ba873fc87086098657e488e7365f8c14aeb4d06","ref":"refs/heads/main","pushedAt":"2024-07-05T10:33:55.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"add batching to filters","shortMessageHtmlLink":"add batching to filters"}},{"before":"b045794b5f84ad4e351d5b321b6827e147b3da35","after":"898efc0fc6ee2050f8ef78f7236cace2b26f2824","ref":"refs/heads/main","pushedAt":"2024-07-03T23:25:51.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"nit","shortMessageHtmlLink":"nit"}},{"before":"1e27cc8819465d5246d89cd929423b76eb0bc5dd","after":"b045794b5f84ad4e351d5b321b6827e147b3da35","ref":"refs/heads/main","pushedAt":"2024-07-03T23:24:11.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"add tasks total to jobs_status","shortMessageHtmlLink":"add tasks total to jobs_status"}},{"before":"1cece66bcefb43ca7e9ce3b4a44ae2cf7600e942","after":"1e27cc8819465d5246d89cd929423b76eb0bc5dd","ref":"refs/heads/main","pushedAt":"2024-06-28T12:30:11.000Z","pushType":"pr_merge","commitsCount":2,"pusher":{"login":"hynky1999","name":"Hynek Kydlíček","path":"/hynky1999","primaryAvatarUrl":"https://github.com/avatars/u/39408646?s=80&v=4"},"commit":{"message":"Merge pull request #232 from QasidSaleem/optimize_num_sentences_computation\n\nchecks if min_num_sentences is disabled or not before computing the n…","shortMessageHtmlLink":"Merge pull request #232 from QasidSaleem/optimize_num_sentences_compu…"}},{"before":"f0ad3efc5e20e2920cf61f73400fda74d54d0d11","after":null,"ref":"refs/heads/add_filter_example","pushedAt":"2024-06-25T15:21:58.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"}},{"before":"01a017b1bd03a6cdc59c4515545e80788de4e03d","after":"1cece66bcefb43ca7e9ce3b4a44ae2cf7600e942","ref":"refs/heads/main","pushedAt":"2024-06-25T15:21:53.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"Add an example for filtering an HF dataset and push to hub (#201)\n\n* Create filter_hf_dataset.py\n\n* fix linting and add comment\n\n* Update filter_hf_dataset.py","shortMessageHtmlLink":"Add an example for filtering an HF dataset and push to hub (#201)"}},{"before":"08f07ed78a9bf0925e64abd7d6dd00bba09e90b8","after":"f0ad3efc5e20e2920cf61f73400fda74d54d0d11","ref":"refs/heads/add_filter_example","pushedAt":"2024-06-25T15:15:21.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"loubnabnl","name":"Loubna Ben Allal","path":"/loubnabnl","primaryAvatarUrl":"https://github.com/avatars/u/44069155?s=80&v=4"},"commit":{"message":"Update filter_hf_dataset.py","shortMessageHtmlLink":"Update filter_hf_dataset.py"}},{"before":"240f65988d2970fed82cbe835bc533729a28d50d","after":"08f07ed78a9bf0925e64abd7d6dd00bba09e90b8","ref":"refs/heads/add_filter_example","pushedAt":"2024-06-25T15:10:20.000Z","pushType":"push","commitsCount":1,"pusher":{"login":"loubnabnl","name":"Loubna Ben Allal","path":"/loubnabnl","primaryAvatarUrl":"https://github.com/avatars/u/44069155?s=80&v=4"},"commit":{"message":"fix linting and add comment","shortMessageHtmlLink":"fix linting and add comment"}},{"before":"18f5f9863a188ea5096f8500fb7c9397aabe833e","after":null,"ref":"refs/heads/paths_file","pushedAt":"2024-06-19T15:35:10.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"}},{"before":"4fa87f78826827aa5b82a9f4e043b7c96f40a312","after":"01a017b1bd03a6cdc59c4515545e80788de4e03d","ref":"refs/heads/main","pushedAt":"2024-06-19T15:35:07.000Z","pushType":"pr_merge","commitsCount":1,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"},"commit":{"message":"Adds paths_file to readers (#228)\n\n* allow to pass a paths_file instead of always listing files in readers\r\n\r\n* Update src/datatrove/pipeline/readers/warc.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* Update src/datatrove/pipeline/readers/parquet.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* Update src/datatrove/pipeline/readers/jsonl.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* Update src/datatrove/pipeline/readers/ipc.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* Update src/datatrove/pipeline/readers/csv.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* Update src/datatrove/pipeline/readers/base.py\r\n\r\nCo-authored-by: Hynek Kydlíček \r\n\r\n* fix typo\r\n\r\n---------\r\n\r\nCo-authored-by: Hynek Kydlíček ","shortMessageHtmlLink":"Adds paths_file to readers (#228)"}},{"before":"437cdddaae0ad7f961cabea5c74600147e874b29","after":null,"ref":"refs/heads/glotlid","pushedAt":"2024-06-19T14:36:44.000Z","pushType":"branch_deletion","commitsCount":0,"pusher":{"login":"guipenedo","name":"Guilherme Penedo","path":"/guipenedo","primaryAvatarUrl":"https://github.com/avatars/u/3883401?s=80&v=4"}}],"hasNextPage":true,"hasPreviousPage":false,"activityType":"all","actor":null,"timePeriod":"all","sort":"DESC","perPage":30,"cursor":"djE6ks8AAAAEhZOCrwA","startCursor":null,"endCursor":null}},"title":"Activity · huggingface/datatrove"}