diff --git a/data/datasets.dvc b/data/datasets.dvc index 66359ef4..6d5b7485 100644 --- a/data/datasets.dvc +++ b/data/datasets.dvc @@ -1,6 +1,6 @@ outs: -- md5: b073343b2883fe4537cbdb98bd447cc2.dir - size: 728780256 - nfiles: 61 +- md5: b45a782128aebf6786df801a75fbd46d.dir + size: 734199009 + nfiles: 62 path: datasets hash: md5 diff --git a/data/raw.dvc b/data/raw.dvc index bc5cdb43..f13f5629 100644 --- a/data/raw.dvc +++ b/data/raw.dvc @@ -1,6 +1,6 @@ outs: -- md5: 7ba2a5b698999a9bfa635252b5090ff7.dir - size: 446381456 - nfiles: 406 +- md5: 0916e33f6eef6c80a87e319427005f5e.dir + size: 446720790 + nfiles: 408 path: raw hash: md5 diff --git a/data/report.txt b/data/report.txt index dcb2ab64..46f3000b 100644 --- a/data/report.txt +++ b/data/report.txt @@ -512,3 +512,12 @@ eo_data_export_failed 227 ✔ training amount: 293, positive class: 16.4% ✔ validation amount: 235, positive class: 18.3% ✔ testing amount: 245, positive class: 16.7% + + + +Uganda_NorthCEO2017 (Timesteps: 24) +---------------------------------------------------------------------------- +eo_data_complete 1000 +✔ training amount: 387, positive class: 1.3% +✔ validation amount: 294, positive class: 1.0% +✔ testing amount: 319, positive class: 1.3% diff --git a/datasets.py b/datasets.py index 7e690524..e6d5282e 100644 --- a/datasets.py +++ b/datasets.py @@ -508,6 +508,38 @@ def load_labels(self) -> pd.DataFrame: return df +class Uganda_NorthCEO2017(LabeledDataset): + def load_labels(self) -> pd.DataFrame: + raw_folder = raw_dir / "Uganda_North_2017" + df1 = pd.read_csv( + raw_folder + / "ceo-UNHCR-North-Uganda-Feb-2017---Feb-2018-(Set-1)-sample-data-2024-08-29.csv" + ) + df2 = pd.read_csv( + raw_folder + / "ceo-UNHCR-North-Uganda-Feb-2017---Feb-2018-(Set-2)-sample-data-2024-08-29.csv" + ) + df = pd.concat([df1, df2]) + + # Discard rows with no label + df = df[~df["Does this pixel contain active cropland?"].isna()].copy() + df[CLASS_PROB] = df["Does this pixel contain active cropland?"] == "Crop" + df[CLASS_PROB] = df[CLASS_PROB].astype(int) + df["num_labelers"] = 1 + df = df.groupby([LON, LAT], as_index=False, sort=False).agg( + { + CLASS_PROB: "mean", + "num_labelers": "sum", + "plotid": join_unique, + "sampleid": join_unique, + "email": join_unique, + } + ) + df[START], df[END] = date(2017, 1, 1), date(2018, 12, 31) + df[SUBSET] = train_val_test_split(df.index, 0.3, 0.3) + return df + + class Uganda_NorthCEO2021(LabeledDataset): def load_labels(self) -> pd.DataFrame: raw_folder = raw_dir / "Uganda_North_2021" @@ -1538,6 +1570,7 @@ def load_labels(self) -> pd.DataFrame: TanzaniaCropArea2019(), FranceCropArea2020(), Uganda_NorthCEO2016(), + Uganda_NorthCEO2017(), ] if __name__ == "__main__":