Skip to content

Commit

Permalink
Merge pull request #35 from baraldian/patch-2
Browse files Browse the repository at this point in the history
Improved load_acs
  • Loading branch information
mrtzh committed May 26, 2023
2 parents 41d33f1 + 22f6dc1 commit 731b8d1
Showing 1 changed file with 9 additions and 32 deletions.
41 changes: 9 additions & 32 deletions folktables/load_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,38 +111,15 @@ def load_acs(root_dir, states=None, year=2018, horizon='1-Year',
initialize_and_download(base_datadir, state, year, horizon, survey, download=download)
)

sample = io.StringIO()

first = True

dtypes = {'PINCP': np.float64, 'RT': str, 'SOCP': str, 'SERIALNO': str, 'NAICSP': str}
df_list = []
for file_name in file_names:

with open(file_name, 'r') as f:

if first:
sample.write(next(f))
first = False
else:
next(f)

if serial_filter_list is None:
for line in f:
if random.uniform(0, 1) < density:
# strip whitespace found in some early files
sample.write(line.replace(' ',''))
else:
for line in f:
serialno = line.split(',')[1]
if serialno in serial_filter_list:
# strip whitespace found in some early files
sample.write(line.replace(' ',''))


sample.seek(0)

dtypes = {'PINCP' : np.float64, 'RT' : str, 'SOCP' : str, 'SERIALNO' : str, 'NAICSP' : str}

return pd.read_csv(sample, dtype=dtypes)
df = pd.read_csv(file_name, dtype=dtypes).replace(' ','')
if serial_filter_list is not None:
df = df[df['SERIALNO'].isin(serial_filter_list)]
df_list.append(df)
all_df = pd.concat(df_list)
return all_df


def load_definitions(root_dir, year=2018, horizon='1-Year', download=False):
Expand Down Expand Up @@ -214,4 +191,4 @@ def generate_categories(features, definition_df):
del mapping_dict[-99999999999999.0]

categories[feature] = mapping_dict
return categories
return categories

0 comments on commit 731b8d1

Please sign in to comment.