I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS
having a row containing a list like "['A', 'B', '4']"
has the same list size of a column named COLOR
having a row containing a list "['red', 'blue', 'green']"
, but the list sizes can vary per CSV file depending on the API response. I would like to use pandas
to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.
Here's an example of what the data might look like from this mockup function:
import random
import csv
# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]
def create_csv(file_name, num_records):
cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
with open(file_name, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
for i in range(num_records):
record = {
"id" : i + 1,
"name": [NAMES[n] for n in range(cur_random_list_size)],
"month": random.randint(1,12),
"cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
"phone": random.choice(PHONE_OS)
}
writer.writerow(record.values())
print(f"CSV file '{file_name}' created with {num_records} records.")
create_csv("people_data.csv", 5)
ID | NAME | MONTH | CARS | PHONE OS |
---|---|---|---|---|
1 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 2 | "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" | iOS |
2 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 4 | "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" | iOS |
3 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" | Android |
4 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 3 | "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" | iOS |
5 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" | Android |
I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS
having a row containing a list like "['A', 'B', '4']"
has the same list size of a column named COLOR
having a row containing a list "['red', 'blue', 'green']"
, but the list sizes can vary per CSV file depending on the API response. I would like to use pandas
to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.
Here's an example of what the data might look like from this mockup function:
import random
import csv
# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]
def create_csv(file_name, num_records):
cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
with open(file_name, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
for i in range(num_records):
record = {
"id" : i + 1,
"name": [NAMES[n] for n in range(cur_random_list_size)],
"month": random.randint(1,12),
"cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
"phone": random.choice(PHONE_OS)
}
writer.writerow(record.values())
print(f"CSV file '{file_name}' created with {num_records} records.")
create_csv("people_data.csv", 5)
ID | NAME | MONTH | CARS | PHONE OS |
---|---|---|---|---|
1 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 2 | "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" | iOS |
2 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 4 | "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" | iOS |
3 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" | Android |
4 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 3 | "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" | iOS |
5 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" | Android |
And ideally, I'd like to separate this into five individual csv files, as an example for john_doe_people_data.csv
:
ID | NAME | MONTH | CARS | PHONE OS |
---|---|---|---|---|
1 | John Doe | 2 | Toyota | iOS |
2 | John Doe | 4 | Nissan | iOS |
3 | John Doe | 8 | BMW | Android |
4 | John Doe | 3 | Tesla | iOS |
5 | John Doe | 8 | Ford | Android |
All in all, how can I use pandas to create separate CSV files for each element in a list column, while keeping the rest of the data in each file?
Share Improve this question edited Dec 3, 2024 at 13:55 kyrlon asked Nov 18, 2024 at 23:07 kyrlonkyrlon 1,3402 gold badges13 silver badges28 bronze badges 5- These lists - they are encoded as strings in the CSV. What format are they? JSON, or perhaps python (e.g., use ast.literal_eval)? – tdelaney Commented Nov 18, 2024 at 23:51
- I think this would be easier with the CSV module. Create a dict mapping name to an empty table (defaultdict works for this). Iterate each row, break open the list and put the row (after some tidying) in that mapped list. Now each of those lists can be saved with csv.writer. So, are you absolutely wed to pandas? – tdelaney Commented Nov 18, 2024 at 23:54
- @tdelaney yes they are encoded as strings – kyrlon Commented Nov 19, 2024 at 2:00
- That they are strings is normal for a CSV. Its a question of how they are encoded. How do we convert that string into a list? I gave two common options. JSON makes sense, but your example isn't JSON. Python makes less sense (a python only solution in a CSV is limiting) but does decode your example. – tdelaney Commented Nov 19, 2024 at 2:31
- 1 @kyrlon Please check my answer below that completes the task with a few lines of code. – LMC Commented Nov 19, 2024 at 20:54
5 Answers
Reset to default 1Another possible solution, whose steps are (where df
is the dataframe resulting from reading the csv
file):
Each column's string values are cleaned by removing square brackets using
replace
and then split into lists withsplit
.The resulting dataframe is then expanded with
explode
, transforming each element of the list-like columns into separate rows.Finally, the dataframe is grouped by the
NAME
column usinggroupby
, and a list comprehension is employed to generate a list of dataframes, each corresponding to a uniqueNAME
.
[g for _, g in df.assign(**{
x: df[x].str.replace(r'\[|\]', '', regex=True).str.split(',')
for x in ['NAME', 'CARS']})
.explode(['NAME', 'CARS']).groupby('NAME')]
Columns containing list as strings can be easily parsed as json with a custom converter when reading the csv. Then a double explode()
would complete the task
#import csv
import pandas as pd
from io import StringIO
def CustomParser(data):
j1 = pd.read_json(StringIO(data.replace("'", '"')))
return j1[0].values.tolist()
f1 = "/home/lmc/tmp/people_data.csv"
#create_csv(f1, 5)
df = pd.read_csv(f1, converters={'NAME':CustomParser, 'CARS':CustomParser},header=0)
df = df.explode('NAME').explode('CARS')
df_doe = df[df['NAME'] == 'John Doe']
print(df_doe)
Result for first item in NAMES
ID NAME MONTH CARS PHONE OS
0 1 John Doe 10 Honda Android
0 1 John Doe 10 Toyota Android
1 2 John Doe 9 Audi iOS
1 2 John Doe 9 Honda iOS
2 3 John Doe 12 Audi iOS
2 3 John Doe 12 Honda iOS
3 4 John Doe 10 Ford iOS
3 4 John Doe 10 BMW iOS
4 5 John Doe 5 Chevrolet iOS
4 5 John Doe 5 Nissan iOS
I believe the following gives what you are looking for:
import pandas as pd
import ast
def explode_list_columns(df, list_columns):
"""
Input:
df: pandas DataFrame
list_columns: list of columns in the DataFrame that contain lists
Output:
df: pandas DataFrame with each element of the lists in the specified columns separated into different rows
"""
for col in list_columns:
df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
for col in list_columns:
df = df.explode(col, ignore_index=True)
return df
And then you just need to iterate over all rows that contain each separate name and write this to a .csv file like this for example
file_name = "people_data.csv"
df = pd.read_csv(file_name)
list_columns = ["NAME", "CARS"]
df_exploded = explode_list_columns(df, list_columns)
for name in df_exploded["NAME"].unique():
name_df = df_exploded[df_exploded["NAME"] == name]
output_file_name = f"{name.lower().replace(' ', '_')}_people_data.csv"
name_df.to_csv(output_file_name, index=False)
# df = pd.read_csv('people_data.csv')
# df
# above df is reading the file created from your function
people = eval(df.iloc[0, 1])
len_people = len(people)
print(len_people)
for i in range(len_people):
df1 = pd.DataFrame()
df1[['ID','MONTH', 'PHONE OS']] = df[['ID','MONTH', 'PHONE OS']]
# print(people[i])
df1['NAME'] = people[i]
# print(df['CARS'].apply(lambda x: eval(x)[i]))
df1['CARS'] = df['CARS'].apply(lambda x: eval(x)[i])
print(df1)
# write your csv file here
I ended up using a combination of explode
,map
, and ast.literal_eval
to break out the columns with string lists into different CSV files.
Instead of hard-coding column names like NAME
or CARS
, the program now dynamically checks which columns contain string representations of lists. This is done by iterating over all columns and using the map_check_if_list_literal
function to identify list-like columns and later convert to literals with map_convert_list
applied element-wise :
import random
import csv
import pandas as pd
import ast
# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]
def create_csv(file_name, num_records):
cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
with open(file_name, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
for i in range(num_records):
record = {
"id" : i + 1,
"name": [NAMES[n] for n in range(cur_random_list_size)],
"month": random.randint(1,12),
"cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
"phone": random.choice(PHONE_OS)
}
writer.writerow(record.values())
print(f"CSV file '{file_name}' created with {num_records} records.")
def map_check_if_list_literal(element):
if isinstance(element,str):
try:
data = ast.literal_eval(element)
if isinstance(data, list):
return True
else:
return False
except Exception as e:
return False
else:
return False
def map_convert_list_literal(element):
if isinstance(element,str):
try:
data = ast.literal_eval(element)
if isinstance(data, list):
return data
else:
return element
except Exception as e:
return element
else:
return element
if __name__ == "__main__":
create_csv("people_data.csv", 5)
file_name = "people_data.csv"
df = pd.read_csv(file_name)
temp_df = df.map(map_check_if_list_literal)
columns_w_list = []
for c in temp_df.columns:
if temp_df[c].any():
columns_w_list.append(c)
new_df = df.map(map_convert_list_literal)
new_df = new_df.explode(columns_w_list)
#this is column of interest
reference_column = ast.literal_eval(df["NAME"].mode()[0])
for name in reference_column:
mask = new_df["NAME"] == name
unique_df = new_df[mask]
unique_df.to_csv(f"{name}_{file_name}.csv", index=False)