python - On making a time series heat map

In relation this post, I have acquired the raw disdrometer data which includes multiple pairs of Drop Size (0.062 mm, 0.187 mm etc) and Counts of raindrops (32 bins). I would like to create a time series heatmap of the disdrometer data in which the independent variables are Time and Drop Size, while the dependent variable is "log N(d)".

How can I efficiently create this heatmap time series in Python? I did try to do this with the initial code, but I didn't manage to reproduce the sample figure in the previous post, correctly (the visualization). is there a way to adjust the x and y limits, the x and y tickmarks/labels as well? (as they were quite incorrect).

To compute for N(d), the formula we use is "Sum of Counts in each time stamp" / 0.54 * 300 * 0.125

I did try my best to do this with the initial code, but I didn't manage to reproduce the sample figure in the previous post, correctly (the visualization). Like is there a way to adjust/correct the x and y limits? Also the tickmarks/labels in x and y axis were quite incorrect.

The sample disdrometer data can be acquired through here. I'll appreciate a helping hand to this matter.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# File paths for all datasets
all_file_paths = {
    "0.062 mm": "07292023/0.062 mm.csv",
    "0.187 mm": "07292023/0.187 mm.csv",
    "0.312 mm": "07292023/0.312 mm.csv",
    "0.437 mm": "07292023/0.437 mm.csv",
    "0.562 mm": "07292023/0.562 mm.csv",
    "0.687 mm": "07292023/0.687 mm.csv",
    "0.812 mm": "07292023/0.812 mm.csv",
    "0.937 mm": "07292023/0.937 mm.csv",
    "1.062 mm": "07292023/1.062 mm.csv",
    "1.187 mm": "07292023/1.187 mm.csv",
    "1.375 mm": "07292023/1.375 mm.csv",
    "1.625 mm": "07292023/1.625 mm.csv",
    "1.875 mm": "07292023/1.875 mm.csv",
    "2.125 mm": "07292023/2.125 mm.csv",
    "2.375 mm": "07292023/2.375 mm.csv",
    "3.25 mm": "07292023/3.25 mm.csv",
    "3.75 mm": "07292023/3.75 mm.csv",
    "4.25 mm": "07292023/4.25 mm.csv",
    "4.75 mm": "07292023/4.75 mm.csv",
    "5.5 mm": "07292023/5.5 mm.csv",
    "6.5 mm": "07292023/6.5 mm.csv",
    "7.5 mm": "07292023/7.5 mm.csv",
    "8.5 mm": "07292023/8.5 mm.csv",
}

# Constants for N(D) calculation
factor = 0.54 * 300 * 0.125

# Read and process all datasets
combined_data = []
for size, path in all_file_paths.items():
    df = pd.read_csv(path)
    df["Size"] = float(size.split()[0])  # Ensure size is numeric
    time_col = df["Time stamp"]  # Keep original time format
    nd_values = df.iloc[:, 2:].astype(float) / factor  # Compute N(D)

    # Sum bins within 1 mm intervals
    size_bin = int(df["Size"].iloc[0] // 0.25)
    sum_nd = nd_values.sum(axis=1)  # Sum all bins in this interval
    log_nd = np.log10(sum_nd.replace(0, np.nan))  # Compute log N(D)

    # Create DataFrame with relevant information
    result_df = pd.DataFrame({
        "Time stamp": time_col,
        "Size_bin": size_bin,
        "log N(D)": log_nd
    })
combined_data.append(result_df)

# Combine into a single DataFrame
heatmap_data = pd.concat(combined_data).reset_index(drop=True)

# Filter for log N(D) > 0
#heatmap_data = heatmap_data[heatmap_data["log N(D)"] > 0]

# Aggregate log N(D) by size bin and time
heatmap_binned = heatmap_data.groupby(["Time stamp", "Size_bin"])['log N(D)'].mean().reset_index()

# Pivot for heatmap
heatmap_pivot = heatmap_binned.pivot(index="Size_bin", columns="Time stamp", values="log N(D)")

# Plot heatmap using axes
fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(heatmap_pivot, cmap="turbo", robust=True, vmin=0, vmax=2, 
            ax=ax, cbar_kws={'pad': 0.1})
ax.set_xlabel("Time Stamp (LST)", fontsize=15)
ax.set_ylabel("Drop Size (mm)", fontsize=15)
#ax.set_title("Raindrop Size Distribution Heatmap (Summed Bins)")
ax.invert_yaxis()  # Invert y-axis to start from lowest to highest size

# Set y-axis tick labels to represent bin intervals
ax.set_yticks(np.arange(0, heatmap_pivot.shape[0], 1))
ax.set_yticklabels(np.arange(0, heatmap_pivot.shape[0], 1))

custom_labels = np.arange(1, heatmap_pivot.shape[0] + 1)  
ax.set_yticklabels(custom_labels)

#ax.minorticks_on()
ax.tick_params(axis='x', which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in')
ax.tick_params(axis='y', which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in', rotation=0)
#ax.tick_params(which='minor', length=7, width=1.5, top=True, right=True, direction='in')

# Customize tick parameters for colorbar
cbar = ax.collections[0].colorbar
cbar.set_label('log N(D)', fontsize=15)
cbar.ax.minorticks_on()
cbar.ax.tick_params(which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in')
cbar.ax.tick_params(which='minor', length=5, width=1.5, top=True, right=True, direction='in')

ax.grid(ls='--', alpha=0.35)

# Add border to the colorbar
cbar.outline.set_edgecolor('black')
cbar.outline.set_linewidth(2)

# Add border to the plot
plt.gca().spines['top'].set_visible(True)
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['left'].set_visible(True)
plt.gca().spines['bottom'].set_visible(True)

#plt.rcParams['axes.linewidth'] = 2
#plt.rcParams['mathtext.fontset'] = 'cm'
#plt.rcParams['mathtext.default'] = 'regular'
#plt.rcParams['font.family'] = 'STIXGeneral'
plt.savefig('Test1.jpg', dpi=300, bbox_inches='tight')
plt.show()

To compute for N(d), the formula we use is "Sum of Counts in each time stamp" / 0.54 * 300 * 0.125

The sample disdrometer data can be acquired through here. I'll appreciate a helping hand to this matter.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# File paths for all datasets
all_file_paths = {
    "0.062 mm": "07292023/0.062 mm.csv",
    "0.187 mm": "07292023/0.187 mm.csv",
    "0.312 mm": "07292023/0.312 mm.csv",
    "0.437 mm": "07292023/0.437 mm.csv",
    "0.562 mm": "07292023/0.562 mm.csv",
    "0.687 mm": "07292023/0.687 mm.csv",
    "0.812 mm": "07292023/0.812 mm.csv",
    "0.937 mm": "07292023/0.937 mm.csv",
    "1.062 mm": "07292023/1.062 mm.csv",
    "1.187 mm": "07292023/1.187 mm.csv",
    "1.375 mm": "07292023/1.375 mm.csv",
    "1.625 mm": "07292023/1.625 mm.csv",
    "1.875 mm": "07292023/1.875 mm.csv",
    "2.125 mm": "07292023/2.125 mm.csv",
    "2.375 mm": "07292023/2.375 mm.csv",
    "3.25 mm": "07292023/3.25 mm.csv",
    "3.75 mm": "07292023/3.75 mm.csv",
    "4.25 mm": "07292023/4.25 mm.csv",
    "4.75 mm": "07292023/4.75 mm.csv",
    "5.5 mm": "07292023/5.5 mm.csv",
    "6.5 mm": "07292023/6.5 mm.csv",
    "7.5 mm": "07292023/7.5 mm.csv",
    "8.5 mm": "07292023/8.5 mm.csv",
}

# Constants for N(D) calculation
factor = 0.54 * 300 * 0.125

# Read and process all datasets
combined_data = []
for size, path in all_file_paths.items():
    df = pd.read_csv(path)
    df["Size"] = float(size.split()[0])  # Ensure size is numeric
    time_col = df["Time stamp"]  # Keep original time format
    nd_values = df.iloc[:, 2:].astype(float) / factor  # Compute N(D)

    # Sum bins within 1 mm intervals
    size_bin = int(df["Size"].iloc[0] // 0.25)
    sum_nd = nd_values.sum(axis=1)  # Sum all bins in this interval
    log_nd = np.log10(sum_nd.replace(0, np.nan))  # Compute log N(D)

    # Create DataFrame with relevant information
    result_df = pd.DataFrame({
        "Time stamp": time_col,
        "Size_bin": size_bin,
        "log N(D)": log_nd
    })
combined_data.append(result_df)

# Combine into a single DataFrame
heatmap_data = pd.concat(combined_data).reset_index(drop=True)

# Filter for log N(D) > 0
#heatmap_data = heatmap_data[heatmap_data["log N(D)"] > 0]

# Aggregate log N(D) by size bin and time
heatmap_binned = heatmap_data.groupby(["Time stamp", "Size_bin"])['log N(D)'].mean().reset_index()

# Pivot for heatmap
heatmap_pivot = heatmap_binned.pivot(index="Size_bin", columns="Time stamp", values="log N(D)")

# Plot heatmap using axes
fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(heatmap_pivot, cmap="turbo", robust=True, vmin=0, vmax=2, 
            ax=ax, cbar_kws={'pad': 0.1})
ax.set_xlabel("Time Stamp (LST)", fontsize=15)
ax.set_ylabel("Drop Size (mm)", fontsize=15)
#ax.set_title("Raindrop Size Distribution Heatmap (Summed Bins)")
ax.invert_yaxis()  # Invert y-axis to start from lowest to highest size

# Set y-axis tick labels to represent bin intervals
ax.set_yticks(np.arange(0, heatmap_pivot.shape[0], 1))
ax.set_yticklabels(np.arange(0, heatmap_pivot.shape[0], 1))

custom_labels = np.arange(1, heatmap_pivot.shape[0] + 1)  
ax.set_yticklabels(custom_labels)

#ax.minorticks_on()
ax.tick_params(axis='x', which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in')
ax.tick_params(axis='y', which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in', rotation=0)
#ax.tick_params(which='minor', length=7, width=1.5, top=True, right=True, direction='in')

# Customize tick parameters for colorbar
cbar = ax.collections[0].colorbar
cbar.set_label('log N(D)', fontsize=15)
cbar.ax.minorticks_on()
cbar.ax.tick_params(which='major', labelsize=15, length=10, width=2, top=True, right=True, direction='in')
cbar.ax.tick_params(which='minor', length=5, width=1.5, top=True, right=True, direction='in')

ax.grid(ls='--', alpha=0.35)

# Add border to the colorbar
cbar.outline.set_edgecolor('black')
cbar.outline.set_linewidth(2)

# Add border to the plot
plt.gca().spines['top'].set_visible(True)
plt.gca().spines['right'].set_visible(True)
plt.gca().spines['left'].set_visible(True)
plt.gca().spines['bottom'].set_visible(True)

#plt.rcParams['axes.linewidth'] = 2
#plt.rcParams['mathtext.fontset'] = 'cm'
#plt.rcParams['mathtext.default'] = 'regular'
#plt.rcParams['font.family'] = 'STIXGeneral'
plt.savefig('Test1.jpg', dpi=300, bbox_inches='tight')
plt.show()

Share Improve this question edited Mar 29 at 11:16 gboffi 25.1k9 gold badges60 silver badges95 bronze badges asked Mar 28 at 9:40 CGHA 377 bronze badges

Your file 07292023/4.25 mm.csv contains suspicious data, because a great deal of the numbers in the second column are not 4.25, as it is in every other file you've posted, but 4.75. – gboffi Commented Mar 29 at 11:07
In your data, some of the counts are equal to zero ("0") and some are null strings (""). There is a semantic difference or both notations mean "nothing happened here" ? – gboffi Commented Mar 29 at 11:10
hi @gboffi both notations mean nothing happened here. Although, in the visualization from the previous, the y axis also include 0-1 mm drop size counts. – CGHA Commented Mar 30 at 0:59
oops my bad, I'll edit out the 4.25 mm csv – CGHA Commented Mar 30 at 1:01
drive.google/drive/u/0/folders/… I revised all of the data, @gboffi, aligned to the correct time stamps. Let me know if this helps. – CGHA Commented Mar 30 at 2:48

| Show 1 more comment

1 Answer 1

Sorted by: Reset to default 1

The image above was obtained by

$ python3 raindrops_falling.py [0-9]*.csv
Minimum count*factor = 20.250
Minimum log N(D) = 1.306
$

I'm rather unsure about my procedure to compute logNd below, but it's your field not mine, isn't it? so please modify that part as you will, you should be a better judge than me.

Eventually, here it is the code (note that I have used neither Pandas nor Seaborne)

$ cat raindrops_falling.py
import numpy as np
import matplotlib.pyplot as plt
import fileinput
import warnings

time_stamps = []
sizes = {}

for line in fileinput.input(encoding="utf-8"):
    time, mm, *counts = [elt.strip() for elt in line.strip().split(',')]
    if time[0] == "T" : continue # skip headers

    time = int(time)
    μm = int(float(mm)*1000) # over-thinking the keys of sizes
    
    # count is a string of digits or ('') the null string
    # int(count or 0) does not fail when count=='' but returns 0
    total_count = sum((int(count or 0) for count in counts))

    # treating new time or new drop size
    if time not in time_stamps: time_stamps.append(time)
    if μm not in sizes: sizes[μm] = []

    # store the total count for this μm and each timestamp
    sizes[μm].append(total_count)       

time_stamps = np.array(time_stamps)
mm = np.array(list(sizes.keys()))/1000
factor = 0.54 * 300 * 0.125 # = 54 * 3 / 8 = 27 * 3 / 4 = 81/4 = 20.25
factored_counts = np.array([counts for counts in sizes.values()])*factor

# some checks
min_fc = (factored_counts[factored_counts>0]).min()
print("""\
Minimum count*factor = %.3f
Minimum log N(D) = %.3f"""%(min_fc, np.log10(min_fc)))

# np.log10 return NaN when the argument is negative,
# we prefer a NaN (no patch) to -Inf for plotting the color mesh
factored_counts[factored_counts==0] = -1

# when np.log10 finds a <0 argument it raises a warning, let's suppress it
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    logNd = np.log10(factored_counts)

# PLOTTING
fig, ax = plt.subplots(layout='constrained')
plt.pcolormesh(time_stamps, mm, logNd, cmap='turbo',
#              vmin=0 → colorbar starts from 0
#              shading='nearest' → patches centered on time, diam
               vmin=0, shading='nearest')
# check also shading='gouraud'
plt.colorbar()

# some little adjustments
plt.yticks(mm, fontsize='xx-small')
plt.xlim(xmax=2000)
plt.grid(lw=0.15, color='#707070')
ax.patch.set_color('#e0e0e0')
fig.patch.set_color('#ddddd0')

# That's all Folks!
plt.show()
$

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - On making a time series heat map - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)