python - Prediction with categorical data in semopy

I'm using semopy for the first time (I am more familiar with lavaan in R). I was able to apply the predict method for continuous, normally distributed data (for both one missing value and multiple missing values) using the example on the semopy website. However, I am struggling with using the predict method with categorical data.

I used an example dataset and tried to treat some variables (x1, x2, and x3) in the dataset as categorical.

Here is my code:

import semopy
import pandas as pd
dat = semopy.examples.political_democracy.get_data()
dat = dat.round() # make all scores integers

# Convert ordinal variables to categories
ordinal_vars = ['x1', 'x2', 'x3']
for col in ordinal_vars:
    dat[col] = dat[col].astype('category')

# model syntax
mod = '''# measurement model
ind60 =~ x1 + x2 + x3
dem60 =~ y1 + y2 + y3 + y4
dem65 =~ y5 + y6 + y7 + y8
# regressions
dem60 ~ ind60
dem65 ~ ind60 + dem60
# residual correlations
y1 ~~ y5
y2 ~~ y4 + y6
y3 ~~ y7
y4 ~~ y8
y6 ~~ y8
# DEFINE(ordinal) y1 y2 y3 y4 y5 y6 y7 y8 x1 x2 x3 # i only treat x1-3 as ordinal instead
DEFINE(ordinal) x1 x2 x3
'''

# generate missing value
i, v = 0, 'x1'
x = dat[v].values[i]

dat[v].values[i] = float('nan')

# fit model
fitmod = semopy.Model(mod)
fitmod.fit(dat, method = 'DWLS')

And here is the error I receive:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[1], line 42
     38 # print(dat)
     39 
     40 # fit model
     41 fitmod = semopy.Model(mod)
---> 42 fitmod.fit(dat, method = 'DWLS')
     43 preds = fitmod.predict(dat)
     44 print(preds)

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:1097, in Model.fit(self, data, cov, obj, solver, groups, clean_slate, regularization, n_samples, **kwargs)
   1054 def fit(self, data=None, cov=None, obj='MLW', solver='SLSQP', groups=None,
   1055         clean_slate=False, regularization=None, n_samples=None, **kwargs):
   1056     """
   1057     Fit model to data.
   1058 
   (...)
   1095 
   1096     """
-> 1097     self.load(data=data, cov=cov, groups=groups,
   1098               clean_slate=clean_slate, n_samples=n_samples)
   1099     if obj == 'FIML':
   1100         if not hasattr(self, 'mx_data'):

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:1038, in Model.load(self, data, cov, groups, clean_slate, n_samples)
   1036     raise KeyError('Variables {} are missing from data.'.format(t))
   1037 if data is not None:
-> 1038     self.load_data(data, covariance=cov, groups=groups)
   1039 else:
   1040     self.load_cov(cov)

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:901, in Model.load_data(self, data, covariance, groups)
    899 else:
    900     inds = [obs.index(v) for v in self.vars['ordinal']]
--> 901     self.load_cov(hetcor(self.mx_data, inds))
    902 self.n_samples, self.n_obs = self.mx_data.shape

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:267, in hetcor(data, ords, nearest)
    265 c_z = {v: (data[v] - c_means[v]) / c_vars[v] for v in conts}
    266 c_pdfs = {v: norm.logpdf(data[v], c_means[v], c_vars[v]) for v in conts}
--> 267 o_ints = {v: estimate_intervals(data[v]) for v in ords}
    269 for c, o in product(conts, ords):
    270     cov[c][o] = polyserial_corr(data[c], data[o], x_mean=c_means[c],
    271                                 x_var=c_vars[c], x_z=c_z[c],
    272                                 x_pdfs=c_pdfs[c], y_ints=o_ints[o])

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:267, in <dictcomp>(.0)
    265 c_z = {v: (data[v] - c_means[v]) / c_vars[v] for v in conts}
    266 c_pdfs = {v: norm.logpdf(data[v], c_means[v], c_vars[v]) for v in conts}
--> 267 o_ints = {v: estimate_intervals(data[v]) for v in ords}
    269 for c, o in product(conts, ords):
    270     cov[c][o] = polyserial_corr(data[c], data[o], x_mean=c_means[c],
    271                                 x_var=c_vars[c], x_z=c_z[c],
    272                                 x_pdfs=c_pdfs[c], y_ints=o_ints[o])

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:95, in estimate_intervals(x, inf)
     93 sz = len(x_f)
     94 cumcounts = np.cumsum(counts[:-1])
---> 95 u = [np.where(u == sample)[0][0] + 1 for sample in x]
     96 return list(chain([-inf], (norm.ppf(n / sz) for n in cumcounts), [inf])), u

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:95, in <listcomp>(.0)
     93 sz = len(x_f)
     94 cumcounts = np.cumsum(counts[:-1])
---> 95 u = [np.where(u == sample)[0][0] + 1 for sample in x]
     96 return list(chain([-inf], (norm.ppf(n / sz) for n in cumcounts), [inf])), u

IndexError: index 0 is out of bounds for axis 0 with size 0

I don't fully understand what the error means. I found this question, but was not able to fix the issue in my case. I am also relatively unfamiliar with Python, so perhaps there is a basic syntax error that I'm overlooking? Alternatively, perhaps it is not yet possible to predict with categorical data in semopy?

I used an example dataset and tried to treat some variables (x1, x2, and x3) in the dataset as categorical.

Here is my code:

import semopy
import pandas as pd
dat = semopy.examples.political_democracy.get_data()
dat = dat.round() # make all scores integers

# Convert ordinal variables to categories
ordinal_vars = ['x1', 'x2', 'x3']
for col in ordinal_vars:
    dat[col] = dat[col].astype('category')

# model syntax
mod = '''# measurement model
ind60 =~ x1 + x2 + x3
dem60 =~ y1 + y2 + y3 + y4
dem65 =~ y5 + y6 + y7 + y8
# regressions
dem60 ~ ind60
dem65 ~ ind60 + dem60
# residual correlations
y1 ~~ y5
y2 ~~ y4 + y6
y3 ~~ y7
y4 ~~ y8
y6 ~~ y8
# DEFINE(ordinal) y1 y2 y3 y4 y5 y6 y7 y8 x1 x2 x3 # i only treat x1-3 as ordinal instead
DEFINE(ordinal) x1 x2 x3
'''

# generate missing value
i, v = 0, 'x1'
x = dat[v].values[i]

dat[v].values[i] = float('nan')

# fit model
fitmod = semopy.Model(mod)
fitmod.fit(dat, method = 'DWLS')

And here is the error I receive:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[1], line 42
     38 # print(dat)
     39 
     40 # fit model
     41 fitmod = semopy.Model(mod)
---> 42 fitmod.fit(dat, method = 'DWLS')
     43 preds = fitmod.predict(dat)
     44 print(preds)

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:1097, in Model.fit(self, data, cov, obj, solver, groups, clean_slate, regularization, n_samples, **kwargs)
   1054 def fit(self, data=None, cov=None, obj='MLW', solver='SLSQP', groups=None,
   1055         clean_slate=False, regularization=None, n_samples=None, **kwargs):
   1056     """
   1057     Fit model to data.
   1058 
   (...)
   1095 
   1096     """
-> 1097     self.load(data=data, cov=cov, groups=groups,
   1098               clean_slate=clean_slate, n_samples=n_samples)
   1099     if obj == 'FIML':
   1100         if not hasattr(self, 'mx_data'):

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:1038, in Model.load(self, data, cov, groups, clean_slate, n_samples)
   1036     raise KeyError('Variables {} are missing from data.'.format(t))
   1037 if data is not None:
-> 1038     self.load_data(data, covariance=cov, groups=groups)
   1039 else:
   1040     self.load_cov(cov)

File /opt/anaconda3/lib/python3.11/site-packages/semopy/model.py:901, in Model.load_data(self, data, covariance, groups)
    899 else:
    900     inds = [obs.index(v) for v in self.vars['ordinal']]
--> 901     self.load_cov(hetcor(self.mx_data, inds))
    902 self.n_samples, self.n_obs = self.mx_data.shape

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:267, in hetcor(data, ords, nearest)
    265 c_z = {v: (data[v] - c_means[v]) / c_vars[v] for v in conts}
    266 c_pdfs = {v: norm.logpdf(data[v], c_means[v], c_vars[v]) for v in conts}
--> 267 o_ints = {v: estimate_intervals(data[v]) for v in ords}
    269 for c, o in product(conts, ords):
    270     cov[c][o] = polyserial_corr(data[c], data[o], x_mean=c_means[c],
    271                                 x_var=c_vars[c], x_z=c_z[c],
    272                                 x_pdfs=c_pdfs[c], y_ints=o_ints[o])

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:267, in <dictcomp>(.0)
    265 c_z = {v: (data[v] - c_means[v]) / c_vars[v] for v in conts}
    266 c_pdfs = {v: norm.logpdf(data[v], c_means[v], c_vars[v]) for v in conts}
--> 267 o_ints = {v: estimate_intervals(data[v]) for v in ords}
    269 for c, o in product(conts, ords):
    270     cov[c][o] = polyserial_corr(data[c], data[o], x_mean=c_means[c],
    271                                 x_var=c_vars[c], x_z=c_z[c],
    272                                 x_pdfs=c_pdfs[c], y_ints=o_ints[o])

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:95, in estimate_intervals(x, inf)
     93 sz = len(x_f)
     94 cumcounts = np.cumsum(counts[:-1])
---> 95 u = [np.where(u == sample)[0][0] + 1 for sample in x]
     96 return list(chain([-inf], (norm.ppf(n / sz) for n in cumcounts), [inf])), u

File /opt/anaconda3/lib/python3.11/site-packages/semopy/polycorr.py:95, in <listcomp>(.0)
     93 sz = len(x_f)
     94 cumcounts = np.cumsum(counts[:-1])
---> 95 u = [np.where(u == sample)[0][0] + 1 for sample in x]
     96 return list(chain([-inf], (norm.ppf(n / sz) for n in cumcounts), [inf])), u

IndexError: index 0 is out of bounds for axis 0 with size 0

Share Improve this question asked Dec 4, 2024 at 11:15 ambiguditi 1213 bronze badges

1 Hi, ambiguditi. This is not my field, but at a glance it seems more of a programming issue rather than an inherent statistical one, and that would make it off-topic . If it is otherwise, I would say make that explicit. – User1865345 Commented Dec 4, 2024 at 11:32
1 Hi, @User1865345, I think you're right. I will try to have the question migrated to stack overflow. – ambiguditi Commented Dec 4, 2024 at 11:40
1 Flag the post for mod's attention and tell them to migrate it to SO. Or you can do it manually by deleting it and posting the same at SO :-) – User1865345 Commented Dec 4, 2024 at 11:43
1 Yes, already flagged the question :) I'll wait for a few hours otherwise delete and repost myself. Thanks for the help! – ambiguditi Commented Dec 4, 2024 at 11:44
1 No problem. SO would be better place for your query. – User1865345 Commented Dec 4, 2024 at 11:45

Add a comment |

1 Answer 1

Sorted by: Reset to default 2

I wrote to the developers of semopy and they confirmed that the package currently only supports prediction for continuous data.

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Prediction with categorical data in semopy - Stack Overflow

1 Answer 1

与本文相关的文章

评论列表(0)