import stata_setup
stata_setup.config("C:/Program Files/Stata17/", "mp")

  ___  ____  ____  ____  ____ ©
 /__    /   ____/   /   ____/      17.0
___/   /   /___/   /   /___/       MP—Parallel Edition

 Statistics and Data Science       Copyright 1985-2021 StataCorp LLC
                                   StataCorp
                                   4905 Lakeway Drive
                                   College Station, Texas 77845 USA
                                   800-STATA-PC        https://www.stata.com
                                   979-696-4600        [email protected]

Stata license: Single-user 8-core  perpetual
Serial number: 100
  Licensed to: zxx
               StataCorp LLC

Notes:
      1. Unicode is supported; see help unicode_advice.
      2. More than 2 billion observations are allowed; see help obs_advice.
      3. Maximum number of variables is set to 5,000; see help set_maxvar.


%%stata?


%%stata
sysuse auto, clear
describe

. sysuse auto, clear
(1978 automobile data)

. describe

Contains data from C:\Program Files\Stata17/ado\base/a/auto.dta
 Observations:            74                  1978 automobile data
    Variables:            12                  13 Apr 2020 17:45
                                              (_dta has notes)
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
make            str18   %-18s                 Make and model
price           int     %8.0gc                Price
mpg             int     %8.0g                 Mileage (mpg)
rep78           int     %8.0g                 Repair record 1978
headroom        float   %6.1f                 Headroom (in.)
trunk           int     %8.0g                 Trunk space (cu. ft.)
weight          int     %8.0gc                Weight (lbs.)
length          int     %8.0g                 Length (in.)
turn            int     %8.0g                 Turn circle (ft.)
displacement    int     %8.0g                 Displacement (cu. in.)
gear_ratio      float   %6.2f                 Gear ratio
foreign         byte    %8.0g      origin     Car origin
-------------------------------------------------------------------------------
Sorted by: foreign

.


%%stata
scatter mpg weight, by(foreign, total)


%stata summarize mpg

    Variable |        Obs        Mean    Std. dev.       Min        Max
-------------+---------------------------------------------------------
         mpg |         74     21.2973    5.785503         12         41


%%stata -qui 
summarize mpg


%%stata -gw 12cm
scatter mpg weight, by(foreign, total)


import pandas as pd
import io
import requests

data = requests.get("https://www.stata.com/python/pystata/misc/nhanes2.csv").content
nhanes2 = pd.read_csv(io.StringIO(data.decode('utf-8')))
nhanes2


%%stata -d nhanes2 -force
label define sex2 1 "Male" 2 "Female"
encode sex, generate(sex2) label(sex2)

label define agegrp 1 "20-29" 2 "30-39" 3 "40-49" 4 "50-59" 5 "60-69" 6 "70+"
encode agegrp, generate(agegrp2) label(agegrp)

label variable bpsystol "systolic blood pressure"
label variable agegrp2 "Age Group"
label variable sex2 "1=Male, 2=Female"

describe bpsystol agegrp2 sex2

. label define sex2 1 "Male" 2 "Female"

. encode sex, generate(sex2) label(sex2)

. 
. label define agegrp 1 "20-29" 2 "30-39" 3 "40-49" 4 "50-59" 5 "60-69" 6 "70+"

. encode agegrp, generate(agegrp2) label(agegrp)

. 
. label variable bpsystol "systolic blood pressure"

. label variable agegrp2 "Age Group"

. label variable sex2 "1=Male, 2=Female"

. 
. describe bpsystol agegrp2 sex2

Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
bpsystol        long    %12.0g                systolic blood pressure
agegrp2         long    %8.0g      agegrp     Age Group
sex2            long    %8.0g      sex2       1=Male, 2=Female

.


%%stata -eret steret
// fit a regression model
regress bpsystol agegrp2##sex2

// e() results
ereturn list

. // fit a regression model
. regress bpsystol agegrp2##sex2

      Source |       SS           df       MS      Number of obs   =    10,351
-------------+----------------------------------   F(11, 10339)    =    312.88
       Model |  1407229.28        11  127929.935   Prob > F        =    0.0000
    Residual |  4227440.75    10,339  408.882943   R-squared       =    0.2497
-------------+----------------------------------   Adj R-squared   =    0.2489
       Total |  5634670.03    10,350  544.412563   Root MSE        =    20.221

------------------------------------------------------------------------------
    bpsystol | Coefficient  Std. err.      t    P>|t|     [95% conf. interval]
-------------+----------------------------------------------------------------
     agegrp2 |
      30-39  |   .7956175   .9473117     0.84   0.401    -1.061297    2.652532
      40-49  |   5.117078   1.018176     5.03   0.000     3.121256      7.1129
      50-59  |   12.20018   1.022541    11.93   0.000      10.1958    14.20456
      60-69  |   16.85887   .8155092    20.67   0.000     15.26031    18.45742
        70+  |   22.50889   1.130959    19.90   0.000     20.29199    24.72579
             |
        sex2 |
     Female  |  -12.60132   .8402299   -15.00   0.000    -14.24833    -10.9543
             |
agegrp2#sex2 |
      30-39 #|
     Female  |   4.140156    1.31031     3.16   0.002     1.571695    6.708617
      40-49 #|
     Female  |   8.644866   1.412067     6.12   0.000     5.876941    11.41279
      50-59 #|
     Female  |   11.83134   1.406641     8.41   0.000     9.074051    14.58863
      60-69 #|
     Female  |     14.093   1.130882    12.46   0.000     11.87625    16.30975
 70+#Female  |   15.86608   1.542296    10.29   0.000     12.84288    18.88928
             |
       _cons |   123.8862   .6052954   204.67   0.000     122.6997    125.0727
------------------------------------------------------------------------------

. 
. // e() results
. ereturn list

scalars:
                  e(N) =  10351
               e(df_m) =  11
               e(df_r) =  10339
                  e(F) =  312.8766723590079
                 e(r2) =  .2497447540773591
               e(rmse) =  20.22085415724865
                e(mss) =  1407229.279971525
                e(rss) =  4227440.746112916
               e(r2_a) =  .2489465330013219
                 e(ll) =  -45803.93060947768
               e(ll_0) =  -47291.06810807489
               e(rank) =  12

macros:
            e(cmdline) : "regress bpsystol agegrp2##sex2"
              e(title) : "Linear regression"
          e(marginsok) : "XB default"
                e(vce) : "ols"
             e(depvar) : "bpsystol"
                e(cmd) : "regress"
         e(properties) : "b V"
            e(predict) : "regres_p"
              e(model) : "ols"
          e(estat_cmd) : "regress_estat"

matrices:
                  e(b) :  1 x 21
                  e(V) :  21 x 21

functions:
             e(sample)   

.


steret.keys()

dict_keys(['e(N)', 'e(df_m)', 'e(df_r)', 'e(F)', 'e(r2)', 'e(rmse)', 'e(mss)', 'e(rss)', 'e(r2_a)', 'e(ll)', 'e(ll_0)', 'e(rank)', 'e(cmdline)', 'e(title)', 'e(marginsprop)', 'e(marginsok)', 'e(vce)', 'e(depvar)', 'e(cmd)', 'e(properties)', 'e(predict)', 'e(model)', 'e(estat_cmd)', 'e(b)', 'e(V)'])


steret['e(b)']

array([[  0.        ,   0.79561746,   5.11707797,  12.20017802,
         16.85886868,  22.50888857,   0.        , -12.601317  ,
          0.        ,   0.        ,   0.        ,   4.14015609,
          0.        ,   8.6448661 ,   0.        ,  11.83133884,
          0.        ,  14.09300146,   0.        ,  15.86607901,
        123.88620072]])


%%stata 
use http://www.stata-press.com/data/r17/iris, clear
describe
label list species

. use http://www.stata-press.com/data/r17/iris, clear
(Iris data)

. describe

Contains data from http://www.stata-press.com/data/r17/iris.dta
 Observations:           150                  Iris data
    Variables:             5                  18 Jan 2020 13:23
                                              (_dta has notes)
-------------------------------------------------------------------------------
Variable      Storage   Display    Value
    name         type    format    label      Variable label
-------------------------------------------------------------------------------
iris            byte    %10.0g     species    Iris species
seplen          double  %4.1f                 Sepal length in cm
sepwid          double  %4.1f                 Sepal width in cm
petlen          double  %4.1f                 Petal length in cm
petwid          double  %4.1f                 Petal width in cm
-------------------------------------------------------------------------------
Sorted by: 

. label list species
species:
           1 setosa
           2 versicolor
           3 virginica

.


%%stata -fouta training,test
// Split the original dataset into training and test
// dataset which contains 80% and 20% of observations respectively
splitsample, generate(svar, replace) split(0.8 0.2) show rseed(16)

// create two frames holding the the two datasets
frame put iris seplen sepwid petlen petwid if svar==1, into(training)
frame put iris seplen sepwid petlen petwid if svar==2, into(test)

. // Split the original dataset into training and test
. // dataset which contains 80% and 20% of observations respectively
. splitsample, generate(svar, replace) split(0.8 0.2) show rseed(16)

       svar |      Freq.     Percent        Cum.
------------+-----------------------------------
          1 |        120       80.00       80.00
          2 |         30       20.00      100.00
------------+-----------------------------------
      Total |        150      100.00

. 
. // create two frames holding the the two datasets
. frame put iris seplen sepwid petlen petwid if svar==1, into(training)

. frame put iris seplen sepwid petlen petwid if svar==2, into(test)

.


X_train = training[:, 1:]
y_train = training[:, 0]
X_test = test[:, 1:]
y_test = test[:, 0]


from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, random_state=17)
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, random_state=17)


from sklearn import metrics

y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)


from sfi import Frame

fr = Frame.connect('test')
fr.addVarByte('irispr')
fr.addVarFloat('pr1')
fr.addVarFloat('pr2')
fr.addVarFloat('pr3')

fr.store('irispr', None, y_pred)
fr.store('pr1 pr2 pr3', None, y_pred_prob)


%%stata
frame change test
label values irispr species
label variable irispr predicted
tabulate iris irispr, row
list iris irispr pr1 pr2 pr3 if iris!=irispr

. frame change test

. label values irispr species

. label variable irispr predicted

. tabulate iris irispr, row

+----------------+
| Key            |
|----------------|
|   frequency    |
| row percentage |
+----------------+

      Iris |            predicted
   species |    setosa  versicolo  virginica |     Total
-----------+---------------------------------+----------
    setosa |        11          0          0 |        11 
           |    100.00       0.00       0.00 |    100.00 
-----------+---------------------------------+----------
versicolor |         0          9          3 |        12 
           |      0.00      75.00      25.00 |    100.00 
-----------+---------------------------------+----------
 virginica |         0          1          6 |         7 
           |      0.00      14.29      85.71 |    100.00 
-----------+---------------------------------+----------
     Total |        11         10          9 |        30 
           |     36.67      33.33      30.00 |    100.00 

. list iris irispr pr1 pr2 pr3 if iris!=irispr

     +----------------------------------------------------------+
     |       iris       irispr        pr1        pr2        pr3 |
     |----------------------------------------------------------|
 16. | versicolor    virginica          0   .1658101   .8341899 |
 18. | versicolor    virginica          0   .0193115   .9806885 |
 19. | versicolor    virginica   .0006667   .2863244   .7130089 |
 26. |  virginica   versicolor   .0015674    .593027   .4054056 |
     +----------------------------------------------------------+

.


%%mata?


%%mata
/* create an NxN identity matrix */
real matrix id(real scalar n)
{
    real scalar i
    real matrix res

    res = J(n, n, 0)
    for (i=1; i<=n; i++) {
        res[i,i] = 1
    }
    return(res)
}

B = id(3)
B

. mata
------------------------------------------------- mata (type end to exit) -----
: /* create an NxN identity matrix */
: real matrix id(real scalar n)
> {
>     real scalar i
>     real matrix res
> 
>     res = J(n, n, 0)
>     for (i=1; i<=n; i++) {
>         res[i,i] = 1
>     }
>     return(res)
> }

: 
: B = id(3)

: B
[symmetric]
       1   2   3
    +-------------+
  1 |  1          |
  2 |  0   1      |
  3 |  0   0   1  |
    +-------------+

: end
-------------------------------------------------------------------------------

.


%pystata?


%pystata set graph_format png


%%stata
sysuse auto, clear
histogram rep78

. sysuse auto, clear
(1978 automobile data)

. histogram rep78
(bin=8, start=1, width=.5)

.


turksales = pd.read_csv('turksales.csv')
turksales.head()


from pystata import stata
stata.pdataframe_to_data(turksales, force=True)


stata.run('''
generate qdate = quarterly(t, "YQ")
format qdate %tq
tsset qdate, quarterly
''')

. 
. generate qdate = quarterly(t, "YQ")

. format qdate %tq

. tsset qdate, quarterly

Time variable: qdate, 1990q1 to 1999q4
        Delta: 1 quarter

.


stata.run('''
arima sales, arima(3, 1, 0)
predict sales_pred, y
''')

. 
. arima sales, arima(3, 1, 0)

(setting optimization to BHHH)
Iteration 0:   log likelihood = -95.623818  
Iteration 1:   log likelihood =  -83.15803  
Iteration 2:   log likelihood = -78.411534  
Iteration 3:   log likelihood = -77.140891  
Iteration 4:   log likelihood = -75.668157  
(switching optimization to BFGS)
Iteration 5:   log likelihood = -75.617809  
Iteration 6:   log likelihood = -75.542463  
Iteration 7:   log likelihood = -75.536259  
Iteration 8:   log likelihood = -75.535664  
Iteration 9:   log likelihood =  -75.53558  
Iteration 10:  log likelihood = -75.535576  

ARIMA regression

Sample: 1990q2 thru 1999q4                      Number of obs     =         39
                                                Wald chi2(3)      =     125.92
Log likelihood = -75.53558                      Prob > chi2       =     0.0000

------------------------------------------------------------------------------
             |                 OPG
     D.sales | Coefficient  std. err.      z    P>|z|     [95% conf. interval]
-------------+----------------------------------------------------------------
sales        |
       _cons |   .3138001   .0780853     4.02   0.000     .1607558    .4668444
-------------+----------------------------------------------------------------
ARMA         |
          ar |
         L1. |  -.8190625    .087633    -9.35   0.000      -.99082    -.647305
         L2. |  -.8174304   .1006356    -8.12   0.000    -1.014673   -.6201882
         L3. |  -.7738968   .0902345    -8.58   0.000    -.9507531   -.5970404
-------------+----------------------------------------------------------------
      /sigma |   1.608579    .264229     6.09   0.000       1.0907    2.126458
------------------------------------------------------------------------------
Note: The test of the variance against zero is one sided, and the two-sided
      confidence interval is truncated at zero.

. predict sales_pred, y
(1 missing value generated)

.


import numpy as np
stpred = stata.pdataframe_from_data('t sales sales_pred', missingval=np.nan)
stpred = stpred.set_index("t")
stpred.head()


import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

sns.set_theme(style="darkgrid")
fig, ax = plt.subplots(figsize=(10, 8))

lp = sns.lineplot(ax=ax, data=stpred)
lp.xaxis.set_major_locator(ticker.MultipleLocator(6))

plt.show()

	sampl	strata	psu	region	smsa	location	houssiz	sex	race	age	...	region4	smsa1	smsa2	smsa3	rural	loglead	agegrp	highlead	bmi	highbp
0	1400	1	1	S	2	1	4	Male	White	54	...	0	0	1	0	0	NaN	50-59	NaN	20.495686	0
1	1401	1	1	S	2	1	6	Female	White	41	...	0	0	1	0	0	2.564949	40-49	lead<25	21.022337	0
2	1402	1	1	S	1	1	6	Female	Other	21	...	0	1	0	0	0	NaN	20-29	NaN	24.973860	0
3	1404	1	1	S	2	1	9	Female	White	63	...	0	0	1	0	0	NaN	60-69	NaN	35.728722	1
4	1405	1	1	S	1	1	3	Female	White	64	...	0	1	0	0	0	2.995732	60-69	lead<25	27.923803	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10346	48760	32	2	MW	4	48	5	Female	White	35	...	0	0	0	1	1	NaN	30-39	NaN	20.355173	0
10347	48763	32	2	MW	4	48	2	Female	White	33	...	0	0	0	1	1	1.945910	30-39	lead<25	41.645557	1
10348	48764	32	2	MW	4	48	1	Female	White	60	...	0	0	0	1	0	NaN	60-69	NaN	35.626114	0
10349	48768	32	2	MW	4	48	1	Female	White	29	...	0	0	0	1	0	NaN	20-29	NaN	19.204464	0
10350	48770	32	2	MW	4	48	1	Male	White	31	...	0	0	0	1	1	NaN	30-39	NaN	19.635565	0

	t	sales
0	1990q1	100.000000
1	1990q2	97.846031
2	1990q3	98.840286
3	1990q4	100.827500
4	1991q1	98.909805

	sales	sales_pred
t
1990q1	100.000000	NaN
1990q2	97.846031	100.313797
1990q3	98.840286	98.946854
1990q4	100.827500	99.967163
1991q1	98.909805	101.124245

Call Stata from Python

Zhao Xu

Principal Software Engineer
StataCorp LLC

July 16, 2021

Outline¶

Introduction¶

How it works¶

Configuration and initialization¶

Call Stata using magic commands¶

The stata magic¶

%%stata cell magic¶

%stata line magic¶

Arguments¶

Load dataset from Python¶

Push Stata dataset into Python¶

The mata magic¶

The pystata magic¶

Call Stata using API functions¶

Summary¶

Additional resources¶

Call Stata from Python

Zhao Xu

Principal Software EngineerStataCorp LLC

July 16, 2021

Outline¶

Introduction¶

How it works¶

Configuration and initialization¶

Call Stata using magic commands¶

The stata magic¶

%%stata cell magic¶

%stata line magic¶

Arguments¶

Load dataset from Python¶

Push Stata dataset into Python¶

The mata magic¶

The pystata magic¶

Call Stata using API functions¶

Summary¶

Additional resources¶

Principal Software Engineer
StataCorp LLC