Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bit64 support #590

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Imports:
methods,
png,
rappdirs,
bit64,
utils,
rlang,
withr
Expand Down
7 changes: 7 additions & 0 deletions src/libpython.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,12 @@ LIBPYTHON_EXTERN PyObject* (*PyInt_FromLong)(long);
LIBPYTHON_EXTERN long (*PyInt_AsLong)(PyObject *);
LIBPYTHON_EXTERN PyObject* (*PyLong_FromLong)(long);
LIBPYTHON_EXTERN long (*PyLong_AsLong)(PyObject *);
LIBPYTHON_EXTERN PyObject* (*PyLong_FromUnsignedLong)(long);
LIBPYTHON_EXTERN unsigned long (*PyLong_AsUnsignedLong)(PyObject *);
LIBPYTHON_EXTERN long (*PyLong_AsLongAndOverflow)(PyObject *, int*);
LIBPYTHON_EXTERN PyObject* (*PyInt_FromUnsignedLong)(long);
LIBPYTHON_EXTERN unsigned long (*PyInt_AsUnsignedLong)(PyObject *);
LIBPYTHON_EXTERN long (*PyInt_AsLongAndOverflow)(PyObject *, int*);

LIBPYTHON_EXTERN PyObject* (*PyBool_FromLong)(long);

Expand Down Expand Up @@ -440,6 +446,7 @@ typedef struct tagPyArrayObject {

typedef unsigned char npy_bool;
typedef long npy_long;
typedef unsigned long npy_ulong;
typedef double npy_double;
typedef struct { double real, imag; } npy_cdouble;
typedef npy_cdouble npy_complex128;
Expand Down
172 changes: 138 additions & 34 deletions src/python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,27 @@ std::wstring s_python_v3;
std::string s_pythonhome;
std::wstring s_pythonhome_v3;

const std::string CONFIG_LONG_AS_BIT64="reticulate.long_as_bit64";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be static const? (since they should have internal linkage)

const std::string CONFIG_ULONG_AS_BIT64="reticulate.ulong_as_bit64";

template<typename T>
T getConfig(std::string config, T defValue) {
Environment base( "package:base" ) ;
Function getOption = base["getOption"];
SEXP s = getOption(config, defValue);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe we could use Rf_GetOption directly here.

if(s == NULL) {
return defValue;
}
return as<T>(s);
}

bool convertLongToBit64() {
return getConfig<bool>(CONFIG_LONG_AS_BIT64, false);
}

bool convertULongToBit64() {
return getConfig<bool>(CONFIG_ULONG_AS_BIT64, false);
}


// helper to convert std::string to std::wstring
Expand Down Expand Up @@ -294,12 +315,19 @@ int narrow_array_typenum(int typenum) {
case NPY_INT:
typenum = NPY_LONG;
break;
// double
case NPY_UINT:
case NPY_ULONG:
case NPY_ULONGLONG:

case NPY_LONG:
case NPY_LONGLONG:
typenum = convertLongToBit64() ? NPY_LONG : NPY_DOUBLE;
break;

case NPY_ULONG:
case NPY_ULONGLONG:
typenum = convertULongToBit64() ? NPY_ULONG : NPY_DOUBLE;
break;

// double
case NPY_UINT:
case NPY_HALF:
case NPY_FLOAT:
case NPY_DOUBLE:
Expand Down Expand Up @@ -328,12 +356,24 @@ int narrow_array_typenum(int typenum) {
return typenum;
}

int typenum(PyArrayObject* array) {
return PyArray_TYPE(array);
}

int typenum(PyArray_Descr* descr) {
return descr->type_num;
}

int typenum(int typeenum) {
return typeenum;
}

int narrow_array_typenum(PyArrayObject* array) {
return narrow_array_typenum(PyArray_TYPE(array));
return narrow_array_typenum(typenum(array));
}

int narrow_array_typenum(PyArray_Descr* descr) {
return narrow_array_typenum(descr->type_num);
return narrow_array_typenum(typenum(descr->type_num));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
return narrow_array_typenum(typenum(descr->type_num));
return narrow_array_typenum(typenum(descr));

}

bool is_numpy_str(PyObject* x) {
Expand Down Expand Up @@ -911,10 +951,8 @@ bool py_is_callable(PyObjectRef x) {
return py_is_callable(x.get());
}


// convert a python object to an R object
SEXP py_to_r(PyObject* x, bool convert) {

// NULL for Python None
if (py_is_none(x))
return R_NilValue;
Expand All @@ -928,9 +966,18 @@ SEXP py_to_r(PyObject* x, bool convert) {
return LogicalVector::create(x == Py_True);

// integer
else if (scalarType == INTSXP)
return IntegerVector::create(PyInt_AsLong(x));

else if (scalarType == INTSXP) {
long val = PyLong_AsLong(x);
if((val > std::numeric_limits<int>::max() || val < std::numeric_limits<int>::min()) && convertLongToBit64()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I worry a bit that this could be confusing if values were converted to integer64 only "sometimes". That is, I'm not sure if the runtime type should depend on the runtime value. Would it make sense to always make this conversion if the user has opted in?

Rcpp::NumericVector vec(1);
std::memcpy(&(vec[0]), &(val), sizeof(double));
vec.attr("class") = "integer64";
return vec;
}
else{
return IntegerVector::create(val);
}
}
// double
else if (scalarType == REALSXP)
return NumericVector::create(PyFloat_AsDouble(x));
Expand Down Expand Up @@ -963,9 +1010,23 @@ SEXP py_to_r(PyObject* x, bool convert) {
return vec;
} else if (scalarType == INTSXP) {
Rcpp::IntegerVector vec(len);
for (Py_ssize_t i = 0; i<len; i++)
vec[i] = PyInt_AsLong(PyList_GetItem(x, i));
for (Py_ssize_t i = 0; i<len; i++) {
long num = PyLong_AsLong(PyList_GetItem(x, i));
if((num > std::numeric_limits<int>::max() || num < std::numeric_limits<int>::min()) && convertLongToBit64()) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same concern here as above.

//We need to start over an interpret as 64 bit int
Rcpp::NumericVector nVec(len);
long long* res_ptr = (long long*) dataptr(nVec);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that using long long* explicitly requires us to use C++11. This is okay but I think we need to update SystemRequirements to indicate that.

for (Py_ssize_t j = 0; j<len; j++) {
res_ptr[j] = PyInt_AsLong(PyList_GetItem(x, j));;
}
nVec.attr("class") = "integer64";
return nVec;
} else {
vec[i] = num;
}
}
return vec;

} else if (scalarType == CPLXSXP) {
Rcpp::ComplexVector vec(len);
for (Py_ssize_t i = 0; i<len; i++) {
Expand Down Expand Up @@ -1055,6 +1116,7 @@ SEXP py_to_r(PyObject* x, bool convert) {
}

// determine the target type of the array
int oriType = typenum(array);
int typenum = narrow_array_typenum(array);

// cast it to a fortran array (PyArray_CastToType steals the descr)
Expand All @@ -1079,10 +1141,32 @@ SEXP py_to_r(PyObject* x, bool convert) {
}

case NPY_LONG: {
npy_long* pData = (npy_long*)PyArray_DATA(array);
rArray = Rf_allocArray(INTSXP, dimsVector);
for (int i=0; i<len; i++)
INTEGER(rArray)[i] = pData[i];
if((oriType == NPY_LONG || oriType == NPY_LONGLONG) && convertLongToBit64()) {
Rcpp::NumericVector nVec(len);
//Inspired by https://gallery.rcpp.org/articles/creating-integer64-and-nanotime-vectors/
npy_ulong* pData = (npy_ulong*)PyArray_DATA(array);
std::memcpy(&(nVec[0]), &(pData[0]), len * sizeof(double));
nVec.attr("class") = "integer64";
nVec.attr("dim") = dimsVector;
rArray = nVec;
} else {
npy_long* pData = (npy_long*)PyArray_DATA(array);
rArray = Rf_allocArray(INTSXP, dimsVector);
for (int i=0; i<len; i++) {
INTEGER(rArray)[i] = pData[i];
}
}

break;
}
case NPY_ULONG: {
npy_ulong* pData = (npy_ulong*)PyArray_DATA(array);
Rcpp::NumericVector nVec(len);
//Inspired by https://gallery.rcpp.org/articles/creating-integer64-and-nanotime-vectors/
std::memcpy(&(nVec[0]), &(pData[0]), len * sizeof(double));
nVec.attr("class") = CharacterVector::create("integer64", "np.ulong");
nVec.attr("dim") = dimsVector;
rArray = nVec;
break;
}

Expand Down Expand Up @@ -1342,6 +1426,7 @@ SEXP py_to_r(PyObject* x, bool convert) {

}


/* stretchy list, modified from R sources
CAR of the list points to the last cons-cell
CDR points to the first.
Expand All @@ -1359,6 +1444,7 @@ void GrowList(SEXP args_list, SEXP tag, SEXP dflt) {
SEXP tmp = PROTECT(Rf_cons(dflt, R_NilValue));
SET_TAG(tmp, tag);


SETCDR(CAR(args_list), tmp); // set cdr on the last cons-cell
SETCAR(args_list, tmp); // update pointer to last cons cell
UNPROTECT(2);
Expand Down Expand Up @@ -1477,6 +1563,7 @@ SEXP py_get_formals(PyObjectRef callable)
return CDR(r_args);
}


bool is_convertible_to_numpy(RObject x) {

if (!haveNumPy())
Expand Down Expand Up @@ -1517,7 +1604,7 @@ PyObject* r_to_py_numpy(RObject x, bool convert) {
typenum = NPY_INT;
data = &(INTEGER(sexp)[0]);
} else if (type == REALSXP) {
typenum = NPY_DOUBLE;
typenum = x.inherits("integer64") ? (x.inherits("np.ulong") ? NPY_ULONG : NPY_LONG) : NPY_DOUBLE;
data = &(REAL(sexp)[0]);
} else if (type == LGLSXP) {
typenum = NPY_BOOL;
Expand Down Expand Up @@ -1698,25 +1785,39 @@ PyObject* r_to_py_cpp(RObject x, bool convert) {
}

// numeric (pass length 1 vectors as scalars, otherwise pass list)
if (type == REALSXP) {

// handle scalars
} else if (type == REALSXP) {
bool isBit64 = x.inherits("integer64");
bool isUnsigned = x.inherits("np.ulong");
bool isInt64 = isBit64 && !isUnsigned;
bool isUint64 = isBit64 && isUnsigned;
if (LENGTH(sexp) == 1) {
double value = REAL(sexp)[0];
return PyFloat_FromDouble(value);
}

PyObjectPtr list(PyList_New(LENGTH(sexp)));
for (R_xlen_t i = 0; i<LENGTH(sexp); i++) {
double value = REAL(sexp)[i];
// NOTE: reference to added value is "stolen" by the list
int res = PyList_SetItem(list, i, PyFloat_FromDouble(value));
if (res != 0)
throw PythonException(py_fetch_error());
if(isInt64) {
return PyInt_FromLong(reinterpret_cast<long&>(value));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it really safe to reinterpret a value as a reference? I've usually seen type punning done as e.g.

*(long*)(&value)

This is still technically undefined behavior, though. My understanding is that the only standards-compliant way of performing type punning is with memcpy(). See: https://stackoverflow.com/a/17790026/1342082

(In C++20, we will have bitcast: https://en.cppreference.com/w/cpp/numeric/bit_cast)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See https://godbolt.org/z/VDCcVF for an example of a helper function that could be used for casting here.

} else if(isUint64) {
return PyLong_FromUnsignedLong(reinterpret_cast<unsigned long&>(value));
} else {
return PyFloat_FromDouble(value);
}
} else {
PyObjectPtr list(PyList_New(LENGTH(sexp)));
for (R_xlen_t i = 0; i<LENGTH(sexp); i++) {
double value = REAL(sexp)[i];
// NOTE: reference to added value is "stolen" by the list
PyObject* obj;
if(isInt64) {
obj = PyInt_FromLong(reinterpret_cast<long&>(value));
} else if(isUint64) {
obj = PyLong_FromUnsignedLong(reinterpret_cast<unsigned long&>(value));
} else {
obj = PyFloat_FromDouble(value);
}
int res = PyList_SetItem(list, i, obj);
if (res != 0)
throw PythonException(py_fetch_error());
}
return list.detach();
}

return list.detach();

}

// complex (pass length 1 vectors as scalars, otherwise pass list)
Expand Down Expand Up @@ -2703,6 +2804,7 @@ SEXP py_dict_get_item(PyObjectRef dict, RObject key) {
return py_ref(Py_None, false);
}


Py_IncRef(item);
return py_ref(item, dict.convert());

Expand Down Expand Up @@ -2765,6 +2867,7 @@ CharacterVector py_dict_get_keys_as_str(PyObjectRef dict) {
std::vector<std::string> keys;

PyObjectPtr it(PyObject_GetIter(py_keys));

if (it.is_null())
throw PythonException(py_fetch_error());

Expand Down Expand Up @@ -3397,4 +3500,5 @@ PyObjectRef py_capsule(SEXP x) {
ensure_python_initialized();

return py_ref(py_capsule_new(x), false);

}
32 changes: 30 additions & 2 deletions tests/testthat/test-python-numpy.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,38 @@ test_that("Character arrays are handled correctly", {
expect_equal(a1, py_to_r(r_to_py(a1)))
})


test_that("Long integer types are converted to bit64", {
skip_if_no_numpy()
np <- import("numpy", convert = FALSE)
dtypes <- c(np$int64, np$long, np$uint64, np$longlong, np$ulonglong)
require(bit64)
#First run with the default mode
lapply(dtypes, function(dtype) {
a1 <- np$array(c(as.integer64("12345"), as.integer64("1567447722123456786")), dtype = dtype)
expect_equal(class(as.vector(py_to_r(a1))), "numeric")
})

#Now with the options set
options(reticulate.long_as_bit64=TRUE)
options(reticulate.ulong_as_bit64=TRUE)
lapply(dtypes, function(dtype) {
a1 <- np$array(c(as.integer64("12345"), as.integer64("1567447722123456786")), dtype = dtype)
res = c('integer64')
if(dtype == np$uint64 || dtype == np$ulonglong) {
res = c(res, 'np.ulong')
}
expect_setequal(class(py_to_r(a1)), res)
})
options(reticulate.long_as_bit64=F)
options(reticulate.ulong_as_bit64=F)
})

test_that("Long integer types are converted to R numeric", {
skip_if_no_numpy()
np <- import("numpy", convert = FALSE)
dtypes <- c(np$int64, np$int32,
np$uint64, np$uint32)
dtypes <- c(np$uint32)

lapply(dtypes, function(dtype) {
a1 <- np$array(c(1L:30L), dtype = dtype)
a1 <- as.vector(py_to_r(a1))
Expand Down Expand Up @@ -86,3 +113,4 @@ test_that("boolean matrices are converted appropriately", {
A <- matrix(TRUE, nrow = 2, ncol = 2)
expect_equal(A, py_to_r(r_to_py(A)))
})

18 changes: 18 additions & 0 deletions tests/testthat/test-python-pandas.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,23 @@ test_that("NaT is converted to NA", {

expect_equal(py_to_r(before), py_to_r(after))


})

test_that("Large ints are handled correctly", {
skip_if_no_pandas()
require(bit64)

options(reticulate.long_as_bit64=TRUE)
options(reticulate.ulong_as_bit64=TRUE)
A <- data.frame(val=c(as.integer64("1567447722123456785"), as.integer64("1567447722123456786")))
expect_equal(A, py_to_r(r_to_py(A)))

options(reticulate.long_as_bit64=F)
options(reticulate.ulong_as_bit64=F)
A <- data.frame(val=c(as.integer64("1567447722123456785"), as.integer64("1567447722123456786")))
expect_equal(A, py_to_r(r_to_py(A)))

})

test_that("pandas NAs are converted to R NAs", {
Expand Down Expand Up @@ -218,4 +235,5 @@ df = pd.DataFrame({"FCT": pd.Categorical(["No", "Yes"]),

expect_identical(p_df, r_df)


})