Skip to content

Commit

Permalink
feat: Supporting big endian architecture. (#310)
Browse files Browse the repository at this point in the history
* Supporting big endian architecture.

`byteswap` before saving and `byteswap` on read.

* Specify platform ?

* Revert "Specify platform ?"

This reverts commit 8d82bbd.

* GH

* With cache ?

* Switch driver for cache ?

* Push

* Using internal registry

* Fix build push.

* Add platform ?

* PlatformS
  • Loading branch information
Narsil authored Aug 7, 2023
1 parent a01ff65 commit a4d2dda
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 18 deletions.
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
safetensors/target
bindings/python/target
Dockerfile.s390x.test
48 changes: 48 additions & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,51 @@ jobs:
cargo test
pip install .[testing]
pytest -sv tests/
test_s390x_big_endian:
runs-on: ubuntu-latest
name: Test bigendian - S390X
steps:
- uses: actions/checkout@v2
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- uses: tailscale/github-action@v1
with:
authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
- name: Set short sha
id: vars
run: echo "GITHUB_SHA_SHORT=$(git rev-parse --short HEAD)" >> $GITHUB_ENV
- name: Docker meta
id: meta
uses: docker/metadata-action@v4
with:
# list of Docker images to use as base name for tags
images: |
registry.internal.huggingface.tech/safetensors/s390x
# generate Docker tags based on the following events/attributes
tags: |
type=schedule
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
type=sha
- name: Login to Registry
uses: docker/login-action@v2
with:
registry: registry.internal.huggingface.tech
username: ${{ secrets.REGISTRY_USERNAME }}
password: ${{ secrets.REGISTRY_PASSWORD }}
- name: Test big endian
uses: docker/build-push-action@v4
with:
push: true
platforms: linux/s390x
file: Dockerfile.s390x.test
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=registry.internal.huggingface.tech/safetensors/s390x:cache,mode=max
cache-to: type=registry,ref=registry.internal.huggingface.tech/safetensors/s390x:cache,mode=max
14 changes: 14 additions & 0 deletions Dockerfile.s390x.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM s390x/python
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-s390x.sh \
&& bash Miniconda3-py311_23.5.2-0-Linux-s390x.sh -b \
&& rm -f Miniconda3-py311_23.5.2-0-Linux-s390x.sh
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y
RUN /root/miniconda3/bin/conda install pytorch cpuonly -c pytorch -y
WORKDIR /safetensors/
RUN /root/miniconda3/bin/pip install -U pip pytest
COPY . .
SHELL ["/bin/bash", "-c"]
WORKDIR /safetensors/bindings/python/
RUN source /root/.cargo/env && /root/miniconda3/bin/pip install -e .
RUN /root/miniconda3/bin/pytest -sv tests/test_pt_* tests/test_simple.py
ENTRYPOINT /bin/bash
4 changes: 2 additions & 2 deletions bindings/python/py_src/safetensors/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def save(tensor_dict: Dict[str, np.ndarray], metadata: Optional[Dict[str, str]]
"""
for tensor in tensor_dict.values():
if not _is_little_endian(tensor):
raise ValueError("Safetensor format only accepts little endian")
tensor.byteswap(inplace=True)
flattened = {k: {"dtype": v.dtype.name, "shape": v.shape, "data": v.tobytes()} for k, v in tensor_dict.items()}
serialized = serialize(flattened, metadata=metadata)
result = bytes(serialized)
Expand Down Expand Up @@ -72,7 +72,7 @@ def save_file(
"""
for tensor in tensor_dict.values():
if not _is_little_endian(tensor):
raise ValueError("Safetensor format only accepts little endian")
tensor.byteswap(inplace=True)
flattened = {k: {"dtype": v.dtype.name, "shape": v.shape, "data": v.tobytes()} for k, v in tensor_dict.items()}
serialize_file(flattened, filename, metadata=metadata)

Expand Down
5 changes: 2 additions & 3 deletions bindings/python/py_src/safetensors/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,13 +418,12 @@ def _tobytes(tensor: torch.Tensor, name: str) -> bytes:
return b""
newptr = ctypes.cast(ptr, ctypes.POINTER(ctypes.c_ubyte))
data = np.ctypeslib.as_array(newptr, (total_bytes,)) # no internal copy

if sys.byteorder == "big":
data.byteswap(inplace=True)
return data.tobytes()


def _flatten(tensors: Dict[str, torch.Tensor]) -> Dict[str, Dict[str, Any]]:
if sys.byteorder == "big":
raise ValueError("Big endian is not supported, serialization need to be in little endian")
if not isinstance(tensors, dict):
raise ValueError(f"Expected a dict of [str, torch.Tensor] but received {type(tensors)}")

Expand Down
35 changes: 29 additions & 6 deletions bindings/python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -376,12 +376,35 @@ impl Open {
} else {
(intern!(py, "size"), intern!(py, "ByteStorage"))
};
let kwargs =
[(intern!(py, "shared"), shared), (size_name, size)].into_py_dict(py);
let storage = module
.getattr(storage_name)?
.getattr(intern!(py, "from_file"))?
.call((py_filename,), Some(kwargs))?;

let sys = PyModule::import(py, intern!(py, "sys"))?;
let byteorder: String = sys.getattr(intern!(py, "byteorder"))?.extract()?;

let storage = if byteorder == "big" {
let torch_uint8: PyObject = get_pydtype(module, Dtype::U8)?;
let kwargs = [
(intern!(py, "dtype"), torch_uint8),
(intern!(py, "byte_order"), "big".into_py(py)),
]
.into_py_dict(py);
let builtins = PyModule::import(py, intern!(py, "builtins"))?;
let py_buffer = builtins
.getattr(intern!(py, "open"))?
.call1((py_filename, intern!(py, "rb")))?
.getattr(intern!(py, "read"))?
.call0()?;
module
.getattr(storage_name)?
.getattr(intern!(py, "from_buffer"))?
.call((py_buffer,), Some(kwargs))?
} else {
let kwargs =
[(intern!(py, "shared"), shared), (size_name, size)].into_py_dict(py);
module
.getattr(storage_name)?
.getattr(intern!(py, "from_file"))?
.call((py_filename,), Some(kwargs))?
};

let untyped: &PyAny = match storage.getattr(intern!(py, "untyped")) {
Ok(untyped) => untyped,
Expand Down
13 changes: 9 additions & 4 deletions bindings/python/tests/test_pt_comparison.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest
import sys

import torch

Expand Down Expand Up @@ -211,14 +212,18 @@ def test_deserialization_slice(self):
self.assertEqual(_slice.get_dtype(), "F32")
tensor = _slice[:, :, 1:2]

self.assertTrue(torch.equal(tensor, torch.Tensor([[[1.0], [4.0]]])))
self.assertTrue(torch.equal(tensor, self.tensor[:, :, 1:2]))

buffer = tensor.numpy()
if sys.byteorder == "big":
buffer.byteswap(inplace=True)
buffer = buffer.tobytes()
self.assertEqual(
tensor.numpy().tobytes(),
buffer,
b"\x00\x00\x80?\x00\x00\x80@",
)

self.assertTrue(torch.equal(tensor, torch.Tensor([[[1.0], [4.0]]])))
self.assertTrue(torch.equal(tensor, self.tensor[:, :, 1:2]))

def test_deserialization_metadata(self):
with safe_open(self.local, framework="pt") as f:
metadata = f.metadata()
Expand Down
12 changes: 9 additions & 3 deletions bindings/python/tests/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,15 @@ def test_serialization_metadata(self):

def test_serialization_no_big_endian(self):
# Big endian tensor
data = np.zeros((2, 2), dtype=">u4")
with self.assertRaises(ValueError):
save({"test1": data})
data = np.zeros((2, 2), dtype=">i4")
out1 = save({"test1": data}, metadata={"framework": "pt"})
self.assertEqual(
out1,
b'`\x00\x00\x00\x00\x00\x00\x00{"__metadata__":{"framework":"pt"},"test1":{"dtype":"I32","shape":[2,2],"data_offsets":[0,16]}}'
b" \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
)
self.assertEqual(out1[8:].index(b"\x00") + 8, 104)
self.assertEqual((out1[8:].index(b"\x00") + 8) % 8, 0)

def test_accept_path(self):
tensors = {
Expand Down

0 comments on commit a4d2dda

Please sign in to comment.