danieldk HF staff commited on
Commit
a77838d
·
1 Parent(s): c7e38f0
Files changed (37) hide show
  1. .gitattributes +1 -0
  2. build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py +44 -0
  3. build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -0
  4. build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  5. build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py +44 -0
  6. build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py +3 -0
  7. build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  8. build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py +44 -0
  9. build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py +3 -0
  10. build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  11. build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py +44 -0
  12. build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py +3 -0
  13. build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  14. build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py +44 -0
  15. build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py +3 -0
  16. build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  17. build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py +44 -0
  18. build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py +3 -0
  19. build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  20. build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py +44 -0
  21. build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -0
  22. build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  23. build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py +44 -0
  24. build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py +3 -0
  25. build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  26. build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py +44 -0
  27. build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py +3 -0
  28. build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  29. build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py +44 -0
  30. build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py +3 -0
  31. build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  32. build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py +44 -0
  33. build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py +3 -0
  34. build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
  35. build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py +44 -0
  36. build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py +3 -0
  37. build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.so filter=lfs diff=lfs merge=lfs -text
build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9343c97509a78e62cf1f87abbf3bc426f8f85e0c95694b3b2b80740d3cbf280
3
+ size 30943736
build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:626d8e62d5801cdca8869b45e4f79893de3ee6637b86f2647e2b1d1bb1452020
3
+ size 36253328
build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b1e9f1b5beb9f5de558dcfcf2d61ecc4e207a723ba810605d70d9aa31e65df5
3
+ size 37028144
build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d5fa1dba02499bec6e2225c83e460ae0d8e7ca396763d2851f7e000be88e675
3
+ size 30940256
build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bd3556b97cd85c7282ded2057e59207575b92107e0a7e17fcffa82d27f42d26
3
+ size 36256840
build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7aa8e42070dfb49e64cfa204bbb81ab2750c2cfafacac6f8d4ae303285a735a
3
+ size 37027640
build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a9874fefa371528c54a12851771476c3baf9356536b88c960d7ec3dcf293469
3
+ size 30943736
build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:554ac8e33120544e28fb91abab1f03b29e6665256c5acfec69137072575b7945
3
+ size 36253328
build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:373d2248e3ffc236b6b86521dabe2604609e8646107eb3c2d74a9378490a8878
3
+ size 37028144
build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bf3ca9934fb5cf098acbd3e41eab1004f358b49c7c459b01649753dca258d97
3
+ size 30940256
build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5582e29a80ea71e618ccdeb9ad49456d3ebf130ee4c3001b1904e73094460455
3
+ size 36256840
build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+
5
+ try:
6
+ from ._ops import ops
7
+ except ImportError as e:
8
+ # Fallback for local development.
9
+ try:
10
+ import _quantization
11
+ ops = torch.ops._quantization
12
+ except ImportError:
13
+ raise e
14
+
15
+ def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
16
+ return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
17
+
18
+ def cutlass_scaled_mm(a: torch.Tensor,
19
+ b: torch.Tensor,
20
+ scale_a: torch.Tensor,
21
+ scale_b: torch.Tensor,
22
+ out_dtype: torch.dtype,
23
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
24
+ assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
25
+ assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
26
+ assert bias is None or bias.shape[0] == b.shape[
27
+ 1] and bias.dtype == out_dtype
28
+
29
+ m = a.shape[0]
30
+ n = b.shape[1]
31
+
32
+ #if current_platform.is_rocm():
33
+ # triton_scaled_mm_module = importlib.import_module(
34
+ # "vllm.model_executor.layers.quantization.compressed_tensors."
35
+ # "triton_scaled_mm")
36
+ # triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
37
+ # return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
38
+
39
+ out = torch.empty((m, n), dtype=out_dtype, device=a.device)
40
+
41
+ ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
42
+
43
+ return out
44
+
build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ import torch
2
+ from . import _quantization_0_0_1
3
+ ops = torch.ops._quantization_0_0_1
build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d16013abe29e9d3914a19078bf6195bf6a9402222fe48dd75e5a3827e081f49
3
+ size 37027640