Build
Browse files- .gitattributes +1 -0
- build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py +44 -0
- build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py +3 -0
- build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
- build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py +44 -0
- build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py +3 -0
- build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
build/torch24-cxx11-cu118-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx11-cu118-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9343c97509a78e62cf1f87abbf3bc426f8f85e0c95694b3b2b80740d3cbf280
|
3 |
+
size 30943736
|
build/torch24-cxx11-cu121-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx11-cu121-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:626d8e62d5801cdca8869b45e4f79893de3ee6637b86f2647e2b1d1bb1452020
|
3 |
+
size 36253328
|
build/torch24-cxx11-cu124-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx11-cu124-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b1e9f1b5beb9f5de558dcfcf2d61ecc4e207a723ba810605d70d9aa31e65df5
|
3 |
+
size 37028144
|
build/torch24-cxx98-cu118-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx98-cu118-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d5fa1dba02499bec6e2225c83e460ae0d8e7ca396763d2851f7e000be88e675
|
3 |
+
size 30940256
|
build/torch24-cxx98-cu121-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx98-cu121-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bd3556b97cd85c7282ded2057e59207575b92107e0a7e17fcffa82d27f42d26
|
3 |
+
size 36256840
|
build/torch24-cxx98-cu124-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch24-cxx98-cu124-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch24-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f7aa8e42070dfb49e64cfa204bbb81ab2750c2cfafacac6f8d4ae303285a735a
|
3 |
+
size 37027640
|
build/torch25-cxx11-cu118-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx11-cu118-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx11-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a9874fefa371528c54a12851771476c3baf9356536b88c960d7ec3dcf293469
|
3 |
+
size 30943736
|
build/torch25-cxx11-cu121-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx11-cu121-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx11-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:554ac8e33120544e28fb91abab1f03b29e6665256c5acfec69137072575b7945
|
3 |
+
size 36253328
|
build/torch25-cxx11-cu124-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx11-cu124-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx11-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:373d2248e3ffc236b6b86521dabe2604609e8646107eb3c2d74a9378490a8878
|
3 |
+
size 37028144
|
build/torch25-cxx98-cu118-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx98-cu118-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx98-cu118-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7bf3ca9934fb5cf098acbd3e41eab1004f358b49c7c459b01649753dca258d97
|
3 |
+
size 30940256
|
build/torch25-cxx98-cu121-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx98-cu121-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx98-cu121-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5582e29a80ea71e618ccdeb9ad49456d3ebf130ee4c3001b1904e73094460455
|
3 |
+
size 36256840
|
build/torch25-cxx98-cu124-x86_64-linux/quantization/__init__.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
|
5 |
+
try:
|
6 |
+
from ._ops import ops
|
7 |
+
except ImportError as e:
|
8 |
+
# Fallback for local development.
|
9 |
+
try:
|
10 |
+
import _quantization
|
11 |
+
ops = torch.ops._quantization
|
12 |
+
except ImportError:
|
13 |
+
raise e
|
14 |
+
|
15 |
+
def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool:
|
16 |
+
return ops.cutlass_scaled_mm_supports_fp8(cuda_device_capability)
|
17 |
+
|
18 |
+
def cutlass_scaled_mm(a: torch.Tensor,
|
19 |
+
b: torch.Tensor,
|
20 |
+
scale_a: torch.Tensor,
|
21 |
+
scale_b: torch.Tensor,
|
22 |
+
out_dtype: torch.dtype,
|
23 |
+
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
|
24 |
+
assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
25 |
+
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
|
26 |
+
assert bias is None or bias.shape[0] == b.shape[
|
27 |
+
1] and bias.dtype == out_dtype
|
28 |
+
|
29 |
+
m = a.shape[0]
|
30 |
+
n = b.shape[1]
|
31 |
+
|
32 |
+
#if current_platform.is_rocm():
|
33 |
+
# triton_scaled_mm_module = importlib.import_module(
|
34 |
+
# "vllm.model_executor.layers.quantization.compressed_tensors."
|
35 |
+
# "triton_scaled_mm")
|
36 |
+
# triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
|
37 |
+
# return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
|
38 |
+
|
39 |
+
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
|
40 |
+
|
41 |
+
ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
42 |
+
|
43 |
+
return out
|
44 |
+
|
build/torch25-cxx98-cu124-x86_64-linux/quantization/_ops.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from . import _quantization_0_0_1
|
3 |
+
ops = torch.ops._quantization_0_0_1
|
build/torch25-cxx98-cu124-x86_64-linux/quantization/_quantization_0_0_1.abi3.so
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0d16013abe29e9d3914a19078bf6195bf6a9402222fe48dd75e5a3827e081f49
|
3 |
+
size 37027640
|