diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md index f281979d..0fa979ae 100644 --- a/doc/excuter/op-mem-cuda/list.md +++ b/doc/excuter/op-mem-cuda/list.md @@ -4,98 +4,99 @@ ### arg -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| vecset | none | vecset(vector value)->(vector name) | [3 4 5]->shape | vecset(vector value)->(vector name) | -| argset | none | argset(var value)->(var name) | argvalue->argname | argset(var value)->(var name) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| vecset | none | [3 4 5]->shape | vecset(vector value)->(vector name) | +| argset | none | argvalue->argname | argset(var value)->(var name) | ### tensorlife -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| renametensor | none | renametensor(var new_name)->(tensor t) | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | -| newtensor | none | newtensor(vector shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | -| newtensor | none | newtensor(var shape)->(tensor tensor1) | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | -| deltensor | none | deltensor()->(tensor t) | del->T1 | deltensor()->(tensor t) | -| copytensor | none | copytensor(tensor src)->(tensor dst) | T2.data = T1.data | copytensor(tensor src)->(tensor dst) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| renametensor | none | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | +| newtensor | none | T1 = zeros(shape) | newtensor(vector shape)->(tensor tensor1) | +| newtensor | none | T1 = zeros(shape) | newtensor(var shape)->(tensor tensor1) | +| deltensor | none | del->T1 | deltensor()->(tensor t) | +| copytensor | none | T2.data = T1.data | copytensor(tensor src)->(tensor dst) | ### io -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| loadtensordata | none | loadtensordata(var path)->(tensor t) | loadtensordata(path)->tensor | loadtensordata(var path)->(tensor t) | -| save | none | save(tensor t, var path)->() | save(T1,path) | save(tensor t, var path)->() | -| print | miaobyte | print(tensor t)->() | print(T1) | print(tensor t)->() | -| print | miaobyte | print(tensor t, var format)->() | print(T1) | print(tensor t, var format)->() | -| load | none | load(var path)->() | load(path) | load(var path)->() | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| loadtensordata | none | loadtensordata(path)->tensor | loadtensordata(var path)->(tensor t) | +| save | none | save(T1,path) | save(tensor t, var path)->() | +| print | miaobyte | print(T1) | print(tensor t)->() | +| print | miaobyte | print(T1) | print(tensor t, var format)->() | +| load | none | load(path) | load(var path)->() | ### matmul -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| matmul | cublas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| matmul | cublas | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | ### init -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| normal | miaobyte | normal(var mean, var stddev, var seed)->(tensor t) | normal(mean,stddev,seed)->T1 | normal(var mean, var stddev, var seed)->(tensor t) | -| uniform | miaobyte | uniform(var low, var high, var seed)->(tensor t) | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | -| arange | miaobyte | arange(var start, var step)->(tensor t) | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | -| constant | miaobyte | constant(var value)->(tensor t) | constant(value)->T1 | constant(var value)->(tensor t) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var mean, var stddev, var seed)->(tensor t) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | +| constant | miaobyte | constant(value)->T1 | constant(var value)->(tensor t) | ### elementwise -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| switch | miaobyte | switch(listtensor tensors, tensor cases)->(tensor result) | C=switch(tensors,cases) | switch(listtensor tensors, tensor cases)->(tensor result) | -| greaterscalar | miaobyte | greaterscalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | -| equalscalar | miaobyte | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | mask=compare(T1, scalar) | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | -| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| tan | miaobyte | tan(tensor A)->(tensor C) | T3=tan(T1) | tan(tensor A)->(tensor C) | -| sin | miaobyte | sin(tensor A)->(tensor C) | T3=sin(T1) | sin(tensor A)->(tensor C) | -| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | -| addscalar | miaobyte | addscalar(tensor A, var b)->(tensor C) | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | -| greater | miaobyte | greater(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | greater(tensor A, tensor B)->(tensor mask) | -| lessscalar | miaobyte | lessscalar(tensor A, var scalar)->(tensor mask) | mask=compare(T1, scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | -| cos | miaobyte | cos(tensor A)->(tensor C) | T3=cos(T1) | cos(tensor A)->(tensor C) | -| less | miaobyte | less(tensor A, tensor B)->(tensor mask) | mask=compare(T1, T2) | less(tensor A, tensor B)->(tensor mask) | -| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | -| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | -| add | cublas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| sub | miaobyte | sub(tensor A, tensor B)->(tensor C) | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | -| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| subscalar | miaobyte | subscalar(tensor A, var b)->(tensor C) | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | -| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | -| mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | -| equal | miaobyte | equal(tensor A, tensor B, var epsilon)->(tensor mask) | mask=compare(T1, T2) | equal(tensor A, tensor B, var epsilon)->(tensor mask) | -| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) | -| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) | -| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor tensors, tensor cases)->(tensor result) | +| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | +| equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor A, var scalar, var epsilon)->(tensor mask) | +| min | miaobyte | T3=min(T1, T2) | min(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor A, var scalar)->(tensor C) | +| tan | miaobyte | T3=tan(T1) | tan(tensor A)->(tensor C) | +| sin | miaobyte | T3=sin(T1) | sin(tensor A)->(tensor C) | +| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor A, var scalar)->(tensor C) | +| log | miaobyte | T3=log(T1) | log(tensor A)->(tensor C) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor A, var b)->(tensor C) | +| greater | miaobyte | mask=compare(T1, T2) | greater(tensor A, tensor B)->(tensor mask) | +| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | +| cos | miaobyte | T3=cos(T1) | cos(tensor A)->(tensor C) | +| less | miaobyte | mask=compare(T1, T2) | less(tensor A, tensor B)->(tensor mask) | +| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor A, var scalar)->(tensor C) | +| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor A, var scalar)->(tensor C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var scalar, tensor A)->(tensor C) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor a)->(tensor b) | +| add | cublas | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| add | miaobyte | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| sub | miaobyte | T3=T1-T2 | sub(tensor A, tensor B)->(tensor C) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor A, var b)->(tensor C) | +| exp | miaobyte | T3=exp(T1) | exp(tensor A)->(tensor C) | +| mul | miaobyte | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | +| equal | miaobyte | mask=compare(T1, T2) | equal(tensor A, tensor B, var epsilon)->(tensor mask) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| div | miaobyte | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| invert | miaobyte | T3=~T1 | invert(tensor A)->(tensor C) | +| max | miaobyte | T3=max(T1, T2) | max(tensor A, tensor B)->(tensor C) | +| pow | miaobyte | T3=pow(T1, T2) | pow(tensor A, tensor B)->(tensor C) | ### reduce -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| prod | miaobyte | prod(tensor A, vector dims, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | -| reducemax | miaobyte | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | -| sum | miaobyte | sum(tensor A, vector dims, var keepdims)->(tensor B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) | -| reducemin | miaobyte | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector dims, var keepdims)->(tensor B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector dims, var keepdims)->(tensor B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector dims, var keepdims)->(tensor B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector dims, var keepdims)->(tensor B) | ### changeshape -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| indexselect | miaobyte | indexselect(tensor A, tensor indices, var axis)->(tensor B) | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor A, tensor indices, var axis)->(tensor B) | -| broadcastTo | miaobyte | broadcastTo(tensor A, vector new_shape)->(tensor B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | -| concat | miaobyte | concat(listtensor tensors, var dim)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | -| transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | -| reshape | miaobyte | reshape(tensor A, vector shape)->(tensor B) | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor A, tensor indices, var axis)->(tensor B) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | +| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor A, vector dim_order)->(tensor C) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md index 8bcfcdb0..7d676562 100644 --- a/doc/excuter/op-mem-ompsimd/list.md +++ b/doc/excuter/op-mem-ompsimd/list.md @@ -4,96 +4,97 @@ ### arg -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| vecset | none | vecset(vector value)->(vector name) | [3 4 5]->shape | vecset(vector value)->(vector name) | -| argset | none | argset(var value)->(var name) | argvalue->argname | argset(var value)->(var name) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| vecset | none | [3 4 5]->shape | vecset(vector value)->(vector name) | +| argset | none | argvalue->argname | argset(var value)->(var name) | ### tensorlife -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| renametensor | none | renametensor(var new_name)->(tensor t) | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | -| newtensor | none | newtensor(vector shape)->(tensor t) | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor t) | -| newtensor | none | newtensor(var shape)->(tensor t) | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor t) | -| deltensor | none | deltensor()->(tensor t) | del->T1 | deltensor()->(tensor t) | -| copytensor | none | copytensor(tensor src)->(tensor dst) | T1.data->T2.data | copytensor(tensor src)->(tensor dst) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| renametensor | none | rename(newname)->T1 | renametensor(var new_name)->(tensor t) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(vector shape)->(tensor t) | +| newtensor | none | T1 =Tensor(shape=[...]) | newtensor(var shape)->(tensor t) | +| deltensor | none | del->T1 | deltensor()->(tensor t) | +| copytensor | none | T1.data->T2.data | copytensor(tensor src)->(tensor dst) | ### io -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| loadtensordata | none | loadtensordata(var path)->(tensor t) | loadtensordata(path)->tensor.data | loadtensordata(var path)->(tensor t) | -| save | none | save(tensor t, var path)->() | save(T1,path) | save(tensor t, var path)->() | -| print | miaobyte | print(tensor t)->() | print(T1) | print(tensor t)->() | -| print | miaobyte | print(tensor t, var format)->() | print(T1) | print(tensor t, var format)->() | -| load | none | load(var path)->() | mem.load(path) | load(var path)->() | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| loadtensordata | none | loadtensordata(path)->tensor.data | loadtensordata(var path)->(tensor t) | +| save | none | save(T1,path) | save(tensor t, var path)->() | +| print | miaobyte | print(T1) | print(tensor t)->() | +| print | miaobyte | print(T1) | print(tensor t, var format)->() | +| load | none | mem.load(path) | load(var path)->() | ### matmul -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| matmul | cblas | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | -| matmul | miaobyte | matmul(tensor A, tensor B)->(tensor C) | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| matmul | cblas | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | +| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor A, tensor B)->(tensor C) | ### init -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| normal | miaobyte | normal(var mean, var std, var seed)->(tensor t) | normal(mean,stddev,seed)->T1 | normal(var mean, var std, var seed)->(tensor t) | -| uniform | miaobyte | uniform(var low, var high, var seed)->(tensor t) | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | -| arange | miaobyte | arange(var start, var step)->(tensor t) | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | -| constant | miaobyte | constant(var value)->(tensor t) | constant(value)->T1 | constant(var value)->(tensor t) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var mean, var std, var seed)->(tensor t) | +| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var low, var high, var seed)->(tensor t) | +| arange | miaobyte | arange(start,step)->T1 | arange(var start, var step)->(tensor t) | +| constant | miaobyte | constant(value)->T1 | constant(var value)->(tensor t) | ### elementwise -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| switch | miaobyte | switch(listtensor tensors, tensor cases)->(tensor C) | C=switch([tensors],case) | switch(listtensor tensors, tensor cases)->(tensor C) | -| greaterscalar | miaobyte | greaterscalar(tensor A, var scalar)->(tensor mask) | mask=greater(T1,scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | -| equalscalar | miaobyte | equalscalar(tensor A, var scalar)->(tensor mask) | mask=equal(T1,scalar) | equalscalar(tensor A, var scalar)->(tensor mask) | -| min | miaobyte | min(tensor A, tensor B)->(tensor C) | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | -| maxscalar | miaobyte | maxscalar(tensor A, var scalar)->(tensor C) | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | -| divscalar | miaobyte | divscalar(tensor A, var scalar)->(tensor C) | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | -| log | miaobyte | log(tensor A)->(tensor C) | T3=log(T1) | log(tensor A)->(tensor C) | -| addscalar | miaobyte | addscalar(tensor a, var scalar)->(tensor c) | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | -| greater | miaobyte | greater(tensor A, tensor B)->(tensor mask) | mask=greater(T1,T2) | greater(tensor A, tensor B)->(tensor mask) | -| lessscalar | miaobyte | lessscalar(tensor A, var scalar)->(tensor mask) | mask=less(T1,scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | -| less | miaobyte | less(tensor A, tensor B)->(tensor mask) | mask=less(T1,T2) | less(tensor A, tensor B)->(tensor mask) | -| powscalar | miaobyte | powscalar(tensor A, var scalar)->(tensor C) | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | -| minscalar | miaobyte | minscalar(tensor A, var scalar)->(tensor C) | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | -| rdivscalar | miaobyte | rdivscalar(var scalar, tensor A)->(tensor C) | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | -| rpowscalar | miaobyte | rpowscalar(var scalar, tensor A)->(tensor C) | T3=scalar^T1 | rpowscalar(var scalar, tensor A)->(tensor C) | -| add | cblas | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| add | miaobyte | add(tensor a, tensor b)->(tensor c) | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | -| sub | miaobyte | sub(tensor a, tensor b)->(tensor c) | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | -| sqrt | miaobyte | sqrt(tensor A)->(tensor C) | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | -| subscalar | miaobyte | subscalar(tensor a, var scalar)->(tensor c) | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | -| exp | miaobyte | exp(tensor A)->(tensor C) | T3=exp(T1) | exp(tensor A)->(tensor C) | -| mul | miaobyte | mul(tensor A, tensor B)->(tensor C) | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | -| equal | miaobyte | equal(tensor A, tensor B)->(tensor mask) | mask=equal(T1,T2) | equal(tensor A, tensor B)->(tensor mask) | -| mulscalar | miaobyte | mulscalar(tensor A, var b)->(tensor C) | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | -| div | miaobyte | div(tensor A, tensor B)->(tensor C) | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | -| invert | miaobyte | invert(tensor A)->(tensor C) | T3=~T1 | invert(tensor A)->(tensor C) | -| max | miaobyte | max(tensor A, tensor B)->(tensor C) | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | -| pow | miaobyte | pow(tensor A, tensor B)->(tensor C) | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| switch | miaobyte | C=switch([tensors],case) | switch(listtensor tensors, tensor cases)->(tensor C) | +| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor A, var scalar)->(tensor mask) | +| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor A, var scalar)->(tensor mask) | +| min | miaobyte | T3=min(T1,T2) | min(tensor A, tensor B)->(tensor C) | +| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor A, var scalar)->(tensor C) | +| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor A, var scalar)->(tensor C) | +| log | miaobyte | T3=log(T1) | log(tensor A)->(tensor C) | +| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor a, var scalar)->(tensor c) | +| greater | miaobyte | mask=greater(T1,T2) | greater(tensor A, tensor B)->(tensor mask) | +| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor A, var scalar)->(tensor mask) | +| less | miaobyte | mask=less(T1,T2) | less(tensor A, tensor B)->(tensor mask) | +| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor A, var scalar)->(tensor C) | +| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor A, var scalar)->(tensor C) | +| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var scalar, tensor A)->(tensor C) | +| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var scalar, tensor A)->(tensor C) | +| todtype | none | T3(dtypeA)->T1(dtypeB) | todtype(tensor A)->(tensor C) | +| add | cblas | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| add | miaobyte | T3=T1+T2 | add(tensor a, tensor b)->(tensor c) | +| sub | miaobyte | T3=T1-T2 | sub(tensor a, tensor b)->(tensor c) | +| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor A)->(tensor C) | +| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor a, var scalar)->(tensor c) | +| exp | miaobyte | T3=exp(T1) | exp(tensor A)->(tensor C) | +| mul | miaobyte | T3=T1*T2 | mul(tensor A, tensor B)->(tensor C) | +| equal | miaobyte | mask=equal(T1,T2) | equal(tensor A, tensor B)->(tensor mask) | +| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor A, var b)->(tensor C) | +| div | miaobyte | T3=T1/T2 | div(tensor A, tensor B)->(tensor C) | +| invert | miaobyte | T3=~T1 | invert(tensor A)->(tensor C) | +| max | miaobyte | T3=max(T1,T2) | max(tensor A, tensor B)->(tensor C) | +| pow | miaobyte | T3=T1^T2 | pow(tensor A, tensor B)->(tensor C) | ### reduce -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| prod | miaobyte | prod(tensor A, vector axis, var keepdims)->(tensor B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector axis, var keepdims)->(tensor B) | -| reducemax | miaobyte | reducemax(tensor A, vector axis, var keepdims)->(tensor B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector axis, var keepdims)->(tensor B) | -| sum | miaobyte | sum(tensor A, vector axis, var keepdims)->(tensor B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector axis, var keepdims)->(tensor B) | -| reducemin | miaobyte | reducemin(tensor A, vector axis, var keepdims)->(tensor B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector axis, var keepdims)->(tensor B) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor A, vector axis, var keepdims)->(tensor B) | +| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor A, vector axis, var keepdims)->(tensor B) | +| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor A, vector axis, var keepdims)->(tensor B) | +| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor A, vector axis, var keepdims)->(tensor B) | ### changeshape -| Operation | Author | Func Def | Math Formula | IR Instruction | -|-----------|--------|------------|--------------|----------------| -| indexselect | miaobyte | indexselect(tensor A, tensor index, var axis)->(tensor B) | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor A, tensor index, var axis)->(tensor B) | -| broadcastTo | miaobyte | broadcastTo(tensor A, vector new_shape)->(tensor B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | -| concat | miaobyte | concat(listtensor tensors, var dim)->(tensor result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | -| transpose | miaobyte | transpose(tensor A, vector dim_order)->(tensor C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor A, vector dim_order)->(tensor C) | -| reshape | miaobyte | reshape(tensor A, vector shape)->(tensor B) | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | +| Operation | Author | Math Formula | IR Instruction | +|-----------|--------|--------------|----------------| +| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor A, tensor index, var axis)->(tensor B) | +| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor A, vector new_shape)->(tensor B) | +| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor tensors, var dim)->(tensor result) | +| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor A, vector dim_order)->(tensor C) | +| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor A, vector shape)->(tensor B) | diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp index b314e891..482142cd 100644 --- a/excuter/cpp-common/src/deepx/shape.hpp +++ b/excuter/cpp-common/src/deepx/shape.hpp @@ -66,7 +66,7 @@ namespace deepx // rangeParallel 支持omp,但omp内无需线程local变量 void rangeParallel(int dimCount, std::function &indices)> func) const; - void rangeParallel(int dimCount, std::function func) const; + void rangeElementwiseParallel( std::function func) const; void rangeParallel(int dimCount, std::function &indices)> func) const; // 支持omp,但omp内需要线程local变量 diff --git a/excuter/cpp-common/src/deepx/shape_range.cpp b/excuter/cpp-common/src/deepx/shape_range.cpp index 1f7fad54..45d12320 100644 --- a/excuter/cpp-common/src/deepx/shape_range.cpp +++ b/excuter/cpp-common/src/deepx/shape_range.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "deepx/shape.hpp" @@ -113,18 +114,24 @@ namespace deepx } } } - void Shape::rangeParallel(int dimCount, std::function func) const - { - dimCount = checkdim(dimCount, dim()); - int stride = checkStride(dimCount, shape); - - // 计算总循环次数 - int total = size / stride; - -#pragma omp parallel for - for (int idx = 0; idx < total; idx++) + void Shape::rangeElementwiseParallel(std::function func) const + { + int num_threads = std::thread::hardware_concurrency(); + int alignblock=size/num_threads; + const int minblock=256; + if (alignblock size) { + end = size; + } + func(idx,end); } } diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp index d49f12f8..9e46b222 100644 --- a/excuter/cpp-common/src/deepx/tensor.hpp +++ b/excuter/cpp-common/src/deepx/tensor.hpp @@ -26,6 +26,7 @@ namespace deepx DeleteFn deleter; // 释放内存 using CopyFn = void (*)(T *, T *, int); + //copyer(src, dest, size) CopyFn copyer; // 拷贝内存 using SaveFn = void (*)(T *,size_t,const std::string &); diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp index ca44fd13..6e3b2072 100644 --- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp +++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp @@ -6,6 +6,10 @@ namespace deepx::tensorfunc { + //todtype + template + void todtype(const Tensor &input, Tensor &output); + template struct addDispatcher { diff --git a/excuter/cpp-common/src/deepx/tf/tffactory.cpp b/excuter/cpp-common/src/deepx/tf/tffactory.cpp index 48dd52bb..c57adf51 100644 --- a/excuter/cpp-common/src/deepx/tf/tffactory.cpp +++ b/excuter/cpp-common/src/deepx/tf/tffactory.cpp @@ -103,15 +103,14 @@ namespace deepx::tf // 为每个tftype生成一个表格 for (const auto &[tftype, tfs] : tf_by_type) { ss << "### " << tftype << "\n\n"; - ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n"; - ss << "|-----------|--------|------------|--------------|----------------|\n"; + ss << "| Operation | Author | Math Formula | IR Instruction |\n"; + ss << "|-----------|--------|--------------|----------------|\n"; for (const auto &tf : tfs) { ss << "| " << tf->name << " | "; ss << (tf->metadata.author.empty() ? " none " : tf->metadata.author) << " | "; - ss << tf->to_string(false, true) << " | "; ss << tf->math_formula() << " | "; - ss << tf->to_string(false, true) << " |\n"; + ss << stdutil::escape_markdown(tf->to_string(false, true)) << " |\n"; } ss << "\n"; diff --git a/excuter/cpp-common/src/stdutil/string.cpp b/excuter/cpp-common/src/stdutil/string.cpp index d254d349..3af914b0 100644 --- a/excuter/cpp-common/src/stdutil/string.cpp +++ b/excuter/cpp-common/src/stdutil/string.cpp @@ -8,9 +8,49 @@ namespace stdutil str.erase(str.find_last_not_of(" ") + 1); } - void trim(string &str,const string &chars) + void trim(string &str, const string &chars) { str.erase(0, str.find_first_not_of(chars)); str.erase(str.find_last_not_of(chars) + 1); } + + string escape_markdown(const string &str) + { + std::string result; + for (char c : str) + { + switch (c) + { + case '\\': + result += "\\\\"; + break; + case '\"': + result += "\\\""; + break; + case '\'': + result += "\\\'"; + break; + case '\n': + result += "\\n"; + break; + case '\t': + result += "\\t"; + break; + case '\r': + result += "\\r"; + break; + case '\b': + result += "\\b"; + break; + case '\f': + result += "\\f"; + break; + default: + // 普通字符直接添加 + result += c; + } + } + return result; + } + } // namespace stdutil \ No newline at end of file diff --git a/excuter/cpp-common/src/stdutil/string.hpp b/excuter/cpp-common/src/stdutil/string.hpp index 1b353ded..76c9c0c7 100644 --- a/excuter/cpp-common/src/stdutil/string.hpp +++ b/excuter/cpp-common/src/stdutil/string.hpp @@ -10,6 +10,7 @@ namespace stdutil void trimspace(string &str); void trim(string &str,const string &chars=" \t\n\r\f\v"); + string escape_markdown(const string &str); } diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp index c98748e2..4aeaa5ad 100644 --- a/excuter/op-mem-cuda/src/client/tfs.cpp +++ b/excuter/op-mem-cuda/src/client/tfs.cpp @@ -164,6 +164,18 @@ namespace deepx::tf // elementwise void register_elementwise(TfFactory &tffactory) { + //todtype + tffactory.add_tf(std::make_shared(vector( + { + Param("a", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("b", DataCategory::Tensor, Precision::Any), + }))); + + + // add tffactory.add_tf(std::make_shared>(vector( { Param("a", DataCategory::Tensor, Precision::Any), diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu index bc97ba5f..5ef7ea81 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu @@ -369,33 +369,38 @@ namespace deepx::tensorfunc // indexselect template __host__ __device__ void fromIndexselectIndices( - const int *output_indices,const int outputDim, // 输出张量的索引 - const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor - int *index_indices, - const int gatherAxis, // gather操作的轴 - int *input_indices,const int inputDim){ + const int *output_indices, const int outputDim, // 输出张量的索引 + const GatherAxisT *index, const int *indexStrides, const int indexDim, // index是tensor + int *index_indices, + const int gatherAxis, // gather操作的轴 + int *input_indices, const int inputDim) + { for (int i = 0; i < gatherAxis; ++i) { input_indices[i] = output_indices[i]; } - for (int i = gatherAxis; i < gatherAxis + indicesDim; ++i) + for (int i = gatherAxis; i < gatherAxis + indexDim; ++i) { index_indices[i - gatherAxis] = output_indices[i]; } // 使用indices张量中对应位置的值来替换gatherAxis维度的索引 - int indices_idx = linearAt(indicesStrides, indicesDim, index_indices); - input_indices[gatherAxis] = indices[indices_idx]; - for (int i = gatherAxis +indicesDim; i < outputDim; ++i) + int index_idx = linearAt(indexStrides, indexDim, index_indices); + input_indices[gatherAxis] = index[index_idx]; + // for (int i = gatherAxis +indicesDim; i < outputDim; ++i) + // { + // input_indices[gatherAxis+1+i] = output_indices[i]; + // } + for (int i = 0; i < outputDim - (gatherAxis + indexDim); ++i) { - input_indices[gatherAxis+1+i] = output_indices[i]; + input_indices[gatherAxis + 1 + i] = output_indices[gatherAxis + indexDim + i]; } } template __global__ void indexselect_kernel( const T *input, const int *inputStrides, const int inputDim, - const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, + const GatherAxisT *index, const int *indexStrides, const int indexDim, const int gatherAxis, T *output, const int *outputStrides, const int outputDim, const int outputlen) { @@ -410,11 +415,11 @@ namespace deepx::tensorfunc // 输入索引 int index_indices[DIM]; int input_indices[DIM]; - fromIndexselectIndices(output_indices,outputDim, - indices, indicesStrides, indicesDim, - index_indices, - gatherAxis, - input_indices, inputDim); + fromIndexselectIndices(output_indices, outputDim, + index, indexStrides, indexDim, + index_indices, + gatherAxis, + input_indices, inputDim); int inputIdx = linearAt(inputStrides, inputDim, input_indices); int outputIdx = linearAt(outputStrides, outputDim, output_indices); output[outputIdx] = input[inputIdx]; @@ -424,7 +429,7 @@ namespace deepx::tensorfunc template void launch_indexselect( const T *input, const int *inputStrides, const int inputDim, - const GatherAxisT *indices, const int *indicesStrides, const int indicesDim, + const GatherAxisT *index, const int *indexStrides, const int indexDim, const int gatherAxis, T *output, const int *outputStrides, const int outputDim, const int outputlen) { @@ -432,7 +437,7 @@ namespace deepx::tensorfunc auto [numBlocks, blockSize] = BestDims(outputlen); // indices - cudaVector indicesStrides_d(indicesStrides, indicesDim, cudaMemcpyHostToDevice); + cudaVector indexStrides_d(indexStrides, indexDim, cudaMemcpyHostToDevice); // input cudaVector inputStrides_d(inputStrides, inputDim, cudaMemcpyHostToDevice); @@ -440,46 +445,46 @@ namespace deepx::tensorfunc // output cudaVector outputStrides_d(outputStrides, outputDim, cudaMemcpyHostToDevice); - //TODO 这里可能会导致寄存器浪费,但是,搞太多模板T,模板实例化不好搞 - int dim=std::max(inputDim,indicesDim); - dim=std::max(dim,outputDim); + // TODO 这里可能会导致寄存器浪费,但是,搞太多模板T,模板实例化不好搞 + int dim = std::max(inputDim, indexDim); + dim = std::max(dim, outputDim); switch (dim) { case 1: - indexselect_kernel<1, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<1, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 2: - indexselect_kernel<2, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<2, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 3: - indexselect_kernel<3, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<3, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 4: - indexselect_kernel<4, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<4, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 5: - indexselect_kernel<5, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<5, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 6: - indexselect_kernel<6, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<6, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 7: - indexselect_kernel<7, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<7, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 8: - indexselect_kernel<8, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<8, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 9: - indexselect_kernel<9, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<9, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 10: - indexselect_kernel<10, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<10, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 11: - indexselect_kernel<11, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<11, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; case 12: - indexselect_kernel<12, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); + indexselect_kernel<12, T, GatherAxisT><<>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen); break; default: throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM)); @@ -491,71 +496,70 @@ namespace deepx::tensorfunc } } template void launch_indexselect(const double *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - double *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + double *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const float *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - float *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + float *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const nv_bfloat16 *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - __half *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + __half *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int64_t *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int64_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int64_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int32_t *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int32_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int32_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int16_t *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int16_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int16_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int8_t *input, const int *inputStrides, const int inputDim, - const int64_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int64_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const double *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - double *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + double *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const float *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - float *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + float *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const nv_bfloat16 *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - __half *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + __half *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int64_t *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int64_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int64_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int32_t *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int32_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int32_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int16_t *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int16_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int16_t *output, const int *outputStrides, const int outputDim, const int outputlen); template void launch_indexselect(const int8_t *input, const int *inputStrides, const int inputDim, - const int32_t *indices, const int *indicesStrides, const int indicesDim, - const int gatherAxis, - int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); + const int32_t *index, const int *indexStrides, const int indexDim, + const int gatherAxis, + int8_t *output, const int *outputStrides, const int outputDim, const int outputlen); } - #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp index 6fce29d0..d9c9f3c1 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp @@ -80,7 +80,6 @@ namespace deepx::tensorfunc return {size, host_data}; } - } #endif diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh index 14764266..ff7117f3 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh @@ -9,208 +9,236 @@ namespace deepx::tensorfunc { - //sqrt + // sqrt template - __device__ __forceinline__ void deepx_sqrt(const T *a, T *out); + __device__ __forceinline__ void deepx_sqrt(const T *a, T *out); template <> - __device__ __forceinline__ void deepx_sqrt(const double *a, double *out) + __device__ __forceinline__ void deepx_sqrt(const double *a, double *out) { *out = sqrt(*a); } template <> - __device__ __forceinline__ void deepx_sqrt(const float *a, float *out) + __device__ __forceinline__ void deepx_sqrt(const float *a, float *out) { *out = sqrtf(*a); } template <> - __device__ __forceinline__ void deepx_sqrt(const half *a, half *out) + __device__ __forceinline__ void deepx_sqrt(const half *a, half *out) { *out = hsqrt(*a); } template <> - __device__ __forceinline__ void deepx_sqrt(const nv_bfloat16 *a, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_sqrt(const nv_bfloat16 *a, nv_bfloat16 *out) { *out = hsqrt(*a); } - - //pow + + // pow template - __device__ __forceinline__ void deepx_pow(const T *a, const T *b, T *out); + __device__ __forceinline__ void deepx_pow(const T *a, const T *b, T *out); template <> - __device__ __forceinline__ void deepx_pow(const double *a, const double *b, double *out) + __device__ __forceinline__ void deepx_pow(const double *a, const double *b, double *out) { *out = pow(*a, *b); } template <> - __device__ __forceinline__ void deepx_pow(const float *a, const float *b, float *out) + __device__ __forceinline__ void deepx_pow(const float *a, const float *b, float *out) { *out = powf(*a, *b); } - //log + // log template - __device__ __forceinline__ void deepx_log(const T *a, T *out); + __device__ __forceinline__ void deepx_log(const T *a, T *out); template <> - __device__ __forceinline__ void deepx_log(const double *a, double *out) + __device__ __forceinline__ void deepx_log(const double *a, double *out) { *out = log(*a); } template <> - __device__ __forceinline__ void deepx_log(const float *a, float *out) + __device__ __forceinline__ void deepx_log(const float *a, float *out) { *out = logf(*a); } template <> - __device__ __forceinline__ void deepx_log(const half *a, half *out) + __device__ __forceinline__ void deepx_log(const half *a, half *out) { *out = hlog(*a); - } + } template <> - __device__ __forceinline__ void deepx_log(const nv_bfloat16 *a, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_log(const nv_bfloat16 *a, nv_bfloat16 *out) { *out = hlog(*a); } - //exp + // exp template - __device__ __forceinline__ void deepx_exp(const T *a, T *out); + __device__ __forceinline__ void deepx_exp(const T *a, T *out); template <> - __device__ __forceinline__ void deepx_exp(const double *a, double *out) + __device__ __forceinline__ void deepx_exp(const double *a, double *out) { *out = exp(*a); } template <> - __device__ __forceinline__ void deepx_exp(const float *a, float *out) + __device__ __forceinline__ void deepx_exp(const float *a, float *out) { *out = expf(*a); } template <> - __device__ __forceinline__ void deepx_exp(const half *a, half *out) + __device__ __forceinline__ void deepx_exp(const half *a, half *out) { *out = hexp(*a); } template <> - __device__ __forceinline__ void deepx_exp(const nv_bfloat16 *a, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_exp(const nv_bfloat16 *a, nv_bfloat16 *out) { *out = hexp(*a); } - + // max template - __device__ __forceinline__ void deepx_max(const T *a, const T *b, T *out); + __device__ __forceinline__ void deepx_max(const T *a, const T *b, T *out); template <> - __device__ __forceinline__ void deepx_max(const double *a, const double *b, double *out) + __device__ __forceinline__ void deepx_max(const double *a, const double *b, double *out) { *out = fmax(*a, *b); } template <> - __device__ __forceinline__ void deepx_max(const float *a, const float *b, float *out) + __device__ __forceinline__ void deepx_max(const float *a, const float *b, float *out) { *out = fmaxf(*a, *b); } template <> - __device__ __forceinline__ void deepx_max(const half *a, const half *b, half *out) + __device__ __forceinline__ void deepx_max(const half *a, const half *b, half *out) { *out = __hmax(*a, *b); } template <> - __device__ __forceinline__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_max(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) { *out = __hmax(*a, *b); } template <> - __device__ __forceinline__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) + __device__ __forceinline__ void deepx_max(const int64_t *a, const int64_t *b, int64_t *out) { *out = *a > *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) + __device__ __forceinline__ void deepx_max(const int32_t *a, const int32_t *b, int32_t *out) { *out = *a > *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) + __device__ __forceinline__ void deepx_max(const int16_t *a, const int16_t *b, int16_t *out) { *out = *a > *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) + __device__ __forceinline__ void deepx_max(const int8_t *a, const int8_t *b, int8_t *out) { *out = *a > *b ? *a : *b; } // min template - __device__ __forceinline__ void deepx_min(const T *a, const T *b, T *out); + __device__ __forceinline__ void deepx_min(const T *a, const T *b, T *out); template <> - __device__ __forceinline__ void deepx_min(const double *a, const double *b, double *out) + __device__ __forceinline__ void deepx_min(const double *a, const double *b, double *out) { *out = fmin(*a, *b); } template <> - __device__ __forceinline__ void deepx_min(const float *a, const float *b, float *out) + __device__ __forceinline__ void deepx_min(const float *a, const float *b, float *out) { *out = fminf(*a, *b); } template <> - __device__ __forceinline__ void deepx_min(const half *a, const half *b, half *out) + __device__ __forceinline__ void deepx_min(const half *a, const half *b, half *out) { *out = __hmin(*a, *b); } template <> - __device__ __forceinline__ void deepx_min(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) + __device__ __forceinline__ void deepx_min(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out) { *out = __hmin(*a, *b); } template <> - __device__ __forceinline__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) + __device__ __forceinline__ void deepx_min(const int64_t *a, const int64_t *b, int64_t *out) { *out = *a < *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) + __device__ __forceinline__ void deepx_min(const int32_t *a, const int32_t *b, int32_t *out) { *out = *a < *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) + __device__ __forceinline__ void deepx_min(const int16_t *a, const int16_t *b, int16_t *out) { *out = *a < *b ? *a : *b; } template <> - __device__ __forceinline__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) + __device__ __forceinline__ void deepx_min(const int8_t *a, const int8_t *b, int8_t *out) { *out = *a < *b ? *a : *b; } - - + //todtype + template + __device__ __forceinline__ Dtype deepx_todtype(const T &a) + { + return static_cast(a); + } + //float<->half + template <> + __device__ __forceinline__ half deepx_todtype(const float &src) + { + return __float2half(src); + } + + template <> + __device__ __forceinline__ float deepx_todtype(const half &src) + { + return __half2float(src); + } + //float<->bfloat16 + template <> + __device__ __forceinline__ nv_bfloat16 deepx_todtype(const float &src) + { + return __float2bfloat16(src); + } + template <> + __device__ __forceinline__ float deepx_todtype(const nv_bfloat16 &src) + { + return __bfloat162float(src); + } + } #endif // DEEPX_TENSORFUNC_CUDA_MATH_CUH \ No newline at end of file diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu index abe6f223..772865f3 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu @@ -5,9 +5,96 @@ #include #include "deepx/tensorfunc/cuda.hpp" #include "deepx/tensorfunc/authors.hpp" +#include "deepx/tensorfunc/cuda_math.cuh" namespace deepx::tensorfunc { + + //todtype + + template + __global__ void todtype_kernel(const T* A, Dtype* C,const int size){ + int stride = blockDim.x * gridDim.x; + for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride){ + C[idx] = deepx_todtype(A[idx]); + } + } + + template + void launch_todtype(const T* a, Dtype* c,const int size){ + auto [numBlocks, blockSize] = BestDims(size); + todtype_kernel<<>>(a, c, size); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + throw std::runtime_error("Failed to launch todtype kernel: " + + std::string(cudaGetErrorString(err))); + } + } + template void launch_todtype(const double *a, float *c, const int size); + template void launch_todtype(const double *a, half *c, const int size); + template void launch_todtype(const double *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const double *a, int64_t *c, const int size); + template void launch_todtype(const double *a, int32_t *c, const int size); + template void launch_todtype(const double *a, int16_t *c, const int size); + template void launch_todtype(const double *a, int8_t *c, const int size); + + template void launch_todtype(const float *a, double *c, const int size); + template void launch_todtype(const float *a, half *c, const int size); + template void launch_todtype(const float *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const float *a, int64_t *c, const int size); + template void launch_todtype(const float *a, int32_t *c, const int size); + template void launch_todtype(const float *a, int16_t *c, const int size); + template void launch_todtype(const float *a, int8_t *c, const int size); + + template void launch_todtype(const nv_bfloat16 *a, double *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, float *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, half *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, int64_t *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, int32_t *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, int16_t *c, const int size); + template void launch_todtype(const nv_bfloat16 *a, int8_t *c, const int size); + + template void launch_todtype(const half *a, double *c, const int size); + template void launch_todtype(const half *a, float *c, const int size); + template void launch_todtype(const half *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const half *a, int64_t *c, const int size); + template void launch_todtype(const half *a, int32_t *c, const int size); + template void launch_todtype(const half *a, int16_t *c, const int size); + template void launch_todtype(const half *a, int8_t *c, const int size); + + template void launch_todtype(const int64_t *a, double *c, const int size); + template void launch_todtype(const int64_t *a, float *c, const int size); + template void launch_todtype(const int64_t *a, half *c, const int size); + template void launch_todtype(const int64_t *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const int64_t *a, int32_t *c, const int size); + template void launch_todtype(const int64_t *a, int16_t *c, const int size); + template void launch_todtype(const int64_t *a, int8_t *c, const int size); + + template void launch_todtype(const int32_t *a, double *c, const int size); + template void launch_todtype(const int32_t *a, float *c, const int size); + template void launch_todtype(const int32_t *a, half *c, const int size); + template void launch_todtype(const int32_t *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const int32_t *a, int64_t *c, const int size); + template void launch_todtype(const int32_t *a, int16_t *c, const int size); + template void launch_todtype(const int32_t *a, int8_t *c, const int size); + + template void launch_todtype(const int16_t *a, double *c, const int size); + template void launch_todtype(const int16_t *a, float *c, const int size); + template void launch_todtype(const int16_t *a, half *c, const int size); + template void launch_todtype(const int16_t *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const int16_t *a, int64_t *c, const int size); + template void launch_todtype(const int16_t *a, int32_t *c, const int size); + template void launch_todtype(const int16_t *a, int8_t *c, const int size); + + template void launch_todtype(const int8_t *a, double *c, const int size); + template void launch_todtype(const int8_t *a, float *c, const int size); + template void launch_todtype(const int8_t *a, half *c, const int size); + template void launch_todtype(const int8_t *a, nv_bfloat16 *c, const int size); + template void launch_todtype(const int8_t *a, int64_t *c, const int size); + template void launch_todtype(const int8_t *a, int32_t *c, const int size); + template void launch_todtype(const int8_t *a, int16_t *c, const int size); + // add template __global__ void add_kernel(const T *A, const T *B, T *C, const int size) diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh index 604421c4..4100f38d 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh @@ -7,6 +7,15 @@ namespace deepx::tensorfunc { + + //todtype + template + __global__ void todtype_kernel(const T* A, Dtype* C,const int size); + + template + void launch_todtype(const T* a, Dtype* c,const int size); + + //add template __global__ void add_kernel(const T* A, const T* B, T* C,const int size); diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp index 82cb4cbf..b7d3a680 100644 --- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp @@ -9,10 +9,17 @@ #include "stdutil/error.hpp" namespace deepx::tensorfunc -{ - // CUDA kernel函数声明 - - +{ + //todtype + template + void todtype(const Tensor &input, Tensor &output){ + if (input.shape.size != output.shape.size || input.shape.size != output.shape.size) { + throw TensorShapeError("todtype"); + } + launch_todtype(input.data, output.data, input.shape.size); + }; + + //add template struct addDispatcher { diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp index 97e4b17e..709a899c 100644 --- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp +++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp @@ -9,6 +9,359 @@ namespace deepx::tf { + + // todtype + class Todtype : public TF + { + public: + Todtype(const vector &args, const vector &returns) + { + this->name = "todtype"; + this->tftype = "elementwise"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T3(dtypeA)->T1(dtypeB)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } + auto a_shape = mem->gettensor(this->args[0].textvalue).get()->shape; + auto c_shape = mem->gettensor(this->returns[0].textvalue).get()->shape; + if (a_shape.size != c_shape.size) + { + error = "Shape mismatch: " + to_string(a_shape.size) + " != " + to_string(c_shape.size); + return 1; + } + Precision a_type = a_shape.dtype; + Precision c_type = c_shape.dtype; + switch (a_type) + { + case Precision::Float64: + { + switch (c_type) + { + case Precision::Float64: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + break; + } + case Precision::Float32: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data,b->data, a->shape.size); + break; + } + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Float16: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::BFloat16: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int64: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int32: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int16: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int8: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::BFloat16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + return 0; + }; + + }; + + // add template class Add : public TF { @@ -16,7 +369,7 @@ namespace deepx::tf Add(const vector &args, const vector &returns) { this->name = "add"; - this->metadata.author = Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; @@ -32,7 +385,7 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -90,7 +443,7 @@ namespace deepx::tf this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1+scalar"; @@ -100,8 +453,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -158,7 +511,7 @@ namespace deepx::tf this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1-T2"; @@ -168,8 +521,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -222,12 +575,12 @@ namespace deepx::tf SubScalar(const vector &args, const vector &returns) { this->name = "subscalar"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1-scalar"; @@ -238,7 +591,7 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -290,12 +643,12 @@ namespace deepx::tf Mul(const vector &args, const vector &returns) { this->name = "mul"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1*T2"; @@ -305,8 +658,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -359,12 +712,12 @@ namespace deepx::tf MulScalar(const vector &args, const vector &returns) { this->name = "mulscalar"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1*scalar"; @@ -374,8 +727,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -427,12 +780,12 @@ namespace deepx::tf Div(const vector &args, const vector &returns) { this->name = "div"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=T1/T2"; @@ -442,8 +795,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -496,12 +849,12 @@ namespace deepx::tf DivScalar(const vector &args, const vector &returns) { this->name = "divscalar"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=scalar/T1"; @@ -511,8 +864,8 @@ namespace deepx::tf return make_shared>(*this); } int run(shared_ptr mem, string &error) override - { - if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -564,12 +917,12 @@ namespace deepx::tf RDivScalar(const vector &args, const vector &returns) { this->name = "rdivscalar"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; } - + string math_formula() const override { return "T3=scalar/T1"; @@ -580,7 +933,7 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if(!checktensors({this->args[1].textvalue,this->returns[0].textvalue},mem, error)) + if (!checktensors({this->args[1].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -633,7 +986,7 @@ namespace deepx::tf Invert(const vector &args, const vector &returns) { this->name = "invert"; - this->metadata.author=Author::name(); + this->metadata.author = Author::name(); this->tftype = "elementwise"; this->args = args; this->returns = returns; @@ -648,7 +1001,7 @@ namespace deepx::tf } int run(shared_ptr mem, string &error) override { - if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error)) + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) { return 1; } @@ -680,7 +1033,6 @@ namespace deepx::tf return 0; } }; - }; #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp index 5080a673..d2893cfc 100644 --- a/excuter/op-mem-ompsimd/src/client/tfs.cpp +++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp @@ -174,7 +174,19 @@ namespace deepx::tf // elementwise void register_elementwise(TfFactory &tffactory) - { + { + // todtype + tffactory.add_tf(std::make_shared(vector( + { + Param("A", DataCategory::Tensor, Precision::Any), + }), + vector( + { + Param("C", DataCategory::Tensor, Precision::Any), + }))); + + + // add author=miaobyte tffactory.add_tf(std::make_shared>(vector( { Param("a", DataCategory::Tensor, Precision::Any), diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp index 5acf3bd7..f8f0302f 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp @@ -18,15 +18,15 @@ namespace deepx::tensorfunc { if (A.shape == B.shape && A.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C, &scalar_op, &simd_op](int i) + C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i,int i_end) { - int shape_last = C.shape[-1]; + const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j = 0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag, A.data + i + j)) + while (j < i_end && !IsAligned(tag, A.data + i + j)) { T c; scalar_op(A.data[i + j], B.data[i + j], c); @@ -35,14 +35,14 @@ namespace deepx::tensorfunc } // 2. 处理中间对齐部分 - size_t aligned_end = shape_last - (shape_last % lanes); + size_t aligned_end = i_end - (i_end % lanes); for (; j + lanes <= aligned_end; j += lanes) { simd_op(A.data + i + j, B.data + i + j, C.data + i + j, lanes); } // 3. 处理尾部剩余元素 - for (; j < shape_last; j++) + for (; j < i_end; j++) { T c; scalar_op(A.data[i + j], B.data[i + j], c); @@ -62,15 +62,14 @@ namespace deepx::tensorfunc { if (A.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim() - 1, [&A, &b, &C, &scalar_op, &simd_op](int i) + C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i,int i_end) { - int shape_last = C.shape[-1]; const ScalableTag tag; const size_t lanes = Lanes(tag); size_t j = 0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag, A.data + i + j)) + while (j < i_end && !IsAligned(tag, A.data + i + j)) { T c; scalar_op(A.data[i + j], b, c); @@ -79,14 +78,14 @@ namespace deepx::tensorfunc } // 2. 处理中间对齐部分 - size_t aligned_end = shape_last - (shape_last % lanes); + size_t aligned_end = i_end - (i_end % lanes); for (; j + lanes <= aligned_end; j += lanes) { simd_op(A.data + i + j, b, C.data + i + j, lanes); } // 3. 处理尾部剩余元素 - for (; j < shape_last; j++) + for (; j < i_end; j++) { T c; scalar_op(A.data[i + j], b, c); @@ -99,7 +98,21 @@ namespace deepx::tensorfunc } } - // 通用实现 + //todtype + template + static void todtype(const Tensor &A, Tensor &C) + { + C.shape.rangeElementwiseParallel([&A, &C](int i,int i_end) + { + for (int j = 0; j < i_end; j++) + { + C.data[i + j] = static_cast(A.data[i + j]); + } + }); + } + + + // add template struct addDispatcher { @@ -292,9 +305,9 @@ namespace deepx::tensorfunc { if (A.shape == C.shape) { - A.shape.rangeParallel(A.shape.dim()-1, [&A, &C](int idx) + A.shape.rangeElementwiseParallel([&A, &C](int idx,int idx_end) { - for (int j=0;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,input.data + i + j)) { + while (j < i_end && !IsAligned(tag,input.data + i + j)) { output.data[i+j] = std::sqrt(input.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { auto vec = Load(tag, input.data + i + j); @@ -337,7 +349,7 @@ namespace deepx::tensorfunc } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,input.data + i + j)) { + while (j < i_end && !IsAligned(tag,input.data + i + j)) { output.data[i+j] = std::sin(input.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { auto vec = Load(tag, input.data + i + j); @@ -504,7 +517,7 @@ namespace deepx::tensorfunc } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,input.data + i + j)) { + while (j < i_end && !IsAligned(tag,input.data + i + j)) { output.data[i+j] = std::cos(input.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { auto vec = Load(tag, input.data + i + j); @@ -547,7 +559,7 @@ namespace deepx::tensorfunc } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,input.data + i + j)) { + while (j < i_end && !IsAligned(tag,input.data + i + j)) { output.data[i+j] = std::tan(input.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { auto vec = Load(tag, input.data + i + j); @@ -590,7 +601,7 @@ namespace deepx::tensorfunc } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + idx+j)) { - C.data[idx+j]=std::max(A.data[idx+j],B.data[idx+j]); + while (j < i_end && !IsAligned(tag,A.data + i + j)) { + C.data[i+j]=std::max(A.data[i+j],B.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { - auto vec1 = Load(tag, A.data + idx+j); // 加载数组1的向量 - auto vec2 = Load(tag, B.data + idx+j); // 加载数组2的向量 + auto vec1 = Load(tag, A.data + i + j); // 加载数组1的向量 + auto vec2 = Load(tag, B.data + i + j); // 加载数组2的向量 auto vec_result = Max(vec1, vec2); // 向量比较 - Store(vec_result, tag, C.data + idx+j); // 存储结果向量 + Store(vec_result, tag, C.data + i + j); // 存储结果向量 } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + idx+j)) { - C.data[idx+j]=std::max(A.data[idx+j],b); + while (j < i_end && !IsAligned(tag,A.data + i + j)) { + C.data[i+j]=std::max(A.data[i+j],b); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { - auto vec1 = Load(tag, A.data + idx+j); // 加载数组1的向量 + auto vec1 = Load(tag, A.data + i + j); // 加载数组1的向量 auto vec2=Set(tag,b); auto vec_result = Max(vec1, vec2); // 向量比较 - Store(vec_result, tag, C.data + idx+j); // 存储结果向量 + Store(vec_result, tag, C.data + i + j); // 存储结果向量 } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + idx+j)) { - C.data[idx+j]=std::min(A.data[idx+j],B.data[idx+j]); + while (j < i_end && !IsAligned(tag,A.data + i + j)) { + C.data[i+j]=std::min(A.data[i+j],B.data[i+j]); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { - auto vec1 = Load(tag, A.data + idx+j); // 加载数组1的向量 - auto vec2 = Load(tag, B.data + idx+j); // 加载数组2的向量 + auto vec1 = Load(tag, A.data + i + j); // 加载数组1的向量 + auto vec2 = Load(tag, B.data + i + j); // 加载数组2的向量 auto vec_result = Min(vec1, vec2); // 向量比较 - Store(vec_result, tag, C.data + idx+j); // 存储结果向量 + Store(vec_result, tag, C.data + i + j); // 存储结果向量 } // 3. 处理尾部剩余元素 - for (;j tag; const size_t lanes = Lanes(tag); size_t j=0; // 1. 处理前置未对齐部分 - while (j < shape_last && !IsAligned(tag,A.data + idx+j)) { - C.data[idx+j]=std::min(A.data[idx+j],b); + while (j < i_end && !IsAligned(tag,A.data + i + j)) { + C.data[i+j]=std::min(A.data[i+j],b); ++j; } // 2. 处理中间对齐部分 - size_t aligned_end=shape_last-(shape_last%lanes); + size_t aligned_end=i_end-(i_end%lanes); for (; j+lanes<=aligned_end; j += lanes ) { - auto vec1 = Load(tag, A.data + idx+j); // 加载数组1的向量 + auto vec1 = Load(tag, A.data + i + j); // 加载数组1的向量 auto vec2=Set(tag,b); auto vec_result = Min(vec1, vec2); // 向量比较 - Store(vec_result, tag, C.data + idx+j); // 存储结果向量 + Store(vec_result, tag, C.data + i + j); // 存储结果向量 } // 3. 处理尾部剩余元素 - for (;jB.data[idx+i]; + mask.data[i+j]=A.data[i+j]>B.data[i+j]; } }); } @@ -908,11 +915,11 @@ namespace deepx::tensorfunc { if (A.shape == mask.shape) { - A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar](int idx) + A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end) { - for (int i = 0; i < A.shape[-1]; i++) + for (int j = 0; j < i_end; j++) { - mask.data[idx+i]=A.data[idx+i]>scalar; + mask.data[i+j]=A.data[i+j]>scalar; } }); } @@ -931,12 +938,12 @@ namespace deepx::tensorfunc { if (cases.shape == C.shape) { - C.shape.rangeParallel(C.shape.dim()-1, [&tensors, &cases, &C](int idx) + C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i,int i_end) { - for (int i = 0; i < C.shape[-1]; i++) + for (int j = 0; j < i_end; j++) { - int which_tensor=cases.data[idx]; - C.data[idx+i]=tensors[which_tensor]->data[idx]; + int which_tensor=cases.data[i]; + C.data[i+j]=tensors[which_tensor]->data[i]; } }); } diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp index 1f754006..58768784 100644 --- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp +++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp @@ -10,7 +10,247 @@ #include "deepx/tensorfunc/elementwise_cblas.hpp" namespace deepx::tf { + // todtype + class Todtype : public TF + { + public: + Todtype(const vector &args, const vector &returns) + { + this->name = "todtype"; + this->tftype = "elementwise"; + this->args = args; + this->returns = returns; + } + + string math_formula() const override + { + return "T3(dtypeA)->T1(dtypeB)"; + } + shared_ptr clone() const override + { + return make_shared(*this); + } + int run(shared_ptr mem, string &error) override + { + if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error)) + { + return 1; + } + auto a_shape = mem->gettensor(this->args[0].textvalue).get()->shape; + auto c_shape = mem->gettensor(this->returns[0].textvalue).get()->shape; + if (a_shape.size != c_shape.size) + { + error = "Shape mismatch: " + to_string(a_shape.size) + " != " + to_string(c_shape.size); + return 1; + } + Precision a_type = a_shape.dtype; + Precision c_type = c_shape.dtype; + switch (a_type) + { + case Precision::Float64: + { + switch (c_type) + { + case Precision::Float64: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + break; + } + case Precision::Float32: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data,b->data, a->shape.size); + break; + } + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + + break; + case Precision::Int64: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int32: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int16: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + case Precision::Int8: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + case Precision::Int8: + { + switch (c_type) + { + case Precision::Float64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Float32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int64: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int32: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int16: + tensorfunc::todtype(*mem->gettensor(this->args[0].textvalue), *mem->gettensor(this->returns[0].textvalue)); + break; + case Precision::Int8: + { + auto a = mem->gettensor(this->args[0].textvalue); + auto b = mem->gettensor(this->returns[0].textvalue); + b->copyer(a->data, b->data, a->shape.size); + break; + } + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + } + break; + default: + error = "Unsupported dtype: " + precision_str(c_type); + return 1; + } + return 0; + }; + + }; + + // add template class Add : public TF { diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py index 4a17d8ca..051f8d68 100644 --- a/front/py/deepx/nn/functional/__init__.py +++ b/front/py/deepx/nn/functional/__init__.py @@ -10,26 +10,28 @@ from .authormap import defaultauthor from .reduce import mean - from .activite import * - from .elementwise import * from .normalization import * +from .changeshape import * __all__ = [ #leaffunc "newtensor","rnewtensor","printtensor","load", #life "printtensor","save",#io - "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out", - "add","sub","mul","div","sqrt","pow","exp","log", + "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_", + "add","sub","mul","div","sqrt","pow","exp","log","invert","todtype","dropout", "matmul", "reducemax","reducemin","sum","prod", "reshape","permute","transpose","concat","broadcastTo","indexselect", #functional - "relu","sigmoid","swish", + "relu","sigmoid","swish","silu", "mean", "rsqrt", "softmax", + "squeeze","unsqueeze", + #other + "calculate_fan_in_and_fan_out", ] \ No newline at end of file diff --git a/front/py/deepx/nn/functional/activite.py b/front/py/deepx/nn/functional/activite.py index 8510a052..6effbb9c 100644 --- a/front/py/deepx/nn/functional/activite.py +++ b/front/py/deepx/nn/functional/activite.py @@ -1,33 +1,18 @@ from deepx.tensor import Tensor from deepx.nn.functional import newtensor - +from .leaffunc_elementwise import exp # 数学公式:relu(x) = max(0, x) def relu(t: Tensor)->Tensor: from .leaffunc_elementwise import max as max_func - outtensor=t - if t.name!=None: - outtensor=newtensor(t.shape, dtype=t.dtype) - else:#inplace操作 - pass + outtensor=newtensor(t.shape, dtype=t.dtype) return max_func(t,0,outtensor) # 数学公式:σ(x) = 1 / (1 + exp(-x)) def sigmoid(t: Tensor)->Tensor: - outtensor=t - if t.name is not None: - outtensor=newtensor(t.shape, dtype=t.dtype) - t.mul(-1,out=outtensor) - outtensor.exp_() - outtensor.add_(1) - outtensor.rdiv_(1) - return outtensor + return 1 / (exp(t*-1)+1) # 数学公式:swish(x) = x * σ(βx) def swish(x: Tensor,beta: float = 1.0) -> Tensor: - outtensor=x - if x.name is not None: - outtensor=newtensor(x.shape, dtype=x.dtype) - x.mul(beta,out=outtensor) - outtensor=sigmoid(outtensor) - outtensor.mul_(x) - return outtensor + return x*sigmoid(x*beta) + +silu=swish \ No newline at end of file diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py index 120db25b..5c6b492b 100644 --- a/front/py/deepx/nn/functional/authormap.py +++ b/front/py/deepx/nn/functional/authormap.py @@ -29,6 +29,7 @@ 'powscalar':'miaobyte', 'rpowscalar':'miaobyte', 'sqrt':'miaobyte', + 'dropout':'miaobyte', #changeshape 'reshape':'miaobyte', 'transpose':'miaobyte', diff --git a/front/py/deepx/nn/functional/changeshape.py b/front/py/deepx/nn/functional/changeshape.py new file mode 100644 index 00000000..db963f85 --- /dev/null +++ b/front/py/deepx/nn/functional/changeshape.py @@ -0,0 +1,18 @@ +from deepx import Tensor +from .leaffunc_changeshape import reshape + +def squeeze(t:Tensor,dim:int)->Tensor: + assert isinstance(dim,int) + assert isinstance(t,Tensor) + dim=dim%t.ndim + newshape=list(t.shape) + newshape.pop(dim) + return reshape(t,tuple(newshape)) + +def unsqueeze(t:Tensor,dim:int)->Tensor: + assert isinstance(dim,int) + assert isinstance(t,Tensor) + dim=dim%t.ndim + newshape=list(t.shape) + newshape.insert(dim,1) + return reshape(t,tuple(newshape)) \ No newline at end of file diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py index d6a4e6ea..76c22aca 100644 --- a/front/py/deepx/nn/functional/elementwise.py +++ b/front/py/deepx/nn/functional/elementwise.py @@ -2,11 +2,7 @@ from deepx.nn.functional import newtensor def rsqrt(input:Tensor)->Tensor: - from .leaffunc_elementwise import sqrt,div - outtensor=input - if input.name is not None: - outtensor=newtensor(input.shape, dtype=input.dtype) - sqrt(input,out= outtensor) - return div(1,outtensor,outtensor) + from .leaffunc_elementwise import sqrt + return 1/sqrt(input) diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py index 566e2d48..89cbde68 100644 --- a/front/py/deepx/nn/functional/leaffunc.py +++ b/front/py/deepx/nn/functional/leaffunc.py @@ -25,15 +25,15 @@ def op_func( newshape = Shape.broadcast_shape(a.shape, b.shape) an = a.broadcastTo(newshape) bn = b.broadcastTo(newshape) - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(newshape,dtype=a.dtype,name=out) else: - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(a.shape,dtype=a.dtype,name=out) rtf_func = getattr(rtf_module, f'rtf_{op_name}') rtf_func(an, bn, outtensor, defaultauthor[op_name]) else: - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(a.shape,dtype=a.dtype,name=out) rtf_func = getattr(rtf_module, f'rtf_{op_name}scalar') rtf_func(a, b, outtensor, defaultauthor[f'{op_name}scalar']) @@ -47,7 +47,7 @@ def op_func( a:Tensor, out:Union[Tensor,str]=None)->Tensor: outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(a.shape,dtype=a.dtype,name=out) rtf_module = importlib.import_module('deepx.nn.functional.rtf_elementwise') rtf_func = getattr(rtf_module, f'rtf_{op_name}') @@ -69,7 +69,7 @@ def op_func( if dim is None: dim=tuple(range(a.ndim)) result=out - if isinstance(out,str): + if isinstance(out,str) or out is None: resultshape=Shape.reduceshape(a.shape,dim,keepdim) result=newtensor(resultshape, dtype=a.dtype,name=out) rtf_module = importlib.import_module('deepx.nn.functional.rtf_reduce') diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py index c9501755..a830307a 100644 --- a/front/py/deepx/nn/functional/leaffunc_changeshape.py +++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py @@ -10,7 +10,7 @@ def reshape(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor: assert isinstance(i,int) and i>0 outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outshape=shape outtensor=newtensor(outshape,dtype=t.dtype,name=out) else: @@ -32,7 +32,7 @@ def permute(t:Tensor, raise ValueError(f"shape参数不合法,当前输入维度数:{len(dimorder)},张量维度数:{t.ndim}") dimorder = [d % t.ndim for d in dimorder] outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outshape = [t.shape[dim] for dim in dimorder] outtensor=newtensor(outshape,dtype=t.dtype,name=out) @@ -47,9 +47,14 @@ def transpose(t:Tensor,out:Union[Tensor,str]='')->Tensor: -def concat(tensors:Union[list[Tensor],tuple[Tensor]],dim:int,out:Union[Tensor,str]='')->Tensor: +def concat(tensors:Union[list[Tensor],tuple[Tensor,...]],dim:int,out:Union[Tensor,str]='')->Tensor: + assert isinstance(dim,int) + assert isinstance(tensors,list) or isinstance(tensors,tuple) + for t in tensors: + assert isinstance(t,Tensor) + outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outshape=list(tensors[0].shape) outshape[dim]=sum(t.shape[dim] for t in tensors) outtensor=newtensor(outshape,dtype=tensors[0].dtype,name=out) @@ -68,7 +73,7 @@ def broadcastTo(t:Tensor,new_shape:tuple[int,...],out:Union[Tensor,str]='',requi if bshape!=tuple(new_shape): raise ValueError(f"广播失败:{t.shape} 无法广播为 {new_shape} ") outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outshape=new_shape outtensor=newtensor(outshape,dtype=t.dtype,name=out) from .rtf_changeshape import rtf_broadcastTo @@ -80,7 +85,7 @@ def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str] assert gatheraxis>=0 and gatheraxisTensor: # outtensor=None -# if isinstance(out,str): +# if isinstance(out,str) or out is None: # outtensor=Tensor(shape=shape, dtype=t.dtype, device=t.device) # outtensor.addtograph(out) # else: diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py index 6aa54077..77bda9b2 100644 --- a/front/py/deepx/nn/functional/leaffunc_elementwise.py +++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py @@ -28,7 +28,7 @@ def rdiv( b: Tensor, out:Union[Tensor,str]=None)->Tensor: outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(b.shape,dtype=b.dtype,name=out) from .rtf_elementwise import rtf_rdivscalar rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar']) @@ -41,7 +41,7 @@ def rdiv( pow=create_A_B_tf_C('pow') def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor: outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outtensor=newtensor(b.shape,dtype=b.dtype,name=out) from .rtf_elementwise import rtf_rpowscalar rtf_rpowscalar(a,b,outtensor,defaultauthor['rpowscalar']) @@ -53,4 +53,26 @@ def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor: log=create_A_tf_C('log') #invert -invert=create_A_tf_C('invert') \ No newline at end of file +invert=create_A_tf_C('invert') + +#todtype +def todtype(t:Tensor,dest:Tensor): + assert isinstance(t,Tensor) + assert isinstance(dest,Tensor) + assert t.shape==dest.shape + + from .rtf_elementwise import rtf_todtype + rtf_todtype(t,dest) + +#dropout +def dropout(a:Tensor, p:float, out:Union[Tensor,str]='')->Tensor: + assert isinstance(a,Tensor) + outtensor=out + if isinstance(out,str) or out is None: + outtensor=newtensor(a.shape,dtype=a.dtype,name=out) + assert a.shape==outtensor.shape + + from .rtf_elementwise import rtf_dropout + rtf_dropout(a,p,outtensor,defaultauthor['dropout']) + return out + diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py index 8921f8e3..78d438f1 100644 --- a/front/py/deepx/nn/functional/leaffunc_life.py +++ b/front/py/deepx/nn/functional/leaffunc_life.py @@ -1,24 +1,28 @@ from deepx.tensor import Tensor from typing import Union + +def newtensor(shape:tuple[int,...],dtype:str='float32',name:str=None): + assert isinstance(shape,tuple) + for i in shape: + assert isinstance(i,int) + assert isinstance(dtype,str) + assert isinstance(name,str) or name is None -def parse_shape(shape:Union[tuple,list])->tuple[int, ...]: - if len(shape) == 1 and isinstance(shape[0], (tuple, list)): - shape = shape[0] - return tuple(int(dim) for dim in shape) - -def newtensor(*shape,dtype:str='float32',name:str=None): - s=parse_shape(shape) - t=Tensor(shape=s,dtype=dtype,name=name) + t=Tensor(shape=shape,dtype=dtype,name=name) from .rtf_life import rtf_newtensor rtf_newtensor(t) return t + def rnewtensor(t:Tensor): from .rtf_life import rtf_newtensor rtf_newtensor(t) return t + def copytensor(t:Tensor,out:Tensor): from .rtf_life import rtf_copytensor rtf_copytensor(t,out) + + def deltensor(t:Tensor): from .rtf_life import rtf_deltensor rtf_deltensor(t) diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py index 8cad3127..acdcefd4 100644 --- a/front/py/deepx/nn/functional/leaffunc_matmul.py +++ b/front/py/deepx/nn/functional/leaffunc_matmul.py @@ -6,7 +6,7 @@ def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor: outtensor=out - if isinstance(out,str): + if isinstance(out,str) or out is None: outshape=Shape.matmul(a.shape,b.shape) outtensor=newtensor(outshape,dtype=a.dtype,name=out) from .rtf_matmul import rtf_matmul diff --git a/front/py/deepx/nn/functional/normalization.py b/front/py/deepx/nn/functional/normalization.py index 8a9d6030..69b37079 100644 --- a/front/py/deepx/nn/functional/normalization.py +++ b/front/py/deepx/nn/functional/normalization.py @@ -1,19 +1,18 @@ from deepx import Tensor # 数学公式:softmax(x_i) = e^{x_i} / sum(e^{x_j}) -def softmax(t: Tensor,dim:int=-1)->Tensor: - +def softmax(t: Tensor,dim:list[int]=[-1])->Tensor: + assert isinstance(dim,list) + for i in range(len(dim)): + dim[i]=dim[i]%t.ndim # 数值稳定性处理:减去最大值防止指数爆炸 if dim is not None: - reducemax_t = t.reducemax(dim=[dim], keepdim=True) # 保持维度用于广播 + t_reducemax = t.reducemax(dim=tuple(dim), keepdim=True) # 保持维度用于广播 else: - reducemax_t = t.reducemax(keepdim=True) - t_subed=t.clone() - t_subed.sub_(reducemax_t) + t_reducemax= t.reducemax(keepdim=True) + + t=t-t_reducemax - # 实现公式:exp(t_subed) / sum(exp(t_subed)) - exp_t = t_subed.exp() - expt_sum=exp_t.sum(dim=[dim], keepdim=True) - # 处理输出张量(参考sigmoid的实现模式) - exp_t.div(expt_sum,out=t_subed) - return t_subed \ No newline at end of file + t_exp = t.exp() + t_exp_sum=t_exp.sum(dim=tuple(dim), keepdim=True) + return t.exp()/t_exp_sum \ No newline at end of file diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py index 3b7df4a6..dc48ff46 100644 --- a/front/py/deepx/nn/functional/rtf_elementwise.py +++ b/front/py/deepx/nn/functional/rtf_elementwise.py @@ -3,6 +3,7 @@ from deepx.scheduler import send from .rtf import A_B_op_C,A_scalar_op_C,A_op_C + def rtf_add(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor: A_B_op_C("add",a,b,out,author) return out @@ -107,4 +108,18 @@ def rtf_minscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor: def rtf_invert(a:Tensor, out:Tensor, author='miaobyte')->Tensor: A_op_C("invert",a,out,author) + return out + +def rtf_todtype(t:Tensor,dest:Tensor): + assert isinstance(t,Tensor) + assert isinstance(dest,Tensor) + assert t.shape==dest.shape + + args=[Param.tensor(t)] + returns=[Param.tensor(dest)] + ir=DeepxIR("todtype", args, returns,'') + send(ir) + +def rtf_dropout(a:Tensor, p:float, out:Tensor, author='miaobyte')->Tensor: + A_B_op_C("dropout",a,p,out,author) return out \ No newline at end of file diff --git a/front/py/deepx/nn/functional/rtf_life.py b/front/py/deepx/nn/functional/rtf_life.py index 21547b99..60a2371d 100644 --- a/front/py/deepx/nn/functional/rtf_life.py +++ b/front/py/deepx/nn/functional/rtf_life.py @@ -3,6 +3,7 @@ from deepx.scheduler import send def rtf_newtensor(t:Tensor): + assert isinstance(t,Tensor) args=[Param.vector(t.shape,'int32')] returns=[Param.tensor(t)] ir=DeepxIR("newtensor", args, returns,'') @@ -10,12 +11,20 @@ def rtf_newtensor(t:Tensor): def rtf_copytensor(t:Tensor,out:Tensor): + assert isinstance(t,Tensor) + assert isinstance(out,Tensor) + assert t.shape==out.shape + assert t.dtype==out.dtype + args=[Param.tensor(t)] returns=[Param.tensor(out)] ir=DeepxIR("copytensor", args, returns,'') send(ir) + + def rtf_deltensor(t:Tensor): + assert isinstance(t,Tensor) args=[] returns=[Param.tensor(t)] ir=DeepxIR("deltensor", args, returns,'') diff --git a/front/py/deepx/nn/modules/activation.py b/front/py/deepx/nn/modules/activation.py index 5f093f85..51b93f81 100644 --- a/front/py/deepx/nn/modules/activation.py +++ b/front/py/deepx/nn/modules/activation.py @@ -1,39 +1,14 @@ from typing import Union from deepx import Tensor,ones -import deepx.nn.functional as F from .module import Module -#copy from pytorch -class ReLU(Module): - __constants__ = ["inplace"] - inplace: bool - def __init__(self, inplace: bool = False): - super().__init__() - self.inplace = inplace - - def forward(self, input: Tensor) -> Tensor: - return F.relu(input, inplace=self.inplace) - - def extra_repr(self) -> str: - inplace_str = "inplace=True" if self.inplace else "" - return inplace_str - -class Sigmoid(Module): - def __init__(self): - super().__init__() - - def forward(self, input: Tensor) -> Tensor: - return F.sigmoid(input) - -class Swish(Module): +class Glu(Module): def __init__(self): super().__init__() + self.W = ones(shape=(1,1),name=self.full_name+"_W") + self.V = ones(shape=(1,1),name=self.full_name+"_V") - def forward(self, input: Tensor) -> Tensor: - return F.swish(input) - - class Swiglu(Module): def __init__(self): super().__init__() diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py index a25b0963..6a12c26a 100644 --- a/front/py/deepx/scheduler/client/udpconn.py +++ b/front/py/deepx/scheduler/client/udpconn.py @@ -3,7 +3,7 @@ import select class UDPConn: - def __init__(self, endpoint: str = "localhost:8080"): + def __init__(self, endpoint: str = "localhost:9090"): # 解析endpoint self._host, port_str = endpoint.split(':') self._port = int(port_str) diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py index 462fc9d3..e00a70d4 100644 --- a/front/py/deepx/tensor/changeshape.py +++ b/front/py/deepx/tensor/changeshape.py @@ -42,12 +42,6 @@ def transpose_(self): transpose_func(self,self) return self -@tensor_method -def broadcastshape(self,other:Tensor)->tuple[int,...]: - from deepx.nn.functional import broadcastshape as broadcastshape_func - result=broadcastshape_func(self.shape,other.shape) - return result - @tensor_method def broadcastTo(self,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor: from deepx.nn.functional import broadcastTo as broadcastTo_func @@ -62,7 +56,17 @@ def indexselect(self,index:Tensor,axis:int=0,out:Union[Tensor,str]='')->Tensor: result=indexselect_func(self,index,gatheraxis,out) return result +@tensor_method +def squeeze(self,dim:int)->Tensor: + from deepx.nn.functional import squeeze as squeeze_func + result=squeeze_func(self,dim) + return result +@tensor_method +def unsqueeze(self,dim:int)->Tensor: + from deepx.nn.functional import unsqueeze as unsqueeze_func + result=unsqueeze_func(self,dim) + return result # @tensor_method # def expand(self,shape:tuple)->Tensor: diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py index 33ff1b97..9cd78e62 100644 --- a/front/py/deepx/tensor/elementwise.py +++ b/front/py/deepx/tensor/elementwise.py @@ -163,3 +163,8 @@ def invert(self,out:Union[Tensor,str]='')->Tensor: return invert_func(self,out) +@tensor_method +def dropout(self,p:float,out:Union[Tensor,str]=''): + from deepx.nn.functional import dropout as dropout_func + return dropout_func(self,p,out) + diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py index 661bd4de..11144d7a 100644 --- a/front/py/deepx/tensor/tensor.py +++ b/front/py/deepx/tensor/tensor.py @@ -1,4 +1,7 @@ from typing import Optional,Union,TypeAlias + +from triton.language.semantic import equal + from .shape import Shape @@ -36,6 +39,8 @@ def __init__(self,shape:tuple[int,...],dtype:str='float32',name:str=None): raise ValueError("Invalid shape") def copy_to(self,t:'Tensor'): + assert isinstance(t,Tensor) + assert t.name != self._name from deepx.nn.functional import copytensor copytensor(self,t) @@ -44,7 +49,12 @@ def clone(self,name:str=None): t=newtensor(self.shape,dtype=self.dtype,name=name) copytensor(self,t) return t - + def to(self,dtype:str,name:str=None): + assert isinstance(dtype,str) and dtype != '' + from deepx.nn.functional import todtype as todtype_func,newtensor + dest=newtensor(self.shape,dtype=dtype,name=name) + todtype_func(self,dest) + return dest # name @property def name(self): @@ -104,13 +114,17 @@ def dtype(self): #elementwise def __add__(self, other:Union[Number,'Tensor']): return self.add(other) - + def __radd__(self, other:Union[Number,'Tensor']): + return self.add(other) def __sub__(self, other:Union[Number,'Tensor']): return self.sub(other) + def __rsub__(self, other:Union[Number,'Tensor']): + return self.sub(other) def __mul__(self, other:Union[Number,'Tensor']): return self.mul(other) - + def __rmul__(self, other:Union[Number,'Tensor']): + return self.mul(other) def __truediv__(self, other:Union[Number,'Tensor']): return self.div(other) @@ -126,9 +140,10 @@ def __rpow__(self, other:Union[Number,'Tensor']): def __invert__(self): return self.invert() #矩阵乘法 - def __matmul__(self, other:Union[Number,'Tensor']): + def __matmul__(self, other:'Tensor'): return self.matmul(other) - + def __rmatmul__(self, other:'Tensor'): + return other.matmul(self) #gather def __getitem__(self, index:'Tensor'): return self.indexselect(index) diff --git a/front/py/deepx/transformer/encoder.py b/front/py/deepx/transformer/encoder.py deleted file mode 100644 index e69de29b..00000000 diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py index 0e6dd1ed..0554590c 100644 --- a/front/py/deepx/transformer/modeling_rope_utils.py +++ b/front/py/deepx/transformer/modeling_rope_utils.py @@ -2,282 +2,44 @@ import math from deepx import arange,Tensor -def _compute_default_rope_parameters( - base: float = 10000.0, - head_dim: int = 0, - partial_rotary_factor: float = 1.0, -) -> Tuple[Tensor, float]: - attention_factor = 1.0 # 在这种类型的RoPE中未使用 - dim = head_dim*partial_rotary_factor +def _compute_default_rope_parameters(config:dict={ + "base":10000.0, + "head_dim":0, + "partial_rotary_factor":1.0, +}) -> Tuple[Tensor, float]: + dim = config.head_dim* config.partial_rotary_factor # 计算逆频率 - inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim)) - return inv_freq, attention_factor - -# def _compute_linear_scaling_rope_parameters( -# config: Optional[PretrainedConfig] = None, -# device: Optional["torch.device"] = None, -# seq_len: Optional[int] = None, -# **rope_kwargs, -# ) -> Tuple["torch.Tensor", float]: -# """ -# Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev -# Args: -# config ([`~transformers.PretrainedConfig`]): -# The model configuration. -# device (`torch.device`): -# The device to use for initialization of the inverse frequencies. -# seq_len (`int`, *optional*): -# The current sequence length. Unused for this type of RoPE. -# rope_kwargs (`Dict`, *optional*): -# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. -# Returns: -# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the -# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). -# """ -# if config is not None and len(rope_kwargs) > 0: -# raise ValueError( -# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " -# f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" -# ) -# if len(rope_kwargs) > 0: -# factor = rope_kwargs["factor"] -# elif config is not None: -# factor = config.rope_scaling["factor"] - -# # Gets the default RoPE parameters -# inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) - -# # Then applies linear scaling to the frequencies. -# # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so -# # applying scaling to the inverse frequencies is equivalent. -# inv_freq /= factor -# return inv_freq, attention_factor - - -# def _compute_dynamic_ntk_parameters( -# config: Optional[PretrainedConfig] = None, -# device: Optional["torch.device"] = None, -# seq_len: Optional[int] = None, -# **rope_kwargs, -# ) -> Tuple["torch.Tensor", float]: -# """ -# Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla -# Args: -# config ([`~transformers.PretrainedConfig`]): -# The model configuration. -# device (`torch.device`): -# The device to use for initialization of the inverse frequencies. -# seq_len (`int`, *optional*): -# The current sequence length, used to update the dynamic RoPE at inference time. -# rope_kwargs (`Dict`, *optional*): -# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. -# Returns: -# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the -# post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). -# """ -# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling -# if config is not None and len(rope_kwargs) > 0: -# raise ValueError( -# "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " -# f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" -# ) -# if len(rope_kwargs) > 0: -# base = rope_kwargs["base"] -# dim = rope_kwargs["dim"] -# max_position_embeddings = rope_kwargs["max_position_embeddings"] -# factor = rope_kwargs["factor"] -# elif config is not None: -# base = config.rope_theta -# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 -# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) -# dim = int(head_dim * partial_rotary_factor) -# max_position_embeddings = config.max_position_embeddings -# factor = config.rope_scaling["factor"] - -# attention_factor = 1.0 # Unused in this type of RoPE - -# # seq_len: default to max_position_embeddings, e.g. at init time -# seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings - -# # Compute the inverse frequencies -# base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) -# inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) -# return inv_freq, attention_factor - - -# def _compute_yarn_parameters( -# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs -# ) -> Tuple["torch.Tensor", float]: -# """ -# Computes the inverse frequencies with NTK scaling. Please refer to the -# [original paper](https://arxiv.org/abs/2309.00071) -# Args: -# config ([`~transformers.PretrainedConfig`]): -# The model configuration. -# device (`torch.device`): -# The device to use for initialization of the inverse frequencies. -# seq_len (`int`, *optional*): -# The current sequence length. Unused for this type of RoPE. -# rope_kwargs (`Dict`, *optional*): -# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. -# Returns: -# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the -# post-processing scaling factor applied to the computed cos/sin. -# """ -# # No need to keep BC with yarn, unreleased when this new pattern was created. -# if len(rope_kwargs) > 0: -# raise ValueError( -# f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" -# ) - -# base = config.rope_theta -# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 -# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) -# dim = int(head_dim * partial_rotary_factor) -# max_position_embeddings = config.max_position_embeddings -# factor = config.rope_scaling["factor"] - -# # Sets the attention factor as suggested in the paper -# attention_factor = config.rope_scaling.get("attention_factor") -# if attention_factor is None: -# attention_factor = 0.1 * math.log(factor) + 1.0 - -# # Optional config options -# # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) -# beta_fast = config.rope_scaling.get("beta_fast") or 32 -# beta_slow = config.rope_scaling.get("beta_slow") or 1 - -# # Compute the inverse frequencies -# def find_correction_dim(num_rotations, dim, base, max_position_embeddings): -# """Inverse dimension formula to find the dimension based on the number of rotations""" -# return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) - -# def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): -# """Find dimension range bounds based on rotations""" -# low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) -# high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) -# return max(low, 0), min(high, dim - 1) - -# def linear_ramp_factor(min, max, dim): -# if min == max: -# max += 0.001 # Prevent singularity - -# linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) -# ramp_func = torch.clamp(linear_func, 0, 1) -# return ramp_func - -# # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs -# # to expand the possible context length. In other words, interpolation = apply scaling factor. -# pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) -# inv_freq_extrapolation = 1.0 / pos_freqs -# inv_freq_interpolation = 1.0 / (factor * pos_freqs) - -# low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) - -# # Get n-dimensional rotational scaling corrected for extrapolation -# inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device) -# inv_freq = ( -# inv_freq_interpolation * (1 - inv_freq_extrapolation_factor) -# + inv_freq_extrapolation * inv_freq_extrapolation_factor -# ) - -# return inv_freq, attention_factor - - -# def _compute_longrope_parameters( -# config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs -# ) -> Tuple["torch.Tensor", float]: -# """ -# Computes the inverse frequencies with LongRoPE scaling. Please refer to the -# [original implementation](https://github.com/microsoft/LongRoPE) -# Args: -# config ([`~transformers.PretrainedConfig`]): -# The model configuration. -# device (`torch.device`): -# The device to use for initialization of the inverse frequencies. -# seq_len (`int`, *optional*): -# The current sequence length. -# rope_kwargs (`Dict`, *optional*): -# BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. -# Returns: -# Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the -# post-processing scaling factor applied to the computed cos/sin. -# """ -# # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling -# # No need to keep BC with longrope, unreleased when this new pattern was created. -# if len(rope_kwargs) > 0: -# raise ValueError( -# "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " -# f"{rope_kwargs}" -# ) - -# base = config.rope_theta -# partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 -# head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) -# dim = int(head_dim * partial_rotary_factor) -# long_factor = config.rope_scaling["long_factor"] -# short_factor = config.rope_scaling["short_factor"] -# factor = config.rope_scaling.get("factor") -# attention_factor = config.rope_scaling.get("attention_factor") - -# # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a -# # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two -# # values to compute the default attention scaling factor, instead of using `factor`. -# if hasattr(config, "original_max_position_embeddings"): -# original_max_position_embeddings = config.original_max_position_embeddings -# factor = config.max_position_embeddings / config.original_max_position_embeddings -# else: -# original_max_position_embeddings = config.max_position_embeddings - -# # Sets the attention factor as suggested in the paper -# if attention_factor is None: -# if factor <= 1.0: -# attention_factor = 1.0 -# else: -# attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings)) - -# # Compute the inverse frequencies -- scaled based on the target sequence length -# if seq_len and seq_len > original_max_position_embeddings: -# ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) -# else: -# ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) -# inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim -# inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) - -# return inv_freq, attention_factor - - -def _compute_llama3_parameters( base: float = 10000.0, - head_dim: int = 0, - partial_rotary_factor: float = 1.0, - factor:float=8, - low_freq_factor:float=1, - high_freq_factor:float=4, - old_context_len:int=8192, - seq_len: Optional[int] = None -) -> Tuple[Tensor, float]: + inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim)) + return inv_freq, 1.0 + +def _compute_llama3_parameters(config:dict={ + "base":10000.0, + "head_dim":0, + "partial_rotary_factor":1.0, + "factor":8, + "low_freq_factor":1, + "high_freq_factor":4, + "old_context_len":8192, + "seq_len":None +}) -> Tuple[Tensor, float]: # Gets the default RoPE parameters - inv_freq, attention_factor = _compute_default_rope_parameters(base, head_dim, partial_rotary_factor) + inv_freq, attention_factor = _compute_default_rope_parameters(config) - low_freq_wavelen = old_context_len / low_freq_factor - high_freq_wavelen = old_context_len / high_freq_factor + low_freq_wavelen = config.old_context_len / config.low_freq_factor + high_freq_wavelen = config.old_context_len / config.high_freq_factor wavelen = 2 * math.pi / inv_freq # wavelen < high_freq_wavelen: do nothing # wavelen > low_freq_wavelen: divide by factor - inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) + inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq) # otherwise: interpolate between the two, using a smooth factor - smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) - smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama + smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor) + smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) return inv_freq_llama, attention_factor - - -# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters -# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE -# parameterizations, as long as the callable has the same signature. + ROPE_INIT_FUNCTIONS = { "default": _compute_default_rope_parameters, # "linear": _compute_linear_scaling_rope_parameters, diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/transformer/models/llama/attention.py new file mode 100644 index 00000000..edb98f48 --- /dev/null +++ b/front/py/deepx/transformer/models/llama/attention.py @@ -0,0 +1,103 @@ +from typing import Optional,Tuple +from deepx.nn.modules import Module,Linear +from deepx import Tensor,repeat_kv,matmul,softmax,concat,arange + + + +def rotate_half(x:Tensor): + index_front=arange(0,x.shape[-1]//2,dtype="int32") + index_back=arange(x.shape[-1]//2,x.shape[-1],dtype="int32") + x1 = x.index_select(dim=-1,index=index_front) + x2 = x.index_select(dim=-1,index=index_back) + return concat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1): + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# 经简化,去掉了分布式配置,去掉attention的配置。交给IR自动替换flashattention,后续的组件自动处理 + +def eager_attention_forward( + module: Module, + query: Tensor, + key: Tensor, + value: Tensor, + attention_mask: Optional[Tensor], + scaling: float, + dropout: float = 0.0, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = softmax(attn_weights, dim=-1, dtype=query.dtype) + attn_weights = dropout(attn_weights, p=dropout, training=module.training) + attn_output = matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + +class LlamaAttention(Module): + def __init__(self, config:dict, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + def forward( + self, + hidden_states: Tensor, + position_embeddings: Tuple[Tensor, Tensor], + attention_mask: Optional[Tensor] + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1) + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights \ No newline at end of file diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/transformer/models/llama/embedding.py new file mode 100644 index 00000000..8bac5baa --- /dev/null +++ b/front/py/deepx/transformer/models/llama/embedding.py @@ -0,0 +1,62 @@ +from deepx.nn.modules import Module +from deepx import Tensor,concat +from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS + +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +class LlamaRotaryEmbedding(Module): + def __init__(self,config:dict): + super().__init__() + # 最大序列长度 + self.max_seq_len_cached = config["max_position_embeddings"] + # 原始最大序列长度 + self.original_max_seq_len = config["max_position_embeddings"] + # 旋转类型 + self.rope_type=config["rope_scaling"]["type"] + # 旋转初始化函数 + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + # 旋转初始化函数 + inv_freq, self.attention_scaling = self.rope_init_fn(self.config) + # 注册缓存 + self.register_buffer("inv_freq", inv_freq, persistent=False) + # 原始旋转频率 + self.original_inv_freq = self.inv_freq + + # def _dynamic_frequency_update(self, position_ids, device): + # """ + # dynamic RoPE layers should recompute `inv_freq` in the following situations: + # 1 - growing beyond the cached sequence length (allow scaling) + # 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + # """ + # seq_len = torch.max(position_ids) + 1 + # if seq_len > self.max_seq_len_cached: # growth + # inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) + # self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + # self.max_seq_len_cached = seq_len + + # if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + # # This .to() is needed if the model has been moved to a device after being initialized (because + # # the buffer is automatically moved, but not the original copy) + # self.original_inv_freq = self.original_inv_freq.to(device) + # self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + # self.max_seq_len_cached = self.original_max_seq_len + + def forward(self, x, position_ids): + # 扩展旋转频率 + inv_freq_expanded = self.inv_freq.unsqueeze(dim=0).unsqueeze(dim=2).float() + broadcast_shape=(position_ids.shape[0], self.inv_freq.shape[0], 1) + inv_freq_expanded = inv_freq_expanded.broadcast_to(broadcast_shape) + + # 使用torch.unsqueeze和type转换替代索引操作 + position_ids_expanded = position_ids.unsqueeze(dim=1).to(dtype=x.dtype) + # 计算频率 + freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2) + # 拼接频率 + emb = concat((freqs, freqs), dim=-1) + # 计算余弦和正弦 + cos = emb.cos() + sin = emb.sin() + # 应用缩放因子 + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) diff --git a/front/py/deepx/transformer/models/llama/mlp.py b/front/py/deepx/transformer/models/llama/mlp.py new file mode 100644 index 00000000..a35ea3be --- /dev/null +++ b/front/py/deepx/transformer/models/llama/mlp.py @@ -0,0 +1,26 @@ +from deepx.nn.functional import swish as swish_fn +from deepx.nn.modules import Module,Linear + +ACT2FN={ + "silu":swish_fn, +} + +class LlamaMLP(Module): + def __init__(self, config:dict): + super().__init__() + # 输入层大小 + self.hidden_size = config.hidden_size + # 中间层大小 + self.intermediate_size = config["intermediate_size"] + #门控投影层 + self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + #上投影层 + self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + #下投影层 + self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + #激活函数 + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj \ No newline at end of file diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py index c3b07fd3..cc9cfc8b 100644 --- a/front/py/deepx/transformer/models/llama/modeling_llama.py +++ b/front/py/deepx/transformer/models/llama/modeling_llama.py @@ -1,102 +1,294 @@ -from deepx.nn.modules import Module,Linear -from deepx import Tensor,ones,rsqrt,concat +from typing import Optional,Tuple +from deepx.nn.modules import Module,Linear,Embedding +from deepx import Tensor from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS -# RMSNorm -# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py -# 数学公式 -class LlamaRMSNorm(Module): - def __init__(self, hidden_size, eps=1e-6): - """ - LlamaRMSNorm is equivalent to T5LayerNorm - """ +from deepx.transformer.models.llama.attention import LlamaAttention +from deepx.transformer.models.llama.mlp import LlamaMLP +from deepx.transformer.models.llama.normalization import LlamaRMSNorm +from deepx.transformer.models.llama.embedding import LlamaRotaryEmbedding + + +class LlamaDecoderLayer(Module): + def __init__(self, config:dict, layer_idx: int): super().__init__() - self.weight=ones(hidden_size) - self.register_parameter("weight",self.weight) - self.variance_epsilon = eps - def forward(self, hidden_states:Tensor): - variance = hidden_states.pow(2).mean((-1,), keepdim=True) - hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states + self.hidden_size = config.hidden_size + + self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx) + + self.mlp = LlamaMLP(config) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: Tensor, + attention_mask: Optional[Tensor] = None, + position_ids: Optional[Tensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + position_embeddings: Optional[Tuple[Tensor, Tensor]] = None, # necessary, but kept here for BC + ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + use_cache=use_cache, + position_embeddings=position_embeddings, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + return outputs - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - +class LlamaModel(Module): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] + + Args: + config: LlamaConfig + """ -class LlamaRotaryEmbedding(Module): - def __init__(self,rope_type:str="default",max_seq_len:int=1024,device=None): + def __init__(self, config:dict): super().__init__() - # 最大序列长度 - self.max_seq_len_cached = max_seq_len - # 原始最大序列长度 - self.original_max_seq_len = max_seq_len - # 旋转类型 - self.rope_type=rope_type - # 旋转初始化函数 - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - # 旋转初始化函数 - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - #TODO - # 注册缓存 - self.register_buffer("inv_freq", inv_freq, persistent=False) - # 原始旋转频率 - self.original_inv_freq = self.inv_freq - - # def _dynamic_frequency_update(self, position_ids, device): - # """ - # dynamic RoPE layers should recompute `inv_freq` in the following situations: - # 1 - growing beyond the cached sequence length (allow scaling) - # 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) - # """ - # seq_len = torch.max(position_ids) + 1 - # if seq_len > self.max_seq_len_cached: # growth - # inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) - # self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation - # self.max_seq_len_cached = seq_len - - # if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset - # # This .to() is needed if the model has been moved to a device after being initialized (because - # # the buffer is automatically moved, but not the original copy) - # self.original_inv_freq = self.original_inv_freq.to(device) - # self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) - # self.max_seq_len_cached = self.original_max_seq_len - - def forward(self, x, position_ids): - # TODO - # if "dynamic" in self.rope_type: - # self._dynamic_frequency_update(position_ids, device=x.device) - - # Core RoPE block - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = ModuleList( + [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = concat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() + def forward( + self, + input_ids: Tensor = None, + attention_mask: Optional[Tensor] = None, + position_ids: Optional[Tensor] = None, + inputs_embeds: Optional[Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict - # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention - cos = cos * self.attention_scaling - sin = sin * self.attention_scaling + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False -class LlamaMLP(Module): - def __init__(self, config): - super().__init__() - self.config = config - # 输入层大小 - self.hidden_size = config.hidden_size - # 中间层大小 - self.intermediate_size = config.intermediate_size - #门控投影层 - self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) - #上投影层 - self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) - #下投影层 - self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) - #激活函数 - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - return down_proj \ No newline at end of file + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + output_attentions=output_attentions, + use_cache=use_cache, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + output = BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + return output if return_dict else output.to_tuple() + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask diff --git a/front/py/deepx/transformer/models/llama/normalization.py b/front/py/deepx/transformer/models/llama/normalization.py new file mode 100644 index 00000000..867f3db4 --- /dev/null +++ b/front/py/deepx/transformer/models/llama/normalization.py @@ -0,0 +1,22 @@ +from deepx.nn.modules import Module +from deepx import Tensor,ones,rsqrt +# RMSNorm +# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py +# 数学公式 +class LlamaRMSNorm(Module): + def __init__(self, hidden_size:int, eps:float=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight=ones((hidden_size,)) + self.register_parameter("weight",self.weight) + self.variance_epsilon = eps + def forward(self, hidden_states:Tensor): + variance = hidden_states.pow(2).mean((-1,), keepdim=True) + hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + \ No newline at end of file diff --git a/front/py/examples/1_tensor/1_copy.py b/front/py/examples/1_tensor/1_copy.py index 244d4bfa..96651626 100644 --- a/front/py/examples/1_tensor/1_copy.py +++ b/front/py/examples/1_tensor/1_copy.py @@ -2,8 +2,8 @@ def copytest(): from deepx.nn.functional import newtensor - t1= newtensor(1, 2, 3,name='t1') - t2= newtensor(1, 2, 3,name='t2') + t1= newtensor((1, 2, 3),name='t1') + t2= newtensor((1, 2, 3),name='t2') t1.print() t1.copy_to(t2) t2.print() diff --git a/front/py/examples/1_tensor/1_print.py b/front/py/examples/1_tensor/1_print.py index 7a5a205e..5072e1e0 100644 --- a/front/py/examples/1_tensor/1_print.py +++ b/front/py/examples/1_tensor/1_print.py @@ -6,7 +6,7 @@ def newtensor(): from deepx.nn.functional import newtensor - t=newtensor(1,2,3,name='t') + t=newtensor((1,2,3),name='t') t.print() if __name__ == "__main__": diff --git a/front/py/examples/2_ir/1_init_zeroones.py b/front/py/examples/2_ir/1_init_zeroones.py index ca788569..e286086e 100644 --- a/front/py/examples/2_ir/1_init_zeroones.py +++ b/front/py/examples/2_ir/1_init_zeroones.py @@ -23,15 +23,15 @@ import deepx print() -t1 = deepx.zeros([3,4,5],dtype='float32') -t2 = deepx.ones([3,4,5],dtype='float32') -t4=deepx.full([3,4,5],value=0.5) +t1 = deepx.zeros((3,4,5),dtype='float32') +t2 = deepx.ones((3,4,5),dtype='float32') +t4=deepx.full((3,4,5),value=0.5) t4.print() -t5=deepx.uniform(3,4,5,low=0,high=1) +t5=deepx.uniform((3,4,5),low=0,high=1) t5.print() -t6=deepx.kaiming_uniform(3,4,5,dtype='float32') +t6=deepx.kaiming_uniform((3,4,5),dtype='float32') t6.print() -t7=deepx.zeros(3,4,5,dtype='float32') +t7=deepx.zeros((3,4,5),dtype='float32') t7.normal_(mean=0,stddev=0.02) t7.print("%.6f") diff --git a/front/py/examples/2_ir/2_elementwise_add.py b/front/py/examples/2_ir/2_elementwise_add.py index dca1fa93..d0b016bf 100644 --- a/front/py/examples/2_ir/2_elementwise_add.py +++ b/front/py/examples/2_ir/2_elementwise_add.py @@ -14,7 +14,7 @@ print() -t1 = full(2,3,4, value=10,dtype="float32") +t1 = full((2,3,4), value=10,dtype="float32") t2 = t1.clone() t3 = t1+t2 t3.add_(0.5) diff --git a/front/py/examples/2_ir/2_elementwise_compare.py b/front/py/examples/2_ir/2_elementwise_compare.py index 7f010870..6c66670c 100644 --- a/front/py/examples/2_ir/2_elementwise_compare.py +++ b/front/py/examples/2_ir/2_elementwise_compare.py @@ -17,10 +17,10 @@ print() -t1 = full(2,3,4, value=10,dtype="int8") +t1 = full((2,3,4), value=10,dtype="int8") t2 = ~t1 t2.print() -t3 = full(2,3,4, value=2,dtype="int64") +t3 = full((2,3,4), value=2,dtype="int64") t4 = ~t3 t4.print() \ No newline at end of file diff --git a/front/py/examples/2_ir/2_elementwise_dropout.py b/front/py/examples/2_ir/2_elementwise_dropout.py new file mode 100644 index 00000000..6ac49a56 --- /dev/null +++ b/front/py/examples/2_ir/2_elementwise_dropout.py @@ -0,0 +1,21 @@ +############-------PyTorch-------################ + +print() +import torch +torch_t1 = torch.arange(24, dtype=torch.int32).reshape(2,3,4) +torch_t2 = torch_t1.dropout(p=0.5) +print(torch_t2) + + + + +############-------DEEPX-------################ + +from deepx import Tensor,arange + +print() + +t1 = arange(start=0,end=24 ,dtype="int32").reshape_(2,3,4) +t2 = t1.dropout(p=0.5) +t2.print() + \ No newline at end of file diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py index 705219ed..ed5ca24a 100644 --- a/front/py/examples/2_ir/2_elementwise_sqrtlog.py +++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py @@ -1,5 +1,6 @@ ############-------PyTorch-------################ +print() import torch torch_t1 = torch.arange(3*4*5, dtype=torch.float32) diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py index 205b27c0..413a2613 100644 --- a/front/py/examples/2_ir/3_matmul.py +++ b/front/py/examples/2_ir/3_matmul.py @@ -33,7 +33,7 @@ t1 = load(npy_path+'t1') t2 = load(npy_path+'t2') -t3= zeros(1024,1024,dtype='float32',name="t3") +t3= zeros((1024,1024),dtype='float32',name="t3") from deepx.nn.functional import defaultauthor defaultauthor['matmul']='miaobyte' # warmup diff --git a/front/py/examples/2_ir/4_changeshape_broadcast.py b/front/py/examples/2_ir/4_changeshape_broadcast.py index 84199680..15ceb34f 100644 --- a/front/py/examples/2_ir/4_changeshape_broadcast.py +++ b/front/py/examples/2_ir/4_changeshape_broadcast.py @@ -1,6 +1,6 @@ #######====PYTORCH======######## - +print() import torch a=torch.arange(4*2*3).reshape(4,2,3) b=torch.arange(2*1).reshape(2,1) diff --git a/front/py/examples/2_ir/4_changeshape_concat.py b/front/py/examples/2_ir/4_changeshape_concat.py index cda373fb..ada8e0b0 100644 --- a/front/py/examples/2_ir/4_changeshape_concat.py +++ b/front/py/examples/2_ir/4_changeshape_concat.py @@ -1,5 +1,6 @@ ############-------PyTorch-------################ +print() import torch torch_t1 = torch.ones(3, 4,5, dtype=torch.float32) torch_t2 = torch.ones(3, 4,5, dtype=torch.float32) @@ -13,7 +14,6 @@ from deepx import Tensor,zeros, ones, concat -print() t1 = ones([3,4,5],dtype='float32',name='t1') t2=ones([3,4,5],dtype='float32',name='t2') diff --git a/front/py/examples/2_ir/4_changeshape_gather.py b/front/py/examples/2_ir/4_changeshape_gather.py index df12e5f2..5a7b6be4 100644 --- a/front/py/examples/2_ir/4_changeshape_gather.py +++ b/front/py/examples/2_ir/4_changeshape_gather.py @@ -1,27 +1,26 @@ ############-------PyTorch-------################ -import numpy as np +import os print() -indices_np = np.array([[0, 1, 2], [0, 1, 2]]) - -print(indices_np) - +dir=os.path.expanduser('~/model/deepxmodel/functional/') import torch torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5) -torch_indices = torch.tensor(indices_np) -torch_t2 = torch.index_select(torch_t, 1,torch_indices) +index=[0, 1, 2,0, 1, 2] +torch_index = torch.tensor(index,dtype=torch.int32) + +from deepxutil.torch import save_torch +save_torch(torch_index,dir+'gatherindex') + +torch_t2 = torch.index_select(torch_t, 1,torch_index) print(torch_t2.shape) print(torch_t2) ############-------DEEPX-------################ -from deepx import Tensor,arange,Shape,load -from deepxutil.numpy import save_numpy - -save_numpy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices') +from deepx import arange ,load t = arange(start=0,end=10*5,dtype='float32',name='t').reshape_((10,5)) -indices = load('/home/lipeng/model/deepxmodel/tester/testindices') +indices = load(dir+'gatherindex') indices.print() t2 = t.indexselect(indices,axis=1) t2.print() diff --git a/front/py/examples/2_ir/4_changeshape_reshape.py b/front/py/examples/2_ir/4_changeshape_reshape.py index a8dde5f5..aec8153d 100644 --- a/front/py/examples/2_ir/4_changeshape_reshape.py +++ b/front/py/examples/2_ir/4_changeshape_reshape.py @@ -1,4 +1,4 @@ - +print() ############-------PyTorch-------################ import torch @@ -13,7 +13,7 @@ ############-------DEEPX-------################ from deepx import Tensor,zeros, ones, full, arange -print() + t1 = ones((3,4),dtype='float32',name='t1') t1.print() t2=t1.reshape((3,2,2)) diff --git a/front/py/examples/2_ir/5_reduce_prod.py b/front/py/examples/2_ir/5_reduce_prod.py index a6f0eb63..60676f51 100644 --- a/front/py/examples/2_ir/5_reduce_prod.py +++ b/front/py/examples/2_ir/5_reduce_prod.py @@ -3,15 +3,10 @@ import torch torch_t = torch.arange(0,60).reshape(3,4,5) print(torch_t) -torch_s = torch.sum(torch_t, dim=[0, 2]) -print(torch_s) -# torch_p=torch.prod(torch_t,dim=1) -# print(torch_p) -torch_t1 = torch.ones(4, 5, 6,dtype=torch.float) -print(torch_t1) -torch_t2 = torch.sum(torch_t1, dim=[0, 1]) -print(torch_t2) +torch_p=torch.prod(torch_t,dim=1) +print(torch_p) + ############-------DEEPX-------################ @@ -20,14 +15,7 @@ from deepx.nn.functional import sum,prod t=arange(0,60,name='t').reshape_((3,4,5)) - t.print() -s=sum(t,dim=(0,2),out="s") -s.print() -# p=prod(t,dim=(1,),out="p") -# p.print() - -t1=ones((4,5,6),name="t1") -t1.print() -t2=sum(t1,dim=(0,1),out='t2') -t2.print() + +p=prod(t,dim=(1,),out="p") +p.print() diff --git a/front/py/examples/2_ir/6_tensorlife_to.py b/front/py/examples/2_ir/6_tensorlife_to.py new file mode 100644 index 00000000..03f16276 --- /dev/null +++ b/front/py/examples/2_ir/6_tensorlife_to.py @@ -0,0 +1,20 @@ +############-------PyTorch-------################ + +print() +import torch +torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32) +torch_t2 = torch_t1.to(dtype=torch.bfloat16) +print(torch_t2) +torch_t3 = torch_t2.to(dtype=torch.float32) +print(torch_t3) + +############-------DEEPX-------################ + +from deepx import full + + +t1 = full((2,3,4), value=10,dtype="float32") +t2 = t1.to(dtype="bfloat16") +t2.print() +t3 = t2.to(dtype="float32") +t3.print() \ No newline at end of file diff --git a/front/py/examples/3_functional/1_dropout.py b/front/py/examples/3_functional/1_dropout.py new file mode 100644 index 00000000..7aabaf7e --- /dev/null +++ b/front/py/examples/3_functional/1_dropout.py @@ -0,0 +1,9 @@ +############-------PyTorch-------################ +print() + +import torch +import torch.nn.functional as F +torch_t = torch.empty(10, 10).uniform_(-1, 1) +torch_relu_t = F.dropout(torch_t) +print(torch_t) +print(torch_relu_t) diff --git a/front/py/examples/3_functional/1_relu.py b/front/py/examples/3_functional/1_relu.py deleted file mode 100644 index 9cd1737e..00000000 --- a/front/py/examples/3_functional/1_relu.py +++ /dev/null @@ -1,27 +0,0 @@ -############-------PyTorch-------################ - -import torch -import torch.nn.functional as F -torch_t = torch.empty(10, 10).uniform_(-1, 1) -torch_relu_t = F.relu(torch_t) -print(torch_t) -print(torch_relu_t) - -############-------DEEPX-------################ - -from deepx import Tensor,ones -from deepx.nn.functional import relu,uniform - - -t=uniform(10,10,low=-1,high=1,name='t') - -print(t) -relu_t=relu(t) -print(relu_t) - -# 当tensor.name为str时,说明其是中间变量,执行inplace操作 -t2=uniform(10,10,low=-1,high=1) -t2.print() -relu_t2=relu(t2) -relu_t2.print() - diff --git a/front/py/examples/3_functional/1_rsqrt.py b/front/py/examples/3_functional/1_rsqrt.py deleted file mode 100644 index aa4926a6..00000000 --- a/front/py/examples/3_functional/1_rsqrt.py +++ /dev/null @@ -1,18 +0,0 @@ -############-------PyTorch-------################ - -import torch -import torch.nn.functional as F -torch_t = torch.arange(0, 24).reshape(2, 3, 4) -torch_rsqrt_t = torch.rsqrt(torch_t) -print(torch_t) -print(torch_rsqrt_t) - -############-------DEEPX-------################ - -from deepx import arange -from deepx.nn.functional import rsqrt - -t=arange(2,3,4,name='t') -t.print() -rsqrt_t=rsqrt(t) -rsqrt_t.print() diff --git a/front/py/examples/3_functional/1_swiglu.py b/front/py/examples/3_functional/1_swiglu.py deleted file mode 100644 index 74120526..00000000 --- a/front/py/examples/3_functional/1_swiglu.py +++ /dev/null @@ -1,34 +0,0 @@ -############-------PyTorch-------################ -import torch -import torch.nn.functional as F - -# 使用arange创建连续数据,确保最后一维是偶数以便分割 -x_torch = torch.arange(48, dtype=torch.float32).reshape(3, 4, 4) / 10.0 - 3.0 -print("PyTorch tensor:") -print(x_torch) - -# SwiGLU实现:将tensor在最后一维分成两半 -x1, x2 = torch.split(x_torch, x_torch.size(-1) // 2, dim=-1) -out_torch = F.silu(x1) * x2 # SwiGLU: swish(x1) * x2 -print("\nPyTorch swiglu result:") -print(out_torch) - -############-------DEEPX-------################ -from deepx import arange,swish,swiglu - -# 使用相同的初始化方式 -x = arange(0,48,1,name="x").reshape_(3,4,4) -x.div_(10.0) -x.sub_(3.0) - -print("\nDEEPX tensor:") -print(x) - -out = swiglu(x,out="out") -print("\nDEEPX swiglu result:") -print(out) - -import os -script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0] # 获取不带后缀的脚本名 -str=out.graph.to_dot() -str.render(script_name+".dot", format='svg') \ No newline at end of file diff --git a/front/py/examples/3_functional/activite_relu.py b/front/py/examples/3_functional/activite_relu.py new file mode 100644 index 00000000..7a899c74 --- /dev/null +++ b/front/py/examples/3_functional/activite_relu.py @@ -0,0 +1,25 @@ +############-------PyTorch-------################ +print() + +import torch +import torch.nn.functional as F +torch_t = torch.empty(10, 10).uniform_(-1, 1) +torch_relu_t = F.relu(torch_t) +print(torch_t) +print(torch_relu_t) + +import os +dir=os.path.expanduser('~/model/deepxmodel/functional/') +from deepxutil.torch import save_torch +save_torch(torch_t,dir+'uniformed') + +############-------DEEPX-------################ + +from deepx import relu,load + + +t=load(dir+'uniformed') +t.print() +relu_t=relu(t) +relu_t.print() + diff --git a/front/py/examples/3_functional/1_sigmoid.py b/front/py/examples/3_functional/activite_sigmoid.py similarity index 70% rename from front/py/examples/3_functional/1_sigmoid.py rename to front/py/examples/3_functional/activite_sigmoid.py index dbdfd614..8859fe4b 100644 --- a/front/py/examples/3_functional/1_sigmoid.py +++ b/front/py/examples/3_functional/activite_sigmoid.py @@ -6,18 +6,21 @@ print("PyTorch tensor:") print(x_torch) +import os +dir=os.path.expanduser('~/model/deepxmodel/functional/') +from deepxutil.torch import save_torch +save_torch(x_torch,dir+'sigmoided') + out_torch = torch.sigmoid(x_torch) print("\nPyTorch sigmoid result:") print(out_torch) ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange +from deepx import Tensor,ones,zeros,arange,load from deepx import sigmoid # 使用相同的初始化方式 -x = arange(3,4,5,name="x") -x.div_(10.0) -x.sub_(3.0) +x = load(dir+'sigmoided') print("\nDEEPX tensor:") x.print() diff --git a/front/py/examples/3_functional/1_swish.py b/front/py/examples/3_functional/activite_swish.py similarity index 65% rename from front/py/examples/3_functional/1_swish.py rename to front/py/examples/3_functional/activite_swish.py index f4e8c7c3..ca9f431e 100644 --- a/front/py/examples/3_functional/1_swish.py +++ b/front/py/examples/3_functional/activite_swish.py @@ -1,23 +1,25 @@ ############-------PyTorch-------################ import torch -import torch.nn.functional as F # 使用arange创建连续数据 x_torch = torch.arange(60, dtype=torch.float32).reshape(3, 4, 5) / 10.0 - 3.0 print("PyTorch tensor:") print(x_torch) -out_torch = F.silu(x_torch) # silu 就是 swish 激活函数 +import os +dir=os.path.expanduser('~/model/deepxmodel/functional/') +from deepxutil.torch import save_torch +save_torch(x_torch,dir+'swish') + +out_torch = torch.nn.functional.silu(x_torch) print("\nPyTorch swish result:") print(out_torch) ############-------DEEPX-------################ -from deepx import arange,swish +from deepx import load, swish # 使用相同的初始化方式 -x = arange(3,4,5,name="x") -x.div_(10.0) -x.sub_(3.0) +x = load(dir+'swish') print("\nDEEPX tensor:") x.print() diff --git a/front/py/examples/3_functional/elementwise_rsqrt.py b/front/py/examples/3_functional/elementwise_rsqrt.py new file mode 100644 index 00000000..cf50e40d --- /dev/null +++ b/front/py/examples/3_functional/elementwise_rsqrt.py @@ -0,0 +1,21 @@ +############-------PyTorch-------################ + +import torch +torch_t = torch.arange(0, 24,dtype=torch.float).reshape(2, 3, 4) +torch_rsqrt_t = torch.rsqrt(torch_t) +print(torch_t) +print(torch_rsqrt_t) + +import os +dir = os.path.expanduser('~/model/deepxmodel/functional/') +from deepxutil.torch import save_torch +save_torch(torch_t, dir + 'aranged') + +############-------DEEPX-------################ + +from deepx import rsqrt,load + +t=load(dir+'aranged') +t.print() +rsqrt_t=rsqrt(t) +rsqrt_t.print() diff --git a/front/py/examples/3_functional/1_softmax.py b/front/py/examples/3_functional/normalization_softmax.py similarity index 62% rename from front/py/examples/3_functional/1_softmax.py rename to front/py/examples/3_functional/normalization_softmax.py index f3b78b35..3758daa0 100644 --- a/front/py/examples/3_functional/1_softmax.py +++ b/front/py/examples/3_functional/normalization_softmax.py @@ -6,22 +6,24 @@ print("PyTorch tensor:") print(x_torch) -out_torch = torch.softmax(x_torch,-1) +out_torch = torch.softmax(x_torch,-2) print("\nPyTorch sigmoid result:") print(out_torch) +import os +dir = os.path.expanduser('~/model/deepxmodel/functional/') +from deepxutil.torch import save_torch +save_torch(x_torch, dir + 'forsoftmax') + ############-------DEEPX-------################ -from deepx import Tensor,ones,zeros,arange -from deepx import softmax +from deepx import softmax,load # 使用相同的初始化方式 -x = arange(3,4,5,name="x") -x.div_(10.0) -x.sub_(3.0) +x=load(dir+'forsoftmax') print("\nDEEPX tensor:") x.print() -out=softmax(x,-1) +out=softmax(x,[-2]) print("\nDEEPX sigmoid result:") out.print() diff --git a/front/py/examples/3_functional/1_mean.py b/front/py/examples/3_functional/reduce_mean.py similarity index 77% rename from front/py/examples/3_functional/1_mean.py rename to front/py/examples/3_functional/reduce_mean.py index 64511555..e6d6d368 100644 --- a/front/py/examples/3_functional/1_mean.py +++ b/front/py/examples/3_functional/reduce_mean.py @@ -1,5 +1,5 @@ ############-------PyTorch-------################ - +print() import torch torch_t3 = torch.arange(0, 120,dtype=torch.float).reshape(4, 5, 6) @@ -8,12 +8,9 @@ print(torch_t3_mean) ############-------DEEPX-------################ +from deepx.nn.functional import mean,arange -from deepx import arange -from deepx.nn.functional import mean - - -t3=arange(4,5,6,name="t3") +t3 = arange(0, 120).reshape((4, 5, 6)) t3.print() t3_mean=mean(t3,dim=(0,1)) diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py index 3204fc00..ff5c1f17 100644 --- a/front/py/examples/3_module/1_embedding.py +++ b/front/py/examples/3_module/1_embedding.py @@ -28,13 +28,14 @@ def tokenize_text(text, tokenizer): torch_input = tokenize_text(text, tokenizer) from deepxutil.torch import save_torch save_torch(torch_input,dir+'input') +print(torch_input.shape) print(torch_input) # 创建网络 torch_net = nn.Embedding(tokenizer.vocab_size, 4096) save_torch(torch_net.weight,dir+'weight') # 前向传播 torch_output = torch_net(torch_input) -print() + print(torch_output.shape) print(torch_output) diff --git a/front/py/examples/3_module/1_swiglu.py b/front/py/examples/3_module/1_swiglu.py deleted file mode 100644 index 0e350582..00000000 --- a/front/py/examples/3_module/1_swiglu.py +++ /dev/null @@ -1,43 +0,0 @@ -hidden_size = 8 -eps = 1e-6 -dir='/home/lipeng/model/deepxmodel/llama/' - - - -############### PyTorch 实现部分 ############### -import torch -# 使用小规模数据以便打印完整结果 -pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0 -print("PyTorch 输入:") -print(pt_input) - -from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm -from deepxutil.torch import save_torch -save_torch(pt_input,dir+'rmsnorm_input') -# 使用transformers库中的官方LlamaRMSNorm实现 -pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps) -# 设置权重为固定值0.5 -with torch.no_grad(): - pt_norm.weight.fill_(0.5) -# 前向传播 -pt_output = pt_norm(pt_input) - - -print("\nPyTorch RMSNorm 结果:") -print(pt_output.shape) -print(pt_output) - - -############### DeepX 实现部分 ############### -from deepx import constant_,load -from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm - -input=load(dir+'rmsnorm_input') - -# DeepX计算流程 -norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps) -# 设置相同的权重 -constant_(norm.weight, 0.5) -# 前向计算 -output = norm(input) -output.print() diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py index 0e350582..f0160a27 100644 --- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py +++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py @@ -30,7 +30,7 @@ ############### DeepX 实现部分 ############### from deepx import constant_,load -from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm +from deepx.transformer.models.llama.normalization import LlamaRMSNorm input=load(dir+'rmsnorm_input') diff --git a/todo/modeling_llama.py b/todo/modeling_llama.py deleted file mode 100644 index 8cbb1262..00000000 --- a/todo/modeling_llama.py +++ /dev/null @@ -1,1155 +0,0 @@ -# coding=utf-8 -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from typing import Callable, List, Optional, Tuple, Union - -import torch -import torch.utils.checkpoint -from torch import nn - -from ...activations import ACT2FN -from ...cache_utils import Cache, DynamicCache, StaticCache -from ...generation import GenerationMixin -from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_outputs import ( - BaseModelOutputWithPast, - CausalLMOutputWithPast, - QuestionAnsweringModelOutput, - SequenceClassifierOutputWithPast, - TokenClassifierOutput, -) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS -from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...processing_utils import Unpack -from ...pytorch_utils import ALL_LAYERNORM_LAYERS -from ...utils import ( - LossKwargs, - add_code_sample_docstrings, - add_start_docstrings, - add_start_docstrings_to_model_forward, - logging, - replace_return_docstrings, -) -from .configuration_llama import LlamaConfig - - -logger = logging.get_logger(__name__) - -_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf" -_CONFIG_FOR_DOC = "LlamaConfig" - - -class LlamaRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - LlamaRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - -ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) - - -class LlamaRotaryEmbedding(nn.Module): - def __init__(self, config: LlamaConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - def _dynamic_frequency_update(self, position_ids, device): - """ - dynamic RoPE layers should recompute `inv_freq` in the following situations: - 1 - growing beyond the cached sequence length (allow scaling) - 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) - """ - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_seq_len_cached: # growth - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation - self.max_seq_len_cached = seq_len - - if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset - # This .to() is needed if the model has been moved to a device after being initialized (because - # the buffer is automatically moved, but not the original copy) - self.original_inv_freq = self.original_inv_freq.to(device) - self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) - self.max_seq_len_cached = self.original_max_seq_len - - @torch.no_grad() - def forward(self, x, position_ids): - if "dynamic" in self.rope_type: - self._dynamic_frequency_update(position_ids, device=x.device) - - # Core RoPE block - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - # Force float32 (see https://github.com/huggingface/transformers/pull/29285) - device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - - # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention - cos = cos * self.attention_scaling - sin = sin * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class LlamaMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - return down_proj - - -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -def eager_attention_forward( - module: nn.Module, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attention_mask: Optional[torch.Tensor], - scaling: float, - dropout: float = 0.0, - **kwargs, -): - key_states = repeat_kv(key, module.num_key_value_groups) - value_states = repeat_kv(value, module.num_key_value_groups) - - attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) - attn_output = torch.matmul(attn_weights, value_states) - attn_output = attn_output.transpose(1, 2).contiguous() - - return attn_output, attn_weights - - -class LlamaAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: LlamaConfig, layer_idx: int): - super().__init__() - self.config = config - self.layer_idx = layer_idx - self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads - self.scaling = self.head_dim**-0.5 - self.attention_dropout = config.attention_dropout - self.is_causal = True - - self.q_proj = nn.Linear( - config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias - ) - self.k_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.v_proj = nn.Linear( - config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias - ) - self.o_proj = nn.Linear( - config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias - ) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: Tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], - past_key_value: Optional[Cache] = None, - cache_position: Optional[torch.LongTensor] = None, - **kwargs: Unpack[FlashAttentionKwargs], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - input_shape = hidden_states.shape[:-1] - hidden_shape = (*input_shape, -1, self.head_dim) - - query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) - key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) - value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) - - cos, sin = position_embeddings - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - attention_interface: Callable = eager_attention_forward - if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - logger.warning_once( - "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " - 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - - attn_output, attn_weights = attention_interface( - self, - query_states, - key_states, - value_states, - attention_mask, - dropout=0.0 if not self.training else self.attention_dropout, - scaling=self.scaling, - **kwargs, - ) - - attn_output = attn_output.reshape(*input_shape, -1).contiguous() - attn_output = self.o_proj(attn_output) - return attn_output, attn_weights - - -class LlamaDecoderLayer(nn.Module): - def __init__(self, config: LlamaConfig, layer_idx: int): - super().__init__() - self.hidden_size = config.hidden_size - - self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx) - - self.mlp = LlamaMLP(config) - self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC - **kwargs: Unpack[FlashAttentionKwargs], - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - if output_attentions: - outputs += (self_attn_weights,) - - return outputs - - -LLAMA_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`LlamaConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", - LLAMA_START_DOCSTRING, -) -class LlamaPreTrainedModel(PreTrainedModel): - config_class = LlamaConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["LlamaDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_flex_attn = True - _supports_cache_class = True - _supports_quantized_cache = True - _supports_static_cache = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -LLAMA_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): - Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` - returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. - - Two formats are allowed: - - a [`~cache_utils.Cache`] instance, see our - [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); - - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of - shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy - cache format. - - The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the - legacy cache format will be returned. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): - Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, - this tensor is not affected by padding. It is used to update the cache in the correct position and to infer - the complete sequence length. -""" - - -@add_start_docstrings( - "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", - LLAMA_START_DOCSTRING, -) -class LlamaModel(LlamaPreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] - - Args: - config: LlamaConfig - """ - - def __init__(self, config: LlamaConfig): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList( - [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] - ) - self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = LlamaRotaryEmbedding(config=config) - self.gradient_checkpointing = False - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - **flash_attn_kwargs: Unpack[FlashAttentionKwargs], - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if (input_ids is None) ^ (inputs_embeds is not None): - raise ValueError("You must specify exactly one of input_ids or inputs_embeds") - - if self.gradient_checkpointing and self.training and use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." - ) - use_cache = False - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if use_cache and past_key_values is None: - past_key_values = DynamicCache() - - if cache_position is None: - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 - cache_position = torch.arange( - past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device - ) - - if position_ids is None: - position_ids = cache_position.unsqueeze(0) - - causal_mask = self._update_causal_mask( - attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions - ) - - hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - - for decoder_layer in self.layers[: self.config.num_hidden_layers]: - if output_hidden_states: - all_hidden_states += (hidden_states,) - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - causal_mask, - position_ids, - past_key_values, - output_attentions, - use_cache, - cache_position, - position_embeddings, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=causal_mask, - position_ids=position_ids, - past_key_value=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - **flash_attn_kwargs, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - output = BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=past_key_values if use_cache else None, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - return output if return_dict else output.to_tuple() - - def _update_causal_mask( - self, - attention_mask: torch.Tensor, - input_tensor: torch.Tensor, - cache_position: torch.Tensor, - past_key_values: Cache, - output_attentions: bool, - ): - if self.config._attn_implementation == "flash_attention_2": - if attention_mask is not None and (attention_mask == 0.0).any(): - return attention_mask - return None - - # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in - # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail - # to infer the attention mask. - past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 - using_static_cache = isinstance(past_key_values, StaticCache) - - # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward - if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: - if AttentionMaskConverter._ignore_causal_mask_sdpa( - attention_mask, - inputs_embeds=input_tensor, - past_key_values_length=past_seen_tokens, - is_training=self.training, - ): - return None - - dtype, device = input_tensor.dtype, input_tensor.device - sequence_length = input_tensor.shape[1] - if using_static_cache: - target_length = past_key_values.get_max_cache_shape() - else: - target_length = ( - attention_mask.shape[-1] - if isinstance(attention_mask, torch.Tensor) - else past_seen_tokens + sequence_length + 1 - ) - - # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). - causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( - attention_mask, - sequence_length=sequence_length, - target_length=target_length, - dtype=dtype, - device=device, - cache_position=cache_position, - batch_size=input_tensor.shape[0], - ) - - if ( - self.config._attn_implementation == "sdpa" - and attention_mask is not None - and attention_mask.device.type == "cuda" - and not output_attentions - ): - # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when - # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. - # Details: https://github.com/pytorch/pytorch/issues/110213 - min_dtype = torch.finfo(dtype).min - causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) - - return causal_mask - - @staticmethod - def _prepare_4d_causal_attention_mask_with_cache_position( - attention_mask: torch.Tensor, - sequence_length: int, - target_length: int, - dtype: torch.dtype, - device: torch.device, - cache_position: torch.Tensor, - batch_size: int, - **kwargs, - ): - """ - Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape - `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. - - Args: - attention_mask (`torch.Tensor`): - A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape - `(batch_size, 1, query_length, key_value_length)`. - sequence_length (`int`): - The sequence length being processed. - target_length (`int`): - The target length: when generating with static cache, the mask should be as long as the static cache, - to account for the 0 padding, the part of the cache that is not filled yet. - dtype (`torch.dtype`): - The dtype to use for the 4D attention mask. - device (`torch.device`): - The device to plcae the 4D attention mask on. - cache_position (`torch.Tensor`): - Indices depicting the position of the input sequence tokens in the sequence. - batch_size (`torch.Tensor`): - Batch size. - """ - if attention_mask is not None and attention_mask.dim() == 4: - # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. - causal_mask = attention_mask - else: - min_dtype = torch.finfo(dtype).min - causal_mask = torch.full( - (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device - ) - if sequence_length != 1: - causal_mask = torch.triu(causal_mask, diagonal=1) - causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) - causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) - if attention_mask is not None: - causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit - mask_length = attention_mask.shape[-1] - padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] - padding_mask = padding_mask == 0 - causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( - padding_mask, min_dtype - ) - - return causal_mask - - -class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... - - -class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): - _tied_weights_keys = ["lm_head.weight"] - _tp_plan = {"lm_head": "colwise_rep"} - - def __init__(self, config): - super().__init__(config) - self.model = LlamaModel(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - cache_position: Optional[torch.LongTensor] = None, - num_logits_to_keep: int = 0, - **kwargs: Unpack[KwargsForCausalLM], - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - num_logits_to_keep (`int`, *optional*): - Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all - `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that - token can save memory, which becomes pretty significant for long sequences or large vocabulary size. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, LlamaForCausalLM - - >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") - >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - cache_position=cache_position, - **kwargs, - ) - - hidden_states = outputs[0] - # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) - - loss = None - if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - The LLaMa Model transformer with a sequence classification head on top (linear layer). - - [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - LLAMA_START_DOCSTRING, -) -class LlamaForSequenceClassification(LlamaPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = LlamaModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility - sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 - sequence_lengths = sequence_lengths % input_ids.shape[-1] - sequence_lengths = sequence_lengths.to(logits.device) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) - - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) - - -@add_start_docstrings( - """ -The Llama Model transformer with a span classification head on top for extractive question-answering tasks like -SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). - """, - LLAMA_START_DOCSTRING, -) -class LlamaForQuestionAnswering(LlamaPreTrainedModel): - base_model_prefix = "transformer" - - # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama - def __init__(self, config): - super().__init__(config) - self.transformer = LlamaModel(config) - self.qa_outputs = nn.Linear(config.hidden_size, 2) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.transformer.embed_tokens - - def set_input_embeddings(self, value): - self.transformer.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - start_positions: Optional[torch.LongTensor] = None, - end_positions: Optional[torch.LongTensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - **kwargs, - ) -> Union[Tuple, QuestionAnsweringModelOutput]: - r""" - start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence - are not taken into account for computing the loss. - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.transformer( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1).contiguous() - end_logits = end_logits.squeeze(-1).contiguous() - - loss = None - if start_positions is not None and end_positions is not None: - loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) - - if not return_dict: - output = (start_logits, end_logits) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return QuestionAnsweringModelOutput( - loss=loss, - start_logits=start_logits, - end_logits=end_logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -@add_start_docstrings( - """ - The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states - output) e.g. for Named-Entity-Recognition (NER) tasks. - """, - LLAMA_START_DOCSTRING, -) -class LlamaForTokenClassification(LlamaPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = LlamaModel(config) - if getattr(config, "classifier_dropout", None) is not None: - classifier_dropout = config.classifier_dropout - elif getattr(config, "hidden_dropout", None) is not None: - classifier_dropout = config.hidden_dropout - else: - classifier_dropout = 0.1 - self.dropout = nn.Dropout(classifier_dropout) - self.score = nn.Linear(config.hidden_size, config.num_labels) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - @add_code_sample_docstrings( - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=TokenClassifierOutput, - config_class=_CONFIG_FOR_DOC, - ) - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, TokenClassifierOutput]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - sequence_output = outputs[0] - sequence_output = self.dropout(sequence_output) - logits = self.score(sequence_output) - - loss = None - if labels is not None: - loss = self.loss_function(logits, labels, self.config) - - if not return_dict: - output = (logits,) + outputs[2:] - return ((loss,) + output) if loss is not None else output - - return TokenClassifierOutput( - loss=loss, - logits=logits, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - -__all__ = [ - "LlamaForCausalLM", - "LlamaModel", - "LlamaPreTrainedModel", - "LlamaForSequenceClassification", - "LlamaForQuestionAnswering", - "LlamaForTokenClassification", -]