diff --git a/doc/excuter/op-mem-cuda/list.md b/doc/excuter/op-mem-cuda/list.md
index f281979d..0fa979ae 100644
--- a/doc/excuter/op-mem-cuda/list.md
+++ b/doc/excuter/op-mem-cuda/list.md
@@ -4,98 +4,99 @@
 
 ### arg
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | argvalue->argname | argset(var<any> value)->(var<any> name) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| vecset |  none  | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argvalue->argname | argset(var<any> value)->(var<any> name) |
 
 ### tensorlife
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| renametensor |  none  | renametensor(var<string> new_name)->(tensor<any> t) | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
-| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> tensor1) | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
-| deltensor |  none  | deltensor()->(tensor<any> t) | del->T1 | deltensor()->(tensor<any> t) |
-| copytensor |  none  | copytensor(tensor<any> src)->(tensor<any> dst) | T2.data = T1.data | copytensor(tensor<any> src)->(tensor<any> dst) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| renametensor |  none  | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
+| newtensor |  none  | T1 = zeros(shape) | newtensor(vector<int32> shape)->(tensor<any> tensor1) |
+| newtensor |  none  | T1 = zeros(shape) | newtensor(var<string> shape)->(tensor<any> tensor1) |
+| deltensor |  none  | del->T1 | deltensor()->(tensor<any> t) |
+| copytensor |  none  | T2.data = T1.data | copytensor(tensor<any> src)->(tensor<any> dst) |
 
 ### io
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| loadtensordata |  none  | loadtensordata(var<string> path)->(tensor<any> t) | loadtensordata(path)->tensor | loadtensordata(var<string> path)->(tensor<any> t) |
-| save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
-| print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
-| print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
-| load |  none  | load(var<string> path)->() | load(path) | load(var<string> path)->() |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| loadtensordata |  none  | loadtensordata(path)->tensor | loadtensordata(var<string> path)->(tensor<any> t) |
+| save |  none  | save(T1,path) | save(tensor<any> t, var<string> path)->() |
+| print | miaobyte | print(T1) | print(tensor<any> t)->() |
+| print | miaobyte | print(T1) | print(tensor<any> t, var<string> format)->() |
+| load |  none  | load(path) | load(var<string> path)->() |
 
 ### matmul
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| matmul | cublas | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| matmul | cublas | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
 ### init
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| normal | miaobyte | normal(var<any> mean, var<any> stddev, var<int32> seed)->(tensor<any> t) | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> stddev, var<int32> seed)->(tensor<any> t) |
-| uniform | miaobyte | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
-| arange | miaobyte | arange(var<any> start, var<any> step)->(tensor<any> t) | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
-| constant | miaobyte | constant(var<any> value)->(tensor<any> t) | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> stddev, var<int32> seed)->(tensor<any> t) |
+| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
+| arange | miaobyte | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
+| constant | miaobyte | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
 
 ### elementwise
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
-| greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
-| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| tan | miaobyte | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
-| sin | miaobyte | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| log | miaobyte | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| addscalar | miaobyte | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| cos | miaobyte | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| powscalar | miaobyte | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
-| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| rpowscalar | miaobyte | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
-| add | cublas | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| sub | miaobyte | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| sqrt | miaobyte | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| subscalar | miaobyte | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| exp | miaobyte | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
-| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| equal | miaobyte | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
-| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
-| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1, T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| pow | miaobyte | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| switch | miaobyte | C=switch(tensors,cases) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> result) |
+| greaterscalar | miaobyte | mask=compare(T1, scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| equalscalar | miaobyte | mask=compare(T1, scalar) | equalscalar(tensor<any> A, var<any> scalar, var<float64> epsilon)->(tensor<bool> mask) |
+| min | miaobyte | T3=min(T1, T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| maxscalar | miaobyte | T3=max(T1, scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| tan | miaobyte | T3=tan(T1) | tan(tensor<float64|float32> A)->(tensor<float64|float32> C) |
+| sin | miaobyte | T3=sin(T1) | sin(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| divscalar | miaobyte | T3=scalar/T1 | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| log | miaobyte | T3=log(T1) | log(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| greater | miaobyte | mask=compare(T1, T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| lessscalar | miaobyte | mask=compare(T1, scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| cos | miaobyte | T3=cos(T1) | cos(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| less | miaobyte | mask=compare(T1, T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| powscalar | miaobyte | T3=pow(T1, scalar) | powscalar(tensor<float64|float32> A, var<float64|int32> scalar)->(tensor<float64|float32> C) |
+| minscalar | miaobyte | T3=min(T1, scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| rpowscalar | miaobyte | T3=pow(scalar, T1) | rpowscalar(var<float64|int32> scalar, tensor<float64|float32> A)->(tensor<float64|float32> C) |
+| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> a)->(tensor<any> b) |
+| add | cublas | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| sub | miaobyte | T3=T1-T2 | sub(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| exp | miaobyte | T3=exp(T1) | exp(tensor<float64|float32|float16|bfloat16> A)->(tensor<float64|float32|float16|bfloat16> C) |
+| mul | miaobyte | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| equal | miaobyte | mask=compare(T1, T2) | equal(tensor<any> A, tensor<any> B, var<float64> epsilon)->(tensor<bool> mask) |
+| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| div | miaobyte | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
+| max | miaobyte | T3=max(T1, T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| pow | miaobyte | T3=pow(T1, T2) | pow(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
 
 ### reduce
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| prod | miaobyte | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> dims, var<bool> keepdims)->(tensor<any> B) |
 
 ### changeshape
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| indexselect | miaobyte | indexselect(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
-| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
-| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
-| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
-| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| indexselect | miaobyte | T2 = T1.indexselect(index=[1,2], axis=1) | indexselect(tensor<any> A, tensor<int64|int32> indices, var<int32> axis)->(tensor<any> B) |
+| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
+| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
+| transpose | miaobyte | T2 = T1.transpose(dimorder=[1,0]) | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
+| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 
diff --git a/doc/excuter/op-mem-ompsimd/list.md b/doc/excuter/op-mem-ompsimd/list.md
index 8bcfcdb0..7d676562 100644
--- a/doc/excuter/op-mem-ompsimd/list.md
+++ b/doc/excuter/op-mem-ompsimd/list.md
@@ -4,96 +4,97 @@
 
 ### arg
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| vecset |  none  | vecset(vector<any> value)->(vector<any> name) | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
-| argset |  none  | argset(var<any> value)->(var<any> name) | argvalue->argname | argset(var<any> value)->(var<any> name) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| vecset |  none  | [3  4  5]->shape | vecset(vector<any> value)->(vector<any> name) |
+| argset |  none  | argvalue->argname | argset(var<any> value)->(var<any> name) |
 
 ### tensorlife
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| renametensor |  none  | renametensor(var<string> new_name)->(tensor<any> t) | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
-| newtensor |  none  | newtensor(vector<int32> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> t) |
-| newtensor |  none  | newtensor(var<string> shape)->(tensor<any> t) | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
-| deltensor |  none  | deltensor()->(tensor<any> t) | del->T1 | deltensor()->(tensor<any> t) |
-| copytensor |  none  | copytensor(tensor<any> src)->(tensor<any> dst) | T1.data->T2.data | copytensor(tensor<any> src)->(tensor<any> dst) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| renametensor |  none  | rename(newname)->T1 | renametensor(var<string> new_name)->(tensor<any> t) |
+| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(vector<int32> shape)->(tensor<any> t) |
+| newtensor |  none  | T1 =Tensor(shape=[...]) | newtensor(var<string> shape)->(tensor<any> t) |
+| deltensor |  none  | del->T1 | deltensor()->(tensor<any> t) |
+| copytensor |  none  | T1.data->T2.data | copytensor(tensor<any> src)->(tensor<any> dst) |
 
 ### io
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| loadtensordata |  none  | loadtensordata(var<string> path)->(tensor<any> t) | loadtensordata(path)->tensor.data | loadtensordata(var<string> path)->(tensor<any> t) |
-| save |  none  | save(tensor<any> t, var<string> path)->() | save(T1,path) | save(tensor<any> t, var<string> path)->() |
-| print | miaobyte | print(tensor<any> t)->() | print(T1) | print(tensor<any> t)->() |
-| print | miaobyte | print(tensor<any> t, var<string> format)->() | print(T1) | print(tensor<any> t, var<string> format)->() |
-| load |  none  | load(var<string> path)->() | mem.load(path) | load(var<string> path)->() |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| loadtensordata |  none  | loadtensordata(path)->tensor.data | loadtensordata(var<string> path)->(tensor<any> t) |
+| save |  none  | save(T1,path) | save(tensor<any> t, var<string> path)->() |
+| print | miaobyte | print(T1) | print(tensor<any> t)->() |
+| print | miaobyte | print(T1) | print(tensor<any> t, var<string> format)->() |
+| load |  none  | mem.load(path) | load(var<string> path)->() |
 
 ### matmul
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| matmul | cblas | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
-| matmul | miaobyte | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| matmul | cblas | T3=T1 @ T2 | matmul(tensor<float64|float32> A, tensor<float64|float32> B)->(tensor<float64|float32> C) |
+| matmul | miaobyte | T3=T1 @ T2 | matmul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
 ### init
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| normal | miaobyte | normal(var<any> mean, var<any> std, var<int32> seed)->(tensor<any> t) | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> std, var<int32> seed)->(tensor<any> t) |
-| uniform | miaobyte | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
-| arange | miaobyte | arange(var<any> start, var<any> step)->(tensor<any> t) | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
-| constant | miaobyte | constant(var<any> value)->(tensor<any> t) | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| normal | miaobyte | normal(mean,stddev,seed)->T1 | normal(var<any> mean, var<any> std, var<int32> seed)->(tensor<any> t) |
+| uniform | miaobyte | uniform(low,high,seed)->T1 | uniform(var<any> low, var<any> high, var<int32> seed)->(tensor<any> t) |
+| arange | miaobyte | arange(start,step)->T1 | arange(var<any> start, var<any> step)->(tensor<any> t) |
+| constant | miaobyte | constant(value)->T1 | constant(var<any> value)->(tensor<any> t) |
 
 ### elementwise
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| switch | miaobyte | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
-| greaterscalar | miaobyte | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| equalscalar | miaobyte | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| min | miaobyte | min(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| maxscalar | miaobyte | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| divscalar | miaobyte | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| log | miaobyte | log(tensor<any> A)->(tensor<any> C) | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
-| addscalar | miaobyte | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
-| greater | miaobyte | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| lessscalar | miaobyte | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
-| less | miaobyte | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| powscalar | miaobyte | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| minscalar | miaobyte | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
-| rdivscalar | miaobyte | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| rpowscalar | miaobyte | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
-| add | cblas | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
-| add | miaobyte | add(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| sub | miaobyte | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
-| sqrt | miaobyte | sqrt(tensor<any> A)->(tensor<any> C) | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
-| subscalar | miaobyte | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
-| exp | miaobyte | exp(tensor<any> A)->(tensor<any> C) | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
-| mul | miaobyte | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| equal | miaobyte | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) | mask=equal(T1,T2) | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
-| mulscalar | miaobyte | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
-| div | miaobyte | div(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| invert | miaobyte | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
-| max | miaobyte | max(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
-| pow | miaobyte | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| switch | miaobyte | C=switch([tensors],case) | switch(listtensor<any> tensors, tensor<int8> cases)->(tensor<any> C) |
+| greaterscalar | miaobyte | mask=greater(T1,scalar) | greaterscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| equalscalar | miaobyte | mask=equal(T1,scalar) | equalscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| min | miaobyte | T3=min(T1,T2) | min(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| maxscalar | miaobyte | T3=max(T1,scalar) | maxscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| divscalar | miaobyte | T3=T1/scalar | divscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| log | miaobyte | T3=log(T1) | log(tensor<any> A)->(tensor<any> C) |
+| addscalar | miaobyte | T3=T1+scalar | addscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| greater | miaobyte | mask=greater(T1,T2) | greater(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| lessscalar | miaobyte | mask=less(T1,scalar) | lessscalar(tensor<any> A, var<any> scalar)->(tensor<bool> mask) |
+| less | miaobyte | mask=less(T1,T2) | less(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| powscalar | miaobyte | T3=T1^scalar | powscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| minscalar | miaobyte | T3=min(T1,scalar) | minscalar(tensor<any> A, var<any> scalar)->(tensor<any> C) |
+| rdivscalar | miaobyte | T3=scalar/T1 | rdivscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| rpowscalar | miaobyte | T3=scalar^T1 | rpowscalar(var<any> scalar, tensor<any> A)->(tensor<any> C) |
+| todtype |  none  | T3(dtypeA)->T1(dtypeB) | todtype(tensor<any> A)->(tensor<any> C) |
+| add | cblas | T3=T1+T2 | add(tensor<float64|float32> a, tensor<float64|float32> b)->(tensor<float64|float32> c) |
+| add | miaobyte | T3=T1+T2 | add(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| sub | miaobyte | T3=T1-T2 | sub(tensor<any> a, tensor<any> b)->(tensor<any> c) |
+| sqrt | miaobyte | T3=sqrt(T1) | sqrt(tensor<any> A)->(tensor<any> C) |
+| subscalar | miaobyte | T3=T1-scalar | subscalar(tensor<any> a, var<any> scalar)->(tensor<any> c) |
+| exp | miaobyte | T3=exp(T1) | exp(tensor<any> A)->(tensor<any> C) |
+| mul | miaobyte | T3=T1*T2 | mul(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| equal | miaobyte | mask=equal(T1,T2) | equal(tensor<any> A, tensor<any> B)->(tensor<bool> mask) |
+| mulscalar | miaobyte | T3=T1*scalar | mulscalar(tensor<any> A, var<any> b)->(tensor<any> C) |
+| div | miaobyte | T3=T1/T2 | div(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| invert | miaobyte | T3=~T1 | invert(tensor<int64|int32|int16|int8> A)->(tensor<int64|int32|int16|int8> C) |
+| max | miaobyte | T3=max(T1,T2) | max(tensor<any> A, tensor<any> B)->(tensor<any> C) |
+| pow | miaobyte | T3=T1^T2 | pow(tensor<any> A, tensor<any> B)->(tensor<any> C) |
 
 ### reduce
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| prod | miaobyte | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| reducemax | miaobyte | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| sum | miaobyte | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
-| reducemin | miaobyte | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| prod | miaobyte | B = prod(A, axis=[1 2], keepdims=false) | prod(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| reducemax | miaobyte | B = reducemax(A, axis=[1 2], keepdims=false) | reducemax(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| sum | miaobyte | B = sum(A, axis=[1 2], keepdims=false) | sum(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
+| reducemin | miaobyte | B = reducemin(A, axis=[1 2], keepdims=false) | reducemin(tensor<any> A, vector<int32> axis, var<bool> keepdims)->(tensor<any> B) |
 
 ### changeshape
 
-| Operation | Author | Func Def | Math Formula | IR Instruction |
-|-----------|--------|------------|--------------|----------------|
-| indexselect | miaobyte | indexselect(tensor<any> A, tensor<int64|int32> index, var<int32> axis)->(tensor<any> B) | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor<any> A, tensor<int64|int32> index, var<int32> axis)->(tensor<any> B) |
-| broadcastTo | miaobyte | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
-| concat | miaobyte | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
-| transpose | miaobyte | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
-| reshape | miaobyte | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
+| Operation | Author |  Math Formula | IR Instruction |
+|-----------|--------|--------------|----------------|
+| indexselect | miaobyte | T2 = T1.indexselect(index=T3, axis=3) | indexselect(tensor<any> A, tensor<int64|int32> index, var<int32> axis)->(tensor<any> B) |
+| broadcastTo | miaobyte | T2 = T1.broadcastTo(new_shape=[4,3,2]) | broadcastTo(tensor<any> A, vector<int32> new_shape)->(tensor<any> B) |
+| concat | miaobyte | Tresult = concat([T1, T2...], axis=3) | concat(listtensor<any> tensors, var<int32> dim)->(tensor<any> result) |
+| transpose | miaobyte | T1.transpose(dimorder=[1,0])->T2 | transpose(tensor<any> A, vector<int32> dim_order)->(tensor<any> C) |
+| reshape | miaobyte | T1.reshape(shape)->T2 | reshape(tensor<any> A, vector<int32> shape)->(tensor<any> B) |
 
diff --git a/excuter/cpp-common/src/deepx/shape.hpp b/excuter/cpp-common/src/deepx/shape.hpp
index b314e891..482142cd 100644
--- a/excuter/cpp-common/src/deepx/shape.hpp
+++ b/excuter/cpp-common/src/deepx/shape.hpp
@@ -66,7 +66,7 @@ namespace deepx
 
         // rangeParallel 支持omp,但omp内无需线程local变量
         void rangeParallel(int dimCount, std::function<void(const std::vector<int> &indices)> func) const;
-        void rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const;
+        void rangeElementwiseParallel( std::function<void(const int idx_linear,const int idx_linear_end)> func) const;
         void rangeParallel(int dimCount, std::function<void(const int idx_linear, const std::vector<int> &indices)> func) const;
 
         // 支持omp,但omp内需要线程local变量
diff --git a/excuter/cpp-common/src/deepx/shape_range.cpp b/excuter/cpp-common/src/deepx/shape_range.cpp
index 1f7fad54..45d12320 100644
--- a/excuter/cpp-common/src/deepx/shape_range.cpp
+++ b/excuter/cpp-common/src/deepx/shape_range.cpp
@@ -2,6 +2,7 @@
 #include <vector>
 #include <functional>
 #include <any>
+#include <thread>
 
 #include <omp.h>
 #include "deepx/shape.hpp"
@@ -113,18 +114,24 @@ namespace deepx
             }
         }
     }
-    void Shape::rangeParallel(int dimCount, std::function<void(const int idx_linear)> func) const
-    {
-        dimCount = checkdim(dimCount, dim());
-        int stride = checkStride(dimCount, shape);
-
-        // 计算总循环次数
-        int total = size / stride;
-
-#pragma omp parallel for
-        for (int idx = 0; idx < total; idx++)
+    void Shape::rangeElementwiseParallel(std::function<void(const int idx_linear,const int idx_linear_end)> func) const
+    {   
+        int num_threads =  std::thread::hardware_concurrency();
+        int alignblock=size/num_threads;
+        const int minblock=256;
+        if (alignblock<minblock)
         {
-            func(idx * stride);
+            alignblock=minblock;
+            num_threads=size/alignblock;
+        }
+        #pragma omp parallel for num_threads(num_threads)
+        for (int idx = 0; idx < size; idx+=alignblock)
+        {
+            int end = idx + alignblock;
+            if (end > size) {
+                end = size;
+            }
+            func(idx,end);
         }
     }
 
diff --git a/excuter/cpp-common/src/deepx/tensor.hpp b/excuter/cpp-common/src/deepx/tensor.hpp
index d49f12f8..9e46b222 100644
--- a/excuter/cpp-common/src/deepx/tensor.hpp
+++ b/excuter/cpp-common/src/deepx/tensor.hpp
@@ -26,6 +26,7 @@ namespace deepx
         DeleteFn deleter; // 释放内存
 
         using CopyFn = void (*)(T *, T *, int);
+        //copyer(src, dest, size)
         CopyFn copyer; // 拷贝内存
 
         using SaveFn = void (*)(T *,size_t,const std::string &);
diff --git a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
index ca44fd13..6e3b2072 100644
--- a/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
+++ b/excuter/cpp-common/src/deepx/tensorfunc/elementwise.hpp
@@ -6,6 +6,10 @@
 
 namespace deepx::tensorfunc
 {
+    //todtype
+    template <typename T,typename Dtype>
+    void todtype(const Tensor<T> &input, Tensor<Dtype> &output);
+ 
     template <typename Author, typename T>
     struct addDispatcher
     {
diff --git a/excuter/cpp-common/src/deepx/tf/tffactory.cpp b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
index 48dd52bb..c57adf51 100644
--- a/excuter/cpp-common/src/deepx/tf/tffactory.cpp
+++ b/excuter/cpp-common/src/deepx/tf/tffactory.cpp
@@ -103,15 +103,14 @@ namespace deepx::tf
         // 为每个tftype生成一个表格
         for (const auto &[tftype, tfs] : tf_by_type) {
             ss << "### " << tftype << "\n\n";
-            ss << "| Operation | Author | Func Def | Math Formula | IR Instruction |\n";
-            ss << "|-----------|--------|------------|--------------|----------------|\n";
+            ss << "| Operation | Author |  Math Formula | IR Instruction |\n";
+            ss << "|-----------|--------|--------------|----------------|\n";
             
             for (const auto &tf : tfs) {
                 ss << "| " << tf->name << " | ";
                 ss << (tf->metadata.author.empty() ? " none " : tf->metadata.author) << " | ";
-                ss << tf->to_string(false, true) << " | ";
                 ss << tf->math_formula() << " | ";
-                ss << tf->to_string(false, true) << " |\n";
+                ss << stdutil::escape_markdown(tf->to_string(false, true)) << " |\n";
             }
             
             ss << "\n";
diff --git a/excuter/cpp-common/src/stdutil/string.cpp b/excuter/cpp-common/src/stdutil/string.cpp
index d254d349..3af914b0 100644
--- a/excuter/cpp-common/src/stdutil/string.cpp
+++ b/excuter/cpp-common/src/stdutil/string.cpp
@@ -8,9 +8,49 @@ namespace stdutil
         str.erase(str.find_last_not_of(" ") + 1);
     }
 
-    void trim(string &str,const string &chars)
+    void trim(string &str, const string &chars)
     {
         str.erase(0, str.find_first_not_of(chars));
         str.erase(str.find_last_not_of(chars) + 1);
     }
+
+    string escape_markdown(const string &str)
+    {
+        std::string result;
+        for (char c : str)
+        {
+            switch (c)
+            {
+            case '\\':
+                result += "\\\\";
+                break;
+            case '\"':
+                result += "\\\"";
+                break;
+            case '\'':
+                result += "\\\'";
+                break;
+            case '\n':
+                result += "\\n";
+                break;
+            case '\t':
+                result += "\\t";
+                break;
+            case '\r':
+                result += "\\r";
+                break;
+            case '\b':
+                result += "\\b";
+                break;
+            case '\f':
+                result += "\\f";
+                break;
+            default:
+                // 普通字符直接添加
+                result += c;
+            }
+        }
+        return result;
+    }
+
 } // namespace stdutil
\ No newline at end of file
diff --git a/excuter/cpp-common/src/stdutil/string.hpp b/excuter/cpp-common/src/stdutil/string.hpp
index 1b353ded..76c9c0c7 100644
--- a/excuter/cpp-common/src/stdutil/string.hpp
+++ b/excuter/cpp-common/src/stdutil/string.hpp
@@ -10,6 +10,7 @@ namespace stdutil
     void trimspace(string &str);
     void trim(string &str,const string &chars=" \t\n\r\f\v");
 
+    string escape_markdown(const string &str);
 }
 
 
diff --git a/excuter/op-mem-cuda/src/client/tfs.cpp b/excuter/op-mem-cuda/src/client/tfs.cpp
index c98748e2..4aeaa5ad 100644
--- a/excuter/op-mem-cuda/src/client/tfs.cpp
+++ b/excuter/op-mem-cuda/src/client/tfs.cpp
@@ -164,6 +164,18 @@ namespace deepx::tf
     // elementwise
     void register_elementwise(TfFactory &tffactory)
     {
+        //todtype
+        tffactory.add_tf(std::make_shared<Todtype>(vector<Param>(
+                                                                 {
+                                                                     Param("a", DataCategory::Tensor, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("b", DataCategory::Tensor, Precision::Any),
+                                                                 })));
+
+
+        // add
         tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("a", DataCategory::Tensor, Precision::Any),
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
index bc97ba5f..5ef7ea81 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/changeshape_miaobyte.cu
@@ -369,33 +369,38 @@ namespace deepx::tensorfunc
     // indexselect
     template <typename GatherAxisT>
     __host__ __device__ void fromIndexselectIndices(
-    const int *output_indices,const int outputDim,  // 输出张量的索引
-    const GatherAxisT *indices,const int *indicesStrides,const int indicesDim, //indices是tensor
-    int *index_indices,
-    const int gatherAxis,      // gather操作的轴
-    int *input_indices,const int inputDim){
+        const int *output_indices, const int outputDim,                              // 输出张量的索引
+        const GatherAxisT *index, const int *indexStrides, const int indexDim, // index是tensor
+        int *index_indices,
+        const int gatherAxis, // gather操作的轴
+        int *input_indices, const int inputDim)
+    {
 
         for (int i = 0; i < gatherAxis; ++i)
         {
             input_indices[i] = output_indices[i];
         }
-        for (int i = gatherAxis; i < gatherAxis + indicesDim; ++i)
+        for (int i = gatherAxis; i < gatherAxis + indexDim; ++i)
         {
             index_indices[i - gatherAxis] = output_indices[i];
         }
         // 使用indices张量中对应位置的值来替换gatherAxis维度的索引
-        int indices_idx = linearAt(indicesStrides, indicesDim, index_indices);
-        input_indices[gatherAxis] = indices[indices_idx];
-        for (int i = gatherAxis +indicesDim; i < outputDim; ++i)
+        int index_idx = linearAt(indexStrides, indexDim, index_indices);
+        input_indices[gatherAxis] = index[index_idx];
+        // for (int i = gatherAxis +indicesDim; i < outputDim; ++i)
+        // {
+        //     input_indices[gatherAxis+1+i] = output_indices[i];
+        // }
+        for (int i = 0; i < outputDim - (gatherAxis + indexDim); ++i)
         {
-            input_indices[gatherAxis+1+i] = output_indices[i];
+            input_indices[gatherAxis + 1 + i] = output_indices[gatherAxis + indexDim + i];
         }
     }
 
     template <int DIM, typename T, typename GatherAxisT>
     __global__ void indexselect_kernel(
         const T *input, const int *inputStrides, const int inputDim,
-        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
+        const GatherAxisT *index, const int *indexStrides, const int indexDim,
         const int gatherAxis,
         T *output, const int *outputStrides, const int outputDim, const int outputlen)
     {
@@ -410,11 +415,11 @@ namespace deepx::tensorfunc
             // 输入索引
             int index_indices[DIM];
             int input_indices[DIM];
-            fromIndexselectIndices(output_indices,outputDim,
-                              indices, indicesStrides, indicesDim,
-                              index_indices,
-                              gatherAxis,
-                              input_indices, inputDim);
+            fromIndexselectIndices(output_indices, outputDim,
+                                   index, indexStrides, indexDim,
+                                   index_indices,
+                                   gatherAxis,
+                                   input_indices, inputDim);
             int inputIdx = linearAt(inputStrides, inputDim, input_indices);
             int outputIdx = linearAt(outputStrides, outputDim, output_indices);
             output[outputIdx] = input[inputIdx];
@@ -424,7 +429,7 @@ namespace deepx::tensorfunc
     template <typename T, typename GatherAxisT>
     void launch_indexselect(
         const T *input, const int *inputStrides, const int inputDim,
-        const GatherAxisT *indices, const int *indicesStrides, const int indicesDim,
+        const GatherAxisT *index, const int *indexStrides, const int indexDim,
         const int gatherAxis,
         T *output, const int *outputStrides, const int outputDim, const int outputlen)
     {
@@ -432,7 +437,7 @@ namespace deepx::tensorfunc
         auto [numBlocks, blockSize] = BestDims(outputlen);
 
         // indices
-        cudaVector<int> indicesStrides_d(indicesStrides, indicesDim, cudaMemcpyHostToDevice);
+        cudaVector<int> indexStrides_d(indexStrides, indexDim, cudaMemcpyHostToDevice);
 
         // input
         cudaVector<int> inputStrides_d(inputStrides, inputDim, cudaMemcpyHostToDevice);
@@ -440,46 +445,46 @@ namespace deepx::tensorfunc
         // output
         cudaVector<int> outputStrides_d(outputStrides, outputDim, cudaMemcpyHostToDevice);
 
-        //TODO 这里可能会导致寄存器浪费，但是，搞太多模板T，模板实例化不好搞
-        int dim=std::max(inputDim,indicesDim);
-        dim=std::max(dim,outputDim);
+        // TODO 这里可能会导致寄存器浪费，但是，搞太多模板T，模板实例化不好搞
+        int dim = std::max(inputDim, indexDim);
+        dim = std::max(dim, outputDim);
         switch (dim)
         {
         case 1:
-            indexselect_kernel<1, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<1, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 2:
-            indexselect_kernel<2, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<2, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 3:
-            indexselect_kernel<3, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<3, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 4:
-            indexselect_kernel<4, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<4, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 5:
-            indexselect_kernel<5, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<5, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 6:
-            indexselect_kernel<6, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<6, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 7:
-            indexselect_kernel<7, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<7, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 8:
-            indexselect_kernel<8, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<8, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 9:
-            indexselect_kernel<9, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<9, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 10:
-            indexselect_kernel<10, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<10, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 11:
-            indexselect_kernel<11, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<11, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         case 12:
-            indexselect_kernel<12, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, indices, indicesStrides_d.data, indicesDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
+            indexselect_kernel<12, T, GatherAxisT><<<numBlocks, blockSize>>>(input, inputStrides_d.data, inputDim, index, indexStrides_d.data, indexDim, gatherAxis, output, outputStrides_d.data, outputDim, outputlen);
             break;
         default:
             throw std::runtime_error("dimension large than " + std::to_string(MAX_DIM));
@@ -491,71 +496,70 @@ namespace deepx::tensorfunc
         }
     }
     template void launch_indexselect<double, int64_t>(const double *input, const int *inputStrides, const int inputDim,
-                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 double *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                      const int64_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      double *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<float, int64_t>(const float *input, const int *inputStrides, const int inputDim,
-                                                const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                const int gatherAxis,
-                                                float *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                     const int64_t *index, const int *indexStrides, const int indexDim,
+                                                     const int gatherAxis,
+                                                     float *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<nv_bfloat16, int64_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
-                                                      const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                      const int gatherAxis,
-                                                      nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                           const int64_t *index, const int *indexStrides, const int indexDim,
+                                                           const int gatherAxis,
+                                                           nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<__half, int64_t>(const __half *input, const int *inputStrides, const int inputDim,
-                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 __half *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                      const int64_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      __half *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int64_t, int64_t>(const int64_t *input, const int *inputStrides, const int inputDim,
-                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int64_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int32_t, int64_t>(const int32_t *input, const int *inputStrides, const int inputDim,
-                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int64_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int16_t, int64_t>(const int16_t *input, const int *inputStrides, const int inputDim,
-                                                  const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int64_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int8_t, int64_t>(const int8_t *input, const int *inputStrides, const int inputDim,
-                                                 const int64_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                      const int64_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
 
     template void launch_indexselect<double, int32_t>(const double *input, const int *inputStrides, const int inputDim,
-                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 double *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                        const int32_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      double *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<float, int32_t>(const float *input, const int *inputStrides, const int inputDim,
-                                                const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                const int gatherAxis,
-                                                float *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                     const int32_t *index, const int *indexStrides, const int indexDim,
+                                                     const int gatherAxis,
+                                                     float *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<nv_bfloat16, int32_t>(const nv_bfloat16 *input, const int *inputStrides, const int inputDim,
-                                                      const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                      const int gatherAxis,
-                                                      nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                           const int32_t *index, const int *indexStrides, const int indexDim,
+                                                           const int gatherAxis,
+                                                           nv_bfloat16 *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<__half, int32_t>(const __half *input, const int *inputStrides, const int inputDim,
-                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 __half *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                      const int32_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      __half *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int64_t, int32_t>(const int64_t *input, const int *inputStrides, const int inputDim,
-                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int32_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int64_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int32_t, int32_t>(const int32_t *input, const int *inputStrides, const int inputDim,
-                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int32_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int32_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int16_t, int32_t>(const int16_t *input, const int *inputStrides, const int inputDim,
-                                                  const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                  const int gatherAxis,
-                                                  int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                       const int32_t *index, const int *indexStrides, const int indexDim,
+                                                       const int gatherAxis,
+                                                       int16_t *output, const int *outputStrides, const int outputDim, const int outputlen);
     template void launch_indexselect<int8_t, int32_t>(const int8_t *input, const int *inputStrides, const int inputDim,
-                                                 const int32_t *indices, const int *indicesStrides, const int indicesDim,
-                                                 const int gatherAxis,
-                                                 int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
+                                                      const int32_t *index, const int *indexStrides, const int indexDim,
+                                                      const int gatherAxis,
+                                                      int8_t *output, const int *outputStrides, const int outputDim, const int outputlen);
 }
 
- 
 #endif // DEEPX_TENSORFUNC_CHANGESHAPE_MIAOBYTE_HPP
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
index 6fce29d0..d9c9f3c1 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda.hpp
@@ -80,7 +80,6 @@ namespace deepx::tensorfunc
         return {size, host_data};
     }
 
-    
 }
 
 #endif
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
index 14764266..ff7117f3 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/cuda_math.cuh
@@ -9,208 +9,236 @@
 namespace deepx::tensorfunc
 {
 
-    //sqrt
+    // sqrt
     template <typename T>
-    __device__  __forceinline__  void deepx_sqrt(const T *a, T *out);
+    __device__ __forceinline__ void deepx_sqrt(const T *a, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_sqrt<double>(const double *a, double *out)
+    __device__ __forceinline__ void deepx_sqrt<double>(const double *a, double *out)
     {
         *out = sqrt(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_sqrt<float>(const float *a, float *out)
+    __device__ __forceinline__ void deepx_sqrt<float>(const float *a, float *out)
     {
         *out = sqrtf(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_sqrt<half>(const half *a, half *out)
+    __device__ __forceinline__ void deepx_sqrt<half>(const half *a, half *out)
     {
         *out = hsqrt(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_sqrt<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
+    __device__ __forceinline__ void deepx_sqrt<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
         *out = hsqrt(*a);
     }
-    
-    //pow
+
+    // pow
     template <typename T>
-    __device__  __forceinline__  void deepx_pow(const T *a, const T *b, T *out);
+    __device__ __forceinline__ void deepx_pow(const T *a, const T *b, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_pow<double>(const double *a, const double *b, double *out)
+    __device__ __forceinline__ void deepx_pow<double>(const double *a, const double *b, double *out)
     {
         *out = pow(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_pow<float>(const float *a, const float *b, float *out)
+    __device__ __forceinline__ void deepx_pow<float>(const float *a, const float *b, float *out)
     {
         *out = powf(*a, *b);
     }
 
-    //log
+    // log
     template <typename T>
-    __device__  __forceinline__  void deepx_log(const T *a, T *out);
+    __device__ __forceinline__ void deepx_log(const T *a, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_log<double>(const double *a, double *out)
+    __device__ __forceinline__ void deepx_log<double>(const double *a, double *out)
     {
         *out = log(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_log<float>(const float *a, float *out)
+    __device__ __forceinline__ void deepx_log<float>(const float *a, float *out)
     {
         *out = logf(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_log<half>(const half *a, half *out)
+    __device__ __forceinline__ void deepx_log<half>(const half *a, half *out)
     {
         *out = hlog(*a);
-    }   
+    }
 
     template <>
-    __device__  __forceinline__  void deepx_log<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
+    __device__ __forceinline__ void deepx_log<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
         *out = hlog(*a);
     }
 
-    //exp
+    // exp
     template <typename T>
-    __device__  __forceinline__  void deepx_exp(const T *a, T *out);
+    __device__ __forceinline__ void deepx_exp(const T *a, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_exp<double>(const double *a, double *out)
+    __device__ __forceinline__ void deepx_exp<double>(const double *a, double *out)
     {
         *out = exp(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_exp<float>(const float *a, float *out)
+    __device__ __forceinline__ void deepx_exp<float>(const float *a, float *out)
     {
         *out = expf(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_exp<half>(const half *a, half *out)
+    __device__ __forceinline__ void deepx_exp<half>(const half *a, half *out)
     {
         *out = hexp(*a);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_exp<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
+    __device__ __forceinline__ void deepx_exp<nv_bfloat16>(const nv_bfloat16 *a, nv_bfloat16 *out)
     {
         *out = hexp(*a);
     }
-    
+
     // max
     template <typename T>
-    __device__  __forceinline__  void deepx_max(const T *a, const T *b, T *out);
+    __device__ __forceinline__ void deepx_max(const T *a, const T *b, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_max<double>(const double *a, const double *b, double *out)
+    __device__ __forceinline__ void deepx_max<double>(const double *a, const double *b, double *out)
     {
         *out = fmax(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_max<float>(const float *a, const float *b, float *out)
+    __device__ __forceinline__ void deepx_max<float>(const float *a, const float *b, float *out)
     {
         *out = fmaxf(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_max<half>(const half *a, const half *b, half *out)
+    __device__ __forceinline__ void deepx_max<half>(const half *a, const half *b, half *out)
     {
         *out = __hmax(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    __device__ __forceinline__ void deepx_max<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
     {
         *out = __hmax(*a, *b);
     }
     template <>
-    __device__  __forceinline__  void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    __device__ __forceinline__ void deepx_max<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
     {
         *out = *a > *b ? *a : *b;
     }
     template <>
-    __device__  __forceinline__  void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    __device__ __forceinline__ void deepx_max<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
     {
         *out = *a > *b ? *a : *b;
     }
     template <>
-    __device__  __forceinline__  void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    __device__ __forceinline__ void deepx_max<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
     {
         *out = *a > *b ? *a : *b;
     }
     template <>
-    __device__  __forceinline__  void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    __device__ __forceinline__ void deepx_max<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
     {
         *out = *a > *b ? *a : *b;
     }
 
     // min
     template <typename T>
-    __device__  __forceinline__  void deepx_min(const T *a, const T *b, T *out);
+    __device__ __forceinline__ void deepx_min(const T *a, const T *b, T *out);
 
     template <>
-    __device__  __forceinline__  void deepx_min<double>(const double *a, const double *b, double *out)
+    __device__ __forceinline__ void deepx_min<double>(const double *a, const double *b, double *out)
     {
         *out = fmin(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<float>(const float *a, const float *b, float *out)
+    __device__ __forceinline__ void deepx_min<float>(const float *a, const float *b, float *out)
     {
         *out = fminf(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<half>(const half *a, const half *b, half *out)
+    __device__ __forceinline__ void deepx_min<half>(const half *a, const half *b, half *out)
     {
         *out = __hmin(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
+    __device__ __forceinline__ void deepx_min<nv_bfloat16>(const nv_bfloat16 *a, const nv_bfloat16 *b, nv_bfloat16 *out)
     {
         *out = __hmin(*a, *b);
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
+    __device__ __forceinline__ void deepx_min<int64_t>(const int64_t *a, const int64_t *b, int64_t *out)
     {
         *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
+    __device__ __forceinline__ void deepx_min<int32_t>(const int32_t *a, const int32_t *b, int32_t *out)
     {
         *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
+    __device__ __forceinline__ void deepx_min<int16_t>(const int16_t *a, const int16_t *b, int16_t *out)
     {
         *out = *a < *b ? *a : *b;
     }
 
     template <>
-    __device__  __forceinline__  void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
+    __device__ __forceinline__ void deepx_min<int8_t>(const int8_t *a, const int8_t *b, int8_t *out)
     {
         *out = *a < *b ? *a : *b;
     }
 
-   
-    
+    //todtype
+    template <typename T, typename Dtype>
+    __device__ __forceinline__ Dtype deepx_todtype(const T &a)
+    {
+        return static_cast<Dtype>(a);
+    }
+    //float<->half
+    template <>
+    __device__ __forceinline__ half deepx_todtype<float, half>(const float &src)
+    {
+        return __float2half(src);
+    }
+
+    template <>
+    __device__ __forceinline__ float deepx_todtype<half, float>(const half &src)
+    {
+        return __half2float(src);
+    }
+    //float<->bfloat16
+    template <>
+    __device__ __forceinline__ nv_bfloat16 deepx_todtype<float, nv_bfloat16>(const float &src)
+    {
+        return __float2bfloat16(src);
+    }
+    template <>
+    __device__ __forceinline__ float deepx_todtype<nv_bfloat16, float>(const nv_bfloat16 &src)
+    {
+        return __bfloat162float(src);
+    }
+ 
 }
 
 #endif // DEEPX_TENSORFUNC_CUDA_MATH_CUH
\ No newline at end of file
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
index abe6f223..772865f3 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cu
@@ -5,9 +5,96 @@
 #include <cuda_fp16.h>
 #include "deepx/tensorfunc/cuda.hpp"
 #include "deepx/tensorfunc/authors.hpp"
+#include "deepx/tensorfunc/cuda_math.cuh"
 
 namespace deepx::tensorfunc
 {
+
+    //todtype
+
+    template <typename T,typename Dtype>
+    __global__ void todtype_kernel(const T* A, Dtype* C,const int size){
+        int stride = blockDim.x * gridDim.x;
+        for(int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; idx += stride){
+            C[idx] = deepx_todtype<T,Dtype>(A[idx]);
+        }
+    }
+
+    template <typename T,typename Dtype>
+    void launch_todtype(const T* a, Dtype* c,const int size){
+        auto [numBlocks, blockSize] = BestDims(size);
+        todtype_kernel<<<numBlocks, blockSize>>>(a, c, size);
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            throw std::runtime_error("Failed to launch todtype kernel: " +
+                                     std::string(cudaGetErrorString(err)));
+        }
+    }
+    template void launch_todtype<double, float>(const double *a, float *c, const int size);
+    template void launch_todtype<double, half>(const double *a, half *c, const int size);
+    template void launch_todtype<double, nv_bfloat16>(const double *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<double, int64_t>(const double *a, int64_t *c, const int size);
+    template void launch_todtype<double, int32_t>(const double *a, int32_t *c, const int size);
+    template void launch_todtype<double, int16_t>(const double *a, int16_t *c, const int size);
+    template void launch_todtype<double, int8_t>(const double *a, int8_t *c, const int size);
+
+    template void launch_todtype<float, double>(const float *a, double *c, const int size);
+    template void launch_todtype<float, half>(const float *a, half *c, const int size);
+    template void launch_todtype<float, nv_bfloat16>(const float *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<float, int64_t>(const float *a, int64_t *c, const int size);
+    template void launch_todtype<float, int32_t>(const float *a, int32_t *c, const int size);
+    template void launch_todtype<float, int16_t>(const float *a, int16_t *c, const int size);
+    template void launch_todtype<float, int8_t>(const float *a, int8_t *c, const int size);
+
+    template void launch_todtype<nv_bfloat16, double>(const nv_bfloat16 *a, double *c, const int size);
+    template void launch_todtype<nv_bfloat16, float>(const nv_bfloat16 *a, float *c, const int size);
+    template void launch_todtype<nv_bfloat16, half>(const nv_bfloat16 *a, half *c, const int size);
+    template void launch_todtype<nv_bfloat16, int64_t>(const nv_bfloat16 *a, int64_t *c, const int size);
+    template void launch_todtype<nv_bfloat16, int32_t>(const nv_bfloat16 *a, int32_t *c, const int size);
+    template void launch_todtype<nv_bfloat16, int16_t>(const nv_bfloat16 *a, int16_t *c, const int size);
+    template void launch_todtype<nv_bfloat16, int8_t>(const nv_bfloat16 *a, int8_t *c, const int size);
+
+    template void launch_todtype<half, double>(const half *a, double *c, const int size);
+    template void launch_todtype<half, float>(const half *a, float *c, const int size);
+    template void launch_todtype<half, nv_bfloat16>(const half *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<half, int64_t>(const half *a, int64_t *c, const int size);
+    template void launch_todtype<half, int32_t>(const half *a, int32_t *c, const int size);
+    template void launch_todtype<half, int16_t>(const half *a, int16_t *c, const int size);
+    template void launch_todtype<half, int8_t>(const half *a, int8_t *c, const int size);
+ 
+    template void launch_todtype<int64_t, double>(const int64_t *a, double *c, const int size);
+    template void launch_todtype<int64_t, float>(const int64_t *a, float *c, const int size);
+    template void launch_todtype<int64_t, half>(const int64_t *a, half *c, const int size);
+    template void launch_todtype<int64_t, nv_bfloat16>(const int64_t *a, nv_bfloat16 *c, const int size); 
+    template void launch_todtype<int64_t, int32_t>(const int64_t *a, int32_t *c, const int size);
+    template void launch_todtype<int64_t, int16_t>(const int64_t *a, int16_t *c, const int size);
+    template void launch_todtype<int64_t, int8_t>(const int64_t *a, int8_t *c, const int size);
+
+    template void launch_todtype<int32_t, double>(const int32_t *a, double *c, const int size);
+    template void launch_todtype<int32_t, float>(const int32_t *a, float *c, const int size);
+    template void launch_todtype<int32_t, half>(const int32_t *a, half *c, const int size);
+    template void launch_todtype<int32_t, nv_bfloat16>(const int32_t *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<int32_t, int64_t>(const int32_t *a, int64_t *c, const int size);
+    template void launch_todtype<int32_t, int16_t>(const int32_t *a, int16_t *c, const int size);
+    template void launch_todtype<int32_t, int8_t>(const int32_t *a, int8_t *c, const int size);
+
+    template void launch_todtype<int16_t, double>(const int16_t *a, double *c, const int size);
+    template void launch_todtype<int16_t, float>(const int16_t *a, float *c, const int size);
+    template void launch_todtype<int16_t, half>(const int16_t *a, half *c, const int size);
+    template void launch_todtype<int16_t, nv_bfloat16>(const int16_t *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<int16_t, int64_t>(const int16_t *a, int64_t *c, const int size);
+    template void launch_todtype<int16_t, int32_t>(const int16_t *a, int32_t *c, const int size);
+    template void launch_todtype<int16_t, int8_t>(const int16_t *a, int8_t *c, const int size);
+    
+    template void launch_todtype<int8_t, double>(const int8_t *a, double *c, const int size);
+    template void launch_todtype<int8_t, float>(const int8_t *a, float *c, const int size);
+    template void launch_todtype<int8_t, half>(const int8_t *a, half *c, const int size);
+    template void launch_todtype<int8_t, nv_bfloat16>(const int8_t *a, nv_bfloat16 *c, const int size);
+    template void launch_todtype<int8_t, int64_t>(const int8_t *a, int64_t *c, const int size);
+    template void launch_todtype<int8_t, int32_t>(const int8_t *a, int32_t *c, const int size);
+    template void launch_todtype<int8_t, int16_t>(const int8_t *a, int16_t *c, const int size);
+   
     // add
     template <typename T>
     __global__ void add_kernel(const T *A, const T *B, T *C, const int size)
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
index 604421c4..4100f38d 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.cuh
@@ -7,6 +7,15 @@
 
 namespace deepx::tensorfunc
 {
+
+    //todtype
+    template <typename T,typename Dtype>
+    __global__ void todtype_kernel(const T* A, Dtype* C,const int size);
+
+    template <typename T,typename Dtype>
+    void launch_todtype(const T* a, Dtype* c,const int size);
+
+    //add
      template <typename T>
     __global__ void add_kernel(const T* A, const T* B, T* C,const int size);
 
diff --git a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
index 82cb4cbf..b7d3a680 100644
--- a/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tensorfunc/elementwise_miaobyte_basic.hpp
@@ -9,10 +9,17 @@
 #include "stdutil/error.hpp"
 
 namespace deepx::tensorfunc
-{
-    // CUDA kernel函数声明
-   
-
+{   
+    //todtype
+    template <typename T,typename Dtype>
+    void todtype(const Tensor<T> &input, Tensor<Dtype> &output){
+        if (input.shape.size != output.shape.size || input.shape.size != output.shape.size) {
+            throw TensorShapeError("todtype");
+        }
+        launch_todtype(input.data, output.data, input.shape.size);
+    };
+ 
+    //add
     template <typename T>
     struct addDispatcher<miaobyte, T>
     {
diff --git a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
index 97e4b17e..709a899c 100644
--- a/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
+++ b/excuter/op-mem-cuda/src/deepx/tf/elementwise_basic.hpp
@@ -9,6 +9,359 @@
 
 namespace deepx::tf
 {
+
+    // todtype
+    class Todtype : public TF
+    {
+    public:
+        Todtype(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "todtype";
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T3(dtypeA)->T1(dtypeB)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Todtype>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+            auto a_shape = mem->gettensor(this->args[0].textvalue).get()->shape;
+            auto c_shape = mem->gettensor(this->returns[0].textvalue).get()->shape;
+            if (a_shape.size != c_shape.size)
+            {
+                error = "Shape mismatch: " +  to_string(a_shape.size)  + " != " +  to_string(c_shape.size);
+                return 1;
+            }
+            Precision a_type = a_shape.dtype;
+            Precision c_type = c_shape.dtype;
+            switch (a_type)
+            {
+            case Precision::Float64:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                {
+                    auto a = mem->gettensor<double>(this->args[0].textvalue);
+                    auto b = mem->gettensor<double>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Float32:
+                    tensorfunc::todtype<double, float>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<double, half>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<double, nv_bfloat16>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<double, int16_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<double, int8_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+                break;
+            }
+            case Precision::Float32:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<float, double>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                {
+                    auto a = mem->gettensor<float>(this->args[0].textvalue);
+                    auto b = mem->gettensor<float>(this->returns[0].textvalue);
+                    b->copyer(a->data,b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Float16:
+                    tensorfunc::todtype<float, half>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<float, nv_bfloat16>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<float, int16_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<float, int8_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Float16:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<half, double>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<half, float>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                {
+                    auto a = mem->gettensor<half>(this->args[0].textvalue);
+                    auto b = mem->gettensor<half>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+
+                case Precision::BFloat16:
+                    tensorfunc::todtype<half, nv_bfloat16>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<half, int64_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<half, int32_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<half, int16_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<half, int8_t>(*mem->gettensor<half>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::BFloat16:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<nv_bfloat16, double>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<nv_bfloat16, float>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<nv_bfloat16, half>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                {
+                    auto a = mem->gettensor<nv_bfloat16>(this->args[0].textvalue);
+                    auto b = mem->gettensor<nv_bfloat16>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int64:
+                    tensorfunc::todtype<nv_bfloat16, int64_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<nv_bfloat16, int32_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<nv_bfloat16, int16_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<nv_bfloat16, int8_t>(*mem->gettensor<nv_bfloat16>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int64:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int64_t, double>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int64_t, float>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<int64_t, half>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<int64_t, nv_bfloat16>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                {
+                    auto a = mem->gettensor<int64_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int64_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int32:
+                    tensorfunc::todtype<int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<int64_t, int16_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<int64_t, int8_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int32:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int32_t, double>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int32_t, float>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<int32_t, half>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<int32_t, nv_bfloat16>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<int32_t, int64_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                {
+                    auto a = mem->gettensor<int32_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int32_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int16:
+                    tensorfunc::todtype<int32_t, int16_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<int32_t, int8_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int16:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int16_t, double>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int16_t, float>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<int16_t, half>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<int16_t, nv_bfloat16>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                {
+                    auto a = mem->gettensor<int16_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int16_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int8:
+                    tensorfunc::todtype<int16_t, int8_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int8:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int8_t, double>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int8_t, float>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float16:
+                    tensorfunc::todtype<int8_t, half>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<half>(this->returns[0].textvalue));
+                    break;
+                case Precision::BFloat16:
+                    tensorfunc::todtype<int8_t, nv_bfloat16>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<nv_bfloat16>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<int8_t, int16_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                {
+                    auto a = mem->gettensor<int8_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int8_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            default:
+                error = "Unsupported dtype: " + precision_str(c_type);
+                return 1;
+            }
+            return 0;
+        };
+        
+    };
+
+    // add
     template <typename Author>
     class Add : public TF
     {
@@ -16,7 +369,7 @@ namespace deepx::tf
         Add(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "add";
-            this->metadata.author = Author::name();  
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -32,7 +385,7 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -90,7 +443,7 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1+scalar";
@@ -100,8 +453,8 @@ namespace deepx::tf
             return make_shared<AddScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -158,7 +511,7 @@ namespace deepx::tf
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1-T2";
@@ -168,8 +521,8 @@ namespace deepx::tf
             return make_shared<Sub<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))  
+        {
+            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -222,12 +575,12 @@ namespace deepx::tf
         SubScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "subscalar";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1-scalar";
@@ -238,7 +591,7 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -290,12 +643,12 @@ namespace deepx::tf
         Mul(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "mul";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1*T2";
@@ -305,8 +658,8 @@ namespace deepx::tf
             return make_shared<Mul<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+        {
+            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -359,12 +712,12 @@ namespace deepx::tf
         MulScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "mulscalar";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1*scalar";
@@ -374,8 +727,8 @@ namespace deepx::tf
             return make_shared<MulScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -427,12 +780,12 @@ namespace deepx::tf
         Div(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "div";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=T1/T2";
@@ -442,8 +795,8 @@ namespace deepx::tf
             return make_shared<Div<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+        {
+            if (!checktensors({this->args[0].textvalue, this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -496,12 +849,12 @@ namespace deepx::tf
         DivScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "divscalar";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=scalar/T1";
@@ -511,8 +864,8 @@ namespace deepx::tf
             return make_shared<DivScalar<Author>>(*this);
         }
         int run(shared_ptr<MemBase> mem, string &error) override
-        {   
-            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -564,12 +917,12 @@ namespace deepx::tf
         RDivScalar(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "rdivscalar";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
         }
- 
+
         string math_formula() const override
         {
             return "T3=scalar/T1";
@@ -580,7 +933,7 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if(!checktensors({this->args[1].textvalue,this->returns[0].textvalue},mem, error))
+            if (!checktensors({this->args[1].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -633,7 +986,7 @@ namespace deepx::tf
         Invert(const vector<Param> &args, const vector<Param> &returns)
         {
             this->name = "invert";
-            this->metadata.author=Author::name();
+            this->metadata.author = Author::name();
             this->tftype = "elementwise";
             this->args = args;
             this->returns = returns;
@@ -648,7 +1001,7 @@ namespace deepx::tf
         }
         int run(shared_ptr<MemBase> mem, string &error) override
         {
-            if(!checktensors({this->args[0].textvalue,this->returns[0].textvalue},mem, error))
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
             {
                 return 1;
             }
@@ -680,7 +1033,6 @@ namespace deepx::tf
             return 0;
         }
     };
-
 };
 
 #endif // DEEPX_TF_ELEMENTWISE_BASIC_HPP
diff --git a/excuter/op-mem-ompsimd/src/client/tfs.cpp b/excuter/op-mem-ompsimd/src/client/tfs.cpp
index 5080a673..d2893cfc 100644
--- a/excuter/op-mem-ompsimd/src/client/tfs.cpp
+++ b/excuter/op-mem-ompsimd/src/client/tfs.cpp
@@ -174,7 +174,19 @@ namespace deepx::tf
 
     // elementwise
     void register_elementwise(TfFactory &tffactory)
-    {
+    {   
+        // todtype 
+        tffactory.add_tf(std::make_shared<Todtype>(vector<Param>(
+                                                                 {
+                                                                     Param("A", DataCategory::Tensor, Precision::Any),
+                                                                 }),
+                                                             vector<Param>(
+                                                                 {
+                                                                     Param("C", DataCategory::Tensor, Precision::Any),
+                                                                 })));
+
+
+        // add author=miaobyte
         tffactory.add_tf(std::make_shared<Add<miaobyte>>(vector<Param>(
                                                              {
                                                                  Param("a", DataCategory::Tensor, Precision::Any),
diff --git a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
index 5acf3bd7..f8f0302f 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tensorfunc/elementwise_miaobyte.hpp
@@ -18,15 +18,15 @@ namespace deepx::tensorfunc
     {
         if (A.shape == B.shape && A.shape == C.shape)
         {
-            C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C, &scalar_op, &simd_op](int i)
+            C.shape.rangeElementwiseParallel([&A, &B, &C, &scalar_op, &simd_op](int i,int i_end)
                                   {
-                                      int shape_last = C.shape[-1];
+                                      
                                       const ScalableTag<T> tag;
                                       const size_t lanes = Lanes(tag);
                                       size_t j = 0;
 
                                       // 1. 处理前置未对齐部分
-                                      while (j < shape_last && !IsAligned(tag, A.data + i + j))
+                                      while (j < i_end && !IsAligned(tag, A.data + i + j))
                                       {
                                           T c;
                                           scalar_op(A.data[i + j], B.data[i + j], c);
@@ -35,14 +35,14 @@ namespace deepx::tensorfunc
                                       }
 
                                       // 2. 处理中间对齐部分
-                                      size_t aligned_end = shape_last - (shape_last % lanes);
+                                      size_t aligned_end = i_end - (i_end % lanes);
                                       for (; j + lanes <= aligned_end; j += lanes)
                                       {
                                           simd_op(A.data + i + j, B.data + i + j, C.data + i + j, lanes);
                                       }
 
                                       // 3. 处理尾部剩余元素
-                                      for (; j < shape_last; j++)
+                                      for (; j < i_end; j++)
                                       {
                                           T c;
                                           scalar_op(A.data[i + j], B.data[i + j], c);
@@ -62,15 +62,14 @@ namespace deepx::tensorfunc
     {
         if (A.shape == C.shape)
         {
-            C.shape.rangeParallel(C.shape.dim() - 1, [&A, &b, &C, &scalar_op, &simd_op](int i)
+            C.shape.rangeElementwiseParallel([&A, &b, &C, &scalar_op, &simd_op](int i,int i_end)
                                   {
-                                      int shape_last = C.shape[-1];
                                       const ScalableTag<T> tag;
                                       const size_t lanes = Lanes(tag);
                                       size_t j = 0;
 
                                       // 1. 处理前置未对齐部分
-                                      while (j < shape_last && !IsAligned(tag, A.data + i + j))
+                                      while (j < i_end && !IsAligned(tag, A.data + i + j))
                                       {
                                           T c;
                                           scalar_op(A.data[i + j], b, c);
@@ -79,14 +78,14 @@ namespace deepx::tensorfunc
                                       }
 
                                       // 2. 处理中间对齐部分
-                                      size_t aligned_end = shape_last - (shape_last % lanes);
+                                      size_t aligned_end = i_end - (i_end % lanes);
                                       for (; j + lanes <= aligned_end; j += lanes)
                                       {
                                           simd_op(A.data + i + j, b, C.data + i + j, lanes);
                                       }
 
                                       // 3. 处理尾部剩余元素
-                                      for (; j < shape_last; j++)
+                                      for (; j < i_end; j++)
                                       {
                                           T c;
                                           scalar_op(A.data[i + j], b, c);
@@ -99,7 +98,21 @@ namespace deepx::tensorfunc
         }
     }
 
-    // 通用实现
+    //todtype
+    template <typename T,typename Dtype>
+    static void todtype(const Tensor<T> &A, Tensor<Dtype> &C)
+    {
+        C.shape.rangeElementwiseParallel([&A, &C](int i,int i_end)
+        {
+            for (int j = 0; j < i_end; j++)
+            {
+                C.data[i + j] = static_cast<Dtype>(A.data[i + j]);
+            }
+        });
+    }
+
+
+    // add
     template <typename T>
     struct addDispatcher<miaobyte, T>
     {
@@ -292,9 +305,9 @@ namespace deepx::tensorfunc
         {   
             if (A.shape == C.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &C](int idx)
+                A.shape.rangeElementwiseParallel([&A, &C](int idx,int idx_end)
                                       {
-                                           for (int j=0;j<A.shape[-1];j++)
+                                           for (int j=0;j<idx_end;j++)
                                            {
                                                 C.data[idx+j]=~A.data[idx+j];
                                            } 
@@ -314,21 +327,20 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
                                            {
-                int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,input.data + i + j)) {
+                while (j < i_end && !IsAligned(tag,input.data + i + j)) {
                     output.data[i+j] = std::sqrt(input.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
                     auto vec = Load(tag, input.data + i + j);
@@ -337,7 +349,7 @@ namespace deepx::tensorfunc
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
                     output.data[i+j] = std::sqrt(input.data[i+j]);
                 } });
@@ -355,13 +367,11 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
                                            {
-                                               int shape_last = output.shape[-1];
-
                                                size_t j = 0;
 
-                                               while (j < shape_last)
+                                               while (j < i_end)
                                                {
                                                    output.data[i + j] = std::sqrt(input.data[i + j]);
                                                    ++j;
@@ -382,9 +392,9 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int i)
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
                                       {
-                                         for (int j = 0; j < C.shape[-1]; j++)
+                                         for (int j = 0; j < i_end; j++)
                                          C.data[i+j] = std::pow(A.data[i+j], B.data[i+j]); });
             }
             else
@@ -403,10 +413,11 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output, &value](int i)
+                output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end)
                                            {
-                                             for (int j = 0; j < output.shape[-1]; j++)
-                                                output.data[i+j] = std::pow(input.data[i+j], value); });
+                                             for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::pow(input.data[i+j], value);
+                                            });
             }
             else
             {
@@ -423,10 +434,11 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output, &value](int i)
+                output.shape.rangeElementwiseParallel([&input, &output, &value](int i,int i_end)
                                            {
-                                                for (int j = 0; j < output.shape[-1]; j++)
-                                                output.data[i+j] = std::pow(value, input.data[i+j]); });
+                                                for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::pow(value, input.data[i+j]);
+                                           });
             }
             else
             {
@@ -443,9 +455,10 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
-                                           { for (int j = 0; j < output.shape[-1]; j++)
-                                                output.data[i+j] = std::log(input.data[i+j]); });
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
+                                           { for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::log(input.data[i+j]);
+                                           });
             }
             else
             {
@@ -462,9 +475,10 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
-                                           { for (int j = 0; j < output.shape[-1]; j++)
-                                                output.data[i+j] = std::exp(input.data[i+j]); });
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
+                                           { for (int j = 0; j < i_end; j++)
+                                                output.data[i+j] = std::exp(input.data[i+j]);
+                                           });
             }
             else
             {
@@ -481,21 +495,20 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
                                            {
-                int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,input.data + i + j)) {
+                while (j < i_end && !IsAligned(tag,input.data + i + j)) {
                     output.data[i+j] = std::sin(input.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
                     auto vec = Load(tag, input.data + i + j);
@@ -504,7 +517,7 @@ namespace deepx::tensorfunc
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
                     output.data[i+j] = std::sin(input.data[i+j]);
                 } });
@@ -524,21 +537,20 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
                                            {
-                int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,input.data + i + j)) {
+                while (j < i_end && !IsAligned(tag,input.data + i + j)) {
                     output.data[i+j] = std::cos(input.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
                     auto vec = Load(tag, input.data + i + j);
@@ -547,7 +559,7 @@ namespace deepx::tensorfunc
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
                     output.data[i+j] = std::cos(input.data[i+j]);
                 } });
@@ -567,21 +579,20 @@ namespace deepx::tensorfunc
         {
             if (input.shape == output.shape)
             {
-                output.shape.rangeParallel(output.shape.dim() - 1, [&input, &output](int i)
+                output.shape.rangeElementwiseParallel([&input, &output](int i,int i_end)
                                            {
-                int shape_last=output.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,input.data + i + j)) {
+                while (j < i_end && !IsAligned(tag,input.data + i + j)) {
                     output.data[i+j] = std::tan(input.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
                     auto vec = Load(tag, input.data + i + j);
@@ -590,7 +601,7 @@ namespace deepx::tensorfunc
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
                     output.data[i+j] = std::tan(input.data[i+j]);
                 } });
@@ -609,33 +620,32 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int idx)
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
                                       {
-                int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
  
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,A.data + idx+j)) {
-                    C.data[idx+j]=std::max(A.data[idx+j],B.data[idx+j]);
+                while (j < i_end && !IsAligned(tag,A.data + i + j)) {
+                    C.data[i+j]=std::max(A.data[i+j],B.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
-                    auto vec1 = Load(tag, A.data + idx+j);  // 加载数组1的向量
-                    auto vec2 = Load(tag, B.data + idx+j);  // 加载数组2的向量
+                    auto vec1 = Load(tag, A.data + i + j);  // 加载数组1的向量
+                    auto vec2 = Load(tag, B.data + i + j);  // 加载数组2的向量
                     auto vec_result = Max(vec1, vec2);  // 向量比较
-                    Store(vec_result, tag, C.data + idx+j); // 存储结果向量
+                    Store(vec_result, tag, C.data + i + j); // 存储结果向量
                 }  
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
-                    C.data[idx+j]=std::max(A.data[idx+j],B.data[idx+j]);
+                    C.data[i+j]=std::max(A.data[i+j],B.data[i+j]);
                 } });
             }
             else
@@ -652,33 +662,32 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim() - 1, [&A, b, &C](int idx)
+                C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end)
                                       {
-                int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分
-                while (j < shape_last && !IsAligned(tag,A.data + idx+j)) {
-                    C.data[idx+j]=std::max(A.data[idx+j],b);
+                while (j < i_end && !IsAligned(tag,A.data + i + j)) {
+                    C.data[i+j]=std::max(A.data[i+j],b);
                     ++j;
                 }   
 
                 // 2. 处理中间对齐部分
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
-                    auto vec1 = Load(tag, A.data + idx+j);  // 加载数组1的向量
+                    auto vec1 = Load(tag, A.data + i + j);  // 加载数组1的向量
                     auto vec2=Set(tag,b);   
                     auto vec_result = Max(vec1, vec2);  // 向量比较
-                    Store(vec_result, tag, C.data + idx+j); // 存储结果向量
+                    Store(vec_result, tag, C.data + i + j); // 存储结果向量
                 }   
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
-                    C.data[idx+j]=std::max(A.data[idx+j],b);
+                    C.data[i+j]=std::max(A.data[i+j],b);
                 } });
             }
             else
@@ -695,33 +704,32 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim() - 1, [&A, &B, &C](int idx)
+                C.shape.rangeElementwiseParallel([&A, &B, &C](int i,int i_end)
                                       {
-                int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分    
-                while (j < shape_last && !IsAligned(tag,A.data + idx+j)) {
-                    C.data[idx+j]=std::min(A.data[idx+j],B.data[idx+j]);
+                while (j < i_end && !IsAligned(tag,A.data + i + j)) {
+                    C.data[i+j]=std::min(A.data[i+j],B.data[i+j]);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分  
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
-                    auto vec1 = Load(tag, A.data + idx+j);  // 加载数组1的向量
-                    auto vec2 = Load(tag, B.data + idx+j);  // 加载数组2的向量
+                    auto vec1 = Load(tag, A.data + i + j);  // 加载数组1的向量
+                    auto vec2 = Load(tag, B.data + i + j);  // 加载数组2的向量
                     auto vec_result = Min(vec1, vec2);  // 向量比较 
-                    Store(vec_result, tag, C.data + idx+j); // 存储结果向量
+                    Store(vec_result, tag, C.data + i + j); // 存储结果向量
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++)
+                for (;j<i_end;j++)
                 {
-                    C.data[idx+j]=std::min(A.data[idx+j],B.data[idx+j]);
+                    C.data[i+j]=std::min(A.data[i+j],B.data[i+j]);
                 } });
             }
             else
@@ -738,32 +746,31 @@ namespace deepx::tensorfunc
         {
             if (A.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim() - 1, [&A, b, &C](int idx)
+                C.shape.rangeElementwiseParallel([&A, b, &C](int i,int i_end)
                                       {   
-                int shape_last=C.shape[-1];
                 const ScalableTag<T> tag;
                 const size_t lanes = Lanes(tag);
                 size_t j=0;
 
                 // 1. 处理前置未对齐部分        
-                while (j < shape_last && !IsAligned(tag,A.data + idx+j)) {
-                    C.data[idx+j]=std::min(A.data[idx+j],b);
+                while (j < i_end && !IsAligned(tag,A.data + i + j)) {
+                    C.data[i+j]=std::min(A.data[i+j],b);
                     ++j;
                 }
 
                 // 2. 处理中间对齐部分  
-                size_t aligned_end=shape_last-(shape_last%lanes);
+                size_t aligned_end=i_end-(i_end%lanes);
                 for (; j+lanes<=aligned_end; j +=  lanes  )
                 {
-                    auto vec1 = Load(tag, A.data + idx+j);  // 加载数组1的向量
+                    auto vec1 = Load(tag, A.data + i + j);  // 加载数组1的向量
                     auto vec2=Set(tag,b);       
                     auto vec_result = Min(vec1, vec2);  // 向量比较
-                    Store(vec_result, tag, C.data + idx+j); // 存储结果向量
+                    Store(vec_result, tag, C.data + i + j); // 存储结果向量
                 }
 
                 // 3. 处理尾部剩余元素
-                for (;j<shape_last;j++){
-                    C.data[idx+j]=std::min(A.data[idx+j],b);
+                for (;j<i_end;j++){
+                    C.data[i+j]=std::min(A.data[i+j],b);
                 } });
             }
             else
@@ -781,16 +788,16 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {   
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask,epsilon](int idx)
+                A.shape.rangeElementwiseParallel([&A, &B, &mask,epsilon](int i,int i_end)
                                       {
-                                            for (int i = 0; i < A.shape[-1]; i++)
+                                            for (int j = 0; j < i_end; j++)
                                             {
                                                 if (epsilon == 0)
                                                 {
-                                                    mask.data[idx+i]=A.data[idx+i]==B.data[idx+i];
+                                                    mask.data[i+j]=A.data[i+j]==B.data[i+j];
                                                 }
                                                 else{
-                                                    mask.data[idx+i]=std::abs(A.data[idx+i]-B.data[idx+i])<=epsilon;
+                                                    mask.data[i+j]=std::abs(A.data[i+j]-B.data[i+j])<=epsilon;
                                                 }
                                             }
                                             });
@@ -810,16 +817,16 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar,epsilon](int idx)
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar,epsilon](int i,int i_end)
                                       {
-                for (int i = 0; i < A.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {
                     if (epsilon == 0)
                     {
-                        mask.data[idx+i]=A.data[idx+i]==scalar;
+                        mask.data[i+j]=A.data[i+j]==scalar;
                     }
                     else{
-                        mask.data[idx+i]=std::abs(A.data[idx+i]-scalar)<=epsilon;
+                        mask.data[i+j]=std::abs(A.data[i+j]-scalar)<=epsilon;
                     }
                 }
                 });
@@ -839,11 +846,11 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask](int idx)
+                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end)
                                       {
-                for (int i = 0; i < A.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {
-                    mask.data[idx+i]=A.data[idx+i]<B.data[idx+i];
+                    mask.data[i+j]=A.data[i+j]<B.data[i+j];
                 }   
                 });
             }
@@ -862,11 +869,11 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar](int idx)
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end)
                                       {
-                for (int i = 0; i < A.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {
-                    mask.data[idx+i]=A.data[idx+i]<scalar;
+                    mask.data[i+j]=A.data[i+j]<scalar;
                 }
                 });
             }
@@ -885,11 +892,11 @@ namespace deepx::tensorfunc
         {
             if (A.shape == B.shape && mask.shape == A.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &B, &mask](int idx)
+                A.shape.rangeElementwiseParallel([&A, &B, &mask](int i,int i_end)
                                       {
-                for (int i = 0; i < A.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {
-                    mask.data[idx+i]=A.data[idx+i]>B.data[idx+i];
+                    mask.data[i+j]=A.data[i+j]>B.data[i+j];
                 }
                 });
             }
@@ -908,11 +915,11 @@ namespace deepx::tensorfunc
         {
             if (A.shape == mask.shape)
             {
-                A.shape.rangeParallel(A.shape.dim()-1, [&A, &mask, &scalar](int idx)
+                A.shape.rangeElementwiseParallel([&A, &mask, &scalar](int i,int i_end)
                                       {
-                for (int i = 0; i < A.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {
-                    mask.data[idx+i]=A.data[idx+i]>scalar;
+                    mask.data[i+j]=A.data[i+j]>scalar;
                 }
                 });
             }
@@ -931,12 +938,12 @@ namespace deepx::tensorfunc
         {
             if (cases.shape == C.shape)
             {
-                C.shape.rangeParallel(C.shape.dim()-1, [&tensors, &cases, &C](int idx)
+                C.shape.rangeElementwiseParallel([&tensors, &cases, &C](int i,int i_end)
                                       {
-                for (int i = 0; i < C.shape[-1]; i++)
+                for (int j = 0; j < i_end; j++)
                 {   
-                    int which_tensor=cases.data[idx];
-                    C.data[idx+i]=tensors[which_tensor]->data[idx];
+                    int which_tensor=cases.data[i];
+                    C.data[i+j]=tensors[which_tensor]->data[i];
                 }
                 });
             }
diff --git a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
index 1f754006..58768784 100644
--- a/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
+++ b/excuter/op-mem-ompsimd/src/deepx/tf/elementwise.hpp
@@ -10,7 +10,247 @@
 #include "deepx/tensorfunc/elementwise_cblas.hpp"
 namespace deepx::tf
 {
+        // todtype
+    class Todtype : public TF
+    {
+    public:
+        Todtype(const vector<Param> &args, const vector<Param> &returns)
+        {
+            this->name = "todtype";
+            this->tftype = "elementwise";
+            this->args = args;
+            this->returns = returns;
+        }
+
+        string math_formula() const override
+        {
+            return "T3(dtypeA)->T1(dtypeB)";
+        }
+        shared_ptr<TF> clone() const override
+        {
+            return make_shared<Todtype>(*this);
+        }
+        int run(shared_ptr<MemBase> mem, string &error) override
+        {
+            if (!checktensors({this->args[0].textvalue, this->returns[0].textvalue}, mem, error))
+            {
+                return 1;
+            }
+            auto a_shape = mem->gettensor(this->args[0].textvalue).get()->shape;
+            auto c_shape = mem->gettensor(this->returns[0].textvalue).get()->shape;
+            if (a_shape.size != c_shape.size)
+            {
+                error = "Shape mismatch: " +  to_string(a_shape.size)  + " != " +  to_string(c_shape.size);
+                return 1;
+            }
+            Precision a_type = a_shape.dtype;
+            Precision c_type = c_shape.dtype;
+            switch (a_type)
+            {
+            case Precision::Float64:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                {
+                    auto a = mem->gettensor<double>(this->args[0].textvalue);
+                    auto b = mem->gettensor<double>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Float32:
+                    tensorfunc::todtype<double, float>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                 case Precision::Int64:
+                    tensorfunc::todtype<double, int64_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<double, int32_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<double, int16_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<double, int8_t>(*mem->gettensor<double>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+                break;
+            }
+            case Precision::Float32:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<float, double>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                {
+                    auto a = mem->gettensor<float>(this->args[0].textvalue);
+                    auto b = mem->gettensor<float>(this->returns[0].textvalue);
+                    b->copyer(a->data,b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int64:
+                    tensorfunc::todtype<float, int64_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<float, int32_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<float, int16_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<float, int8_t>(*mem->gettensor<float>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+ 
+            break;
+            case Precision::Int64:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int64_t, double>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int64_t, float>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                {
+                    auto a = mem->gettensor<int64_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int64_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int32:
+                    tensorfunc::todtype<int64_t, int32_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<int64_t, int16_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<int64_t, int8_t>(*mem->gettensor<int64_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int32:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int32_t, double>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int32_t, float>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<int32_t, int64_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                {
+                    auto a = mem->gettensor<int32_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int32_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int16:
+                    tensorfunc::todtype<int32_t, int16_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                    tensorfunc::todtype<int32_t, int8_t>(*mem->gettensor<int32_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int16:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int16_t, double>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int16_t, float>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int64:
+                    tensorfunc::todtype<int16_t, int64_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<int16_t, int32_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                {
+                    auto a = mem->gettensor<int16_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int16_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                case Precision::Int8:
+                    tensorfunc::todtype<int16_t, int8_t>(*mem->gettensor<int16_t>(this->args[0].textvalue), *mem->gettensor<int8_t>(this->returns[0].textvalue));
+                    break;
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            case Precision::Int8:
+            {
+                switch (c_type)
+                {
+                case Precision::Float64:
+                    tensorfunc::todtype<int8_t, double>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<double>(this->returns[0].textvalue));
+                    break;
+                case Precision::Float32:
+                    tensorfunc::todtype<int8_t, float>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<float>(this->returns[0].textvalue));
+                    break;
+               case Precision::Int64:
+                    tensorfunc::todtype<int8_t, int64_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int64_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int32:
+                    tensorfunc::todtype<int8_t, int32_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int32_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int16:
+                    tensorfunc::todtype<int8_t, int16_t>(*mem->gettensor<int8_t>(this->args[0].textvalue), *mem->gettensor<int16_t>(this->returns[0].textvalue));
+                    break;
+                case Precision::Int8:
+                {
+                    auto a = mem->gettensor<int8_t>(this->args[0].textvalue);
+                    auto b = mem->gettensor<int8_t>(this->returns[0].textvalue);
+                    b->copyer(a->data, b->data, a->shape.size);
+                    break;
+                }
+                default:
+                    error = "Unsupported dtype: " + precision_str(c_type);
+                    return 1;
+                }
+            }
+            break;
+            default:
+                error = "Unsupported dtype: " + precision_str(c_type);
+                return 1;
+            }
+            return 0;
+        };
+        
+    };
+
 
+    // add
     template <typename Author>
     class Add : public TF
     {
diff --git a/front/py/deepx/nn/functional/__init__.py b/front/py/deepx/nn/functional/__init__.py
index 4a17d8ca..051f8d68 100644
--- a/front/py/deepx/nn/functional/__init__.py
+++ b/front/py/deepx/nn/functional/__init__.py
@@ -10,26 +10,28 @@
 from .authormap import defaultauthor
 
 from .reduce import mean
-
 from .activite import *
-
 from .elementwise import *
 from .normalization import *
+from .changeshape import *
 __all__ = [
 
     #leaffunc
     "newtensor","rnewtensor","printtensor","load", #life
     "printtensor","save",#io
-    "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_","calculate_fan_in_and_fan_out",
-    "add","sub","mul","div","sqrt","pow","exp","log",
+    "constant","constant_","full","zeros","ones","uniform","uniform_","arange","arange_","kaiming_uniform","kaiming_uniform_",
+    "add","sub","mul","div","sqrt","pow","exp","log","invert","todtype","dropout",
     "matmul",
     "reducemax","reducemin","sum","prod",
     "reshape","permute","transpose","concat","broadcastTo","indexselect",
 
     #functional
-    "relu","sigmoid","swish",
+    "relu","sigmoid","swish","silu",
     "mean",
     "rsqrt",
     "softmax",
+    "squeeze","unsqueeze",
 
+    #other
+    "calculate_fan_in_and_fan_out",
 ]
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/activite.py b/front/py/deepx/nn/functional/activite.py
index 8510a052..6effbb9c 100644
--- a/front/py/deepx/nn/functional/activite.py
+++ b/front/py/deepx/nn/functional/activite.py
@@ -1,33 +1,18 @@
 from deepx.tensor import Tensor
 from deepx.nn.functional import newtensor
-
+from .leaffunc_elementwise  import exp
 # 数学公式：relu(x) = max(0, x)
 def relu(t: Tensor)->Tensor:
     from .leaffunc_elementwise import max as max_func
-    outtensor=t
-    if t.name!=None:
-        outtensor=newtensor(t.shape, dtype=t.dtype)
-    else:#inplace操作
-        pass
+    outtensor=newtensor(t.shape, dtype=t.dtype)
     return max_func(t,0,outtensor)
  
  # 数学公式：σ(x) = 1 / (1 + exp(-x))
 def sigmoid(t: Tensor)->Tensor:
-    outtensor=t
-    if t.name is not None:
-        outtensor=newtensor(t.shape, dtype=t.dtype)
-    t.mul(-1,out=outtensor)
-    outtensor.exp_()
-    outtensor.add_(1)
-    outtensor.rdiv_(1)
-    return outtensor
+    return 1 / (exp(t*-1)+1)
 
 # 数学公式：swish(x) = x * σ(βx)
 def swish(x: Tensor,beta: float = 1.0) -> Tensor:
-    outtensor=x
-    if x.name is not None:
-        outtensor=newtensor(x.shape, dtype=x.dtype)
-    x.mul(beta,out=outtensor)
-    outtensor=sigmoid(outtensor)
-    outtensor.mul_(x)
-    return outtensor
+    return x*sigmoid(x*beta)
+
+silu=swish
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/authormap.py b/front/py/deepx/nn/functional/authormap.py
index 120db25b..5c6b492b 100644
--- a/front/py/deepx/nn/functional/authormap.py
+++ b/front/py/deepx/nn/functional/authormap.py
@@ -29,6 +29,7 @@
         'powscalar':'miaobyte',
         'rpowscalar':'miaobyte',
         'sqrt':'miaobyte',
+        'dropout':'miaobyte',
         #changeshape
         'reshape':'miaobyte',
         'transpose':'miaobyte',
diff --git a/front/py/deepx/nn/functional/changeshape.py b/front/py/deepx/nn/functional/changeshape.py
new file mode 100644
index 00000000..db963f85
--- /dev/null
+++ b/front/py/deepx/nn/functional/changeshape.py
@@ -0,0 +1,18 @@
+from deepx import Tensor
+from .leaffunc_changeshape import reshape
+
+def squeeze(t:Tensor,dim:int)->Tensor:
+    assert isinstance(dim,int)
+    assert isinstance(t,Tensor)
+    dim=dim%t.ndim
+    newshape=list(t.shape)
+    newshape.pop(dim)
+    return reshape(t,tuple(newshape))
+
+def unsqueeze(t:Tensor,dim:int)->Tensor:
+    assert isinstance(dim,int)
+    assert isinstance(t,Tensor)
+    dim=dim%t.ndim
+    newshape=list(t.shape)
+    newshape.insert(dim,1)
+    return reshape(t,tuple(newshape))
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/elementwise.py b/front/py/deepx/nn/functional/elementwise.py
index d6a4e6ea..76c22aca 100644
--- a/front/py/deepx/nn/functional/elementwise.py
+++ b/front/py/deepx/nn/functional/elementwise.py
@@ -2,11 +2,7 @@
 from deepx.nn.functional import newtensor
 
 def rsqrt(input:Tensor)->Tensor:
-    from .leaffunc_elementwise import sqrt,div
-    outtensor=input
-    if input.name is not None:
-        outtensor=newtensor(input.shape, dtype=input.dtype)
-    sqrt(input,out= outtensor)
-    return div(1,outtensor,outtensor)
+    from .leaffunc_elementwise import sqrt
+    return 1/sqrt(input)
  
 
diff --git a/front/py/deepx/nn/functional/leaffunc.py b/front/py/deepx/nn/functional/leaffunc.py
index 566e2d48..89cbde68 100644
--- a/front/py/deepx/nn/functional/leaffunc.py
+++ b/front/py/deepx/nn/functional/leaffunc.py
@@ -25,15 +25,15 @@ def op_func(
                 newshape = Shape.broadcast_shape(a.shape, b.shape)
                 an = a.broadcastTo(newshape)
                 bn = b.broadcastTo(newshape)
-                if isinstance(out,str):
+                if isinstance(out,str) or out is None:
                     outtensor=newtensor(newshape,dtype=a.dtype,name=out)
             else:
-                if isinstance(out,str):
+                if isinstance(out,str) or out is None:
                     outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
             rtf_func = getattr(rtf_module, f'rtf_{op_name}')
             rtf_func(an, bn, outtensor, defaultauthor[op_name])
         else:
-            if isinstance(out,str):
+            if isinstance(out,str) or out is None:
                 outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
             rtf_func = getattr(rtf_module, f'rtf_{op_name}scalar')
             rtf_func(a, b, outtensor, defaultauthor[f'{op_name}scalar'])
@@ -47,7 +47,7 @@ def op_func(
             a:Tensor,
             out:Union[Tensor,str]=None)->Tensor:
         outtensor=out
-        if isinstance(out,str):
+        if isinstance(out,str) or out is None:
             outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
         rtf_module = importlib.import_module('deepx.nn.functional.rtf_elementwise')
         rtf_func = getattr(rtf_module, f'rtf_{op_name}')
@@ -69,7 +69,7 @@ def op_func(
         if dim is None:
             dim=tuple(range(a.ndim))
         result=out
-        if isinstance(out,str):
+        if isinstance(out,str) or out is None:
             resultshape=Shape.reduceshape(a.shape,dim,keepdim)
             result=newtensor(resultshape, dtype=a.dtype,name=out)
         rtf_module = importlib.import_module('deepx.nn.functional.rtf_reduce')
diff --git a/front/py/deepx/nn/functional/leaffunc_changeshape.py b/front/py/deepx/nn/functional/leaffunc_changeshape.py
index c9501755..a830307a 100644
--- a/front/py/deepx/nn/functional/leaffunc_changeshape.py
+++ b/front/py/deepx/nn/functional/leaffunc_changeshape.py
@@ -10,7 +10,7 @@ def reshape(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
         assert isinstance(i,int) and i>0
 
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape=shape
         outtensor=newtensor(outshape,dtype=t.dtype,name=out)
     else:
@@ -32,7 +32,7 @@ def permute(t:Tensor,
         raise ValueError(f"shape参数不合法,当前输入维度数：{len(dimorder)}，张量维度数：{t.ndim}")
     dimorder = [d % t.ndim for d in dimorder]
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape = [t.shape[dim] for dim in dimorder]
         outtensor=newtensor(outshape,dtype=t.dtype,name=out)
 
@@ -47,9 +47,14 @@ def transpose(t:Tensor,out:Union[Tensor,str]='')->Tensor:
 
  
 
-def concat(tensors:Union[list[Tensor],tuple[Tensor]],dim:int,out:Union[Tensor,str]='')->Tensor:
+def concat(tensors:Union[list[Tensor],tuple[Tensor,...]],dim:int,out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(dim,int)
+    assert isinstance(tensors,list) or isinstance(tensors,tuple)
+    for t in tensors:
+        assert isinstance(t,Tensor)
+
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape=list(tensors[0].shape)
         outshape[dim]=sum(t.shape[dim] for t in tensors)
         outtensor=newtensor(outshape,dtype=tensors[0].dtype,name=out)
@@ -68,7 +73,7 @@ def broadcastTo(t:Tensor,new_shape:tuple[int,...],out:Union[Tensor,str]='',requi
     if bshape!=tuple(new_shape):
         raise ValueError(f"广播失败：{t.shape} 无法广播为 {new_shape} ")
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape=new_shape
         outtensor=newtensor(outshape,dtype=t.dtype,name=out)
     from .rtf_changeshape import rtf_broadcastTo
@@ -80,7 +85,7 @@ def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]
     assert gatheraxis>=0 and gatheraxis<input.ndim
 
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape=Shape.indexselectshape(input.shape,indices.shape,gatheraxis)
         outtensor=newtensor(outshape,dtype=input.dtype,name=out)
     assert outtensor.shape==outshape
@@ -106,7 +111,7 @@ def indexselect(input:Tensor,indices:Tensor,gatheraxis:int,out:Union[Tensor,str]
 # OpNode.register("expand")
 # def expand(t:Tensor,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
 #     outtensor=None
-#     if isinstance(out,str):
+#     if isinstance(out,str) or out is None:
 #         outtensor=Tensor(shape=shape, dtype=t.dtype, device=t.device)
 #         outtensor.addtograph(out)
 #     else:
diff --git a/front/py/deepx/nn/functional/leaffunc_elementwise.py b/front/py/deepx/nn/functional/leaffunc_elementwise.py
index 6aa54077..77bda9b2 100644
--- a/front/py/deepx/nn/functional/leaffunc_elementwise.py
+++ b/front/py/deepx/nn/functional/leaffunc_elementwise.py
@@ -28,7 +28,7 @@ def rdiv(
         b: Tensor, 
         out:Union[Tensor,str]=None)->Tensor:
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
     from .rtf_elementwise import rtf_rdivscalar
     rtf_rdivscalar(a,b,outtensor,defaultauthor['rdivscalar'])
@@ -41,7 +41,7 @@ def rdiv(
 pow=create_A_B_tf_C('pow')
 def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor:
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outtensor=newtensor(b.shape,dtype=b.dtype,name=out)
     from .rtf_elementwise import rtf_rpowscalar
     rtf_rpowscalar(a,b,outtensor,defaultauthor['rpowscalar'])
@@ -53,4 +53,26 @@ def rpow(a:Number,b:Tensor,out:Union[Tensor,str]=None)->Tensor:
 log=create_A_tf_C('log')
 
 #invert
-invert=create_A_tf_C('invert')
\ No newline at end of file
+invert=create_A_tf_C('invert')
+
+#todtype
+def todtype(t:Tensor,dest:Tensor):
+    assert isinstance(t,Tensor)
+    assert isinstance(dest,Tensor)
+    assert t.shape==dest.shape
+
+    from .rtf_elementwise import rtf_todtype
+    rtf_todtype(t,dest)
+
+#dropout
+def dropout(a:Tensor, p:float, out:Union[Tensor,str]='')->Tensor:
+    assert isinstance(a,Tensor)
+    outtensor=out
+    if isinstance(out,str) or out is None:
+        outtensor=newtensor(a.shape,dtype=a.dtype,name=out)
+    assert a.shape==outtensor.shape
+
+    from .rtf_elementwise import rtf_dropout
+    rtf_dropout(a,p,outtensor,defaultauthor['dropout'])
+    return out
+
diff --git a/front/py/deepx/nn/functional/leaffunc_life.py b/front/py/deepx/nn/functional/leaffunc_life.py
index 8921f8e3..78d438f1 100644
--- a/front/py/deepx/nn/functional/leaffunc_life.py
+++ b/front/py/deepx/nn/functional/leaffunc_life.py
@@ -1,24 +1,28 @@
 from deepx.tensor import Tensor
 from typing import Union
+ 
+def newtensor(shape:tuple[int,...],dtype:str='float32',name:str=None):
+    assert isinstance(shape,tuple)
+    for i in shape:
+        assert isinstance(i,int)
+    assert isinstance(dtype,str)
+    assert isinstance(name,str) or name is None
 
-def parse_shape(shape:Union[tuple,list])->tuple[int, ...]:
-    if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
-        shape = shape[0]
-    return tuple(int(dim) for dim in shape)
-
-def newtensor(*shape,dtype:str='float32',name:str=None):
-    s=parse_shape(shape)
-    t=Tensor(shape=s,dtype=dtype,name=name)
+    t=Tensor(shape=shape,dtype=dtype,name=name)
     from .rtf_life import rtf_newtensor
     rtf_newtensor(t)
     return t
+
 def rnewtensor(t:Tensor):
     from .rtf_life import rtf_newtensor
     rtf_newtensor(t)
     return t
+
 def copytensor(t:Tensor,out:Tensor):
     from .rtf_life import rtf_copytensor
     rtf_copytensor(t,out)
+
+
 def deltensor(t:Tensor):
     from .rtf_life import rtf_deltensor
     rtf_deltensor(t)
diff --git a/front/py/deepx/nn/functional/leaffunc_matmul.py b/front/py/deepx/nn/functional/leaffunc_matmul.py
index 8cad3127..acdcefd4 100644
--- a/front/py/deepx/nn/functional/leaffunc_matmul.py
+++ b/front/py/deepx/nn/functional/leaffunc_matmul.py
@@ -6,7 +6,7 @@
 
 def matmul(a:Tensor,b:Tensor,out:Union[Tensor,str]='',bench:tuple[int,int]=None)->Tensor:
     outtensor=out
-    if isinstance(out,str):
+    if isinstance(out,str) or out is None:
         outshape=Shape.matmul(a.shape,b.shape)
         outtensor=newtensor(outshape,dtype=a.dtype,name=out)
     from .rtf_matmul import rtf_matmul
diff --git a/front/py/deepx/nn/functional/normalization.py b/front/py/deepx/nn/functional/normalization.py
index 8a9d6030..69b37079 100644
--- a/front/py/deepx/nn/functional/normalization.py
+++ b/front/py/deepx/nn/functional/normalization.py
@@ -1,19 +1,18 @@
 from deepx import Tensor
 
 # 数学公式：softmax(x_i) = e^{x_i} / sum(e^{x_j})
-def softmax(t: Tensor,dim:int=-1)->Tensor:
-
+def softmax(t: Tensor,dim:list[int]=[-1])->Tensor:
+    assert isinstance(dim,list)
+    for i in range(len(dim)):
+        dim[i]=dim[i]%t.ndim
     # 数值稳定性处理：减去最大值防止指数爆炸
     if dim is not None:
-        reducemax_t = t.reducemax(dim=[dim], keepdim=True)  # 保持维度用于广播
+        t_reducemax = t.reducemax(dim=tuple(dim), keepdim=True)  # 保持维度用于广播
     else:
-        reducemax_t = t.reducemax(keepdim=True)
-    t_subed=t.clone()
-    t_subed.sub_(reducemax_t)
+        t_reducemax= t.reducemax(keepdim=True)
+
+    t=t-t_reducemax
 
-    # 实现公式：exp(t_subed) / sum(exp(t_subed))
-    exp_t = t_subed.exp()
-    expt_sum=exp_t.sum(dim=[dim], keepdim=True)
-    # 处理输出张量（参考sigmoid的实现模式）
-    exp_t.div(expt_sum,out=t_subed)
-    return t_subed
\ No newline at end of file
+    t_exp = t.exp()
+    t_exp_sum=t_exp.sum(dim=tuple(dim), keepdim=True)
+    return t.exp()/t_exp_sum
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_elementwise.py b/front/py/deepx/nn/functional/rtf_elementwise.py
index 3b7df4a6..dc48ff46 100644
--- a/front/py/deepx/nn/functional/rtf_elementwise.py
+++ b/front/py/deepx/nn/functional/rtf_elementwise.py
@@ -3,6 +3,7 @@
 from deepx.scheduler import send
 from .rtf import A_B_op_C,A_scalar_op_C,A_op_C
 
+
 def rtf_add(a:Tensor, b:Tensor, out:Tensor, author='miaobyte')->Tensor:
     A_B_op_C("add",a,b,out,author)
     return out
@@ -107,4 +108,18 @@ def rtf_minscalar(a:Tensor, b:float, out:Tensor, author='miaobyte')->Tensor:
 
 def rtf_invert(a:Tensor, out:Tensor, author='miaobyte')->Tensor:
     A_op_C("invert",a,out,author)
+    return out
+
+def rtf_todtype(t:Tensor,dest:Tensor):
+    assert isinstance(t,Tensor)
+    assert isinstance(dest,Tensor)
+    assert t.shape==dest.shape
+
+    args=[Param.tensor(t)]
+    returns=[Param.tensor(dest)]
+    ir=DeepxIR("todtype", args, returns,'')
+    send(ir)
+
+def rtf_dropout(a:Tensor, p:float, out:Tensor, author='miaobyte')->Tensor:
+    A_B_op_C("dropout",a,p,out,author)
     return out
\ No newline at end of file
diff --git a/front/py/deepx/nn/functional/rtf_life.py b/front/py/deepx/nn/functional/rtf_life.py
index 21547b99..60a2371d 100644
--- a/front/py/deepx/nn/functional/rtf_life.py
+++ b/front/py/deepx/nn/functional/rtf_life.py
@@ -3,6 +3,7 @@
 from deepx.scheduler import send
 
 def rtf_newtensor(t:Tensor):
+    assert isinstance(t,Tensor)
     args=[Param.vector(t.shape,'int32')]
     returns=[Param.tensor(t)]
     ir=DeepxIR("newtensor", args, returns,'')
@@ -10,12 +11,20 @@ def rtf_newtensor(t:Tensor):
 
 
 def rtf_copytensor(t:Tensor,out:Tensor):
+    assert isinstance(t,Tensor)
+    assert isinstance(out,Tensor)
+    assert t.shape==out.shape
+    assert t.dtype==out.dtype
+
     args=[Param.tensor(t)]
     returns=[Param.tensor(out)]
     ir=DeepxIR("copytensor", args, returns,'')
     send(ir)
 
+
+
 def rtf_deltensor(t:Tensor):
+    assert isinstance(t,Tensor)
     args=[]
     returns=[Param.tensor(t)]
     ir=DeepxIR("deltensor", args, returns,'')
diff --git a/front/py/deepx/nn/modules/activation.py b/front/py/deepx/nn/modules/activation.py
index 5f093f85..51b93f81 100644
--- a/front/py/deepx/nn/modules/activation.py
+++ b/front/py/deepx/nn/modules/activation.py
@@ -1,39 +1,14 @@
 from typing import Union
 from deepx  import Tensor,ones
-import deepx.nn.functional as F
 from .module import Module
 
-#copy from pytorch
-class ReLU(Module):
-    __constants__ = ["inplace"]
-    inplace: bool
 
-    def __init__(self, inplace: bool = False):
-        super().__init__()
-        self.inplace = inplace
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.relu(input, inplace=self.inplace)
-
-    def extra_repr(self) -> str:
-        inplace_str = "inplace=True" if self.inplace else ""
-        return inplace_str
-
-class Sigmoid(Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, input: Tensor) -> Tensor:
-        return F.sigmoid(input)
-    
-class Swish(Module):
+class Glu(Module):
     def __init__(self):
         super().__init__()
+        self.W = ones(shape=(1,1),name=self.full_name+"_W")
+        self.V = ones(shape=(1,1),name=self.full_name+"_V")
 
-    def forward(self, input: Tensor) -> Tensor:
-        return F.swish(input)
-
- 
 class Swiglu(Module):
     def __init__(self):
         super().__init__()
diff --git a/front/py/deepx/scheduler/client/udpconn.py b/front/py/deepx/scheduler/client/udpconn.py
index a25b0963..6a12c26a 100644
--- a/front/py/deepx/scheduler/client/udpconn.py
+++ b/front/py/deepx/scheduler/client/udpconn.py
@@ -3,7 +3,7 @@
 import select
 
 class UDPConn:
-    def __init__(self, endpoint: str = "localhost:8080"):
+    def __init__(self, endpoint: str = "localhost:9090"):
         # 解析endpoint
         self._host, port_str = endpoint.split(':')
         self._port = int(port_str)
diff --git a/front/py/deepx/tensor/changeshape.py b/front/py/deepx/tensor/changeshape.py
index 462fc9d3..e00a70d4 100644
--- a/front/py/deepx/tensor/changeshape.py
+++ b/front/py/deepx/tensor/changeshape.py
@@ -42,12 +42,6 @@ def transpose_(self):
     transpose_func(self,self)
     return self
 
-@tensor_method
-def broadcastshape(self,other:Tensor)->tuple[int,...]:
-    from deepx.nn.functional import broadcastshape as broadcastshape_func
-    result=broadcastshape_func(self.shape,other.shape)
-    return result
-
 @tensor_method
 def broadcastTo(self,shape:tuple[int,...],out:Union[Tensor,str]='')->Tensor:
     from deepx.nn.functional import broadcastTo as broadcastTo_func
@@ -62,7 +56,17 @@ def indexselect(self,index:Tensor,axis:int=0,out:Union[Tensor,str]='')->Tensor:
     result=indexselect_func(self,index,gatheraxis,out)
     return result
 
+@tensor_method
+def squeeze(self,dim:int)->Tensor:
+    from deepx.nn.functional import squeeze as squeeze_func
+    result=squeeze_func(self,dim)
+    return result
 
+@tensor_method
+def unsqueeze(self,dim:int)->Tensor:
+    from deepx.nn.functional import unsqueeze as unsqueeze_func
+    result=unsqueeze_func(self,dim)
+    return result
 
 # @tensor_method
 # def expand(self,shape:tuple)->Tensor:
diff --git a/front/py/deepx/tensor/elementwise.py b/front/py/deepx/tensor/elementwise.py
index 33ff1b97..9cd78e62 100644
--- a/front/py/deepx/tensor/elementwise.py
+++ b/front/py/deepx/tensor/elementwise.py
@@ -163,3 +163,8 @@ def invert(self,out:Union[Tensor,str]='')->Tensor:
     return invert_func(self,out)
 
 
+@tensor_method
+def dropout(self,p:float,out:Union[Tensor,str]=''):
+    from deepx.nn.functional import dropout as dropout_func
+    return dropout_func(self,p,out)
+
diff --git a/front/py/deepx/tensor/tensor.py b/front/py/deepx/tensor/tensor.py
index 661bd4de..11144d7a 100644
--- a/front/py/deepx/tensor/tensor.py
+++ b/front/py/deepx/tensor/tensor.py
@@ -1,4 +1,7 @@
 from typing import Optional,Union,TypeAlias
+
+from triton.language.semantic import equal
+
 from .shape import Shape
 
 
@@ -36,6 +39,8 @@ def __init__(self,shape:tuple[int,...],dtype:str='float32',name:str=None):
             raise ValueError("Invalid shape")
 
     def copy_to(self,t:'Tensor'):
+        assert isinstance(t,Tensor)
+        assert t.name != self._name
         from deepx.nn.functional import copytensor
         copytensor(self,t)
 
@@ -44,7 +49,12 @@ def clone(self,name:str=None):
         t=newtensor(self.shape,dtype=self.dtype,name=name)
         copytensor(self,t)
         return t
-    
+    def to(self,dtype:str,name:str=None):
+        assert isinstance(dtype,str) and dtype != ''
+        from deepx.nn.functional import todtype as todtype_func,newtensor
+        dest=newtensor(self.shape,dtype=dtype,name=name)
+        todtype_func(self,dest)
+        return dest
     # name
     @property
     def name(self):
@@ -104,13 +114,17 @@ def dtype(self):
     #elementwise
     def __add__(self, other:Union[Number,'Tensor']):
         return self.add(other)
-    
+    def __radd__(self, other:Union[Number,'Tensor']):
+        return self.add(other)
     def __sub__(self, other:Union[Number,'Tensor']):
         return self.sub(other)
+    def __rsub__(self, other:Union[Number,'Tensor']):
+        return self.sub(other)
     
     def __mul__(self, other:Union[Number,'Tensor']):
         return self.mul(other)
-    
+    def __rmul__(self, other:Union[Number,'Tensor']):
+        return self.mul(other)
     def __truediv__(self, other:Union[Number,'Tensor']):
         return self.div(other)
     
@@ -126,9 +140,10 @@ def __rpow__(self, other:Union[Number,'Tensor']):
     def __invert__(self):
         return self.invert()
     #矩阵乘法
-    def __matmul__(self, other:Union[Number,'Tensor']):
+    def __matmul__(self, other:'Tensor'):
         return self.matmul(other)
-
+    def __rmatmul__(self, other:'Tensor'):
+        return other.matmul(self)
     #gather
     def __getitem__(self, index:'Tensor'):
         return self.indexselect(index)
diff --git a/front/py/deepx/transformer/encoder.py b/front/py/deepx/transformer/encoder.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/front/py/deepx/transformer/modeling_rope_utils.py b/front/py/deepx/transformer/modeling_rope_utils.py
index 0e6dd1ed..0554590c 100644
--- a/front/py/deepx/transformer/modeling_rope_utils.py
+++ b/front/py/deepx/transformer/modeling_rope_utils.py
@@ -2,282 +2,44 @@
 import math
 from deepx import arange,Tensor
 
-def _compute_default_rope_parameters(
-    base: float = 10000.0,
-    head_dim: int = 0,
-    partial_rotary_factor: float = 1.0,
-) -> Tuple[Tensor, float]:
-    attention_factor = 1.0  # 在这种类型的RoPE中未使用
-    dim   = head_dim*partial_rotary_factor
+def _compute_default_rope_parameters(config:dict={
+    "base":10000.0,
+    "head_dim":0,
+    "partial_rotary_factor":1.0,
+}) -> Tuple[Tensor, float]:
+    dim   = config.head_dim* config.partial_rotary_factor
     # 计算逆频率
-    inv_freq = 1.0 / (base ** (arange(0, dim, 2, dtype='float64')/ dim))
-    return inv_freq, attention_factor
- 
-# def _compute_linear_scaling_rope_parameters(
-#     config: Optional[PretrainedConfig] = None,
-#     device: Optional["torch.device"] = None,
-#     seq_len: Optional[int] = None,
-#     **rope_kwargs,
-# ) -> Tuple["torch.Tensor", float]:
-#     """
-#     Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
-#     Args:
-#         config ([`~transformers.PretrainedConfig`]):
-#             The model configuration.
-#         device (`torch.device`):
-#             The device to use for initialization of the inverse frequencies.
-#         seq_len (`int`, *optional*):
-#             The current sequence length. Unused for this type of RoPE.
-#         rope_kwargs (`Dict`, *optional*):
-#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
-#     Returns:
-#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-#         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
-#     """
-#     if config is not None and len(rope_kwargs) > 0:
-#         raise ValueError(
-#             "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
-#             f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
-#         )
-#     if len(rope_kwargs) > 0:
-#         factor = rope_kwargs["factor"]
-#     elif config is not None:
-#         factor = config.rope_scaling["factor"]
-
-#     # Gets the default RoPE parameters
-#     inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
-
-#     # Then applies linear scaling to the frequencies.
-#     # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
-#     # applying scaling to the inverse frequencies is equivalent.
-#     inv_freq /= factor
-#     return inv_freq, attention_factor
-
-
-# def _compute_dynamic_ntk_parameters(
-#     config: Optional[PretrainedConfig] = None,
-#     device: Optional["torch.device"] = None,
-#     seq_len: Optional[int] = None,
-#     **rope_kwargs,
-# ) -> Tuple["torch.Tensor", float]:
-#     """
-#     Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
-#     Args:
-#         config ([`~transformers.PretrainedConfig`]):
-#             The model configuration.
-#         device (`torch.device`):
-#             The device to use for initialization of the inverse frequencies.
-#         seq_len (`int`, *optional*):
-#             The current sequence length, used to update the dynamic RoPE at inference time.
-#         rope_kwargs (`Dict`, *optional*):
-#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
-#     Returns:
-#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-#         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
-#     """
-#     # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
-#     if config is not None and len(rope_kwargs) > 0:
-#         raise ValueError(
-#             "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
-#             f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
-#         )
-#     if len(rope_kwargs) > 0:
-#         base = rope_kwargs["base"]
-#         dim = rope_kwargs["dim"]
-#         max_position_embeddings = rope_kwargs["max_position_embeddings"]
-#         factor = rope_kwargs["factor"]
-#     elif config is not None:
-#         base = config.rope_theta
-#         partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-#         head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-#         dim = int(head_dim * partial_rotary_factor)
-#         max_position_embeddings = config.max_position_embeddings
-#         factor = config.rope_scaling["factor"]
-
-#     attention_factor = 1.0  # Unused in this type of RoPE
-
-#     # seq_len: default to max_position_embeddings, e.g. at init time
-#     seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
-
-#     # Compute the inverse frequencies
-#     base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
-#     inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
-#     return inv_freq, attention_factor
-
-
-# def _compute_yarn_parameters(
-#     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-# ) -> Tuple["torch.Tensor", float]:
-#     """
-#     Computes the inverse frequencies with NTK scaling. Please refer to the
-#     [original paper](https://arxiv.org/abs/2309.00071)
-#     Args:
-#         config ([`~transformers.PretrainedConfig`]):
-#             The model configuration.
-#         device (`torch.device`):
-#             The device to use for initialization of the inverse frequencies.
-#         seq_len (`int`, *optional*):
-#             The current sequence length. Unused for this type of RoPE.
-#         rope_kwargs (`Dict`, *optional*):
-#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
-#     Returns:
-#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-#         post-processing scaling factor applied to the computed cos/sin.
-#     """
-#     # No need to keep BC with yarn, unreleased when this new pattern was created.
-#     if len(rope_kwargs) > 0:
-#         raise ValueError(
-#             f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
-#         )
-
-#     base = config.rope_theta
-#     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-#     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-#     dim = int(head_dim * partial_rotary_factor)
-#     max_position_embeddings = config.max_position_embeddings
-#     factor = config.rope_scaling["factor"]
-
-#     # Sets the attention factor as suggested in the paper
-#     attention_factor = config.rope_scaling.get("attention_factor")
-#     if attention_factor is None:
-#         attention_factor = 0.1 * math.log(factor) + 1.0
-
-#     # Optional config options
-#     # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
-#     beta_fast = config.rope_scaling.get("beta_fast") or 32
-#     beta_slow = config.rope_scaling.get("beta_slow") or 1
-
-#     # Compute the inverse frequencies
-#     def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
-#         """Inverse dimension formula to find the dimension based on the number of rotations"""
-#         return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
-
-#     def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
-#         """Find dimension range bounds based on rotations"""
-#         low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
-#         high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
-#         return max(low, 0), min(high, dim - 1)
-
-#     def linear_ramp_factor(min, max, dim):
-#         if min == max:
-#             max += 0.001  # Prevent singularity
-
-#         linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
-#         ramp_func = torch.clamp(linear_func, 0, 1)
-#         return ramp_func
-
-#     # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
-#     # to expand the possible context length. In other words, interpolation = apply scaling factor.
-#     pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
-#     inv_freq_extrapolation = 1.0 / pos_freqs
-#     inv_freq_interpolation = 1.0 / (factor * pos_freqs)
-
-#     low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
-
-#     # Get n-dimensional rotational scaling corrected for extrapolation
-#     inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
-#     inv_freq = (
-#         inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
-#         + inv_freq_extrapolation * inv_freq_extrapolation_factor
-#     )
-
-#     return inv_freq, attention_factor
-
-
-# def _compute_longrope_parameters(
-#     config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
-# ) -> Tuple["torch.Tensor", float]:
-#     """
-#     Computes the inverse frequencies with LongRoPE scaling. Please refer to the
-#     [original implementation](https://github.com/microsoft/LongRoPE)
-#     Args:
-#         config ([`~transformers.PretrainedConfig`]):
-#             The model configuration.
-#         device (`torch.device`):
-#             The device to use for initialization of the inverse frequencies.
-#         seq_len (`int`, *optional*):
-#             The current sequence length.
-#         rope_kwargs (`Dict`, *optional*):
-#             BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
-#     Returns:
-#         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
-#         post-processing scaling factor applied to the computed cos/sin.
-#     """
-#     # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
-#     # No need to keep BC with longrope, unreleased when this new pattern was created.
-#     if len(rope_kwargs) > 0:
-#         raise ValueError(
-#             "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
-#             f"{rope_kwargs}"
-#         )
-
-#     base = config.rope_theta
-#     partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
-#     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-#     dim = int(head_dim * partial_rotary_factor)
-#     long_factor = config.rope_scaling["long_factor"]
-#     short_factor = config.rope_scaling["short_factor"]
-#     factor = config.rope_scaling.get("factor")
-#     attention_factor = config.rope_scaling.get("attention_factor")
-
-#     # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
-#     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
-#     # values to compute the default attention scaling factor, instead of using `factor`.
-#     if hasattr(config, "original_max_position_embeddings"):
-#         original_max_position_embeddings = config.original_max_position_embeddings
-#         factor = config.max_position_embeddings / config.original_max_position_embeddings
-#     else:
-#         original_max_position_embeddings = config.max_position_embeddings
-
-#     # Sets the attention factor as suggested in the paper
-#     if attention_factor is None:
-#         if factor <= 1.0:
-#             attention_factor = 1.0
-#         else:
-#             attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings))
-
-#     # Compute the inverse frequencies -- scaled based on the target sequence length
-#     if seq_len and seq_len > original_max_position_embeddings:
-#         ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
-#     else:
-#         ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
-#     inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
-#     inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
-
-#     return inv_freq, attention_factor
-
-
-def _compute_llama3_parameters(    base: float = 10000.0,
-    head_dim: int = 0,
-    partial_rotary_factor: float = 1.0,
-    factor:float=8,
-    low_freq_factor:float=1,
-    high_freq_factor:float=4,
-    old_context_len:int=8192,
-    seq_len: Optional[int] = None
-) -> Tuple[Tensor, float]:
+    inv_freq = 1.0 / (config.base ** (arange(0, dim, 2, dtype='float64')/ dim))
+    return inv_freq, 1.0
+    
+def _compute_llama3_parameters(config:dict={
+    "base":10000.0,
+    "head_dim":0,
+    "partial_rotary_factor":1.0,
+    "factor":8,
+    "low_freq_factor":1,
+    "high_freq_factor":4,
+    "old_context_len":8192,
+    "seq_len":None
+}) -> Tuple[Tensor, float]:
     # Gets the default RoPE parameters
-    inv_freq, attention_factor = _compute_default_rope_parameters(base, head_dim, partial_rotary_factor)
+    inv_freq, attention_factor = _compute_default_rope_parameters(config)
 
-    low_freq_wavelen = old_context_len / low_freq_factor
-    high_freq_wavelen = old_context_len / high_freq_factor
+    low_freq_wavelen = config.old_context_len / config.low_freq_factor
+    high_freq_wavelen = config.old_context_len / config.high_freq_factor
 
     wavelen = 2 * math.pi / inv_freq
     # wavelen < high_freq_wavelen: do nothing
     # wavelen > low_freq_wavelen: divide by factor
-    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / config.factor, inv_freq)
     # otherwise: interpolate between the two, using a smooth factor
-    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
-    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    smooth_factor = (config.old_context_len / wavelen - config.low_freq_factor) / (config.high_freq_factor - config.low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / config.factor + smooth_factor * inv_freq_llama
     is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
     inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
 
     return inv_freq_llama, attention_factor
-
-
-# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
-# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
-# parameterizations, as long as the callable has the same signature.
+ 
 ROPE_INIT_FUNCTIONS = {
     "default": _compute_default_rope_parameters,
     # "linear": _compute_linear_scaling_rope_parameters,
diff --git a/front/py/deepx/transformer/models/llama/attention.py b/front/py/deepx/transformer/models/llama/attention.py
new file mode 100644
index 00000000..edb98f48
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/attention.py
@@ -0,0 +1,103 @@
+from typing import Optional,Tuple
+from deepx.nn.modules import Module,Linear
+from deepx import Tensor,repeat_kv,matmul,softmax,concat,arange
+
+
+
+def rotate_half(x:Tensor):
+    index_front=arange(0,x.shape[-1]//2,dtype="int32")
+    index_back=arange(x.shape[-1]//2,x.shape[-1],dtype="int32")
+    x1 = x.index_select(dim=-1,index=index_front)
+    x2 = x.index_select(dim=-1,index=index_back)
+    return concat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q:Tensor, k:Tensor, cos:Tensor, sin:Tensor, unsqueeze_dim:int=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+ 
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# 经简化，去掉了分布式配置，去掉attention的配置。交给IR自动替换flashattention，后续的组件自动处理
+
+def eager_attention_forward(
+    module: Module,
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    attention_mask: Optional[Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights =  matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = softmax(attn_weights, dim=-1, dtype=query.dtype)
+    attn_weights = dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+class LlamaAttention(Module):
+    def __init__(self, config:dict, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states:  Tensor,
+        position_embeddings: Tuple[Tensor, Tensor],
+        attention_mask: Optional[Tensor]
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+ 
+ 
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
\ No newline at end of file
diff --git a/front/py/deepx/transformer/models/llama/embedding.py b/front/py/deepx/transformer/models/llama/embedding.py
new file mode 100644
index 00000000..8bac5baa
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/embedding.py
@@ -0,0 +1,62 @@
+from deepx.nn.modules import Module
+from deepx import Tensor,concat
+from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+class LlamaRotaryEmbedding(Module):
+    def __init__(self,config:dict):
+        super().__init__()
+        # 最大序列长度
+        self.max_seq_len_cached = config["max_position_embeddings"]
+        # 原始最大序列长度
+        self.original_max_seq_len = config["max_position_embeddings"]
+        # 旋转类型
+        self.rope_type=config["rope_scaling"]["type"]
+        # 旋转初始化函数
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        # 旋转初始化函数
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config)
+        # 注册缓存
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # 原始旋转频率
+        self.original_inv_freq = self.inv_freq
+
+    # def _dynamic_frequency_update(self, position_ids, device):
+    #     """
+    #     dynamic RoPE layers should recompute `inv_freq` in the following situations:
+    #     1 - growing beyond the cached sequence length (allow scaling)
+    #     2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+    #     """
+    #     seq_len = torch.max(position_ids) + 1
+    #     if seq_len > self.max_seq_len_cached:  # growth
+    #         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
+    #         self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+    #         self.max_seq_len_cached = seq_len
+
+    #     if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+    #         # This .to() is needed if the model has been moved to a device after being initialized (because
+    #         # the buffer is automatically moved, but not the original copy)
+    #         self.original_inv_freq = self.original_inv_freq.to(device)
+    #         self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+    #         self.max_seq_len_cached = self.original_max_seq_len
+
+    def forward(self, x, position_ids):
+        # 扩展旋转频率
+        inv_freq_expanded = self.inv_freq.unsqueeze(dim=0).unsqueeze(dim=2).float()
+        broadcast_shape=(position_ids.shape[0], self.inv_freq.shape[0], 1)
+        inv_freq_expanded = inv_freq_expanded.broadcast_to(broadcast_shape)
+
+        # 使用torch.unsqueeze和type转换替代索引操作
+        position_ids_expanded = position_ids.unsqueeze(dim=1).to(dtype=x.dtype)
+        # 计算频率
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(1, 2)
+        # 拼接频率
+        emb = concat((freqs, freqs), dim=-1)
+        # 计算余弦和正弦
+        cos = emb.cos()
+        sin = emb.sin()
+        # 应用缩放因子
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
diff --git a/front/py/deepx/transformer/models/llama/mlp.py b/front/py/deepx/transformer/models/llama/mlp.py
new file mode 100644
index 00000000..a35ea3be
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/mlp.py
@@ -0,0 +1,26 @@
+from deepx.nn.functional import swish as swish_fn
+from deepx.nn.modules import Module,Linear
+
+ACT2FN={
+    "silu":swish_fn,
+}
+
+class LlamaMLP(Module):
+    def __init__(self, config:dict):
+        super().__init__()
+        # 输入层大小
+        self.hidden_size = config.hidden_size  
+        # 中间层大小
+        self.intermediate_size = config["intermediate_size"]
+        #门控投影层
+        self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        #上投影层
+        self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        #下投影层
+        self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        #激活函数
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
\ No newline at end of file
diff --git a/front/py/deepx/transformer/models/llama/modeling_llama.py b/front/py/deepx/transformer/models/llama/modeling_llama.py
index c3b07fd3..cc9cfc8b 100644
--- a/front/py/deepx/transformer/models/llama/modeling_llama.py
+++ b/front/py/deepx/transformer/models/llama/modeling_llama.py
@@ -1,102 +1,294 @@
-from deepx.nn.modules import Module,Linear
-from deepx import Tensor,ones,rsqrt,concat
+from typing import Optional,Tuple
+from deepx.nn.modules import Module,Linear,Embedding
+from deepx import Tensor
 from deepx.transformer.modeling_rope_utils import ROPE_INIT_FUNCTIONS
-# RMSNorm
-# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-# 数学公式
-class LlamaRMSNorm(Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
+from deepx.transformer.models.llama.attention import LlamaAttention
+from deepx.transformer.models.llama.mlp import LlamaMLP
+from deepx.transformer.models.llama.normalization import LlamaRMSNorm
+from deepx.transformer.models.llama.embedding import LlamaRotaryEmbedding
+ 
+
+class LlamaDecoderLayer(Module):
+    def __init__(self, config:dict, layer_idx: int):
         super().__init__()
-        self.weight=ones(hidden_size)
-        self.register_parameter("weight",self.weight)
-        self.variance_epsilon = eps
-    def forward(self, hidden_states:Tensor):
-        variance =  hidden_states.pow(2).mean((-1,), keepdim=True)
-        hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        position_embeddings: Optional[Tuple[Tensor, Tensor]] = None,  # necessary, but kept here for BC
+    ) -> Tuple[Tensor, Optional[Tuple[Tensor, Tensor]]]:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
  
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-    
+class LlamaModel(Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
 
-class LlamaRotaryEmbedding(Module):
-    def __init__(self,rope_type:str="default",max_seq_len:int=1024,device=None):
+    def __init__(self, config:dict):
         super().__init__()
-        # 最大序列长度
-        self.max_seq_len_cached = max_seq_len
-        # 原始最大序列长度
-        self.original_max_seq_len = max_seq_len
-        # 旋转类型
-        self.rope_type=rope_type
-        # 旋转初始化函数
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-        # 旋转初始化函数
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        #TODO 
-        # 注册缓存
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # 原始旋转频率
-        self.original_inv_freq = self.inv_freq
-
-    # def _dynamic_frequency_update(self, position_ids, device):
-    #     """
-    #     dynamic RoPE layers should recompute `inv_freq` in the following situations:
-    #     1 - growing beyond the cached sequence length (allow scaling)
-    #     2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-    #     """
-    #     seq_len = torch.max(position_ids) + 1
-    #     if seq_len > self.max_seq_len_cached:  # growth
-    #         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-    #         self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-    #         self.max_seq_len_cached = seq_len
-
-    #     if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-    #         # This .to() is needed if the model has been moved to a device after being initialized (because
-    #         # the buffer is automatically moved, but not the original copy)
-    #         self.original_inv_freq = self.original_inv_freq.to(device)
-    #         self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-    #         self.max_seq_len_cached = self.original_max_seq_len
-
-    def forward(self, x, position_ids):
-        # TODO
-        # if "dynamic" in self.rope_type:
-        #     self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
  
-        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-        emb = concat((freqs, freqs), dim=-1)
-        cos = emb.cos()
-        sin = emb.sin()
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
 
-class LlamaMLP(Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        # 输入层大小
-        self.hidden_size = config.hidden_size  
-        # 中间层大小
-        self.intermediate_size = config.intermediate_size  
-        #门控投影层
-        self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        #上投影层
-        self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        #下投影层
-        self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        #激活函数
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
\ No newline at end of file
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                position_embeddings=position_embeddings,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to plcae the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
diff --git a/front/py/deepx/transformer/models/llama/normalization.py b/front/py/deepx/transformer/models/llama/normalization.py
new file mode 100644
index 00000000..867f3db4
--- /dev/null
+++ b/front/py/deepx/transformer/models/llama/normalization.py
@@ -0,0 +1,22 @@
+from deepx.nn.modules import Module
+from deepx import Tensor,ones,rsqrt
+# RMSNorm
+# copy from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+# 数学公式
+class LlamaRMSNorm(Module):
+    def __init__(self, hidden_size:int, eps:float=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight=ones((hidden_size,))
+        self.register_parameter("weight",self.weight)
+        self.variance_epsilon = eps
+    def forward(self, hidden_states:Tensor):
+        variance =  hidden_states.pow(2).mean((-1,), keepdim=True)
+        hidden_states = hidden_states * rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states
+ 
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+    
\ No newline at end of file
diff --git a/front/py/examples/1_tensor/1_copy.py b/front/py/examples/1_tensor/1_copy.py
index 244d4bfa..96651626 100644
--- a/front/py/examples/1_tensor/1_copy.py
+++ b/front/py/examples/1_tensor/1_copy.py
@@ -2,8 +2,8 @@
 
 def copytest():
     from deepx.nn.functional import newtensor
-    t1= newtensor(1, 2, 3,name='t1')
-    t2= newtensor(1, 2, 3,name='t2')
+    t1= newtensor((1, 2, 3),name='t1')
+    t2= newtensor((1, 2, 3),name='t2')
     t1.print()
     t1.copy_to(t2)
     t2.print()
diff --git a/front/py/examples/1_tensor/1_print.py b/front/py/examples/1_tensor/1_print.py
index 7a5a205e..5072e1e0 100644
--- a/front/py/examples/1_tensor/1_print.py
+++ b/front/py/examples/1_tensor/1_print.py
@@ -6,7 +6,7 @@
 def newtensor():
 
    from deepx.nn.functional import newtensor
-   t=newtensor(1,2,3,name='t')
+   t=newtensor((1,2,3),name='t')
    t.print()
 
 if __name__ == "__main__":
diff --git a/front/py/examples/2_ir/1_init_zeroones.py b/front/py/examples/2_ir/1_init_zeroones.py
index ca788569..e286086e 100644
--- a/front/py/examples/2_ir/1_init_zeroones.py
+++ b/front/py/examples/2_ir/1_init_zeroones.py
@@ -23,15 +23,15 @@
 import deepx
 print()
 
-t1 = deepx.zeros([3,4,5],dtype='float32')
-t2 = deepx.ones([3,4,5],dtype='float32')
-t4=deepx.full([3,4,5],value=0.5)
+t1 = deepx.zeros((3,4,5),dtype='float32')
+t2 = deepx.ones((3,4,5),dtype='float32')
+t4=deepx.full((3,4,5),value=0.5)
 t4.print()
-t5=deepx.uniform(3,4,5,low=0,high=1)
+t5=deepx.uniform((3,4,5),low=0,high=1)
 t5.print()
-t6=deepx.kaiming_uniform(3,4,5,dtype='float32')
+t6=deepx.kaiming_uniform((3,4,5),dtype='float32')
 t6.print()
 
-t7=deepx.zeros(3,4,5,dtype='float32')
+t7=deepx.zeros((3,4,5),dtype='float32')
 t7.normal_(mean=0,stddev=0.02)
 t7.print("%.6f")
diff --git a/front/py/examples/2_ir/2_elementwise_add.py b/front/py/examples/2_ir/2_elementwise_add.py
index dca1fa93..d0b016bf 100644
--- a/front/py/examples/2_ir/2_elementwise_add.py
+++ b/front/py/examples/2_ir/2_elementwise_add.py
@@ -14,7 +14,7 @@
 
 print()
 
-t1 = full(2,3,4, value=10,dtype="float32")
+t1 = full((2,3,4), value=10,dtype="float32")
 t2 = t1.clone()
 t3 = t1+t2
 t3.add_(0.5)
diff --git a/front/py/examples/2_ir/2_elementwise_compare.py b/front/py/examples/2_ir/2_elementwise_compare.py
index 7f010870..6c66670c 100644
--- a/front/py/examples/2_ir/2_elementwise_compare.py
+++ b/front/py/examples/2_ir/2_elementwise_compare.py
@@ -17,10 +17,10 @@
 
 print()
 
-t1 = full(2,3,4, value=10,dtype="int8")
+t1 = full((2,3,4), value=10,dtype="int8")
 t2 = ~t1
 t2.print()
 
-t3 = full(2,3,4, value=2,dtype="int64")
+t3 = full((2,3,4), value=2,dtype="int64")
 t4 = ~t3
 t4.print()
\ No newline at end of file
diff --git a/front/py/examples/2_ir/2_elementwise_dropout.py b/front/py/examples/2_ir/2_elementwise_dropout.py
new file mode 100644
index 00000000..6ac49a56
--- /dev/null
+++ b/front/py/examples/2_ir/2_elementwise_dropout.py
@@ -0,0 +1,21 @@
+############-------PyTorch-------################
+
+print()
+import torch
+torch_t1 = torch.arange(24, dtype=torch.int32).reshape(2,3,4)
+torch_t2 = torch_t1.dropout(p=0.5)
+print(torch_t2)
+ 
+
+
+
+############-------DEEPX-------################
+
+from deepx import Tensor,arange
+
+print()
+
+t1 = arange(start=0,end=24 ,dtype="int32").reshape_(2,3,4)
+t2 = t1.dropout(p=0.5)
+t2.print()
+ 
\ No newline at end of file
diff --git a/front/py/examples/2_ir/2_elementwise_sqrtlog.py b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
index 705219ed..ed5ca24a 100644
--- a/front/py/examples/2_ir/2_elementwise_sqrtlog.py
+++ b/front/py/examples/2_ir/2_elementwise_sqrtlog.py
@@ -1,5 +1,6 @@
 
 ############-------PyTorch-------################
+print()
 
 import torch
 torch_t1 = torch.arange(3*4*5, dtype=torch.float32)
diff --git a/front/py/examples/2_ir/3_matmul.py b/front/py/examples/2_ir/3_matmul.py
index 205b27c0..413a2613 100644
--- a/front/py/examples/2_ir/3_matmul.py
+++ b/front/py/examples/2_ir/3_matmul.py
@@ -33,7 +33,7 @@
 
 t1 = load(npy_path+'t1')
 t2 = load(npy_path+'t2')
-t3= zeros(1024,1024,dtype='float32',name="t3")
+t3= zeros((1024,1024),dtype='float32',name="t3")
 from deepx.nn.functional import defaultauthor
 defaultauthor['matmul']='miaobyte'
 # warmup
diff --git a/front/py/examples/2_ir/4_changeshape_broadcast.py b/front/py/examples/2_ir/4_changeshape_broadcast.py
index 84199680..15ceb34f 100644
--- a/front/py/examples/2_ir/4_changeshape_broadcast.py
+++ b/front/py/examples/2_ir/4_changeshape_broadcast.py
@@ -1,6 +1,6 @@
 #######====PYTORCH======########
 
-
+print()
 import torch
 a=torch.arange(4*2*3).reshape(4,2,3)
 b=torch.arange(2*1).reshape(2,1)
diff --git a/front/py/examples/2_ir/4_changeshape_concat.py b/front/py/examples/2_ir/4_changeshape_concat.py
index cda373fb..ada8e0b0 100644
--- a/front/py/examples/2_ir/4_changeshape_concat.py
+++ b/front/py/examples/2_ir/4_changeshape_concat.py
@@ -1,5 +1,6 @@
 ############-------PyTorch-------################
 
+print()
 import torch
 torch_t1 = torch.ones(3, 4,5, dtype=torch.float32)
 torch_t2 = torch.ones(3, 4,5, dtype=torch.float32)
@@ -13,7 +14,6 @@
 
 from deepx import Tensor,zeros, ones, concat
 
-print()
 
 t1 = ones([3,4,5],dtype='float32',name='t1')
 t2=ones([3,4,5],dtype='float32',name='t2')
diff --git a/front/py/examples/2_ir/4_changeshape_gather.py b/front/py/examples/2_ir/4_changeshape_gather.py
index df12e5f2..5a7b6be4 100644
--- a/front/py/examples/2_ir/4_changeshape_gather.py
+++ b/front/py/examples/2_ir/4_changeshape_gather.py
@@ -1,27 +1,26 @@
 ############-------PyTorch-------################
-import numpy as np  
+import os
 print()
-indices_np = np.array([[0, 1, 2], [0, 1, 2]])
-
-print(indices_np)
-
+dir=os.path.expanduser('~/model/deepxmodel/functional/')
 import torch
 torch_t = torch.arange(10*5, dtype=torch.float32).reshape(10,5)
-torch_indices = torch.tensor(indices_np)
-torch_t2 = torch.index_select(torch_t, 1,torch_indices)
+index=[0, 1, 2,0, 1, 2]
+torch_index = torch.tensor(index,dtype=torch.int32)
+
+from deepxutil.torch import save_torch
+save_torch(torch_index,dir+'gatherindex')
+
+torch_t2 = torch.index_select(torch_t, 1,torch_index)
 print(torch_t2.shape)
 print(torch_t2)
 
 
 ############-------DEEPX-------################
 
-from deepx import Tensor,arange,Shape,load
-from deepxutil.numpy import save_numpy
-
-save_numpy(indices_np,'/home/lipeng/model/deepxmodel/tester/testindices')
+from deepx import  arange ,load
 
 t = arange(start=0,end=10*5,dtype='float32',name='t').reshape_((10,5))
-indices = load('/home/lipeng/model/deepxmodel/tester/testindices')
+indices = load(dir+'gatherindex')
 indices.print()
 t2 = t.indexselect(indices,axis=1)
 t2.print()
diff --git a/front/py/examples/2_ir/4_changeshape_reshape.py b/front/py/examples/2_ir/4_changeshape_reshape.py
index a8dde5f5..aec8153d 100644
--- a/front/py/examples/2_ir/4_changeshape_reshape.py
+++ b/front/py/examples/2_ir/4_changeshape_reshape.py
@@ -1,4 +1,4 @@
-
+print()
 ############-------PyTorch-------################
 
 import torch
@@ -13,7 +13,7 @@
 ############-------DEEPX-------################
 
 from deepx import Tensor,zeros, ones, full, arange
-print()
+
 t1 = ones((3,4),dtype='float32',name='t1')
 t1.print()
 t2=t1.reshape((3,2,2))
diff --git a/front/py/examples/2_ir/5_reduce_prod.py b/front/py/examples/2_ir/5_reduce_prod.py
index a6f0eb63..60676f51 100644
--- a/front/py/examples/2_ir/5_reduce_prod.py
+++ b/front/py/examples/2_ir/5_reduce_prod.py
@@ -3,15 +3,10 @@
 import torch
 torch_t = torch.arange(0,60).reshape(3,4,5)
 print(torch_t)
-torch_s = torch.sum(torch_t, dim=[0, 2])
-print(torch_s)
-# torch_p=torch.prod(torch_t,dim=1)
-# print(torch_p)
 
-torch_t1 = torch.ones(4, 5, 6,dtype=torch.float)
-print(torch_t1)
-torch_t2 = torch.sum(torch_t1, dim=[0, 1])
-print(torch_t2)
+torch_p=torch.prod(torch_t,dim=1)
+print(torch_p)
+
 
 
 ############-------DEEPX-------################
@@ -20,14 +15,7 @@
 from deepx.nn.functional import sum,prod
 
 t=arange(0,60,name='t').reshape_((3,4,5))
-
 t.print()
-s=sum(t,dim=(0,2),out="s")
-s.print()
-# p=prod(t,dim=(1,),out="p")
-# p.print()
-
-t1=ones((4,5,6),name="t1")
-t1.print()
-t2=sum(t1,dim=(0,1),out='t2')
-t2.print()
+
+p=prod(t,dim=(1,),out="p")
+p.print()
diff --git a/front/py/examples/2_ir/6_tensorlife_to.py b/front/py/examples/2_ir/6_tensorlife_to.py
new file mode 100644
index 00000000..03f16276
--- /dev/null
+++ b/front/py/examples/2_ir/6_tensorlife_to.py
@@ -0,0 +1,20 @@
+############-------PyTorch-------################
+ 
+print()
+import torch
+torch_t1 = torch.full((2,3,4, ), 10, dtype=torch.float32)
+torch_t2 = torch_t1.to(dtype=torch.bfloat16)
+print(torch_t2)
+torch_t3 = torch_t2.to(dtype=torch.float32)
+print(torch_t3)
+
+############-------DEEPX-------################
+
+from deepx import  full
+
+
+t1 = full((2,3,4), value=10,dtype="float32")
+t2 = t1.to(dtype="bfloat16")
+t2.print()
+t3 = t2.to(dtype="float32")
+t3.print()
\ No newline at end of file
diff --git a/front/py/examples/3_functional/1_dropout.py b/front/py/examples/3_functional/1_dropout.py
new file mode 100644
index 00000000..7aabaf7e
--- /dev/null
+++ b/front/py/examples/3_functional/1_dropout.py
@@ -0,0 +1,9 @@
+############-------PyTorch-------################
+print()
+
+import torch
+import torch.nn.functional as F
+torch_t = torch.empty(10, 10).uniform_(-1, 1)
+torch_relu_t = F.dropout(torch_t)
+print(torch_t)
+print(torch_relu_t)
diff --git a/front/py/examples/3_functional/1_relu.py b/front/py/examples/3_functional/1_relu.py
deleted file mode 100644
index 9cd1737e..00000000
--- a/front/py/examples/3_functional/1_relu.py
+++ /dev/null
@@ -1,27 +0,0 @@
-############-------PyTorch-------################
-
-import torch
-import torch.nn.functional as F
-torch_t = torch.empty(10, 10).uniform_(-1, 1)
-torch_relu_t = F.relu(torch_t)
-print(torch_t)
-print(torch_relu_t)
-
-############-------DEEPX-------################
-
-from deepx import Tensor,ones
-from deepx.nn.functional import relu,uniform
-
-
-t=uniform(10,10,low=-1,high=1,name='t')
-
-print(t)
-relu_t=relu(t)
-print(relu_t)
-
-# 当tensor.name为str时，说明其是中间变量，执行inplace操作
-t2=uniform(10,10,low=-1,high=1)
-t2.print()
-relu_t2=relu(t2)
-relu_t2.print()
-
diff --git a/front/py/examples/3_functional/1_rsqrt.py b/front/py/examples/3_functional/1_rsqrt.py
deleted file mode 100644
index aa4926a6..00000000
--- a/front/py/examples/3_functional/1_rsqrt.py
+++ /dev/null
@@ -1,18 +0,0 @@
-############-------PyTorch-------################
-
-import torch
-import torch.nn.functional as F
-torch_t = torch.arange(0, 24).reshape(2, 3, 4)
-torch_rsqrt_t = torch.rsqrt(torch_t)
-print(torch_t)
-print(torch_rsqrt_t)
-
-############-------DEEPX-------################
-
-from deepx import  arange
-from deepx.nn.functional import rsqrt
-
-t=arange(2,3,4,name='t')
-t.print()
-rsqrt_t=rsqrt(t)
-rsqrt_t.print()
diff --git a/front/py/examples/3_functional/1_swiglu.py b/front/py/examples/3_functional/1_swiglu.py
deleted file mode 100644
index 74120526..00000000
--- a/front/py/examples/3_functional/1_swiglu.py
+++ /dev/null
@@ -1,34 +0,0 @@
-############-------PyTorch-------################
-import torch
-import torch.nn.functional as F
-
-# 使用arange创建连续数据，确保最后一维是偶数以便分割
-x_torch = torch.arange(48, dtype=torch.float32).reshape(3, 4, 4) / 10.0 - 3.0
-print("PyTorch tensor:")
-print(x_torch)
-
-# SwiGLU实现：将tensor在最后一维分成两半
-x1, x2 = torch.split(x_torch, x_torch.size(-1) // 2, dim=-1)
-out_torch = F.silu(x1) * x2  # SwiGLU: swish(x1) * x2
-print("\nPyTorch swiglu result:")
-print(out_torch)
-
-############-------DEEPX-------################
-from deepx import arange,swish,swiglu
-
-# 使用相同的初始化方式
-x = arange(0,48,1,name="x").reshape_(3,4,4)
-x.div_(10.0)
-x.sub_(3.0)
-
-print("\nDEEPX tensor:")
-print(x)
-
-out = swiglu(x,out="out")
-print("\nDEEPX swiglu result:")
-print(out)
- 
-import os
-script_name = os.path.splitext(os.path.basename( os.path.abspath(__file__)))[0]  # 获取不带后缀的脚本名
-str=out.graph.to_dot()
-str.render(script_name+".dot", format='svg')
\ No newline at end of file
diff --git a/front/py/examples/3_functional/activite_relu.py b/front/py/examples/3_functional/activite_relu.py
new file mode 100644
index 00000000..7a899c74
--- /dev/null
+++ b/front/py/examples/3_functional/activite_relu.py
@@ -0,0 +1,25 @@
+############-------PyTorch-------################
+print()
+
+import torch
+import torch.nn.functional as F
+torch_t = torch.empty(10, 10).uniform_(-1, 1)
+torch_relu_t = F.relu(torch_t)
+print(torch_t)
+print(torch_relu_t)
+
+import os
+dir=os.path.expanduser('~/model/deepxmodel/functional/')
+from deepxutil.torch import save_torch
+save_torch(torch_t,dir+'uniformed')
+ 
+############-------DEEPX-------################
+
+from deepx  import relu,load
+
+
+t=load(dir+'uniformed')
+t.print()
+relu_t=relu(t)
+relu_t.print()
+
diff --git a/front/py/examples/3_functional/1_sigmoid.py b/front/py/examples/3_functional/activite_sigmoid.py
similarity index 70%
rename from front/py/examples/3_functional/1_sigmoid.py
rename to front/py/examples/3_functional/activite_sigmoid.py
index dbdfd614..8859fe4b 100644
--- a/front/py/examples/3_functional/1_sigmoid.py
+++ b/front/py/examples/3_functional/activite_sigmoid.py
@@ -6,18 +6,21 @@
 print("PyTorch tensor:")
 print(x_torch)
 
+import os
+dir=os.path.expanduser('~/model/deepxmodel/functional/')
+from deepxutil.torch import save_torch
+save_torch(x_torch,dir+'sigmoided')
+
 out_torch = torch.sigmoid(x_torch)
 print("\nPyTorch sigmoid result:")
 print(out_torch)
 
 ############-------DEEPX-------################
-from deepx import Tensor,ones,zeros,arange
+from deepx import Tensor,ones,zeros,arange,load
 from deepx import sigmoid
 
 # 使用相同的初始化方式
-x = arange(3,4,5,name="x")
-x.div_(10.0)
-x.sub_(3.0)
+x = load(dir+'sigmoided')
 
 print("\nDEEPX tensor:")
 x.print()
diff --git a/front/py/examples/3_functional/1_swish.py b/front/py/examples/3_functional/activite_swish.py
similarity index 65%
rename from front/py/examples/3_functional/1_swish.py
rename to front/py/examples/3_functional/activite_swish.py
index f4e8c7c3..ca9f431e 100644
--- a/front/py/examples/3_functional/1_swish.py
+++ b/front/py/examples/3_functional/activite_swish.py
@@ -1,23 +1,25 @@
 ############-------PyTorch-------################
 import torch
-import torch.nn.functional as F
 
 # 使用arange创建连续数据
 x_torch = torch.arange(60, dtype=torch.float32).reshape(3, 4, 5) / 10.0 - 3.0
 print("PyTorch tensor:")
 print(x_torch)
 
-out_torch = F.silu(x_torch)  # silu 就是 swish 激活函数
+import os
+dir=os.path.expanduser('~/model/deepxmodel/functional/')
+from deepxutil.torch import save_torch
+save_torch(x_torch,dir+'swish')
+
+out_torch = torch.nn.functional.silu(x_torch)
 print("\nPyTorch swish result:")
 print(out_torch)
 
 ############-------DEEPX-------################
-from deepx import  arange,swish
+from deepx import  load, swish
 
 # 使用相同的初始化方式
-x = arange(3,4,5,name="x")
-x.div_(10.0)
-x.sub_(3.0)
+x = load(dir+'swish')
 
 print("\nDEEPX tensor:")
 x.print()
diff --git a/front/py/examples/3_functional/elementwise_rsqrt.py b/front/py/examples/3_functional/elementwise_rsqrt.py
new file mode 100644
index 00000000..cf50e40d
--- /dev/null
+++ b/front/py/examples/3_functional/elementwise_rsqrt.py
@@ -0,0 +1,21 @@
+############-------PyTorch-------################
+
+import torch
+torch_t = torch.arange(0, 24,dtype=torch.float).reshape(2, 3, 4)
+torch_rsqrt_t = torch.rsqrt(torch_t)
+print(torch_t)
+print(torch_rsqrt_t)
+
+import os
+dir = os.path.expanduser('~/model/deepxmodel/functional/')
+from deepxutil.torch import save_torch
+save_torch(torch_t, dir + 'aranged')
+
+############-------DEEPX-------################
+
+from deepx import  rsqrt,load
+
+t=load(dir+'aranged')
+t.print()
+rsqrt_t=rsqrt(t)
+rsqrt_t.print()
diff --git a/front/py/examples/3_functional/1_softmax.py b/front/py/examples/3_functional/normalization_softmax.py
similarity index 62%
rename from front/py/examples/3_functional/1_softmax.py
rename to front/py/examples/3_functional/normalization_softmax.py
index f3b78b35..3758daa0 100644
--- a/front/py/examples/3_functional/1_softmax.py
+++ b/front/py/examples/3_functional/normalization_softmax.py
@@ -6,22 +6,24 @@
 print("PyTorch tensor:")
 print(x_torch)
 
-out_torch = torch.softmax(x_torch,-1)
+out_torch = torch.softmax(x_torch,-2)
 print("\nPyTorch sigmoid result:")
 print(out_torch)
 
+import os
+dir = os.path.expanduser('~/model/deepxmodel/functional/')
+from deepxutil.torch import save_torch
+save_torch(x_torch, dir + 'forsoftmax')
+
 ############-------DEEPX-------################
-from deepx import Tensor,ones,zeros,arange
-from deepx import softmax
+from deepx import softmax,load
 
 # 使用相同的初始化方式
-x = arange(3,4,5,name="x")
-x.div_(10.0)
-x.sub_(3.0)
+x=load(dir+'forsoftmax')
 
 print("\nDEEPX tensor:")
 x.print()
 
-out=softmax(x,-1)
+out=softmax(x,[-2])
 print("\nDEEPX sigmoid result:")
 out.print()
diff --git a/front/py/examples/3_functional/1_mean.py b/front/py/examples/3_functional/reduce_mean.py
similarity index 77%
rename from front/py/examples/3_functional/1_mean.py
rename to front/py/examples/3_functional/reduce_mean.py
index 64511555..e6d6d368 100644
--- a/front/py/examples/3_functional/1_mean.py
+++ b/front/py/examples/3_functional/reduce_mean.py
@@ -1,5 +1,5 @@
 ############-------PyTorch-------################
-
+print()
 import torch
 
 torch_t3 = torch.arange(0, 120,dtype=torch.float).reshape(4, 5, 6)
@@ -8,12 +8,9 @@
 print(torch_t3_mean)
 
 ############-------DEEPX-------################
+from deepx.nn.functional import  mean,arange
 
-from deepx import  arange
-from deepx.nn.functional import  mean
-
-
-t3=arange(4,5,6,name="t3")
+t3 = arange(0, 120).reshape((4, 5, 6))
 t3.print()
 
 t3_mean=mean(t3,dim=(0,1))
diff --git a/front/py/examples/3_module/1_embedding.py b/front/py/examples/3_module/1_embedding.py
index 3204fc00..ff5c1f17 100644
--- a/front/py/examples/3_module/1_embedding.py
+++ b/front/py/examples/3_module/1_embedding.py
@@ -28,13 +28,14 @@ def tokenize_text(text, tokenizer):
 torch_input = tokenize_text(text, tokenizer)
 from deepxutil.torch import save_torch
 save_torch(torch_input,dir+'input')
+print(torch_input.shape)
 print(torch_input)
 # 创建网络
 torch_net = nn.Embedding(tokenizer.vocab_size, 4096)
 save_torch(torch_net.weight,dir+'weight')
 # 前向传播
 torch_output = torch_net(torch_input)
-print()
+
 print(torch_output.shape)
 print(torch_output)
 
diff --git a/front/py/examples/3_module/1_swiglu.py b/front/py/examples/3_module/1_swiglu.py
deleted file mode 100644
index 0e350582..00000000
--- a/front/py/examples/3_module/1_swiglu.py
+++ /dev/null
@@ -1,43 +0,0 @@
-hidden_size = 8
-eps = 1e-6
-dir='/home/lipeng/model/deepxmodel/llama/'
-
-
-
-############### PyTorch 实现部分 ###############
-import torch
-# 使用小规模数据以便打印完整结果
-pt_input = torch.arange(48, dtype=torch.float32).reshape(2, 3, hidden_size) / 10.0 - 2.0
-print("PyTorch 输入:")
-print(pt_input)
-
-from transformers.models.llama.modeling_llama import LlamaRMSNorm as TransformersLlamaRMSNorm
-from deepxutil.torch import save_torch
-save_torch(pt_input,dir+'rmsnorm_input')
-# 使用transformers库中的官方LlamaRMSNorm实现
-pt_norm = TransformersLlamaRMSNorm(hidden_size, eps=eps)
-# 设置权重为固定值0.5
-with torch.no_grad():
-    pt_norm.weight.fill_(0.5)
-# 前向传播
-pt_output = pt_norm(pt_input)
-
-
-print("\nPyTorch RMSNorm 结果:")
-print(pt_output.shape)
-print(pt_output)
-
- 
-############### DeepX 实现部分 ###############
-from deepx import  constant_,load
-from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
-
-input=load(dir+'rmsnorm_input')
-
-# DeepX计算流程
-norm = LlamaRMSNorm(hidden_size=hidden_size, eps=eps)
-# 设置相同的权重
-constant_(norm.weight, 0.5)
-# 前向计算
-output = norm(input)
-output.print()
diff --git a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
index 0e350582..f0160a27 100644
--- a/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
+++ b/front/py/examples/4_transformer/llama/1_llamarmsnorm_torch.py
@@ -30,7 +30,7 @@
  
 ############### DeepX 实现部分 ###############
 from deepx import  constant_,load
-from deepx.transformer.models.llama.modeling_llama import LlamaRMSNorm
+from deepx.transformer.models.llama.normalization import LlamaRMSNorm
 
 input=load(dir+'rmsnorm_input')
 
diff --git a/todo/modeling_llama.py b/todo/modeling_llama.py
deleted file mode 100644
index 8cbb1262..00000000
--- a/todo/modeling_llama.py
+++ /dev/null
@@ -1,1155 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Callable, List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache, StaticCache
-from ...generation import GenerationMixin
-from ...modeling_attn_mask_utils import AttentionMaskConverter
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_outputs import (
-    BaseModelOutputWithPast,
-    CausalLMOutputWithPast,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutputWithPast,
-    TokenClassifierOutput,
-)
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
-from ...utils import (
-    LossKwargs,
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_llama import LlamaConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b-hf"
-_CONFIG_FOR_DOC = "LlamaConfig"
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
-
-
-class LlamaRotaryEmbedding(nn.Module):
-    def __init__(self, config: LlamaConfig, device=None):
-        super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    def _dynamic_frequency_update(self, position_ids, device):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
-        seq_len = torch.max(position_ids) + 1
-        if seq_len > self.max_seq_len_cached:  # growth
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
-            self.max_seq_len_cached = seq_len
-
-        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            self.original_inv_freq = self.original_inv_freq.to(device)
-            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
-            self.max_seq_len_cached = self.original_max_seq_len
-
-    @torch.no_grad()
-    def forward(self, x, position_ids):
-        if "dynamic" in self.rope_type:
-            self._dynamic_frequency_update(position_ids, device=x.device)
-
-        # Core RoPE block
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos()
-            sin = emb.sin()
-
-        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
-        cos = cos * self.attention_scaling
-        sin = sin * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-class LlamaMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs,
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-class LlamaAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: LlamaConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-
-        self.q_proj = nn.Linear(
-            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
-        past_key_value: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
-                logger.warning_once(
-                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
-                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-                )
-            else:
-                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-
-
-class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = LlamaMLP(config)
-        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
-
-
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
-            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
-            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
-
-            Two formats are allowed:
-            - a [`~cache_utils.Cache`] instance, see our
-            [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
-            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
-            cache format.
-
-            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
-            legacy cache format will be returned.
-
-            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
-            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
-            of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaModel(LlamaPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
-
-    Args:
-        config: LlamaConfig
-    """
-
-    def __init__(self, config: LlamaConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = LlamaRotaryEmbedding(config=config)
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
-
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
-        )
-
-        hidden_states = inputs_embeds
-
-        # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    causal_mask,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                    position_embeddings,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=causal_mask,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                    position_embeddings=position_embeddings,
-                    **flash_attn_kwargs,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        output = BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-        return output if return_dict else output.to_tuple()
-
-    def _update_causal_mask(
-        self,
-        attention_mask: torch.Tensor,
-        input_tensor: torch.Tensor,
-        cache_position: torch.Tensor,
-        past_key_values: Cache,
-        output_attentions: bool,
-    ):
-        if self.config._attn_implementation == "flash_attention_2":
-            if attention_mask is not None and (attention_mask == 0.0).any():
-                return attention_mask
-            return None
-
-        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
-        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
-        # to infer the attention mask.
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-        using_static_cache = isinstance(past_key_values, StaticCache)
-
-        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
-        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
-            if AttentionMaskConverter._ignore_causal_mask_sdpa(
-                attention_mask,
-                inputs_embeds=input_tensor,
-                past_key_values_length=past_seen_tokens,
-                is_training=self.training,
-            ):
-                return None
-
-        dtype, device = input_tensor.dtype, input_tensor.device
-        sequence_length = input_tensor.shape[1]
-        if using_static_cache:
-            target_length = past_key_values.get_max_cache_shape()
-        else:
-            target_length = (
-                attention_mask.shape[-1]
-                if isinstance(attention_mask, torch.Tensor)
-                else past_seen_tokens + sequence_length + 1
-            )
-
-        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
-        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
-            attention_mask,
-            sequence_length=sequence_length,
-            target_length=target_length,
-            dtype=dtype,
-            device=device,
-            cache_position=cache_position,
-            batch_size=input_tensor.shape[0],
-        )
-
-        if (
-            self.config._attn_implementation == "sdpa"
-            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
-            and not output_attentions
-        ):
-            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
-            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
-            # Details: https://github.com/pytorch/pytorch/issues/110213
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
-
-        return causal_mask
-
-    @staticmethod
-    def _prepare_4d_causal_attention_mask_with_cache_position(
-        attention_mask: torch.Tensor,
-        sequence_length: int,
-        target_length: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        cache_position: torch.Tensor,
-        batch_size: int,
-        **kwargs,
-    ):
-        """
-        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
-        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
-
-        Args:
-            attention_mask (`torch.Tensor`):
-                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
-                `(batch_size, 1, query_length, key_value_length)`.
-            sequence_length (`int`):
-                The sequence length being processed.
-            target_length (`int`):
-                The target length: when generating with static cache, the mask should be as long as the static cache,
-                to account for the 0 padding, the part of the cache that is not filled yet.
-            dtype (`torch.dtype`):
-                The dtype to use for the 4D attention mask.
-            device (`torch.device`):
-                The device to plcae the 4D attention mask on.
-            cache_position (`torch.Tensor`):
-                Indices depicting the position of the input sequence tokens in the sequence.
-            batch_size (`torch.Tensor`):
-                Batch size.
-        """
-        if attention_mask is not None and attention_mask.dim() == 4:
-            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
-            causal_mask = attention_mask
-        else:
-            min_dtype = torch.finfo(dtype).min
-            causal_mask = torch.full(
-                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
-            )
-            if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
-            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
-            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
-            if attention_mask is not None:
-                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
-                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
-                padding_mask = padding_mask == 0
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    padding_mask, min_dtype
-                )
-
-        return causal_mask
-
-
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
-
-
-class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = LlamaModel(config)
-        self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    def get_output_embeddings(self):
-        return self.lm_head
-
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-
-    def set_decoder(self, decoder):
-        self.model = decoder
-
-    def get_decoder(self):
-        return self.model
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        num_logits_to_keep: int = 0,
-        **kwargs: Unpack[KwargsForCausalLM],
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-            num_logits_to_keep (`int`, *optional*):
-                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-            **kwargs,
-        )
-
-        hidden_states = outputs[0]
-        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
-                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
-                sequence_lengths = sequence_lengths % input_ids.shape[-1]
-                sequence_lengths = sequence_lengths.to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
-
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-The Llama Model transformer with a span classification head on top for extractive question-answering tasks like
-SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForQuestionAnswering(LlamaPreTrainedModel):
-    base_model_prefix = "transformer"
-
-    # Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama
-    def __init__(self, config):
-        super().__init__(config)
-        self.transformer = LlamaModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, 2)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.transformer.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.transformer.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        **kwargs,
-    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.transformer(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-
-        loss = None
-        if start_positions is not None and end_positions is not None:
-            loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)
-
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return QuestionAnsweringModelOutput(
-            loss=loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """
-    The Llama Model transformer with a token classification head on top (a linear layer on top of the hidden-states
-    output) e.g. for Named-Entity-Recognition (NER) tasks.
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForTokenClassification(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        if getattr(config, "classifier_dropout", None) is not None:
-            classifier_dropout = config.classifier_dropout
-        elif getattr(config, "hidden_dropout", None) is not None:
-            classifier_dropout = config.hidden_dropout
-        else:
-            classifier_dropout = 0.1
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.score = nn.Linear(config.hidden_size, config.num_labels)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, TokenClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.score(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.config)
-
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-__all__ = [
-    "LlamaForCausalLM",
-    "LlamaModel",
-    "LlamaPreTrainedModel",
-    "LlamaForSequenceClassification",
-    "LlamaForQuestionAnswering",
-    "LlamaForTokenClassification",
-]