access: Go Coverage Report

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package access

import (
        "bytes"
        "context"
        "fmt"
        "io"
        "net/http"
        "runtime"
        "sort"
        "sync/atomic"
        "time"

        "github.com/hashicorp/consul/api"
        "gopkg.in/natefinch/lumberjack.v2"

        errcode "github.com/cubefs/cubefs/blobstore/common/errors"
        "github.com/cubefs/cubefs/blobstore/common/proto"
        "github.com/cubefs/cubefs/blobstore/common/resourcepool"
        "github.com/cubefs/cubefs/blobstore/common/rpc"
        "github.com/cubefs/cubefs/blobstore/common/trace"
        "github.com/cubefs/cubefs/blobstore/util/defaulter"
        "github.com/cubefs/cubefs/blobstore/util/log"
        "github.com/cubefs/cubefs/blobstore/util/retry"
        "github.com/cubefs/cubefs/blobstore/util/task"
)

const (
        defaultMaxSizePutOnce  int64 = 1 << 28 // 256MB
        defaultMaxPartRetry    int   = 3
        defaultMaxHostRetry    int   = 3
        defaultPartConcurrence int   = 4
        defaultServiceInterval int   = 3600 // one hour.
        defaultServiceName           = "access"
)

// RPCConnectMode self-defined rpc client connection config setting
type RPCConnectMode uint8

// timeout: [short - - - - - - - - -> long]
//       quick --> general --> default --> slow --> nolimit
// speed: 40MB -->  20MB   -->  10MB   --> 4MB  --> nolimit
const (
        DefaultConnMode RPCConnectMode = iota
        QuickConnMode
        GeneralConnMode
        SlowConnMode
        NoLimitConnMode
)

func (mode RPCConnectMode) getConfig(speed float64, timeout, baseTimeout int64) rpc.Config {
        getSpeed := func(defaultVal float64) float64 {
                if speed > 0 {
                        return speed
                }
                return defaultVal
        }
        getBaseTimeout := func(defaultVal int64) int64 {
                if baseTimeout > 0 {
                        return baseTimeout
                }
                return defaultVal
        }
        getTimeout := func(speed float64) int64 {
                if timeout > 0 {
                        return timeout
                }
                return 5 * (1 << 30) * 1e3 / int64(speed*(1<<20))
        }

        config := rpc.Config{
                // the whole request and response timeout
                ClientTimeoutMs:   getTimeout(getSpeed(10)),
                BodyBandwidthMBPs: getSpeed(10),
                BodyBaseTimeoutMs: getBaseTimeout(30 * 1000),
                Tc: rpc.TransportConfig{
                        // dial timeout
                        DialTimeoutMs: 5 * 1000,
                        // response header timeout after send the request
                        ResponseHeaderTimeoutMs: 5 * 1000,
                        // IdleConnTimeout is the maximum amount of time an idle
                        // (keep-alive) connection will remain idle before closing
                        // itself.Zero means no limit.
                        IdleConnTimeoutMs: 30 * 1000,

                        MaxIdleConns:        0,
                        MaxConnsPerHost:     2048,
                        MaxIdleConnsPerHost: 1024,
                        DisableCompression:  true,
                },
        }

        switch mode {
        case QuickConnMode:
                config.ClientTimeoutMs = getTimeout(getSpeed(40))
                config.BodyBandwidthMBPs = getSpeed(40)
                config.BodyBaseTimeoutMs = getBaseTimeout(3 * 1000)
                config.Tc.DialTimeoutMs = 2 * 1000
                config.Tc.ResponseHeaderTimeoutMs = 2 * 1000
                config.Tc.IdleConnTimeoutMs = 10 * 1000
        case GeneralConnMode:
                config.ClientTimeoutMs = getTimeout(getSpeed(20))
                config.BodyBandwidthMBPs = getSpeed(20)
                config.BodyBaseTimeoutMs = getBaseTimeout(10 * 1000)
                config.Tc.DialTimeoutMs = 3 * 1000
                config.Tc.ResponseHeaderTimeoutMs = 3 * 1000
                config.Tc.IdleConnTimeoutMs = 30 * 1000
        case SlowConnMode:
                config.ClientTimeoutMs = getTimeout(getSpeed(4))
                config.BodyBandwidthMBPs = getSpeed(4)
                config.BodyBaseTimeoutMs = getBaseTimeout(120 * 1000)
                config.Tc.DialTimeoutMs = 10 * 1000
                config.Tc.ResponseHeaderTimeoutMs = 10 * 1000
                config.Tc.IdleConnTimeoutMs = 60 * 1000
        case NoLimitConnMode:
                config.ClientTimeoutMs = 0
                config.BodyBandwidthMBPs = getSpeed(0)
                config.BodyBaseTimeoutMs = getBaseTimeout(0)
                config.Tc.DialTimeoutMs = 0
                config.Tc.ResponseHeaderTimeoutMs = 0
                config.Tc.IdleConnTimeoutMs = 600 * 1000
        default:
        }

        return config
}

// Config access client config
type Config struct {
        // ConnMode rpc connection timeout setting
        ConnMode RPCConnectMode
        // ClientTimeoutMs the whole request and response timeout
        ClientTimeoutMs int64
        // BodyBandwidthMBPs reading body timeout, request or response
        //   timeout = ContentLength/BodyBandwidthMBPs + BodyBaseTimeoutMs
        BodyBandwidthMBPs float64
        // BodyBaseTimeoutMs base timeout for read body
        BodyBaseTimeoutMs int64

        // Consul is consul config for discovering service
        Consul ConsulConfig
        // ServiceIntervalS is interval seconds for discovering service
        ServiceIntervalS int
        // PriorityAddrs priority addrs of access service when retry
        PriorityAddrs []string
        // MaxSizePutOnce max size using once-put object interface
        MaxSizePutOnce int64
        // MaxPartRetry max retry times when putting one part, 0 means forever
        MaxPartRetry int
        // MaxHostRetry max retry hosts of access service
        MaxHostRetry int
        // PartConcurrence concurrence of put parts
        PartConcurrence int

        // rpc selector config
        // Failure retry interval, default value is -1, if FailRetryIntervalS < 0,
        // remove failed hosts will not work.
        FailRetryIntervalS int
        // Within MaxFailsPeriodS, if the number of failures is greater than or equal to MaxFails,
        // the host is considered disconnected.
        MaxFailsPeriodS int
        // HostTryTimes Number of host failure retries
        HostTryTimes int

        // RPCConfig user-defined rpc config
        // All connections will use the config if it's not nil
        // ConnMode will be ignored if rpc config is setting
        RPCConfig *rpc.Config

        // LogLevel client output logging level.
        LogLevel log.Level

        // Logger trace all logging to the logger if setting.
        // It is an io.WriteCloser that writes to the specified filename.
        // YOU should CLOSE it after you do not use the client anymore.
        Logger *Logger
}

// ConsulConfig alias of consul api.Config
// Fixup: client and sdk using the same config type
type ConsulConfig = api.Config

// Logger alias of lumberjack Logger
// See more at: https://github.com/natefinch/lumberjack
type Logger = lumberjack.Logger

// client access rpc client
type client struct {
        config    Config
        rpcClient atomic.Value
        stop      chan struct{}
}

// API access api for s3
// To trace request id, the ctx is better WithRequestID(ctx, rid).
type API interface {
        // Put object once if size is not greater than MaxSizePutOnce, otherwise put blobs one by one.
        // return a location and map of hash summary bytes you excepted.
        //
        // If PutArgs' body is of type *bytes.Buffer, *bytes.Reader, or *strings.Reader,
        // GetBody is populated, then the Put once request has retry ability.
        Put(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error)
        // Get object, range is supported.
        Get(ctx context.Context, args *GetArgs) (body io.ReadCloser, err error)
        // Delete all blobs in these locations.
        // return failed locations which have yet been deleted if error is not nil.
        Delete(ctx context.Context, args *DeleteArgs) (failedLocations []Location, err error)
}

var _ API = (*client)(nil)

type noopBody struct{}

var _ io.ReadCloser = (*noopBody)(nil)

func (rc noopBody) Read(p []byte) (n int, err error) { return 0, io.EOF }
func (rc noopBody) Close() error                     { return nil }

var memPool *resourcepool.MemPool

func init() {
        memPool = resourcepool.NewMemPool(map[int]int{
                1 << 12: -1,
                1 << 14: -1,
                1 << 18: -1,
                1 << 20: -1,
                1 << 22: -1,
                1 << 23: -1,
                1 << 24: -1,
        })
}

// New returns an access API
func New(cfg Config) (API, error) {
        defaulter.LessOrEqual(&cfg.MaxSizePutOnce, defaultMaxSizePutOnce)
        defaulter.Less(&cfg.MaxPartRetry, defaultMaxPartRetry)
        defaulter.LessOrEqual(&cfg.MaxHostRetry, defaultMaxHostRetry)
        defaulter.LessOrEqual(&cfg.PartConcurrence, defaultPartConcurrence)
        if cfg.ServiceIntervalS < 300 { // at least 5 minutes
                cfg.ServiceIntervalS = defaultServiceInterval
        }

        log.SetOutputLevel(cfg.LogLevel)
        if cfg.Logger != nil {
                log.SetOutput(cfg.Logger)
        }

        c := &client{
                config: cfg,
                stop:   make(chan struct{}),
        }

        runtime.SetFinalizer(c, func(c *client) {
                rpcClient, ok := c.rpcClient.Load().(rpc.Client)
                if ok {
                        rpcClient.Close()
                }
                close(c.stop)
        })

        if cfg.Consul.Address == "" {
                if len(cfg.PriorityAddrs) < 1 {
                        return nil, errcode.ErrAccessServiceDiscovery
                }
                c.rpcClient.Store(getClient(&cfg, cfg.PriorityAddrs))
                return c, nil
        }

        consulConfig := cfg.Consul
        consulClient, err := api.NewClient(&consulConfig)
        if err != nil {
                return nil, errcode.ErrAccessServiceDiscovery
        }

        first := true
        serviceName := defaultServiceName
        hostGetter := func() ([]string, error) {
                if first && len(cfg.PriorityAddrs) > 0 {
                        hosts := make([]string, len(cfg.PriorityAddrs))
                        copy(hosts, cfg.PriorityAddrs[:])
                        first = false
                        return hosts, nil
                }
                services, _, err := consulClient.Health().Service(serviceName, "", true, nil)
                if err != nil {
                        return nil, err
                }
                hosts := make([]string, 0, len(services))
                for _, s := range services {
                        address := s.Service.Address
                        if address == "" {
                                address = s.Node.Address
                        }
                        hosts = append(hosts, fmt.Sprintf("http://%s:%d", address, s.Service.Port))
                }
                if len(hosts) == 0 {
                        return nil, fmt.Errorf("unavailable service")
                }
                return hosts, nil
        }

        hosts, err := hostGetter()
        if err != nil {
                log.Errorf("get hosts from consul failed: %v", err)
                return nil, errcode.ErrAccessServiceDiscovery
        }
        c.rpcClient.Store(getClient(&cfg, hosts))

        ticker := time.NewTicker(time.Duration(cfg.ServiceIntervalS) * time.Second)
        go func() {
                for {
                        old := hosts
                        select {
                        case <-ticker.C:
                                hosts, err = hostGetter()
                                if err != nil {
                                        log.Warnf("update hosts from consul failed: %v", err)
                                        continue
                                }
                                if isUpdated(old, hosts) {
                                        oldClient, ok := c.rpcClient.Load().(rpc.Client)
                                        if ok && oldClient != nil {
                                                oldClient.Close()
                                        }
                                        c.rpcClient.Store(getClient(&cfg, hosts))
                                }
                        case <-c.stop:
                                ticker.Stop()
                                return
                        }
                }
        }()

        return c, nil
}

func isUpdated(a, b []string) bool {
        if len(a) != len(b) {
                return true
        }

        sort.Slice(a, func(i, j int) bool { return a[i] < a[j] })
        sort.Slice(b, func(i, j int) bool { return b[i] < b[j] })

        for i := 0; i < len(a); i++ {
                if a[i] != b[i] {
                        return true
                }
        }
        return false
}

func getClient(cfg *Config, hosts []string) rpc.Client {
        lbConfig := &rpc.LbConfig{
                Hosts:              hosts,
                FailRetryIntervalS: cfg.FailRetryIntervalS,
                MaxFailsPeriodS:    cfg.MaxFailsPeriodS,
                HostTryTimes:       cfg.HostTryTimes,
                RequestTryTimes:    cfg.MaxHostRetry,
                ShouldRetry:        shouldRetry,
        }

        if cfg.RPCConfig == nil {
                rpcConfig := cfg.ConnMode.getConfig(cfg.BodyBandwidthMBPs,
                        cfg.ClientTimeoutMs, cfg.BodyBaseTimeoutMs)
                lbConfig.Config = rpcConfig
                return rpc.NewLbClient(lbConfig, nil)
        }
        lbConfig.Config = *cfg.RPCConfig

        return rpc.NewLbClient(lbConfig, nil)
}

func (c *client) Put(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error) {
        if args.Size == 0 {
                hashSumMap := args.Hashes.ToHashSumMap()
                for alg := range hashSumMap {
                        hashSumMap[alg] = alg.ToHasher().Sum(nil)
                }
                return Location{Blobs: make([]SliceInfo, 0)}, hashSumMap, nil
        }

        ctx = withReqidContext(ctx)
        if args.Size <= c.config.MaxSizePutOnce {
                return c.putObject(ctx, args)
        }
        return c.putParts(ctx, args)
}

func (c *client) putObject(ctx context.Context, args *PutArgs) (location Location, hashSumMap HashSumMap, err error) {
        rpcClient := c.rpcClient.Load().(rpc.Client)

        urlStr := fmt.Sprintf("/put?size=%d&hashes=%d", args.Size, args.Hashes)
        req, err := http.NewRequest(http.MethodPut, urlStr, args.Body)
        if err != nil {
                return
        }

        resp := &PutResp{}
        if err = rpcClient.DoWith(ctx, req, resp, rpc.WithCrcEncode()); err == nil {
                location = resp.Location
                hashSumMap = resp.HashSumMap
        }
        return
}

type blobPart struct {
        cid   proto.ClusterID
        vid   proto.Vid
        bid   proto.BlobID
        size  int
        token string
        buf   []byte
}

func (c *client) putPartsBatch(ctx context.Context, parts []blobPart) error {
        rpcClient := c.rpcClient.Load().(rpc.Client)

        tasks := make([]func() error, 0, len(parts))
        for _, part := range parts {
                part := part
                tasks = append(tasks, func() error {
                        urlStr := fmt.Sprintf("/putat?clusterid=%d&volumeid=%d&blobid=%d&size=%d&hashes=%d&token=%s",
                                part.cid, part.vid, part.bid, part.size, 0, part.token)
                        req, err := http.NewRequest(http.MethodPut, urlStr, bytes.NewReader(part.buf))
                        if err != nil {
                                return err
                        }
                        resp := &PutAtResp{}
                        return rpcClient.DoWith(ctx, req, resp, rpc.WithCrcEncode())
                })
        }

        if err := task.Run(context.Background(), tasks...); err != nil {
                for _, part := range parts {
                        part := part
                        // asynchronously delete blob
                        go func() {
                                urlStr := fmt.Sprintf("/deleteblob?clusterid=%d&volumeid=%d&blobid=%d&size=%d&token=%s",
                                        part.cid, part.vid, part.bid, part.size, part.token)
                                req, err := http.NewRequest(http.MethodDelete, urlStr, nil)
                                if err != nil {
                                        return
                                }
                                rpcClient.DoWith(ctx, req, nil)
                        }()
                }
                return err
        }
        return nil
}

func (c *client) readerPipeline(span trace.Span, reqBody io.Reader,
        closeCh <-chan struct{}, size, blobSize int) <-chan []byte {
        ch := make(chan []byte, c.config.PartConcurrence-1)
        go func() {
                for size > 0 {
                        toread := blobSize
                        if toread > size {
                                toread = size
                        }

                        buf, _ := memPool.Alloc(toread)
                        buf = buf[:toread]
                        _, err := io.ReadFull(reqBody, buf)
                        if err != nil {
                                span.Error("read buffer from request", err)
                                memPool.Put(buf)
                                close(ch)
                                return
                        }

                        select {
                        case <-closeCh:
                                memPool.Put(buf)
                                close(ch)
                                return
                        case ch <- buf:
                        }

                        size -= toread
                }
                close(ch)
        }()
        return ch
}

func (c *client) putParts(ctx context.Context, args *PutArgs) (Location, HashSumMap, error) {
        span := trace.SpanFromContextSafe(ctx)
        rpcClient := c.rpcClient.Load().(rpc.Client)

        hashSumMap := args.Hashes.ToHashSumMap()
        hasherMap := make(HasherMap, len(hashSumMap))
        for alg := range hashSumMap {
                hasherMap[alg] = alg.ToHasher()
        }

        reqBody := args.Body
        if len(hasherMap) > 0 {
                reqBody = io.TeeReader(args.Body, hasherMap.ToWriter())
        }

        var (
                loc    Location
                tokens []string
        )

        signArgs := SignArgs{}
        success := false
        defer func() {
                if success {
                        return
                }

                locations := signArgs.Locations[:]
                if len(locations) > 1 {
                        signArgs.Location = loc.Copy()
                        signResp := &SignResp{}
                        if err := rpcClient.PostWith(ctx, "/sign", signResp, signArgs); err == nil {
                                locations = []Location{signResp.Location.Copy()}
                        }
                }
                if len(locations) > 0 {
                        if _, err := c.Delete(ctx, &DeleteArgs{Locations: locations}); err != nil {
                                span.Warnf("clean location '%+v' failed %s", locations, err.Error())
                        }
                }
        }()

        // alloc
        allocResp := &AllocResp{}
        if err := rpcClient.PostWith(ctx, "/alloc", allocResp, AllocArgs{Size: uint64(args.Size)}); err != nil {
                return allocResp.Location, nil, err
        }
        loc = allocResp.Location
        tokens = allocResp.Tokens
        signArgs.Locations = append(signArgs.Locations, loc.Copy())

        // buffer pipeline
        closeCh := make(chan struct{})
        bufferPipe := c.readerPipeline(span, reqBody, closeCh, int(loc.Size), int(loc.BlobSize))
        defer func() {
                close(closeCh)
                // waiting pipeline close if has error
                for buf := range bufferPipe {
                        if len(buf) > 0 {
                                memPool.Put(buf)
                        }
                }
        }()

        releaseBuffer := func(parts []blobPart) {
                for _, part := range parts {
                        memPool.Put(part.buf)
                }
        }

        currBlobIdx := 0
        currBlobCount := uint32(0)
        remainSize := loc.Size
        restPartsLoc := loc

        readSize := 0
        for readSize < int(loc.Size) {
                parts := make([]blobPart, 0, c.config.PartConcurrence)

                // waiting at least one blob
                buf, ok := <-bufferPipe
                if !ok && readSize < int(loc.Size) {
                        return Location{}, nil, errcode.ErrAccessReadRequestBody
                }
                readSize += len(buf)
                parts = append(parts, blobPart{size: len(buf), buf: buf})

                more := true
                for more && len(parts) < c.config.PartConcurrence {
                        select {
                        case buf, ok := <-bufferPipe:
                                if !ok {
                                        if readSize < int(loc.Size) {
                                                releaseBuffer(parts)
                                                return Location{}, nil, errcode.ErrAccessReadRequestBody
                                        }
                                        more = false
                                } else {
                                        readSize += len(buf)
                                        parts = append(parts, blobPart{size: len(buf), buf: buf})
                                }
                        default:
                                more = false
                        }
                }

                tryTimes := c.config.MaxPartRetry
                for {
                        if len(loc.Blobs) > MaxLocationBlobs {
                                releaseBuffer(parts)
                                return Location{}, nil, errcode.ErrUnexpected
                        }

                        // feed new params
                        currIdx := currBlobIdx
                        currCount := currBlobCount
                        for i := range parts {
                                token := tokens[currIdx]
                                if restPartsLoc.Size > uint64(loc.BlobSize) && parts[i].size < int(loc.BlobSize) {
                                        token = tokens[currIdx+1]
                                }
                                parts[i].token = token
                                parts[i].cid = loc.ClusterID
                                parts[i].vid = loc.Blobs[currIdx].Vid
                                parts[i].bid = loc.Blobs[currIdx].MinBid + proto.BlobID(currCount)

                                currCount++
                                if loc.Blobs[currIdx].Count == currCount {
                                        currIdx++
                                        currCount = 0
                                }
                        }

                        err := c.putPartsBatch(ctx, parts)
                        if err == nil {
                                for _, part := range parts {
                                        remainSize -= uint64(part.size)
                                        currBlobCount++
                                        // next blobs
                                        if loc.Blobs[currBlobIdx].Count == currBlobCount {
                                                currBlobIdx++
                                                currBlobCount = 0
                                        }
                                }

                                break
                        }
                        span.Warn("putat parts", err)

                        if tryTimes > 0 { // has retry setting
                                if tryTimes == 1 {
                                        releaseBuffer(parts)
                                        span.Error("exceed the max retry limit", c.config.MaxPartRetry)
                                        return Location{}, nil, errcode.ErrUnexpected
                                }
                                tryTimes--
                        }

                        var restPartsResp *AllocResp
                        // alloc the rest parts
                        err = retry.Timed(3, 10).RuptOn(func() (bool, error) {
                                resp := &AllocResp{}
                                if err := rpcClient.PostWith(ctx, "/alloc", resp, AllocArgs{
                                        Size:            remainSize,
                                        BlobSize:        loc.BlobSize,
                                        CodeMode:        loc.CodeMode,
                                        AssignClusterID: loc.ClusterID,
                                }); err != nil {
                                        return true, err
                                }
                                if len(resp.Location.Blobs) > 0 {
                                        if newVid := resp.Location.Blobs[0].Vid; newVid == loc.Blobs[currBlobIdx].Vid {
                                                return false, fmt.Errorf("alloc the same vid %d", newVid)
                                        }
                                }
                                restPartsResp = resp
                                return true, nil
                        })
                        if err != nil {
                                releaseBuffer(parts)
                                span.Error("alloc another parts to put", err)
                                return Location{}, nil, errcode.ErrUnexpected
                        }

                        restPartsLoc = restPartsResp.Location
                        signArgs.Locations = append(signArgs.Locations, restPartsLoc.Copy())

                        if currBlobCount > 0 {
                                loc.Blobs[currBlobIdx].Count = currBlobCount
                                currBlobIdx++
                        }
                        loc.Blobs = append(loc.Blobs[:currBlobIdx], restPartsLoc.Blobs...)
                        tokens = append(tokens[:currBlobIdx], restPartsResp.Tokens...)

                        currBlobCount = 0
                }

                releaseBuffer(parts)
        }

        if len(signArgs.Locations) > 1 {
                signArgs.Location = loc.Copy()
                // sign
                signResp := &SignResp{}
                if err := rpcClient.PostWith(ctx, "/sign", signResp, signArgs); err != nil {
                        span.Error("sign location with crc", err)
                        return Location{}, nil, errcode.ErrUnexpected
                }
                loc = signResp.Location
        }

        for alg, hasher := range hasherMap {
                hashSumMap[alg] = hasher.Sum(nil)
        }
        success = true
        return loc, hashSumMap, nil
}

func (c *client) Get(ctx context.Context, args *GetArgs) (body io.ReadCloser, err error) {
        if !args.IsValid() {
                return nil, errcode.ErrIllegalArguments
        }
        rpcClient := c.rpcClient.Load().(rpc.Client)

        ctx = withReqidContext(ctx)
        if args.Location.Size == 0 || args.ReadSize == 0 {
                return noopBody{}, nil
        }

        resp, err := rpcClient.Post(ctx, "/get", args)
        if err != nil {
                return nil, err
        }
        if resp.StatusCode >= 400 {
                return nil, rpc.NewError(resp.StatusCode, "StatusCode", fmt.Errorf("code: %d", resp.StatusCode))
        }

        return resp.Body, nil
}

func (c *client) Delete(ctx context.Context, args *DeleteArgs) ([]Location, error) {
        if !args.IsValid() {
                if args == nil {
                        return nil, errcode.ErrIllegalArguments
                }
                return args.Locations, errcode.ErrIllegalArguments
        }
        rpcClient := c.rpcClient.Load().(rpc.Client)

        ctx = withReqidContext(ctx)
        locations := make([]Location, 0, len(args.Locations))
        for _, loc := range args.Locations {
                if loc.Size > 0 {
                        locations = append(locations, loc.Copy())
                }
        }
        if len(locations) == 0 {
                return nil, nil
        }

        if err := retry.Timed(3, 10).On(func() error {
                // access response 2xx even if there has failed locations
                deleteResp := &DeleteResp{}
                if err := rpcClient.PostWith(ctx, "/delete", deleteResp,
                        DeleteArgs{Locations: locations}); err != nil && rpc.DetectStatusCode(err) != http.StatusIMUsed {
                        return err
                }
                if len(deleteResp.FailedLocations) > 0 {
                        locations = deleteResp.FailedLocations[:]
                        return errcode.ErrUnexpected
                }
                return nil
        }); err != nil {
                return locations, err
        }
        return nil, nil
}

func shouldRetry(code int, err error) bool {
        if err != nil {
                if httpErr, ok := err.(rpc.HTTPError); ok {
                        // 500 need to retry next host
                        return httpErr.StatusCode() == http.StatusInternalServerError
                }
                return true
        }
        if code/100 != 4 && code/100 != 2 {
                return true
        }
        return false
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package access

import (
        "context"

        "github.com/cubefs/cubefs/blobstore/common/trace"
)

type ctxKey uint8

const (
        _operationName = "access_client"
)

const (
        _ ctxKey = iota
        reqidKey
)

// WithRequestID trace request id in full life of the request
// The second parameter rid could be the one of type below:
//     a string,
//     an interface { String() string },
//     an interface { TraceID() string },
//     an interface { RequestID() string },
func WithRequestID(ctx context.Context, rid interface{}) context.Context {
        return context.WithValue(ctx, reqidKey, rid)
}

func reqidFromContext(ctx context.Context) (string, bool) {
        val := ctx.Value(reqidKey)
        if val == nil {
                return "", false
        }
        if rid, ok := val.(string); ok {
                return rid, true
        }
        if rid, ok := val.(interface{ String() string }); ok {
                return rid.String(), true
        }
        if rid, ok := val.(interface{ TraceID() string }); ok {
                return rid.TraceID(), true
        }
        if rid, ok := val.(interface{ RequestID() string }); ok {
                return rid.RequestID(), true
        }
        return "", false
}

func withReqidContext(ctx context.Context) context.Context {
        if rid, ok := reqidFromContext(ctx); ok {
                _, ctx := trace.StartSpanFromContextWithTraceID(ctx, _operationName, rid)
                return ctx
        }
        if span := trace.SpanFromContext(ctx); span != nil {
                return ctx
        }
        _, ctx = trace.StartSpanFromContext(ctx, _operationName)
        return ctx
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package access

import (
        "crypto/md5"
        "crypto/sha1"
        "crypto/sha256"
        "encoding/base64"
        "encoding/binary"
        "encoding/hex"
        "fmt"
        "hash"
        "hash/crc32"
        "io"

        "github.com/cubefs/cubefs/blobstore/common/codemode"
        "github.com/cubefs/cubefs/blobstore/common/proto"
)

// HashAlgorithm hash.Hash algorithm when uploading data
type HashAlgorithm uint8

// defined hash algorithm
const (
        HashAlgDummy  HashAlgorithm = 1 << iota
        HashAlgCRC32                // crc32 with IEEE
        HashAlgMD5                  // md5
        HashAlgSHA1                 // sha1
        HashAlgSHA256               // sha256
)

const (
        // HashSize dummy hash size
        HashSize = 0

        // MaxLocationBlobs max blobs length in Location
        MaxLocationBlobs int = 4
        // MaxDeleteLocations max locations of delete request
        MaxDeleteLocations int = 1024
        // MaxBlobSize max blob size for allocation
        MaxBlobSize uint32 = 1 << 25 // 32MB
)

type dummyHash struct{}

var _ hash.Hash = (*dummyHash)(nil)

// implements hash.Hash
func (d dummyHash) Write(p []byte) (n int, err error) { return len(p), nil }
func (d dummyHash) Sum(b []byte) []byte               { return []byte{} }
func (d dummyHash) Reset()                            { _ = struct{}{} }
func (d dummyHash) Size() int                         { return 0 }
func (d dummyHash) BlockSize() int                    { return 0 }

// ToHasher returns a new hash.Hash computing checksum
// the value of algorithm should be one of HashAlg*
func (alg HashAlgorithm) ToHasher() hash.Hash {
        switch alg {
        case HashAlgCRC32:
                return crc32.NewIEEE()
        case HashAlgMD5:
                return md5.New()
        case HashAlgSHA1:
                return sha1.New()
        case HashAlgSHA256:
                return sha256.New()
        default:
                return dummyHash{}
        }
}

// ToHashSumMap returns a new HashSumMap, decode from rpc url argument
func (alg HashAlgorithm) ToHashSumMap() HashSumMap {
        h := make(HashSumMap)
        for _, a := range []HashAlgorithm{
                HashAlgDummy,
                HashAlgCRC32,
                HashAlgMD5,
                HashAlgSHA1,
                HashAlgSHA256,
        } {
                if alg&a == a {
                        h[a] = nil
                }
        }
        return h
}

// HasherMap map hasher of HashAlgorithm
type HasherMap map[HashAlgorithm]hash.Hash

// ToHashAlgorithm returns HashAlgorithm
func (h HasherMap) ToHashAlgorithm() HashAlgorithm {
        alg := HashAlgorithm(0)
        for k := range h {
                alg |= k
        }
        return alg
}

// ToWriter returns io writer
func (h HasherMap) ToWriter() io.Writer {
        writers := make([]io.Writer, 0, len(h))
        for _, hasher := range h {
                writers = append(writers, hasher)
        }
        return io.MultiWriter(writers...)
}

// HashSumMap save checksum in rpc calls
type HashSumMap map[HashAlgorithm][]byte

// GetSum get checksum value and ok via HashAlgorithm
//
//        HashAlgDummy  returns nil, bool
//        HashAlgCRC32  returns uint32, bool
//        HashAlgMD5    returns string(32), bool
//        HashAlgSHA1   returns string(40), bool
//        HashAlgSHA256 returns string(64), bool
func (h HashSumMap) GetSum(key HashAlgorithm) (interface{}, bool) {
        b, ok := h[key]
        if !ok {
                return nil, false
        }

        switch key {
        case HashAlgCRC32:
                if len(b) != crc32.Size {
                        return nil, false
                }
                return uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24, true
        case HashAlgMD5:
                if len(b) != md5.Size {
                        return nil, false
                }
                return hex.EncodeToString(b[:]), true
        case HashAlgSHA1:
                if len(b) != sha1.Size {
                        return nil, false
                }
                return hex.EncodeToString(b[:]), true
        case HashAlgSHA256:
                if len(b) != sha256.Size {
                        return nil, false
                }
                return hex.EncodeToString(b[:]), true
        default:
                if len(b) != HashSize {
                        return nil, false
                }
                return nil, true
        }
}

// GetSumVal get checksum only value via HashAlgorithm
func (h HashSumMap) GetSumVal(key HashAlgorithm) interface{} {
        val, _ := h.GetSum(key)
        return val
}

// ToHashAlgorithm returns HashAlgorithm, encode to rpc url argument
func (h HashSumMap) ToHashAlgorithm() HashAlgorithm {
        alg := HashAlgorithm(0)
        for k := range h {
                alg |= k
        }
        return alg
}

// All returns readable checksum
func (h HashSumMap) All() map[string]interface{} {
        m := make(map[string]interface{})
        for a, name := range map[HashAlgorithm]string{
                HashAlgCRC32:  "crc32",
                HashAlgMD5:    "md5",
                HashAlgSHA1:   "sha1",
                HashAlgSHA256: "sha256",
        } {
                if val, ok := h.GetSum(a); ok {
                        m[name] = val
                }
        }
        return m
}

// Location file location, 4 + 1 + 8 + 4 + 4 + len*16 bytes
// |                                        |
// |   ClusterID(4)    |    CodeMode(1)     |
// |                Size(8)                 |
// |   BlobSize(4)     |      Crc(4)        |
// |           len*SliceInfo(16)            |
//
// ClusterID which cluster file is in
// CodeMode is ec encode mode, see defined in "common/lib/codemode"
// Size is file size
// BlobSize is every blob's size but the last one which's size=(Size mod BlobSize)
// Crc is the checksum, change anything of the location, crc will mismatch
// Blobs all blob information
type Location struct {
        _         [0]byte
        ClusterID proto.ClusterID   `json:"cluster_id"`
        CodeMode  codemode.CodeMode `json:"code_mode"`
        Size      uint64            `json:"size"`
        BlobSize  uint32            `json:"blob_size"`
        Crc       uint32            `json:"crc"`
        Blobs     []SliceInfo       `json:"blobs"`
}

// SliceInfo blobs info, 8 + 4 + 4 bytes
//
// MinBid is the first blob id
// Vid is which volume all blobs in
// Count is num of consecutive blob ids, count=1 just has one blob
//
// blob ids = [MinBid, MinBid+count)
type SliceInfo struct {
        _      [0]byte
        MinBid proto.BlobID `json:"min_bid"`
        Vid    proto.Vid    `json:"vid"`
        Count  uint32       `json:"count"`
}

// Blob is one piece of data in a location
//
// Bid is the blob id
// Vid is which volume the blob in
// Size is real size of the blob
type Blob struct {
        Bid  proto.BlobID
        Vid  proto.Vid
        Size uint32
}

// Copy returns a new same Location
func (loc *Location) Copy() Location {
        dst := Location{
                ClusterID: loc.ClusterID,
                CodeMode:  loc.CodeMode,
                Size:      loc.Size,
                BlobSize:  loc.BlobSize,
                Crc:       loc.Crc,
                Blobs:     make([]SliceInfo, len(loc.Blobs)),
        }
        copy(dst.Blobs, loc.Blobs)
        return dst
}

// Encode transfer Location to slice byte
// Returns the buf created by me
//
//        (n) means max-n bytes
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//        |  field  | crc | clusterid  | codemode |    size     |  blobsize  |
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//        | n-bytes |  4  | uvarint(5) |    1     | uvarint(10) | uvarint(5) |
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//                25   +  (5){len(blobs)}   +   len(Blobs) * 20
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//        |  blobs  | minbid | vid | count |           ...                   |
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//        | n-bytes |  (10)  | (5) |  (5)  | (20) | (20) |       ...         |
//        - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
func (loc *Location) Encode() []byte {
        if loc == nil {
                return nil
        }
        n := 25 + 5 + len(loc.Blobs)*20
        buf := make([]byte, n)
        n = loc.Encode2(buf)
        return buf[:n]
}

// Encode2 transfer Location to the buf, the buf reuse by yourself
// Returns the number of bytes read
// If the buffer is too small, Encode2 will panic
func (loc *Location) Encode2(buf []byte) int {
        if loc == nil {
                return 0
        }

        n := 0
        binary.BigEndian.PutUint32(buf[n:], loc.Crc)
        n += 4
        n += binary.PutUvarint(buf[n:], uint64(loc.ClusterID))
        buf[n] = byte(loc.CodeMode)
        n++
        n += binary.PutUvarint(buf[n:], uint64(loc.Size))
        n += binary.PutUvarint(buf[n:], uint64(loc.BlobSize))

        n += binary.PutUvarint(buf[n:], uint64(len(loc.Blobs)))
        for _, blob := range loc.Blobs {
                n += binary.PutUvarint(buf[n:], uint64(blob.MinBid))
                n += binary.PutUvarint(buf[n:], uint64(blob.Vid))
                n += binary.PutUvarint(buf[n:], uint64(blob.Count))
        }

        return n
}

// Decode parse location from buf
// Returns the number of bytes read
// Error is not nil when parsing failed
func (loc *Location) Decode(buf []byte) (int, error) {
        if loc == nil {
                return 0, fmt.Errorf("location receiver is nil")
        }

        location, n, err := DecodeLocation(buf)
        if err != nil {
                return n, err
        }

        *loc = location
        return n, nil
}

// ToString transfer location to hex string
func (loc *Location) ToString() string {
        return loc.HexString()
}

// HexString transfer location to hex string
func (loc *Location) HexString() string {
        return hex.EncodeToString(loc.Encode())
}

// Base64String transfer location to base64 string
func (loc *Location) Base64String() string {
        return base64.StdEncoding.EncodeToString(loc.Encode())
}

// Spread location blobs to slice
func (loc *Location) Spread() []Blob {
        count := 0
        for _, blob := range loc.Blobs {
                count += int(blob.Count)
        }

        blobs := make([]Blob, 0, count)
        for _, blob := range loc.Blobs {
                for offset := uint32(0); offset < blob.Count; offset++ {
                        blobs = append(blobs, Blob{
                                Bid:  blob.MinBid + proto.BlobID(offset),
                                Vid:  blob.Vid,
                                Size: loc.BlobSize,
                        })
                }
        }
        if len(blobs) > 0 && loc.BlobSize > 0 {
                if lastSize := loc.Size % uint64(loc.BlobSize); lastSize > 0 {
                        blobs[len(blobs)-1].Size = uint32(lastSize)
                }
        }
        return blobs
}

// DecodeLocation parse location from buf
// Returns Location and the number of bytes read
// Error is not nil when parsing failed
func DecodeLocation(buf []byte) (Location, int, error) {
        var (
                loc Location
                n   int

                val uint64
                nn  int
        )
        next := func() (uint64, int) {
                val, nn := binary.Uvarint(buf)
                if nn <= 0 {
                        return 0, nn
                }
                n += nn
                buf = buf[nn:]
                return val, nn
        }

        if len(buf) < 4 {
                return loc, n, fmt.Errorf("bytes crc %d", len(buf))
        }
        loc.Crc = binary.BigEndian.Uint32(buf)
        n += 4
        buf = buf[4:]

        if val, nn = next(); nn <= 0 {
                return loc, n, fmt.Errorf("bytes cluster_id %d", nn)
        }
        loc.ClusterID = proto.ClusterID(val)

        if len(buf) < 1 {
                return loc, n, fmt.Errorf("bytes codemode %d", len(buf))
        }
        loc.CodeMode = codemode.CodeMode(buf[0])
        n++
        buf = buf[1:]

        if val, nn = next(); nn <= 0 {
                return loc, n, fmt.Errorf("bytes size %d", nn)
        }
        loc.Size = val

        if val, nn = next(); nn <= 0 {
                return loc, n, fmt.Errorf("bytes blob_size %d", nn)
        }
        loc.BlobSize = uint32(val)

        if val, nn = next(); nn <= 0 {
                return loc, n, fmt.Errorf("bytes length blobs %d", nn)
        }
        length := int(val)

        if length > 0 {
                loc.Blobs = make([]SliceInfo, 0, length)
        }
        for index := 0; index < length; index++ {
                var blob SliceInfo
                if val, nn = next(); nn <= 0 {
                        return loc, n, fmt.Errorf("bytes %dth-blob min_bid %d", index, nn)
                }
                blob.MinBid = proto.BlobID(val)

                if val, nn = next(); nn <= 0 {
                        return loc, n, fmt.Errorf("bytes %dth-blob vid %d", index, nn)
                }
                blob.Vid = proto.Vid(val)

                if val, nn = next(); nn <= 0 {
                        return loc, n, fmt.Errorf("bytes %dth-blob count %d", index, nn)
                }
                blob.Count = uint32(val)

                loc.Blobs = append(loc.Blobs, blob)
        }

        return loc, n, nil
}

// DecodeLocationFrom decode location from hex string
func DecodeLocationFrom(s string) (Location, error) {
        return DecodeLocationFromHex(s)
}

// DecodeLocationFromHex decode location from hex string
func DecodeLocationFromHex(s string) (Location, error) {
        var loc Location
        src, err := hex.DecodeString(s)
        if err != nil {
                return loc, err
        }
        _, err = loc.Decode(src)
        if err != nil {
                return loc, err
        }
        return loc, nil
}

// DecodeLocationFromBase64 decode location from base64 string
func DecodeLocationFromBase64(s string) (Location, error) {
        var loc Location
        src, err := base64.StdEncoding.DecodeString(s)
        if err != nil {
                return loc, err
        }
        _, err = loc.Decode(src)
        if err != nil {
                return loc, err
        }
        return loc, nil
}

// PutArgs for service /put
// Hashes means how to calculate check sum,
// HashAlgCRC32 | HashAlgMD5 equal 2 + 4 = 6
type PutArgs struct {
        Size   int64         `json:"size"`
        Hashes HashAlgorithm `json:"hashes,omitempty"`
        Body   io.Reader     `json:"-"`
}

// IsValid is valid put args
func (args *PutArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return args.Size > 0
}

// PutResp put response result
type PutResp struct {
        Location   Location   `json:"location"`
        HashSumMap HashSumMap `json:"hashsum"`
}

// PutAtArgs for service /putat
type PutAtArgs struct {
        ClusterID proto.ClusterID `json:"clusterid"`
        Vid       proto.Vid       `json:"volumeid"`
        BlobID    proto.BlobID    `json:"blobid"`
        Size      int64           `json:"size"`
        Hashes    HashAlgorithm   `json:"hashes,omitempty"`
        Token     string          `json:"token"`
        Body      io.Reader       `json:"-"`
}

// IsValid is valid putat args
func (args *PutAtArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return args.ClusterID > proto.ClusterID(0) &&
                args.Vid > proto.Vid(0) &&
                args.BlobID > proto.BlobID(0) &&
                args.Size > 0
}

// PutAtResp putat response result
type PutAtResp struct {
        HashSumMap HashSumMap `json:"hashsum"`
}

// AllocArgs for service /alloc
type AllocArgs struct {
        Size            uint64            `json:"size"`
        BlobSize        uint32            `json:"blob_size"`
        AssignClusterID proto.ClusterID   `json:"assign_cluster_id"`
        CodeMode        codemode.CodeMode `json:"code_mode"`
}

// IsValid is valid alloc args
func (args *AllocArgs) IsValid() bool {
        if args == nil {
                return false
        }
        if args.AssignClusterID > 0 {
                return args.Size > 0 && args.BlobSize > 0 && args.BlobSize <= MaxBlobSize &&
                        args.CodeMode.IsValid()
        }
        return args.Size > 0 && args.BlobSize <= MaxBlobSize
}

// AllocResp alloc response result with tokens
// if size mod blobsize == 0, length of tokens equal length of location blobs
// otherwise additional token for the last blob uploading
type AllocResp struct {
        Location Location `json:"location"`
        Tokens   []string `json:"tokens"`
}

// GetArgs for service /get
type GetArgs struct {
        Location Location `json:"location"`
        Offset   uint64   `json:"offset"`
        ReadSize uint64   `json:"read_size"`
}

// IsValid is valid get args
func (args *GetArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return args.Offset <= args.Location.Size &&
                args.ReadSize <= args.Location.Size &&
                args.Offset+args.ReadSize <= args.Location.Size
}

// DeleteArgs for service /delete
type DeleteArgs struct {
        Locations []Location `json:"locations"`
}

// IsValid is valid delete args
func (args *DeleteArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return len(args.Locations) > 0 && len(args.Locations) <= MaxDeleteLocations
}

// DeleteResp delete response with failed locations
type DeleteResp struct {
        FailedLocations []Location `json:"failed_locations,omitempty"`
}

// DeleteBlobArgs for service /deleteblob
type DeleteBlobArgs struct {
        ClusterID proto.ClusterID `json:"clusterid"`
        Vid       proto.Vid       `json:"volumeid"`
        BlobID    proto.BlobID    `json:"blobid"`
        Size      int64           `json:"size"`
        Token     string          `json:"token"`
}

// IsValid is valid delete blob args
func (args *DeleteBlobArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return args.ClusterID > proto.ClusterID(0) &&
                args.Vid > proto.Vid(0) &&
                args.BlobID > proto.BlobID(0) &&
                args.Size > 0
}

// SignArgs for service /sign
// Locations are signed location getting from /alloc
// Location is to be signed location which merged by yourself
type SignArgs struct {
        Locations []Location `json:"locations"`
        Location  Location   `json:"location"`
}

// IsValid is valid sign args
func (args *SignArgs) IsValid() bool {
        if args == nil {
                return false
        }
        return len(args.Locations) > 0
}

// SignResp sign response location with crc
type SignResp struct {
        Location Location `json:"location"`
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package codemode

import "fmt"

type (
        // CodeMode EC encode and decode mode
        CodeMode     uint8
        CodeModeName string
)

// pre-defined mode
const (
        EC15P12       CodeMode = 1
        EC6P6         CodeMode = 2
        EC16P20L2     CodeMode = 3
        EC6P10L2      CodeMode = 4
        EC6P3L3       CodeMode = 5
        EC6P6Align0   CodeMode = 6
        EC6P6Align512 CodeMode = 7
        EC4P4L2       CodeMode = 8
        EC12P4        CodeMode = 9
        EC16P4        CodeMode = 10
        EC3P3         CodeMode = 11
        EC10P4        CodeMode = 12
        EC6P3         CodeMode = 13
        EC12P9        CodeMode = 14
        // for test
        EC6P6L9  CodeMode = 200
        EC6P8L10 CodeMode = 201
)

// Note: Don't modify it unless you know very well how codemode works.
const (
        // align size per shard
        alignSize0B   = 0    // 0B
        alignSize512B = 512  // 512B
        alignSize2KB  = 2048 // 2KB
)

// The tactic is fixed pairing with one codemode.
// Add a new codemode if you want other features.
var constCodeModeTactic = map[CodeMode]Tactic{
        // three az
        EC15P12: {N: 15, M: 12, L: 0, AZCount: 3, PutQuorum: 24, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P6:   {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC12P9:  {N: 12, M: 9, L: 0, AZCount: 3, PutQuorum: 20, GetQuorum: 0, MinShardSize: alignSize2KB},

        // two az
        EC16P20L2: {N: 16, M: 20, L: 2, AZCount: 2, PutQuorum: 34, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P10L2:  {N: 6, M: 10, L: 2, AZCount: 2, PutQuorum: 14, GetQuorum: 0, MinShardSize: alignSize2KB},

        // single az
        EC12P4: {N: 12, M: 4, L: 0, AZCount: 1, PutQuorum: 15, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC16P4: {N: 16, M: 4, L: 0, AZCount: 1, PutQuorum: 19, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC3P3:  {N: 3, M: 3, L: 0, AZCount: 1, PutQuorum: 5, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC10P4: {N: 10, M: 4, L: 0, AZCount: 1, PutQuorum: 13, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P3:  {N: 6, M: 3, L: 0, AZCount: 1, PutQuorum: 8, GetQuorum: 0, MinShardSize: alignSize2KB},
        // for env test
        EC6P3L3:       {N: 6, M: 3, L: 3, AZCount: 3, PutQuorum: 9, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P6Align0:   {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize0B},
        EC6P6Align512: {N: 6, M: 6, L: 0, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize512B},
        EC4P4L2:       {N: 4, M: 4, L: 2, AZCount: 2, PutQuorum: 6, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P6L9:       {N: 6, M: 6, L: 9, AZCount: 3, PutQuorum: 11, GetQuorum: 0, MinShardSize: alignSize2KB},
        EC6P8L10:      {N: 6, M: 8, L: 10, AZCount: 2, PutQuorum: 13, GetQuorum: 0, MinShardSize: alignSize0B},
}

var constName2CodeMode = map[CodeModeName]CodeMode{
        "EC15P12":       EC15P12,
        "EC6P6":         EC6P6,
        "EC16P20L2":     EC16P20L2,
        "EC6P10L2":      EC6P10L2,
        "EC6P3L3":       EC6P3L3,
        "EC6P6Align0":   EC6P6Align0,
        "EC6P6Align512": EC6P6Align512,
        "EC4P4L2":       EC4P4L2,
        "EC12P4":        EC12P4,
        "EC16P4":        EC16P4,
        "EC3P3":         EC3P3,
        "EC10P4":        EC10P4,
        "EC6P3":         EC6P3,
        "EC6P6L9":       EC6P6L9,
        "EC6P8L10":      EC6P8L10,
        "EC12P9":        EC12P9,
}

var constCodeMode2Name = map[CodeMode]CodeModeName{
        EC15P12:       "EC15P12",
        EC6P6:         "EC6P6",
        EC16P20L2:     "EC16P20L2",
        EC6P10L2:      "EC6P10L2",
        EC6P3L3:       "EC6P3L3",
        EC6P6Align0:   "EC6P6Align0",
        EC6P6Align512: "EC6P6Align512",
        EC4P4L2:       "EC4P4L2",
        EC12P4:        "EC12P4",
        EC16P4:        "EC16P4",
        EC3P3:         "EC3P3",
        EC10P4:        "EC10P4",
        EC6P3:         "EC6P3",
        EC6P6L9:       "EC6P6L9",
        EC6P8L10:      "EC6P8L10",
        EC12P9:        "EC12P9",
}

//vol layout ep:EC6P10L2
//|----N------|--------M----------------|--L--|
//|0,1,2,3,4,5|6,7,8,9,10,11,12,13,14,15|16,17|

// global stripe:[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], n=6 m=10
// two local stripes:
// local stripe1:[0,1,2,  6, 7, 8, 9,10, 16] n=8 m=1
// local stripe2:[3,4,5, 11,12,13,14,15, 17] n=8 m=1

// Tactic constant strategy of one CodeMode
type Tactic struct {
        N int
        M int
        // local parity count
        L int
        // the count of AZ, access use this for split data shards and parity shards
        AZCount int

        // PutQuorum write quorum,
        // MUST make sure that ec data is recoverable if one AZ was down
        // We SHOULD ignore the local shards
        // (N + M) / AZCount + N <= PutQuorum <= M + N
        PutQuorum int

        // get quorum config
        GetQuorum int

        // MinShardSize min size per shard, fill data into shards 0-N continuously,
        // align with zero bytes if data size less than MinShardSize*N
        //
        // length of data less than MinShardSize*N, size of per shard = MinShardSize
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  |  data  |                 align zero bytes                     |
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  |    0    |    1    |    2    |   ....                |    N    |
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //
        // length of data more than MinShardSize*N, size of per shard = ceil(len(data)/N)
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  |                           data                        |padding|
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  |    0    |    1    |    2    |   ....                |    N    |
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        MinShardSize int
}

func init() {
        // assert all codemode
        for _, pair := range []struct {
                Mode CodeMode
                Size int
        }{
                {Mode: EC15P12, Size: alignSize2KB},
                {Mode: EC6P6, Size: alignSize2KB},
                {Mode: EC12P9, Size: alignSize2KB},
                {Mode: EC16P20L2, Size: alignSize2KB},
                {Mode: EC6P10L2, Size: alignSize2KB},

                {Mode: EC6P3L3, Size: alignSize2KB},
                {Mode: EC6P6Align0, Size: alignSize0B},
                {Mode: EC6P6Align512, Size: alignSize512B},
        } {
                tactic := pair.Mode.Tactic()
                if !tactic.IsValid() {
                        panic(fmt.Sprintf("Invalid codemode:%d Tactic:%+v", pair.Mode, tactic))
                }

                min := tactic.N + (tactic.N+tactic.M)/tactic.AZCount
                max := tactic.N + tactic.M
                if tactic.PutQuorum < min || tactic.PutQuorum > max {
                        panic(fmt.Sprintf("Invalid codemode:%d PutQuorum:%d([%d,%d])", pair.Mode,
                                tactic.PutQuorum, min, max))
                }

                if tactic.MinShardSize != pair.Size {
                        panic(fmt.Sprintf("Invalid codemode:%d MinShardSize:%d(%d)", pair.Mode,
                                tactic.MinShardSize, pair.Size))
                }
        }
}

// T returns pointer of Tactic, used like:
// EC6P6.T().AllLocalStripe()
func (c CodeMode) T() *Tactic {
        tactic := c.Tactic()
        return &tactic
}

// Tactic returns its constant tactic
func (c CodeMode) Tactic() Tactic {
        if tactic, ok := constCodeModeTactic[c]; ok {
                return tactic
        }
        panic(fmt.Sprintf("Invalid codemode:%d", c))
}

// GetShardNum returns all shards number.
func (c CodeMode) GetShardNum() int {
        tactic := c.Tactic()
        return tactic.L + tactic.M + tactic.N
}

// Name turn the CodeMode to CodeModeName
func (c CodeMode) Name() CodeModeName {
        if name, ok := constCodeMode2Name[c]; ok {
                return name
        }
        panic(fmt.Sprintf("codemode: %d is invalid", c))
}

// String turn the CodeMode to string
func (c CodeMode) String() string {
        if name, ok := constCodeMode2Name[c]; ok {
                return string(name)
        }
        return ""
}

// IsValid check the CodeMode is valid
func (c CodeMode) IsValid() bool {
        if _, ok := constCodeMode2Name[c]; ok {
                return ok
        }
        return false
}

// GetCodeMode get the code mode by name
func (cn CodeModeName) GetCodeMode() CodeMode {
        if code, ok := constName2CodeMode[cn]; ok {
                return code
        }
        panic(fmt.Sprintf("codemode: %s is invalid", cn))
}

// IsValid check the CodeMode is valid by Name
func (cn CodeModeName) IsValid() bool {
        if _, ok := constName2CodeMode[cn]; ok {
                return ok
        }
        return false
}

// Tactic get tactic by code mode name
func (cn CodeModeName) Tactic() Tactic {
        return cn.GetCodeMode().Tactic()
}

// IsValid ec tactic valid or not
func (c *Tactic) IsValid() bool {
        return c.N > 0 && c.M > 0 && c.L >= 0 && c.AZCount > 0 &&
                c.PutQuorum > 0 && c.GetQuorum >= 0 && c.MinShardSize >= 0 &&
                c.N%c.AZCount == 0 && c.M%c.AZCount == 0 && c.L%c.AZCount == 0
}

// GetECLayoutByAZ ec layout by AZ
func (c *Tactic) GetECLayoutByAZ() (azStripes [][]int) {
        azStripes = make([][]int, c.AZCount)
        n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
        for idx := range azStripes {
                stripe := make([]int, 0, n+m+l)
                for i := 0; i < n; i++ {
                        stripe = append(stripe, idx*n+i)
                }
                for i := 0; i < m; i++ {
                        stripe = append(stripe, c.N+idx*m+i)
                }
                for i := 0; i < l; i++ {
                        stripe = append(stripe, c.N+c.M+idx*l+i)
                }
                azStripes[idx] = stripe
        }
        return azStripes
}

// GlobalStripe returns initial stripe        return name.GetCodeMode().Tactic()
func (c *Tactic) GlobalStripe() (indexes []int, n, m int) {
        indexes = make([]int, c.N+c.M)
        for i := 0; i < c.N+c.M; i++ {
                indexes[i] = i
        }
        return indexes, c.N, c.M
}

// AllLocalStripe returns all local stripes
func (c *Tactic) AllLocalStripe() (stripes [][]int, n, m int) {
        if c.L == 0 {
                return
        }

        n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
        return c.GetECLayoutByAZ(), n + m, l
}

// LocalStripe get local stripe by index
func (c *Tactic) LocalStripe(index int) (localStripe []int, n, m int) {
        if c.L == 0 {
                return nil, 0, 0
        }

        n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
        var azIdx int
        if index < c.N {
                azIdx = index / n
        } else if index < c.N+c.M {
                azIdx = (index - c.N) / m
        } else if index < c.N+c.M+c.L {
                azIdx = (index - c.N - c.M) / l
        } else {
                return nil, 0, 0
        }

        return c.LocalStripeInAZ(azIdx)
}

// LocalStripeInAZ get local stripe in az index
func (c *Tactic) LocalStripeInAZ(azIndex int) (localStripe []int, n, m int) {
        if c.L == 0 {
                return nil, 0, 0
        }

        n, m, l := c.N/c.AZCount, c.M/c.AZCount, c.L/c.AZCount
        azStripes := c.GetECLayoutByAZ()
        if azIndex < 0 || azIndex >= len(azStripes) {
                return nil, 0, 0
        }
        return azStripes[azIndex][:], n + m, l
}

// GetAllCodeModes get all the available CodeModes
func GetAllCodeModes() []CodeMode {
        return []CodeMode{
                EC15P12,
                EC6P6,
                EC16P20L2,
                EC6P10L2,
                EC6P3L3,
                EC6P6Align0,
                EC6P6Align512,
                EC4P4L2,
                EC12P4,
                EC16P4,
                EC3P3,
                EC10P4,
                EC6P3,
                EC6P6L9,
                EC6P8L10,
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package crc32block

import (
        "encoding/binary"
        "hash/crc32"
)

const (
        defaultCrc32BlockSize = 64 * 1024
)

var gBlockSize int64 = defaultCrc32BlockSize

type blockUnit []byte

func (b blockUnit) length() int {
        return len(b)
}

func (b blockUnit) payload() int {
        return len(b) - crc32Len
}

func (b blockUnit) check() (err error) {
        payloadCrc := crc32.ChecksumIEEE(b[crc32Len:])
        if binary.LittleEndian.Uint32(b) != payloadCrc {
                return ErrMismatchedCrc
        }
        return nil
}

func (b blockUnit) writeCrc() {
        payloadCrc := crc32.ChecksumIEEE(b[crc32Len:])
        binary.LittleEndian.PutUint32(b, payloadCrc)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package crc32block

import (
        "bufio"
        "io"
        "io/ioutil"
)

/*
journal record:

        |-----block-----|------block-----|---block---|

block:

        |--crc--|------payload -----|
*/

type Decoder struct {
        from    io.ReaderAt // read from
        off     int64       // offset. readonly
        limit   int64       // size limit, readonly
        bufSize int64       // for speed
        block   blockUnit   // block buffer
}

type decoderReader struct {
        reader io.Reader //
        block  []byte    //
        i, j   int       // block[i:j]
        err    error     //
}

type rangeReader struct {
        r      io.Reader
        limit  int64
        skip   int64
        skiped bool
}

type blockReader struct {
        reader io.Reader //
        block  blockUnit //
        i, j   int       // block[i:j] is unread portion of the current block's payload.
        remain int64     //
        err    error     //
}

func (br *blockReader) Read(p []byte) (n int, err error) {
        if br.err != nil {
                return 0, br.err
        }

        for br.i == br.j {
                if br.remain == 0 {
                        return n, io.EOF
                }
                br.err = br.nextBlock()
                if br.err != nil {
                        return 0, br.err
                }
        }

        n = copy(p, br.block[br.i:br.j])

        br.i += n
        br.remain -= int64(n)
        return n, nil
}

func (br *blockReader) nextBlock() (err error) {
        blockLen := int64(len(br.block))
        blockPayloadLen := int64(blockLen - crc32Len)

        want := blockLen
        if br.remain < blockPayloadLen {
                want = br.remain + crc32Len
        }

        _, err = io.ReadFull(br.reader, br.block[:want])
        if err != nil {
                br.err = err
                return br.err
        }

        if err = blockUnit(br.block[:want]).check(); err != nil {
                return err
        }

        br.i = crc32Len
        br.j = int(want)

        return nil
}

func (r *rangeReader) Read(p []byte) (n int, err error) {
        if !r.skiped {
                _, err := io.CopyN(ioutil.Discard, r.r, r.skip)
                if err != nil {
                        return 0, err
                }
                r.skiped = true
                r.r = io.LimitReader(r.r, r.limit)
        }
        return r.r.Read(p)
}

func (dec *Decoder) Reader(from, to int64) (r io.Reader, err error) {
        blockLen := int64(dec.block.length())
        blockPayloadLen := int64(dec.block.payload())

        blockOff := (from / blockPayloadLen) * blockLen
        encodedSize := EncodeSize(dec.limit, blockLen) - blockOff

        // raw reader
        r = io.NewSectionReader(dec.from, dec.off+blockOff, encodedSize)

        // buffer
        r = bufio.NewReaderSize(r, int(dec.bufSize))

        // decode reader
        r = NewBlockReader(r, DecodeSize(encodedSize, blockLen), dec.block)

        // range reader
        r = &rangeReader{
                r:     r,
                limit: to - from,
                skip:  from % blockPayloadLen,
        }

        return r, nil
}

func (r *decoderReader) Read(b []byte) (n int, err error) {
        if r.err != nil {
                return 0, r.err
        }

        for len(b) > 0 {
                if r.i == r.j {
                        if r.err = r.nextBlock(); r.err != nil {
                                if n > 0 {
                                        return n, nil
                                }
                                return n, r.err
                        }
                }

                readn := copy(b, r.block[r.i:r.j])
                r.i += readn

                b = b[readn:]
                n += readn
        }
        return
}

func (r *decoderReader) nextBlock() (err error) {
        n, err := readFullOrToEnd(r.reader, r.block)
        if err != nil {
                return
        }

        if n <= crc32Len {
                return ErrMismatchedCrc
        }

        if err = blockUnit(r.block[:n]).check(); err != nil {
                return ErrMismatchedCrc
        }

        r.i, r.j = crc32Len, n

        return nil
}

func NewBlockReader(r io.Reader, limit int64, block []byte) *blockReader {
        if block == nil || !isValidBlockLen(int64(len(block))) {
                panic(ErrInvalidBlock)
        }
        return &blockReader{reader: r, remain: limit, block: block}
}

// NewDecoderReader returns io.Reader
//
// Deprecated: no reused buffer, use NewBodyDecoder to instead.
func NewDecoderReader(in io.Reader) io.Reader {
        chunk := make([]byte, defaultCrc32BlockSize)
        return &decoderReader{block: chunk, err: nil, reader: in}
}

func NewDecoderWithBlock(r io.ReaderAt, off int64, size int64, block []byte, bufferSize int64) (dec *Decoder, err error) {
        if block == nil || !isValidBlockLen(int64(len(block))) {
                return nil, ErrInvalidBlock
        }
        return &Decoder{from: r, off: off, block: block, limit: size, bufSize: bufferSize}, nil
}

func NewDecoder(r io.ReaderAt, off int64, size int64) (dec *Decoder, err error) {
        block := make([]byte, defaultCrc32BlockSize)
        return NewDecoderWithBlock(r, off, size, block, defaultCrc32BlockSize)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package crc32block

import (
        "io"
)

type ReaderError struct {
        error
}

type WriterError struct {
        error
}

type Encoder struct {
        block blockUnit // block buffer
}

type limitEncoderReader struct {
        reader io.Reader
        block  blockUnit
        remain int64
        i, j   int
        err    error
}

type encoderReader struct {
        reader io.Reader //
        block  blockUnit //
        i, j   int       // block[i:j]
        err    error     //
}

func (enc *Encoder) Encode(from io.Reader, limitSize int64, to io.Writer) (n int64, err error) {
        if !isValidBlockLen(int64(enc.block.length())) {
                panic(ErrInvalidBlock)
        }

        encSize := EncodeSize(limitSize, int64(enc.block.length()))

        reader := &limitEncoderReader{reader: from, block: enc.block, remain: limitSize}

        return io.CopyN(to, reader, encSize)
}

func (r *limitEncoderReader) Read(b []byte) (n int, err error) {
        if r.err != nil {
                return 0, r.err
        }

        for len(b) > 0 {
                if r.i == r.j {
                        if r.remain == 0 {
                                return n, io.EOF
                        }
                        if r.err = r.nextBlock(); r.err != nil {
                                if n > 0 {
                                        return n, nil
                                }
                                return n, r.err
                        }
                }

                readn := copy(b, r.block[r.i:r.j])
                r.i += readn

                b = b[readn:]
                n += readn
        }
        return
}

func (r *limitEncoderReader) nextBlock() (err error) {
        blockPayloadLen := r.block.payload()

        needn := blockPayloadLen
        if r.remain < int64(blockPayloadLen) {
                needn = int(r.remain)
        }

        block := blockUnit(r.block[:crc32Len+needn])

        n, err := io.ReadFull(r.reader, block[crc32Len:])
        if err != nil {
                return ReaderError{err}
        }

        r.i = 0
        r.j = crc32Len + n

        blockUnit(r.block[r.i:r.j]).writeCrc()
        r.remain -= int64(block.payload())

        return nil
}

func (r *encoderReader) Read(b []byte) (n int, err error) {
        if r.err != nil {
                return 0, r.err
        }

        for len(b) > 0 {
                if r.i == r.j {
                        if r.err = r.nextBlock(); r.err != nil {
                                if n > 0 {
                                        return n, nil
                                }
                                return n, r.err
                        }
                }

                readn := copy(b, r.block[r.i:r.j])
                r.i += readn

                b = b[readn:]
                n += readn
        }
        return
}

func (r *encoderReader) nextBlock() (err error) {
        n, err := readFullOrToEnd(r.reader, r.block[crc32Len:])
        if err != nil {
                return err
        }

        r.i = 0
        r.j = crc32Len + n

        blockUnit(r.block[r.i:r.j]).writeCrc()

        return nil
}

func NewEncoder(block []byte) (enc *Encoder, err error) {
        if block != nil && !isValidBlockLen(int64(len(block))) {
                return nil, ErrInvalidBlock
        }
        if block == nil {
                block = make([]byte, defaultCrc32BlockSize)
        }

        return &Encoder{block: block}, nil
}

// NewEncoderReader returns io.Reader
//
// Deprecated: no reused buffer, use NewBodyEncoder to instead.
func NewEncoderReader(r io.Reader) io.Reader {
        block := make([]byte, defaultCrc32BlockSize)
        return &encoderReader{block: block, reader: r}
}

func NewLimitEncoderReader(r io.Reader, limitSize int64) (enc *limitEncoderReader) {
        block := make([]byte, defaultCrc32BlockSize)
        enc = &limitEncoderReader{reader: r, block: block, remain: limitSize}
        return
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package crc32block

import (
        "io"
        "sync"

        "github.com/cubefs/cubefs/blobstore/util/bytespool"
)

// RequestBody is implemented http request's body.
// always io.ReadCloser.
//
// For client requests, The HTTP Client's Transport is
// responsible for calling the Close method. Necessarily call
// the Close method if the body's life-cycle control by yourself.
//
// For server requests, the Server will close the request body.
// The ServeHTTP Handler does not need to.
//
// The Body must allow Read to be called concurrently with Close.
// In particular, calling Close should unblock a Read waiting
// for input.
type RequestBody interface {
        io.ReadCloser

        // CodeSize returns encoded whole body size for encoding,
        // or origin body size for decoding.
        CodeSize(int64) int64
}

type requestBody struct {
        encode bool
        offset int
        err    error
        block  blockUnit
        rc     io.ReadCloser

        blockLock chan struct{} // safely free the block
        closeCh   chan struct{}
        closeOnce sync.Once
}

func (r *requestBody) Read(p []byte) (n int, err error) {
        if r.err != nil {
                return 0, r.err
        }

        for len(p) > 0 {
                if r.offset < 0 || r.offset == r.block.length() {
                        if r.err = r.nextBlock(); r.err != nil {
                                if n > 0 {
                                        return n, nil
                                }
                                return n, r.err
                        }
                }

                read := copy(p, r.block[r.offset:])
                r.offset += read

                p = p[read:]
                n += read
        }
        return n, nil
}

func (r *requestBody) nextBlock() error {
        var (
                n     int
                err   error
                block blockUnit
        )
        if r.encode {
                block = r.block[crc32Len:]
        } else {
                block = r.block
        }

        readCh := make(chan struct{})
        go func() {
                if _, ok := <-r.blockLock; !ok {
                        // closed
                        return
                }
                n, err = readFullOrToEnd(r.rc, block)
                close(readCh)
                r.blockLock <- struct{}{}
        }()

        select {
        case <-r.closeCh:
                return ErrReadOnClosed
        case <-readCh:
        }
        if err != nil {
                return err
        }

        if r.encode {
                r.offset = 0
                r.block = r.block[:crc32Len+n]
                r.block.writeCrc()
                return nil
        }

        if n <= crc32Len {
                return ErrMismatchedCrc
        }

        r.offset = crc32Len
        r.block = r.block[:n]
        if err = r.block.check(); err != nil {
                return ErrMismatchedCrc
        }
        return nil
}

func (r *requestBody) Close() error {
        r.closeOnce.Do(func() {
                block := r.block
                r.block = nil
                close(r.closeCh)

                go func(buf []byte) {
                        <-r.blockLock
                        close(r.blockLock)
                        bytespool.Free(buf)
                }(block)
        })
        return r.rc.Close()
}

func (r *requestBody) CodeSize(size int64) int64 {
        if r.encode {
                return EncodeSize(size, int64(r.block.length()))
        }
        return DecodeSize(size, int64(r.block.length()))
}

type codeSizeBody struct {
        encode      bool
        blockLength int64
}

func (c *codeSizeBody) Read(p []byte) (n int, err error) { return 0, io.EOF }
func (c *codeSizeBody) Close() error                     { return nil }
func (c *codeSizeBody) CodeSize(size int64) int64 {
        if c.encode {
                return EncodeSize(size, c.blockLength)
        }
        return DecodeSize(size, c.blockLength)
}

// TODO: using resourcepool's chan-pool if block size greater than 64K.
func newRequestBody(rc io.ReadCloser, encode bool) RequestBody {
        if rc == nil {
                return &codeSizeBody{
                        encode:      encode,
                        blockLength: gBlockSize,
                }
        }

        lock := make(chan struct{}, 1)
        lock <- struct{}{}
        return &requestBody{
                encode:    encode,
                block:     bytespool.Alloc(int(gBlockSize)),
                offset:    -1,
                rc:        rc,
                blockLock: lock,
                closeCh:   make(chan struct{}),
        }
}

// NewBodyEncoder returns encoder with crc32.
//
// If rc == nil, the encoder is called just with CodeSize,
// you need not to Close it at all.
func NewBodyEncoder(rc io.ReadCloser) RequestBody {
        return newRequestBody(rc, true)
}

// NewBodyDecoder returns decoder with crc32.
//
// If rc == nil, the decoder is called just with CodeSize,
// you need not to Close it at all.
func NewBodyDecoder(rc io.ReadCloser) RequestBody {
        return newRequestBody(rc, false)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package crc32block

import (
        "errors"
        "io"
)

const (
        crc32Len     = 4
        baseBlockBit = 12
        baseBlockLen = (1 << baseBlockBit)
)

var (
        ErrInvalidBlock  = errors.New("crc32block: invalid block buffer")
        ErrMismatchedCrc = errors.New("crc32block: mismatched checksum")
        ErrReadOnClosed  = errors.New("crc32block: read on closed")
)

func isValidBlockLen(blockLen int64) bool {
        return blockLen > 0 && blockLen%baseBlockLen == 0
}

func blockPayload(blockLen int64) int64 {
        return blockLen - crc32Len
}

// SetBlockSize set default block size
func SetBlockSize(blockSize int64) {
        if !isValidBlockLen(blockSize) {
                panic(ErrInvalidBlock)
        }
        gBlockSize = blockSize
}

func EncodeSize(size int64, blockLen int64) int64 {
        if !isValidBlockLen(blockLen) {
                panic(ErrInvalidBlock)
        }
        payload := blockPayload(blockLen)
        blockCnt := (size + (payload - 1)) / payload
        return size + 4*blockCnt
}

func DecodeSize(totalSize int64, blockLen int64) int64 {
        if !isValidBlockLen(blockLen) {
                panic(ErrInvalidBlock)
        }
        blockCnt := (totalSize + (blockLen - 1)) / blockLen
        return totalSize - 4*blockCnt
}

func EncodeSizeWithDefualtBlock(size int64) int64 {
        return EncodeSize(size, defaultCrc32BlockSize)
}

func DecodeSizeWithDefualtBlock(size int64) int64 {
        return DecodeSize(size, defaultCrc32BlockSize)
}

func readFullOrToEnd(r io.Reader, buffer []byte) (n int, err error) {
        nn, size := 0, len(buffer)

        for n < size && err == nil {
                nn, err = r.Read(buffer[n:])
                n += nn
                if n != 0 && err == io.EOF {
                        return n, nil
                }
        }

        return n, err
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "errors"
        "net/http"

        "github.com/cubefs/cubefs/blobstore/common/rpc"
)

var (
        // 2xx
        ErrExist = newError(http.StatusCreated, "Data Already Exist")

        // 4xx
        ErrIllegalArguments             = newError(http.StatusBadRequest, "Illegal Arguments")
        ErrNotFound                     = newError(http.StatusNotFound, "Not Found")
        ErrRequestTimeout               = newError(http.StatusRequestTimeout, "Request Timeout")
        ErrRequestedRangeNotSatisfiable = newError(http.StatusRequestedRangeNotSatisfiable, "Request Range Not Satisfiable")
        ErrRequestNotAllow              = newError(http.StatusBadRequest, "Request Not Allow")
        ErrReaderError                  = newError(499, "Reader Error")

        // 5xx errUnexpected - unexpected error, requires manual intervention.
        ErrUnexpected = newError(http.StatusInternalServerError, "Unexpected Error")
)

func newError(status int, msg string) *rpc.Error {
        return rpc.NewError(status, "", errors.New(msg))
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "net/http"

        "github.com/cubefs/cubefs/blobstore/common/rpc"
)

// access             550-599
// blobnode           600-699
// scheduler          700-799
// proxy              800-899
// clusterMgr         900-999

// Error http status code for all application
type Error int

var _ rpc.HTTPError = Error(0)

// Error implements error and rpc.HTTPError
func (e Error) Error() string {
        return errCodeMap[int(e)]
}

// StatusCode implements rpc.HTTPError
func (e Error) StatusCode() int {
        return int(e)
}

// ErrorCode implements rpc.HTTPError
func (e Error) ErrorCode() string {
        return ""
}

var errCodeMap = map[int]string{
        // access
        CodeAccessReadRequestBody:  "access read request body",
        CodeAccessUnexpect:         "access unexpected error",
        CodeAccessServiceDiscovery: "access client service discovery disconnect",
        CodeAccessLimited:          "access limited",
        CodeAccessExceedSize:       "access exceed object size",

        // clustermgr
        CodeCMUnexpect:                   "cm: unexpected error",
        CodeLockNotAllow:                 "lock volume not allow",
        CodeUnlockNotAllow:               "unlock volume not allow",
        CodeVolumeNotExist:               "volume not exist",
        CodeRaftPropose:                  "raft propose error",
        CodeNoLeader:                     "no leader",
        CodeRaftReadIndex:                "raft read index error",
        CodeDuplicatedMemberInfo:         "duplicated member info",
        CodeCMDiskNotFound:               "disk not found",
        CodeInvalidDiskStatus:            "invalid status",
        CodeChangeDiskStatusNotAllow:     "not allow to change status back",
        CodeConcurrentAllocVolumeUnit:    "alloc volume unit concurrently",
        CodeNoAvailableVolume:            "no available volume",
        CodeAllocVolumeInvalidParams:     "alloc volume request params is invalid",
        CodeOldVuidNotMatch:              "update volume unit, old vuid not match",
        CodeNewVuidNotMatch:              "update volume unit, new vuid not match",
        CodeNewDiskIDNotMatch:            "update volume unit, new diskID not match",
        CodeConfigArgument:               "config argument marshal error",
        CodeInvalidClusterID:             "request params error, invalid clusterID",
        CodeInvalidIDC:                   "request params error,invalid idc",
        CodeVolumeUnitNotExist:           "volume unit not exist",
        CodeDiskAbnormalOrNotReadOnly:    "disk is abnormal or not readonly, can't add into dropping list",
        CodeStatChunkFailed:              "stat blob node chunk failed",
        CodeInvalidCodeMode:              "request alloc volume codeMode not invalid",
        CodeRetainVolumeNotAlloc:         "retain volume is not alloc",
        CodeDroppedDiskHasVolumeUnit:     "dropped disk still has volume unit remain, migrate them firstly",
        CodeNotSupportIdle:               "list volume v2 not support idle status",
        CodeDiskIsDropping:               "dropping disk not allow change state or set readonly",
        CodeRejectDeleteSystemConfig:     "reject delete system config",
        CodeRegisterServiceInvalidParams: "register service params is invalid",

        // scheduler
        CodeNotingTodo: "nothing to do",

        // proxy
        CodeNoAvaliableVolume: "this codemode has no avaliable volume",
        CodeAllocBidFromCm:    "alloc bid from clustermgr error",
        CodeClusterIDNotMatch: "clusterId not match",

        // blobnode
        CodeInvalidParam:   "blobnode: invalid params",
        CodeAlreadyExist:   "blobnode: entry already exist",
        CodeOutOfLimit:     "blobnode: out of limit",
        CodeInternal:       "blobnode: internal error",
        CodeOverload:       "blobnode: service is overload",
        CodePathNotExist:   "blobnode: path is not exist",
        CodePathNotEmpty:   "blobnode: path is not empty",
        CodePathFindOnline: "blobnode: path find online disk",

        CodeDiskNotFound:  "disk not found",
        CodeDiskBroken:    "disk is broken",
        CodeInvalidDiskId: "disk id is invalid",
        CodeDiskNoSpace:   "disk no space",

        CodeVuidNotFound:     "vuid not found",
        CodeVUIDReadonly:     "vuid readonly",
        CodeVUIDRelease:      "vuid released",
        CodeVuidNotMatch:     "vuid not match",
        CodeChunkNotReadonly: "chunk must readonly",
        CodeChunkNotNormal:   "chunk must normal",
        CodeChunkNoSpace:     "chunk no space",
        CodeChunkCompacting:  "chunk is compacting",
        CodeInvalidChunkId:   "chunk id is invalid",
        CodeTooManyChunks:    "too many chunks",
        CodeChunkInuse:       "chunk in use",
        CodeSizeOverBurst:    "request size over limit burst",

        CodeBidNotFound:          "bid not found",
        CodeShardSizeTooLarge:    "shard size too large",
        CodeShardNotMarkDelete:   "shard must mark delete",
        CodeShardMarkDeleted:     "shard already mark delete",
        CodeShardInvalidOffset:   "shard offset is invalid",
        CodeShardInvalidBid:      "shard key bid is invalid",
        CodeShardListExceedLimit: "shard list exceed the limit",

        CodeDestReplicaBad: "dest replica is bad can not repair",
        CodeOrphanShard:    "shard is an orphan",
        CodeIllegalTask:    "illegal task",
        CodeRequestLimited: "request limited",
}

// HTTPError make rpc.HTTPError
func HTTPError(statusCode int, errCode string, err error) error {
        return rpc.NewError(statusCode, errCode, err)
}

// Error2HTTPError transfer error to rpc.HTTPError
func Error2HTTPError(err error) error {
        if err == nil {
                return nil
        }
        if e, ok := err.(rpc.HTTPError); ok {
                return e
        }
        if code, ok := err.(Error); ok {
                return code
        }
        return rpc.NewError(http.StatusInternalServerError, "ServerError", err)
}

// DetectCode detect code
func DetectCode(err error) int {
        if err == nil {
                return http.StatusOK
        }
        if code, ok := err.(Error); ok {
                return int(code)
        }
        if httpErr, ok := err.(rpc.HTTPError); ok {
                return httpErr.StatusCode()
        }
        return http.StatusInternalServerError
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "encoding/binary"
        "fmt"
        "strconv"
        "strings"
)

// basic type for all module
type (
        DiskID    uint32
        BlobID    uint64
        Vid       uint32
        ClusterID uint32
)

func (id DiskID) Encode() []byte {
        key := make([]byte, 4)
        binary.BigEndian.PutUint32(key, uint32(id))
        return key
}

func (id *DiskID) Decode(b []byte) DiskID {
        key := binary.BigEndian.Uint32(b)
        *id = DiskID(key)
        return *id
}

func (id DiskID) ToString() string {
        return strconv.FormatUint(uint64(id), 10)
}

func (vid Vid) ToString() string {
        return strconv.FormatUint(uint64(vid), 10)
}

func (id ClusterID) ToString() string {
        return strconv.FormatUint(uint64(id), 10)
}

const seqToken = ";"

// EncodeToken encode host and vid to a string token.
func EncodeToken(host string, vid Vid) (token string) {
        return fmt.Sprintf("%s%s%s", host, seqToken, strconv.FormatUint(uint64(vid), 10))
}

// DecodeToken decode host and vid from the token.
func DecodeToken(token string) (host string, vid Vid, err error) {
        parts := strings.SplitN(token, seqToken, 2)
        if len(parts) != 2 {
                err = fmt.Errorf("invalid token %s", token)
                return
        }
        host = parts[0]
        vidU32, err := strconv.ParseUint(parts[1], 10, 32)
        if err != nil {
                return
        }
        vid = Vid(vidU32)
        return
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "math"
)

// service names
const (
        ServiceNameBlobNode  = "BLOBNODE"
        ServiceNameProxy     = "PROXY"
        ServiceNameScheduler = "SCHEDULER"
)

type DiskStatus uint8

// disk status
const (
        DiskStatusNormal    = DiskStatus(iota + 1) // 1
        DiskStatusBroken                           // 2
        DiskStatusRepairing                        // 3
        DiskStatusRepaired                         // 4
        DiskStatusDropped                          // 5
        DiskStatusMax                              // 6
)

func (status DiskStatus) IsValid() bool {
        return status >= DiskStatusNormal && status < DiskStatusMax
}

func (status DiskStatus) String() string {
        switch status {
        case DiskStatusNormal:
                return "normal"
        case DiskStatusBroken:
                return "broken"
        case DiskStatusRepairing:
                return "repairing"
        case DiskStatusRepaired:
                return "repaired"
        case DiskStatusDropped:
                return "dropped"
        default:
                return "unknown"
        }
}

const (
        InvalidDiskID = DiskID(0)
        InValidBlobID = BlobID(0)
        InvalidCrc32  = uint32(0)
        InvalidVid    = Vid(0)
        InvalidVuid   = Vuid(0)
)

const (
        MaxBlobID = BlobID(math.MaxUint64)
)

// volume status
type VolumeStatus uint8

func (status VolumeStatus) IsValid() bool {
        return status > volumeStatusMin && status < volumeStatusMax
}

func (status VolumeStatus) String() string {
        switch status {
        case VolumeStatusIdle:
                return "idle"
        case VolumeStatusActive:
                return "active"
        case VolumeStatusLock:
                return "lock"
        case VolumeStatusUnlocking:
                return "unlocking"
        default:
                return "unknown"
        }
}

const (
        volumeStatusMin = VolumeStatus(iota)
        VolumeStatusIdle
        VolumeStatusActive
        VolumeStatusLock
        VolumeStatusUnlocking
        volumeStatusMax
)

// system config key,not allow delete
const (
        CodeModeConfigKey    = "code_mode"
        VolumeReserveSizeKey = "volume_reserve_size"
        VolumeChunkSizeKey   = "volume_chunk_size"
)

func IsSysConfigKey(key string) bool {
        switch key {
        case VolumeChunkSizeKey, VolumeReserveSizeKey, CodeModeConfigKey:
                return true
        default:
                return false
        }
}

type TaskSwitch string

const (
        TaskSwitchDataInspect TaskSwitch = "data_inspect"
)

func (t TaskSwitch) Valid() bool {
        switch t {
        case TaskSwitchDataInspect:
                return true
        default:
                return false
        }
}

func (t TaskSwitch) String() string {
        return string(t)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "github.com/cubefs/cubefs/blobstore/util/errors"
)

var ErrInvalidMsg = errors.New("msg is invalid")

type DeleteStage byte

const (
        InitStage DeleteStage = iota
        DeleteStageMarkDelete
        DeleteStageDelete
)

type BlobDeleteStage struct {
        Stages map[uint8]DeleteStage `json:"stages"`
}

func (s *BlobDeleteStage) SetStage(vuidIdx uint8, stage DeleteStage) {
        if s.Stages == nil {
                s.Stages = make(map[uint8]DeleteStage)
        }
        s.Stages[vuidIdx] = stage
}

func (s *BlobDeleteStage) Stage(vuid Vuid) (DeleteStage, bool) {
        stage, exist := s.Stages[vuid.Index()]
        return stage, exist
}

func (s *BlobDeleteStage) Copy() BlobDeleteStage {
        myCopy := BlobDeleteStage{}
        myCopy.Stages = make(map[uint8]DeleteStage)
        for k, v := range s.Stages {
                myCopy.Stages[k] = v
        }
        return myCopy
}

type DeleteMsg struct {
        ClusterID     ClusterID       `json:"cluster_id"`
        Bid           BlobID          `json:"bid"`
        Vid           Vid             `json:"vid"`
        Retry         int             `json:"retry"`
        Time          int64           `json:"time"`
        ReqId         string          `json:"req_id"`
        BlobDelStages BlobDeleteStage `json:"blob_del_stages"`
}

func (msg *DeleteMsg) IsValid() bool {
        if msg.Bid == InValidBlobID {
                return false
        }
        if msg.Vid == InvalidVid {
                return false
        }
        return true
}

func (msg *DeleteMsg) SetDeleteStage(stage BlobDeleteStage) {
        for idx, s := range stage.Stages {
                msg.BlobDelStages.SetStage(idx, s)
        }
}

type ShardRepairMsg struct {
        ClusterID ClusterID `json:"cluster_id"`
        Bid       BlobID    `json:"bid"`
        Vid       Vid       `json:"vid"`
        BadIdx    []uint8   `json:"bad_idx"`
        Retry     int       `json:"retry"`
        Reason    string    `json:"reason"`
        ReqId     string    `json:"req_id"`
}

func (msg *ShardRepairMsg) IsValid() bool {
        if msg.Bid == InValidBlobID {
                return false
        }
        if msg.Vid == InvalidVid {
                return false
        }
        if len(msg.BadIdx) == 0 {
                return false
        }
        return true
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "sync"

        "github.com/cubefs/cubefs/blobstore/common/codemode"
        "github.com/cubefs/cubefs/blobstore/util/errors"
)

var (
        ErrTaskPaused = errors.New("task has paused")
        ErrTaskEmpty  = errors.New("no task to run")
)

const (
        // TaskRenewalPeriodS + RenewalTimeoutS < TaskLeaseExpiredS
        TaskRenewalPeriodS = 5  // worker alive tasks  renewal period
        RenewalTimeoutS    = 1  // timeout of worker task renewal
        TaskLeaseExpiredS  = 10 // task lease duration in scheduler
)

type TaskType string

const (
        TaskTypeDiskRepair    TaskType = "disk_repair"
        TaskTypeBalance       TaskType = "balance"
        TaskTypeDiskDrop      TaskType = "disk_drop"
        TaskTypeManualMigrate TaskType = "manual_migrate"
        TaskTypeVolumeInspect TaskType = "volume_inspect"
        TaskTypeShardRepair   TaskType = "shard_repair"
        TaskTypeBlobDelete    TaskType = "blob_delete"
)

func (t TaskType) Valid() bool {
        switch t {
        case TaskTypeDiskRepair, TaskTypeBalance, TaskTypeDiskDrop, TaskTypeManualMigrate,
                TaskTypeVolumeInspect, TaskTypeShardRepair, TaskTypeBlobDelete:
                return true
        default:
                return false
        }
}

func (t TaskType) String() string {
        return string(t)
}

type VunitLocation struct {
        Vuid   Vuid   `json:"vuid" bson:"vuid"`
        Host   string `json:"host" bson:"host"`
        DiskID DiskID `json:"disk_id" bson:"disk_id"`
}

// for task check
func CheckVunitLocations(locations []VunitLocation) bool {
        if len(locations) == 0 {
                return false
        }

        for _, l := range locations {
                if l.Vuid == InvalidVuid || l.Host == "" || l.DiskID == InvalidDiskID {
                        return false
                }
        }
        return true
}

type MigrateState uint8

const (
        MigrateStateInited MigrateState = iota + 1
        MigrateStatePrepared
        MigrateStateWorkCompleted
        MigrateStateFinished
        MigrateStateFinishedInAdvance
)

type MigrateTask struct {
        TaskID   string       `json:"task_id"`   // task id
        TaskType TaskType     `json:"task_type"` // task type
        State    MigrateState `json:"state"`     // task state

        SourceIDC    string `json:"source_idc"`     // source idc
        SourceDiskID DiskID `json:"source_disk_id"` // source disk id
        SourceVuid   Vuid   `json:"source_vuid"`    // source volume unit id

        Sources  []VunitLocation   `json:"sources"`   // source volume units location
        CodeMode codemode.CodeMode `json:"code_mode"` // codemode

        Destination VunitLocation `json:"destination"` // destination volume unit location

        Ctime string `json:"ctime"` // create time
        MTime string `json:"mtime"` // modify time

        FinishAdvanceReason string `json:"finish_advance_reason"`
        // task migrate chunk direct download first,if fail will recover chunk by ec repair
        ForbiddenDirectDownload bool `json:"forbidden_direct_download"`

        WorkerRedoCnt uint8 `json:"worker_redo_cnt"` // worker redo task count
}

func (t *MigrateTask) Vid() Vid {
        return t.SourceVuid.Vid()
}

func (t *MigrateTask) GetSources() []VunitLocation {
        return t.Sources
}

func (t *MigrateTask) GetDestination() VunitLocation {
        return t.Destination
}

func (t *MigrateTask) SetDestination(dest VunitLocation) {
        t.Destination = dest
}

func (t *MigrateTask) DestinationDiskID() DiskID {
        return t.Destination.DiskID
}

func (t *MigrateTask) GetSourceDiskID() DiskID {
        return t.SourceDiskID
}

func (t *MigrateTask) Running() bool {
        return t.State == MigrateStatePrepared || t.State == MigrateStateWorkCompleted
}

func (t *MigrateTask) Copy() *MigrateTask {
        task := &MigrateTask{}
        *task = *t
        dst := make([]VunitLocation, len(t.Sources))
        copy(dst, t.Sources)
        task.Sources = dst
        return task
}

func (t *MigrateTask) IsValid() bool {
        return t.TaskType.Valid() && t.CodeMode.IsValid() &&
                CheckVunitLocations(t.Sources) &&
                CheckVunitLocations([]VunitLocation{t.Destination})
}

type VolumeInspectCheckPoint struct {
        StartVid Vid    `json:"start_vid"` // min vid in current batch volumes
        Ctime    string `json:"ctime"`
}

type VolumeInspectTask struct {
        TaskID   string            `json:"task_id"`
        Mode     codemode.CodeMode `json:"mode"`
        Replicas []VunitLocation   `json:"replicas"`
}

func (t *VolumeInspectTask) IsValid() bool {
        return t.Mode.IsValid() && CheckVunitLocations(t.Replicas)
}

type MissedShard struct {
        Vuid Vuid   `json:"vuid"`
        Bid  BlobID `json:"bid"`
}

type VolumeInspectRet struct {
        TaskID        string         `json:"task_id"`
        InspectErrStr string         `json:"inspect_err_str"` // inspect run success or not
        MissedShards  []*MissedShard `json:"missed_shards"`
}

func (inspect *VolumeInspectRet) Err() error {
        if len(inspect.InspectErrStr) == 0 {
                return nil
        }
        return errors.New(inspect.InspectErrStr)
}

type ShardRepairTask struct {
        Bid      BlobID            `json:"bid"`
        CodeMode codemode.CodeMode `json:"code_mode"`
        Sources  []VunitLocation   `json:"sources"`
        BadIdxs  []uint8           `json:"bad_idxs"` // TODO: BadIdxes
        Reason   string            `json:"reason"`
}

func (task *ShardRepairTask) IsValid() bool {
        return task.CodeMode.IsValid() && CheckVunitLocations(task.Sources)
}

// TaskStatistics thread-unsafe task statistics.
type TaskStatistics struct {
        DoneSize   uint64 `json:"done_size"`
        DoneCount  uint64 `json:"done_count"`
        TotalSize  uint64 `json:"total_size"`
        TotalCount uint64 `json:"total_count"`
        Progress   uint64 `json:"progress"`
}

// TaskProgress migrate task running progress.
type TaskProgress interface {
        Total(size, count uint64) // reset total size and count.
        Do(size, count uint64)    // update progress.
        Done() TaskStatistics     // returns newest statistics.
}

// NewTaskProgress returns thread-safe task progress.
func NewTaskProgress() TaskProgress {
        return &taskProgress{}
}

type taskProgress struct {
        mu sync.Mutex
        st TaskStatistics
}

func (p *taskProgress) Total(size, count uint64) {
        p.mu.Lock()
        st := &p.st
        st.TotalSize = size
        st.TotalCount = count
        if st.TotalSize == 0 {
                st.Progress = 100
        } else {
                st.Progress = (st.DoneSize * 100) / st.TotalSize
        }
        p.mu.Unlock()
}

func (p *taskProgress) Do(size, count uint64) {
        p.mu.Lock()
        st := &p.st
        st.DoneSize += size
        st.DoneCount += count
        if st.TotalSize == 0 {
                st.Progress = 100
        } else {
                st.Progress = (st.DoneSize * 100) / st.TotalSize
        }
        p.mu.Unlock()
}

func (p *taskProgress) Done() TaskStatistics {
        p.mu.Lock()
        st := p.st
        p.mu.Unlock()
        return st
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "errors"
        "strconv"
)

type (
        Vuid       uint64
        VuidPrefix uint64
)

const (
        MinEpoch = 1
        MaxEpoch = 16777215
        MinIndex = 0
        MaxIndex = 255
)

func (vu Vuid) IsValid() bool {
        return vu > InvalidVuid && IsValidEpoch(vu.Epoch()) && IsValidIndex(vu.Index())
}

func NewVuid(vid Vid, idx uint8, epoch uint32) (Vuid, error) {
        if !IsValidEpoch(epoch) {
                err := errors.New("fail to new vuid,Epoch is overflow")
                return 0, err
        }

        u64 := uint64(vid)<<32 + uint64(idx)<<24 + uint64(epoch)
        return Vuid(u64), nil
}

func EncodeVuidPrefix(vid Vid, idx uint8) VuidPrefix {
        u64 := uint64(vid)<<32 + uint64(idx)<<24
        return VuidPrefix(u64)
}

func EncodeVuid(v VuidPrefix, epoch uint32) Vuid {
        u64 := uint64(v) + uint64(epoch)
        return Vuid(u64)
}

func (v Vuid) Vid() Vid {
        return Vid(v & 0xffffffff00000000 >> 32)
}

func (v Vuid) ToString() string {
        return strconv.FormatUint(uint64(v), 10)
}

func (v Vuid) Index() uint8 {
        return uint8(v & 0xff000000 >> 24)
}

func (v Vuid) Epoch() uint32 {
        return uint32(v & 0xffffff)
}

func (v Vuid) VuidPrefix() VuidPrefix {
        vuidPre := uint64(v) - uint64(v.Epoch())
        return VuidPrefix(vuidPre)
}

func (v VuidPrefix) Vid() Vid {
        return Vid(v & 0xffffffff00000000 >> 32)
}

func (v VuidPrefix) Index() uint8 {
        return uint8(v & 0xff000000 >> 24)
}

func IsValidEpoch(epoch uint32) bool {
        return epoch <= MaxEpoch && epoch >= MinEpoch
}

func IsValidIndex(index uint8) bool {
        return index <= MaxIndex && index >= MinIndex
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package resourcepool

import (
        "runtime"
        "sync"
        "sync/atomic"
        "time"
)

// cache in chan pool will not be released by runtime.GC().
// pool chan length dynamicly changes by EMA(Exponential Moving Average).
// redundant buffers will released by GC.

const maxMemorySize = 1 << 32 // 4G

// type SliceHeader is 24 bytes.
// buffered channel malloc memory in one call.
// `makechan` see more at: https://github.com/golang/go/blob/master/src/runtime/chan.go
//
// limit max channel memory to 96MB (24 * (1<<22)),
// can reduce to 32MB if using type *[]byte.
const maxChanSize = 1 << 22 // 4m

var releaseInterval int64 = int64(time.Minute) * 2

// SetReleaseInterval set release interval duration
func SetReleaseInterval(duration time.Duration) {
        if duration > time.Millisecond*100 {
                atomic.StoreInt64(&releaseInterval, int64(duration))
        }
}

type chPool struct {
        chBuffer    chan []byte
        newBuffer   func() []byte
        capacity    int
        concurrence int32

        closeCh   chan struct{}
        closeOnce sync.Once
}

// NewChanPool return Pool with capacity, no limit if capacity is negative
func NewChanPool(newFunc func() []byte, capacity int) Pool {
        chCap := capacity
        if chCap < 0 {
                buf := newFunc()
                chCap = maxMemorySize / len(buf)
        }
        if chCap > maxChanSize {
                chCap = maxChanSize
        }

        pool := &chPool{
                chBuffer:  make(chan []byte, chCap),
                newBuffer: newFunc,
                capacity:  capacity,
                closeCh:   make(chan struct{}),
        }
        runtime.SetFinalizer(pool, func(p *chPool) {
                p.closeOnce.Do(func() {
                        close(p.closeCh)
                })
        })

        go pool.loopRelease()
        return pool
}

// loopRelease release redundant buffers in chan.
// check EMA concurrence per round of release interval duration.
// release the redundant buffers per release interval duration.
//
// reserve 30% redundancy of capacity
//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//  |      length of buffer chan        |     concurrence     |
//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//  |  redundant  |            capacity            | reserved |
//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
//  | to release  |          buffers keep in memory           |
//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
func (p *chPool) loopRelease() {
        const emaRound = 5

        ticker := time.NewTicker(time.Duration(atomic.LoadInt64(&releaseInterval)) / emaRound)
        defer ticker.Stop()

        var (
                turn     int
                capacity int32
        )
        for {
                select {
                case <-p.closeCh:
                        for {
                                select {
                                case <-p.chBuffer:
                                default:
                                        return
                                }
                        }
                case <-ticker.C:
                        nowConc := atomic.LoadInt32(&p.concurrence)
                        capacity = ema(nowConc, capacity)
                        if turn = (turn + 1) % emaRound; turn != 0 {
                                continue
                        }

                        capa := capacity * 13 / 10
                        redundant := len(p.chBuffer) + int(nowConc-capa)
                        if redundant <= 0 {
                                continue
                        }
                        has := true
                        for ii := 0; has && ii < redundant; ii++ {
                                select {
                                case <-p.chBuffer:
                                default:
                                        has = false
                                }
                        }
                }
        }
}

func (p *chPool) Get() (interface{}, error) {
        atomic.AddInt32(&p.concurrence, 1)

        select {
        case buf := <-p.chBuffer:
                return buf, nil
        default:
                return p.newBuffer(), nil
        }
}

func (p *chPool) Put(x interface{}) {
        buf, ok := x.([]byte)
        if !ok {
                return
        }

        select {
        case p.chBuffer <- buf:
        default:
        }
        atomic.AddInt32(&p.concurrence, -1)
}

func (p *chPool) Cap() int {
        return p.capacity
}

func (p *chPool) Len() int {
        return int(atomic.LoadInt32(&p.concurrence))
}

func (p *chPool) Idle() int {
        return len(p.chBuffer)
}

func ema(val, lastVal int32) int32 {
        return (val*2 + lastVal*8) / 10
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package resourcepool

import (
        "errors"
        "sort"
)

// ErrNoSuitableSizeClass no suitable pool of size
var ErrNoSuitableSizeClass = errors.New("no suitable size class")

// zero bytes, high performance at the 16KB, see more in the benchmark:
// BenchmarkZero/4MB-16KB-4                   13338             88378 ns/op
// BenchmarkZero/8MB-16KB-4                    6670            183987 ns/op
// BenchmarkZero/16MB-16KB-4                   1926            590422 ns/op
const zeroLen = 1 << 14

var zero = make([]byte, zeroLen)

// MemPool reused buffer pool
type MemPool struct {
        pool     []Pool
        poolSize []int
}

// Status memory pools status
type Status []PoolStatus

// PoolStatus status of the pool
type PoolStatus struct {
        Size     int `json:"size"`
        Capacity int `json:"capacity"`
        Running  int `json:"running"`
        Idle     int `json:"idle"`
}

// NewMemPool returns a MemPool within chan pool
func NewMemPool(sizeClasses map[int]int) *MemPool {
        return NewMemPoolWith(sizeClasses, func(size, capacity int) Pool {
                return NewChanPool(func() []byte {
                        return make([]byte, size)
                }, capacity)
        })
}

// NewMemPoolWith new MemPool with size-class and self-defined pool
func NewMemPoolWith(sizeClasses map[int]int, newPool func(size, capacity int) Pool) *MemPool {
        pool := make([]Pool, 0, len(sizeClasses))
        poolSize := make([]int, 0, len(sizeClasses))
        for sizeClass := range sizeClasses {
                if sizeClass > 0 {
                        poolSize = append(poolSize, sizeClass)
                }
        }

        sort.Ints(poolSize)
        for _, sizeClass := range poolSize {
                pool = append(pool, newPool(sizeClass, sizeClasses[sizeClass]))
        }

        return &MemPool{
                pool:     pool,
                poolSize: poolSize,
        }
}

// Get return a suitable buffer
func (p *MemPool) Get(size int) ([]byte, error) {
        for idx, ps := range p.poolSize {
                if size <= ps {
                        buf, err := p.pool[idx].Get()
                        if err != nil {
                                return nil, err
                        }
                        buff := buf.([]byte)
                        return buff[:size], nil
                }
        }

        return nil, ErrNoSuitableSizeClass
}

// Alloc return a buffer, make a new if oversize
func (p *MemPool) Alloc(size int) ([]byte, error) {
        buf, err := p.Get(size)
        if err == ErrNoSuitableSizeClass {
                return make([]byte, size), nil
        }

        return buf, err
}

// Put adds x to the pool, appropriately resize
func (p *MemPool) Put(b []byte) error {
        sizeClass := cap(b)
        b = b[0:sizeClass]
        for ii := len(p.poolSize) - 1; ii >= 0; ii-- {
                if sizeClass >= p.poolSize[ii] {
                        b = b[0:p.poolSize[ii]]
                        p.pool[ii].Put(b)
                        return nil
                }
        }

        return ErrNoSuitableSizeClass
}

// Zero clean up the buffer b to zero bytes
func (p *MemPool) Zero(b []byte) {
        Zero(b)
}

// Status returns status of memory pool
func (p *MemPool) Status() Status {
        st := make(Status, len(p.poolSize))
        for idx, size := range p.poolSize {
                pool := p.pool[idx]
                st[idx] = PoolStatus{
                        Size:     size,
                        Capacity: pool.Cap(),
                        Running:  pool.Len(),
                        Idle:     pool.Idle(),
                }
        }
        return st
}

// Zero clean up the buffer b to zero bytes
func Zero(b []byte) {
        for len(b) > 0 {
                n := copy(b, zero)
                b = b[n:]
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package resourcepool

// sync.Pool cache will be released by runtime.GC()
// see sync/pool.go: runtime_registerPoolCleanup(poolCleanup)

import (
        "errors"
        "sync"
        "sync/atomic"
)

// ErrPoolLimit pool elements exceed its capacity
var ErrPoolLimit = errors.New("resource pool limit")

// Pool resource pool support for sync.pool and capacity limit
// release resource if no used anymore
// no limit if capacity is negative
type Pool interface {
        // Get return nil and error if exceed pool's capacity
        Get() (interface{}, error)
        Put(x interface{})
        Cap() int
        Len() int
        // Idle return cached idle objects in pool.
        Idle() int
}

// sync pool Idle return -1 if no limit
type pool struct {
        sp       sync.Pool
        capacity int32
        current  int32
}

// NewPool return Pool with capacity, no limit if capacity is negative
func NewPool(newFunc func() interface{}, capacity int) Pool {
        return &pool{
                sp:       sync.Pool{New: newFunc},
                capacity: int32(capacity),
                current:  int32(0),
        }
}

func (p *pool) Get() (interface{}, error) {
        current := atomic.AddInt32(&p.current, 1)
        if p.capacity >= 0 && current > p.capacity {
                atomic.AddInt32(&p.current, -1)
                return nil, ErrPoolLimit
        }
        return p.sp.Get(), nil
}

func (p *pool) Put(x interface{}) {
        p.sp.Put(x)
        atomic.AddInt32(&p.current, -1)
}

func (p *pool) Cap() int {
        return int(p.capacity)
}

func (p *pool) Len() int {
        return int(atomic.LoadInt32(&p.current))
}

func (p *pool) Idle() int {
        if p.capacity < 0 {
                return -1
        }
        return p.Cap() - p.Len()
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "encoding/base64"
        "encoding/json"
        "fmt"
        "io"
        "reflect"
        "strconv"
        "strings"

        "github.com/cubefs/cubefs/blobstore/util/bytespool"
        "github.com/cubefs/cubefs/blobstore/util/log"
)

type (
        parserKey struct {
                PkgPath   string
                Name      string
                FieldName string
        }
        parserVal struct {
                Name string
                Opt  struct {
                        Ignore    bool // "-"
                        Omitempty bool // ",omitempty"
                        Base64    bool // ",base64"
                }
        }
)

var registeredParsers map[parserKey]parserVal

func init() {
        registeredParsers = make(map[parserKey]parserVal)
}

// RegisterArgsParser regist your argument need parse in
// uri, query, form, or postform.
// the tags is sorted.
// NOTE: the function is thread-unsafe.
func RegisterArgsParser(args interface{}, tags ...string) {
        if args == nil {
                return
        }

        if _, ok := args.(Parser); ok {
                return
        }

        typ := reflect.TypeOf(args)
        val := reflect.ValueOf(args)
        if typ.Kind() != reflect.Ptr {
                log.Panicf("args(%s) must be pointer", typ.Name())
        }

        typ = typ.Elem()
        if typ.Kind() != reflect.Struct {
                log.Panicf("args(%s) reference must be struct", typ.Name())
        }

        val = val.Elem()
        t := val.Type()
        for i := 0; i < val.NumField(); i++ {
                ft := t.Field(i)

                pVal := parserVal{
                        Name: strings.ToLower(ft.Name),
                }
                for _, tag := range tags {
                        tagStr := ft.Tag.Get(tag)
                        if tagStr != "" {
                                ts := strings.Split(tagStr, ",")
                                if ts[0] == "-" {
                                        pVal.Opt.Ignore = true
                                        break
                                }

                                if ts[0] != "" {
                                        pVal.Name = ts[0]
                                }
                                for _, t := range ts[1:] {
                                        switch t {
                                        case "omitempty":
                                                pVal.Opt.Omitempty = true
                                        case "base64":
                                                pVal.Opt.Base64 = true
                                        default:
                                        }
                                }
                                break
                        }
                }

                pKey := parserKey{
                        PkgPath:   typ.PkgPath(),
                        Name:      typ.Name(),
                        FieldName: ft.Name,
                }
                registeredParsers[pKey] = pVal

                log.Infof("register args field:%+v val:%+v", pKey, pVal)
        }
}

func parseArgs(c *Context, args interface{}, opts ...ServerOption) error {
        if args == nil {
                return nil
        }
        opt := c.opts
        if len(opts) > 0 {
                opt = c.opts.copy()
                for _, o := range opts {
                        o.apply(opt)
                }
        }

        if opt.argsBody {
                size, err := c.RequestLength()
                if err != nil {
                        return err
                }

                if arg, ok := args.(UnmarshalerFrom); ok {
                        return arg.UnmarshalFrom(io.LimitReader(c.Request.Body, int64(size)))
                }

                buf := bytespool.Alloc(size)
                defer bytespool.Free(buf)
                if _, err = io.ReadFull(c.Request.Body, buf); err != nil {
                        return err
                }

                if arg, ok := args.(Unmarshaler); ok {
                        return arg.Unmarshal(buf)
                }
                return json.Unmarshal(buf, args)
        }

        if !opt.hasArgs() {
                return nil
        }

        getter := func(fKey string) string {
                if opt.argsURI {
                        if val := c.Param.ByName(fKey); val != "" {
                                return val
                        }
                }
                if opt.argsQuery {
                        if val := c.Request.URL.Query().Get(fKey); val != "" {
                                return val
                        }
                }
                if opt.argsForm {
                        if val := c.Request.Form.Get(fKey); val != "" {
                                return val
                        }
                }
                if opt.argsPostForm {
                        if val := c.Request.PostForm.Get(fKey); val != "" {
                                return val
                        }
                }
                return ""
        }

        if arg, ok := args.(Parser); ok {
                return arg.Parse(getter)
        }

        typ := reflect.TypeOf(args)
        val := reflect.ValueOf(args)

        if typ.Kind() != reflect.Ptr {
                return fmt.Errorf("args(%s) must be pointer", typ.Name())
        }

        typ = typ.Elem()
        if typ.Kind() != reflect.Struct {
                return fmt.Errorf("args(%s) reference must be struct", typ.Name())
        }

        val = val.Elem()
        t := val.Type()
        for i := 0; i < val.NumField(); i++ {
                ft := t.Field(i)

                pVal, ok := registeredParsers[parserKey{
                        PkgPath:   typ.PkgPath(),
                        Name:      typ.Name(),
                        FieldName: ft.Name,
                }]
                if !ok {
                        pVal = parserVal{Name: strings.ToLower(ft.Name)}
                }
                if pVal.Opt.Ignore {
                        continue
                }

                fVal := getter(pVal.Name)
                if fVal == "" {
                        if pVal.Opt.Omitempty {
                                continue
                        }
                        return fmt.Errorf("args(%s) field(%s) do not omit", typ.Name(), ft.Name)
                }

                if pVal.Opt.Base64 {
                        switch len(fVal) & 3 {
                        case 2:
                                fVal += "=="
                        case 3:
                                fVal += "="
                        default:
                        }
                        b, err := base64.URLEncoding.DecodeString(fVal)
                        if err != nil {
                                return fmt.Errorf("args(%s) field(%s) invalid base64(%s)", typ.Name(), ft.Name, fVal)
                        }
                        fVal = string(b)
                }

                fv := val.Field(i)
                if err := parseValue(fv, fVal); err != nil {
                        return err
                }
        }

        return nil
}

func parseValue(val reflect.Value, str string) (err error) {
        var (
                bv bool
                iv int64
                uv uint64
                fv float64
        )

RETRY:
        switch val.Kind() {
        case reflect.Bool:
                bv, err = strconv.ParseBool(str)
                val.SetBool(bv)

        case reflect.Int:
                iv, err = strconv.ParseInt(str, 10, 0)
                val.SetInt(iv)
        case reflect.Int8:
                iv, err = strconv.ParseInt(str, 10, 8)
                val.SetInt(iv)
        case reflect.Int16:
                iv, err = strconv.ParseInt(str, 10, 16)
                val.SetInt(iv)
        case reflect.Int32:
                iv, err = strconv.ParseInt(str, 10, 32)
                val.SetInt(iv)
        case reflect.Int64:
                iv, err = strconv.ParseInt(str, 10, 64)
                val.SetInt(iv)

        case reflect.Uint:
                uv, err = strconv.ParseUint(str, 10, 0)
                val.SetUint(uv)
        case reflect.Uint8:
                uv, err = strconv.ParseUint(str, 10, 8)
                val.SetUint(uv)
        case reflect.Uint16:
                uv, err = strconv.ParseUint(str, 10, 16)
                val.SetUint(uv)
        case reflect.Uint32:
                uv, err = strconv.ParseUint(str, 10, 32)
                val.SetUint(uv)
        case reflect.Uint64:
                uv, err = strconv.ParseUint(str, 10, 64)
                val.SetUint(uv)

        case reflect.Float32:
                fv, err = strconv.ParseFloat(str, 32)
                val.SetFloat(fv)
        case reflect.Float64:
                fv, err = strconv.ParseFloat(str, 64)
                val.SetFloat(fv)

        case reflect.String:
                val.SetString(str)
        case reflect.Uintptr:
                uv, err = strconv.ParseUint(str, 10, 64)
                val.SetUint(uv)
        case reflect.Ptr:
                elem := reflect.New(val.Type().Elem())
                val.Set(elem)
                val = elem.Elem()
                goto RETRY

        case reflect.Slice:
                if val.Type().Elem().Kind() == reflect.Uint8 {
                        val.SetBytes([]byte(str))
                } else {
                        return fmt.Errorf("unsupported type(%s) of slice", val.Type().Elem().Kind().String())
                }

        default:
                return fmt.Errorf("unsupported type(%s)", val.Kind().String())
        }
        return
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package auth

import (
        "bytes"
        "crypto/md5"
        "encoding/base64"
        "encoding/binary"
        "errors"
        "net/http"
)

const (
        // md5 need 16 byte
        TokenKeyLenth = 16

        // #nosec G101
        TokenHeaderKey = "BLOB-STORE-AUTH-TOKEN"
)

var errMismatchToken = errors.New("mismatch token")

type Config struct {
        EnableAuth bool   `json:"enable_auth"`
        Secret     string `json:"secret"`
}

// simply: use timestamp as a token calculate param
type authInfo struct {
        timestamp int64
        token     []byte
        // other auth content
        others []byte
}

func encodeAuthInfo(info *authInfo) (ret string, err error) {
        w := bytes.NewBuffer([]byte{})
        if err = binary.Write(w, binary.LittleEndian, &info.timestamp); err != nil {
                return
        }
        if err = binary.Write(w, binary.LittleEndian, &info.token); err != nil {
                return
        }
        return base64.URLEncoding.EncodeToString(w.Bytes()), nil
}

func decodeAuthInfo(encodeStr string) (info *authInfo, err error) {
        info = new(authInfo)
        b, err := base64.URLEncoding.DecodeString(encodeStr)
        if err != nil {
                return
        }

        info.token = make([]byte, TokenKeyLenth)
        r := bytes.NewBuffer(b)
        if err = binary.Read(r, binary.LittleEndian, &info.timestamp); err != nil {
                return
        }
        if err = binary.Read(r, binary.LittleEndian, &info.token); err != nil {
                return
        }
        return
}

// calculate auth token with params and secret
func calculate(info *authInfo, secret []byte) (err error) {
        hash := md5.New()
        b := make([]byte, 8)
        binary.LittleEndian.PutUint64(b, uint64(info.timestamp))
        hash.Write(info.others)
        hash.Write(b)
        hash.Write(secret)
        info.token = hash.Sum(nil)
        return
}

// verify auth token with params and secret
func verify(info *authInfo, secret []byte) (err error) {
        checkAuthInfo := &authInfo{timestamp: info.timestamp, others: info.others}
        calculate(checkAuthInfo, secret)
        if !bytes.Equal(checkAuthInfo.token, info.token) {
                return errMismatchToken
        }
        return
}

func genEncodeStr(req *http.Request) []byte {
        calStr := req.URL.Path + req.URL.RawQuery
        return []byte(calStr)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package auth

import (
        "net/http"
)

type AuthHandler struct {
        Secret []byte
}

func NewAuthHandler(cfg *Config) *AuthHandler {
        if cfg.EnableAuth {
                if cfg.Secret == "" {
                        panic("auth secret can not be nil")
                }
                return &AuthHandler{
                        Secret: []byte(cfg.Secret),
                }
        }
        return nil
}

func (self *AuthHandler) Handler(w http.ResponseWriter, req *http.Request, f func(http.ResponseWriter, *http.Request)) {
        token := req.Header.Get(TokenHeaderKey)
        if token == "" {
                w.WriteHeader(http.StatusForbidden)
                return
        }
        info, err := decodeAuthInfo(token)
        if err != nil {
                w.WriteHeader(http.StatusForbidden)
                return
        }
        info.others = genEncodeStr(req)

        err = verify(info, self.Secret)
        if err != nil && err == errMismatchToken {
                w.WriteHeader(http.StatusForbidden)
                return
        }

        f(w, req)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package auth

import (
        "net/http"
        "time"
)

type AuthTransport struct {
        Secret []byte
        Tr     http.RoundTripper
}

func NewAuthTransport(tr http.RoundTripper, cfg *Config) http.RoundTripper {
        if cfg.EnableAuth {
                if cfg.Secret == "" {
                        panic("auth secret can not be nil")
                }
                return &AuthTransport{
                        Secret: []byte(cfg.Secret),
                        Tr:     tr,
                }
        }
        return nil
}

// a simple auth token
func (self *AuthTransport) RoundTrip(req *http.Request) (resp *http.Response, err error) {
        now := time.Now().Unix()

        info := &authInfo{timestamp: now, others: genEncodeStr(req)}

        err = calculate(info, self.Secret)
        if err != nil {
                return self.Tr.RoundTrip(req)
        }

        token, err := encodeAuthInfo(info)
        if err != nil {
                return self.Tr.RoundTrip(req)
        }

        req.Header.Set(TokenHeaderKey, token)
        return self.Tr.RoundTrip(req)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "net/http"

        "github.com/cubefs/cubefs/blobstore/common/crc32block"
)

type crcDecoder struct{}

var _ ProgressHandler = (*crcDecoder)(nil)

func (*crcDecoder) Handler(w http.ResponseWriter, req *http.Request, f func(http.ResponseWriter, *http.Request)) {
        if req.Header.Get(HeaderCrcEncoded) != "" && w.Header().Get(HeaderAckCrcEncoded) == "" {
                if size := req.ContentLength; size > 0 && req.Body != nil {
                        decoder := crc32block.NewBodyDecoder(req.Body)
                        req.ContentLength = decoder.CodeSize(size)
                        req.Body = decoder
                }
                w.Header().Set(HeaderAckCrcEncoded, "1")
        }
        f(w, req)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "net"
        "net/http"
        "time"

        "github.com/cubefs/cubefs/blobstore/common/rpc/auth"
)

// TransportConfig http transport config
type TransportConfig struct {
        // DialTimeoutMs dial timeout in milliseconds
        DialTimeoutMs int64 `json:"dial_timeout_ms"`
        // ResponseHeaderTimeoutMs response header timeout after send the request
        ResponseHeaderTimeoutMs int64 `json:"response_header_timeout_ms"`

        MaxConnsPerHost     int `json:"max_conns_per_host"`
        MaxIdleConns        int `json:"max_idle_conns"`
        MaxIdleConnsPerHost int `json:"max_idle_conns_per_host"`
        // IdleConnTimeout is the maximum amount of time an idle
        // (keep-alive) connection will remain idle before closing
        // itself.Zero means no limit.
        IdleConnTimeoutMs int64 `json:"idle_conn_timeout_ms"`
        // DisableCompression, if true, prevents the Transport from
        // requesting compression with an "Accept-Encoding: gzip"
        DisableCompression bool `json:"disable_compression"`

        // auth config
        Auth auth.Config `json:"auth"`
}

// Default returns default transport if none setting.
// Disable Auth config.
func (tc TransportConfig) Default() TransportConfig {
        noAuth := tc
        noAuth.Auth = auth.Config{}
        none := TransportConfig{}
        if noAuth == none {
                return TransportConfig{
                        MaxConnsPerHost:     10,
                        MaxIdleConns:        1000,
                        MaxIdleConnsPerHost: 10,
                        IdleConnTimeoutMs:   10 * 1000,

                        Auth: tc.Auth,
                }
        }
        return tc
}

// NewTransport returns http transport
func NewTransport(cfg *TransportConfig) http.RoundTripper {
        tr := &http.Transport{
                Proxy:                 http.ProxyFromEnvironment,
                MaxConnsPerHost:       cfg.MaxConnsPerHost,
                MaxIdleConns:          cfg.MaxIdleConns,
                MaxIdleConnsPerHost:   cfg.MaxIdleConnsPerHost,
                IdleConnTimeout:       time.Duration(cfg.IdleConnTimeoutMs) * time.Millisecond,
                ResponseHeaderTimeout: time.Duration(cfg.ResponseHeaderTimeoutMs) * time.Millisecond,
                DisableCompression:    cfg.DisableCompression,
                WriteBufferSize:       1 << 16,
                ReadBufferSize:        1 << 16,
        }
        tr.DialContext = (&net.Dialer{
                Timeout:   time.Duration(cfg.DialTimeoutMs) * time.Millisecond,
                KeepAlive: 30 * time.Second,
        }).DialContext

        if cfg.Auth.EnableAuth {
                authTr := auth.NewAuthTransport(tr, &cfg.Auth)
                if authTr != nil {
                        return authTr
                }
        }
        return tr
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "bufio"
        "fmt"
        "io"
        "math"
        "net"
        "net/http"
        "strconv"
        "strings"
        "sync"

        "github.com/julienschmidt/httprouter"
)

const (
        abortIndex int8 = math.MaxInt8 >> 1
)

var jsonNull = [4]byte{'n', 'u', 'l', 'l'}

// Context handler context with http variables
type Context struct {
        opts  *serverOptions
        Param httprouter.Params

        Request *http.Request
        Writer  http.ResponseWriter

        // pass key/value in whole request
        mu   sync.RWMutex
        Meta map[string]interface{}

        wroteHeader bool

        // interceptors control
        index    int8
        handlers []HandlerFunc
}

// ArgsBody args in body
func (c *Context) ArgsBody(args interface{}) error {
        return c.ParseArgs(args, OptArgsBody())
}

// ArgsURI args in uri
func (c *Context) ArgsURI(args interface{}) error {
        return c.ParseArgs(args, OptArgsURI())
}

// ArgsQuery args in query
func (c *Context) ArgsQuery(args interface{}) error {
        return c.ParseArgs(args, OptArgsQuery())
}

// ArgsForm args in form
func (c *Context) ArgsForm(args interface{}) error {
        return c.ParseArgs(args, OptArgsForm())
}

// ArgsPostForm args in post form
func (c *Context) ArgsPostForm(args interface{}) error {
        return c.ParseArgs(args, OptArgsPostForm())
}

// ParseArgs reflect param to args
func (c *Context) ParseArgs(args interface{}, opts ...ServerOption) error {
        if err := parseArgs(c, args, opts...); err != nil {
                return NewError(http.StatusBadRequest, "Argument", err)
        }
        return nil
}

// RequestLength read request body length
func (c *Context) RequestLength() (int, error) {
        cl := c.Request.ContentLength
        if cl < 0 {
                return 0, fmt.Errorf("Unknown content length in request")
        }
        return int(cl), nil
}

// Next should be used only inside interceptor.
// It executes the pending handlers inside the calling handler.
func (c *Context) Next() {
        c.index++
        for c.index < int8(len(c.handlers)) {
                c.handlers[c.index](c)
                c.index++
        }
}

// IsAborted return aborted or not
func (c *Context) IsAborted() bool {
        return c.index >= abortIndex
}

// Abort the next handlers
func (c *Context) Abort() {
        c.index = abortIndex
}

// AbortWithStatus abort with status
func (c *Context) AbortWithStatus(statusCode int) {
        c.RespondStatus(statusCode)
        c.Abort()
}

// AbortWithStatusJSON abort with status and response data
func (c *Context) AbortWithStatusJSON(statusCode int, obj interface{}) {
        c.RespondStatusData(statusCode, obj)
        c.Abort()
}

// AbortWithError abort with error
func (c *Context) AbortWithError(err error) {
        c.RespondError(err)
        c.Abort()
}

// Respond response 200, and Content-Length: 0
func (c *Context) Respond() {
        c.Writer.Header().Set(HeaderContentLength, "0")
        c.RespondStatus(http.StatusOK)
}

// RespondStatus response status code
func (c *Context) RespondStatus(statusCode int) {
        c.Writer.WriteHeader(statusCode)
        c.wroteHeader = true
}

// RespondError response error
func (c *Context) RespondError(err error) {
        httpErr := Error2HTTPError(err)
        if httpErr == nil {
                c.Respond()
                return
        }
        c.RespondStatusData(httpErr.StatusCode(), errorResponse{
                Error: httpErr.Error(),
                Code:  httpErr.ErrorCode(),
        })
}

// RespondJSON response json
func (c *Context) RespondJSON(obj interface{}) {
        c.RespondStatusData(http.StatusOK, obj)
}

// RespondStatusData response data with code
func (c *Context) RespondStatusData(statusCode int, obj interface{}) {
        body, err := marshalObj(obj)
        if err != nil {
                c.RespondError(err)
                return
        }
        c.RespondWithReader(statusCode, body.ContentLength, body.ContentType, body.Body, nil)
}

// RespondWith response with code, content-type, bytes
func (c *Context) RespondWith(statusCode int, contentType string, body []byte) {
        c.Writer.Header().Set(HeaderContentType, contentType)
        c.Writer.Header().Set(HeaderContentLength, strconv.Itoa(len(body)))

        c.Writer.WriteHeader(statusCode)
        c.wroteHeader = true
        c.Writer.Write(body)
}

// RespondWithReader response with code, content-length, content-type, an io.Reader and extra headers
func (c *Context) RespondWithReader(statusCode int, contentLength int, contentType string,
        body io.Reader, extraHeaders map[string]string) {
        c.Writer.Header().Set(HeaderContentType, contentType)
        c.Writer.Header().Set(HeaderContentLength, strconv.Itoa(contentLength))
        for key, val := range extraHeaders {
                c.Writer.Header().Set(key, val)
        }

        c.Writer.WriteHeader(statusCode)
        c.wroteHeader = true
        io.CopyN(c.Writer, body, int64(contentLength))
}

// Stream sends a streaming response and returns a boolean
// indicates "Is client disconnected in middle of stream"
func (c *Context) Stream(step func(w io.Writer) bool) bool {
        w := c.Writer
        clientGone := c.Request.Context().Done()
        for {
                select {
                case <-clientGone:
                        return true
                default:
                        keepOpen := step(w)
                        c.Flush()
                        if !keepOpen {
                                return false
                        }
                }
        }
}

// Set is used to store a new key/value pair exclusively for this context.
func (c *Context) Set(key string, val interface{}) {
        c.mu.Lock()
        if c.Meta == nil {
                c.Meta = make(map[string]interface{})
        }
        c.Meta[key] = val
        c.mu.Unlock()
}

// Get returns the value for the given key,
// If the value does not exists it returns (nil, false).
func (c *Context) Get(key string) (val interface{}, exists bool) {
        c.mu.RLock()
        val, exists = c.Meta[key]
        c.mu.RUnlock()
        return
}

// RemoteIP parses the IP from Request.RemoteAddr, returns the net.IP (without the port).
func (c *Context) RemoteIP() (net.IP, bool) {
        ip, _, err := net.SplitHostPort(strings.TrimSpace(c.Request.RemoteAddr))
        if err != nil {
                return nil, false
        }
        remoteIP := net.ParseIP(ip)
        if remoteIP == nil {
                return nil, false
        }
        return remoteIP, true
}

// Hijack implements the http.Hijacker interface.
func (c *Context) Hijack() (net.Conn, *bufio.ReadWriter, error) {
        c.wroteHeader = true
        return c.Writer.(http.Hijacker).Hijack()
}

// Flush implements the http.Flush interface.
func (c *Context) Flush() {
        c.Writer.(http.Flusher).Flush()
}

// Pusher implements the http.Pusher interface.
func (c *Context) Pusher() http.Pusher {
        if pusher, ok := c.Writer.(http.Pusher); ok {
                return pusher
        }
        return nil
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "context"
        "encoding/json"
        "errors"
        "net/http"
        "strconv"
        "syscall"
)

type (
        // Error implements HTTPError
        Error struct {
                Status int    // http status code
                Code   string // error code
                Err    error  // error
        }

        // errorResponse response error with json
        // internal type between server and client
        errorResponse struct {
                Error string `json:"error"`
                Code  string `json:"code,omitempty"`
        }

        statusCoder interface {
                StatusCode() int
        }
        errorCoder interface {
                ErrorCode() string
        }
)

var _ HTTPError = &Error{}

// NewError new error with
func NewError(statusCode int, errCode string, err error) *Error {
        return &Error{
                Status: statusCode,
                Code:   errCode,
                Err:    err,
        }
}

// StatusCode returns http status code
func (e *Error) StatusCode() int {
        return e.Status
}

// ErrorCode returns special defined code
func (e *Error) ErrorCode() string {
        return e.Code
}

// Error implements error
func (e *Error) Error() string {
        if e.Err == nil {
                return ""
        }
        return e.Err.Error()
}

// Unwrap errors.Is(), errors.As() and errors.Unwrap()
func (e *Error) Unwrap() error {
        return e.Err
}

// DetectStatusCode returns http status code
func DetectStatusCode(err error) int {
        if err == nil {
                return http.StatusOK
        }

        var st statusCoder
        if errors.As(err, &st) {
                return st.StatusCode()
        }

        switch err {
        case syscall.EINVAL:
                return http.StatusBadRequest
        case context.Canceled:
                return 499
        default:
                return http.StatusInternalServerError
        }
}

// DetectErrorCode returns error code
func DetectErrorCode(err error) string {
        if err == nil {
                return ""
        }

        var ec errorCoder
        if errors.As(err, &ec) {
                return ec.ErrorCode()
        }

        switch err {
        case syscall.EINVAL:
                return "BadRequest"
        case context.Canceled:
                return "Canceled"
        default:
                return "InternalServerError"
        }
}

// DetectError returns status code, error code, error
func DetectError(err error) (int, string, error) {
        return DetectStatusCode(err), DetectErrorCode(err), errors.Unwrap(err)
}

// Error2HTTPError returns an interface HTTPError from an error
func Error2HTTPError(err error) HTTPError {
        if err == nil {
                return nil
        }
        if httpErr, ok := err.(HTTPError); ok {
                return httpErr
        }

        status, code, _ := DetectError(err)
        return NewError(status, code, err)
}

// ReplyErr directly reply error with response writer
func ReplyErr(w http.ResponseWriter, code int, err string) {
        msg, _ := json.Marshal(NewError(code, "", errors.New(err)))
        h := w.Header()
        h.Set("Content-Length", strconv.Itoa(len(msg)))
        h.Set("Content-Type", MIMEJSON)
        w.WriteHeader(code)
        w.Write(msg)
}

// ReplyWith directly reply body with response writer
func ReplyWith(w http.ResponseWriter, code int, bodyType string, msg []byte) {
        h := w.Header()
        h.Set("Content-Length", strconv.Itoa(len(msg)))
        h.Set("Content-Type", bodyType)
        w.WriteHeader(code)
        w.Write(msg)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "context"
        "errors"
        "fmt"
        "net/http"
        urllib "net/url"
        "strings"

        "github.com/cubefs/cubefs/blobstore/common/trace"
)

var errNoHost = errors.New("no host available")

// LbConfig load balance config
type LbConfig struct {
        // hosts
        Hosts []string `json:"hosts"`
        // backup hosts
        BackupHosts []string `json:"backup_hosts"`
        // HostTryTimes Number of host failure retries, HostTryTimes < RequestTryTimes,
        // Avoid requesting the unavailable host all the time
        HostTryTimes int `json:"host_try_times"`
        // Failure retry interval, default value is -1, if FailRetryIntervalS < 0,
        // remove failed hosts will not work.
        FailRetryIntervalS int `json:"fail_retry_interval_s"`
        // Within MaxFailsPeriodS, if the number of failures is greater than or equal
        // to MaxFails, the host is considered disconnected.
        MaxFailsPeriodS int `json:"max_fails_period_s"`

        // RequestTryTimes The maximum number of attempts for a request hosts.
        RequestTryTimes int `json:"try_times"`

        // should retry function
        ShouldRetry func(code int, err error) bool `json:"-"`

        // config for simple client
        Config
}

type lbClient struct {
        requestTryTimes int
        // host for simple client
        clientMap map[string]Client

        sel Selector
        cfg *LbConfig
}

var _ Client = (*lbClient)(nil)

// NewLbClient returns a lb client
func NewLbClient(cfg *LbConfig, sel Selector) Client {
        if cfg == nil {
                cfg = &LbConfig{}
        }
        cfg.Config.Tc = cfg.Config.Tc.Default()
        if cfg.HostTryTimes == 0 {
                cfg.HostTryTimes = (len(cfg.Hosts) + len(cfg.BackupHosts)) * 2
        }
        if cfg.MaxFailsPeriodS == 0 {
                cfg.MaxFailsPeriodS = 1
        }
        if cfg.RequestTryTimes == 0 {
                cfg.RequestTryTimes = cfg.HostTryTimes + 1
        }
        if cfg.ShouldRetry == nil {
                cfg.ShouldRetry = defaultShouldRetry
        }
        if cfg.HostTryTimes > cfg.RequestTryTimes {
                cfg.HostTryTimes = cfg.RequestTryTimes - 1
        }
        if cfg.FailRetryIntervalS == 0 {
                cfg.FailRetryIntervalS = -1
        }
        if sel == nil {
                sel = newSelector(cfg)
        }
        cl := &lbClient{sel: sel, cfg: cfg}
        cl.clientMap = make(map[string]Client)
        for _, host := range cfg.Hosts {
                cl.clientMap[host] = NewClient(&cfg.Config)
        }
        for _, host := range cfg.BackupHosts {
                cl.clientMap[host] = NewClient(&cfg.Config)
        }

        cl.requestTryTimes = cfg.RequestTryTimes
        return cl
}

var defaultShouldRetry = func(code int, err error) bool {
        if err != nil || (code/100 != 4 && code/100 != 2) {
                return true
        }
        return false
}

func (c *lbClient) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
        return c.doCtx(ctx, req)
}

func (c *lbClient) Form(ctx context.Context, method, url string, form map[string][]string) (resp *http.Response, err error) {
        body := urllib.Values(form).Encode()
        req, err := http.NewRequest(method, url, strings.NewReader(body))
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *lbClient) Put(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
        body, err := marshalObj(params)
        if err != nil {
                return
        }
        request, err := http.NewRequest(http.MethodPut, url, body.Body)
        if err != nil {
                return
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        return c.Do(ctx, request)
}

func (c *lbClient) Post(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
        body, err := marshalObj(params)
        if err != nil {
                return nil, err
        }
        request, err := http.NewRequest(http.MethodPost, url, body.Body)
        if err != nil {
                return nil, err
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        return c.Do(ctx, request)
}

func (c *lbClient) DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error {
        for _, opt := range opts {
                opt(req)
        }
        resp, err := c.Do(ctx, req)
        if err != nil {
                return err
        }
        defer resp.Body.Close()
        err = serverCrcEncodeCheck(ctx, req, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *lbClient) GetWith(ctx context.Context, url string, ret interface{}) error {
        resp, err := c.Get(ctx, url)
        if err != nil {
                return err
        }
        return parseData(resp, ret)
}

func (c *lbClient) PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) (err error) {
        body, err := marshalObj(params)
        if err != nil {
                return
        }
        request, err := http.NewRequest(http.MethodPut, url, body.Body)
        if err != nil {
                return
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        for _, opt := range opts {
                opt(request)
        }
        resp, err := c.Do(ctx, request)
        if err != nil {
                return
        }
        defer resp.Body.Close()
        err = serverCrcEncodeCheck(ctx, request, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *lbClient) PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error {
        body, err := marshalObj(params)
        if err != nil {
                return err
        }
        request, err := http.NewRequest(http.MethodPost, url, body.Body)
        if err != nil {
                return err
        }
        request.Header.Set(HeaderContentType, body.ContentType)

        for _, opt := range opts {
                opt(request)
        }
        resp, err := c.Do(ctx, request)
        if err != nil {
                return err
        }
        defer resp.Body.Close()

        // set Header and log errors
        err = serverCrcEncodeCheck(ctx, request, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *lbClient) Head(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodHead, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *lbClient) Get(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodGet, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *lbClient) Delete(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodDelete, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *lbClient) doCtx(ctx context.Context, r *http.Request) (resp *http.Response, err error) {
        reqURI := r.URL.RequestURI()
        span := trace.SpanFromContextSafe(ctx)
        span.Debug("lb.doCtx: start", reqURI)
        var (
                hosts    []string
                tryTimes = c.requestTryTimes
                index    = 0
        )

        for i := 0; i < tryTimes; i++ {
                // close failed body
                if resp != nil && resp.Body != nil {
                        resp.Body.Close()
                        resp = nil
                }
                select {
                case <-ctx.Done():
                        return nil, ctx.Err()
                default:
                }
                // get the available hosts
                if index == len(hosts) || hosts == nil {
                        hosts = c.sel.GetAvailableHosts()
                        if len(hosts) < 1 {
                                err = errNoHost
                                span.Errorf("lb.doCtx: get host failed: %s", err.Error())
                                return
                        }
                        index = 0
                }
                host := hosts[index]
                // get the real url
                r.URL, err = urllib.Parse(host + reqURI)
                if err != nil {
                        span.Errorf("lb.doCtx: parse %s error", host+reqURI)
                        return
                }
                r.Host = r.URL.Host
                resp, err = c.clientMap[host].Do(ctx, r)
                if i == tryTimes-1 {
                        span.Warnf("lb.doCtx: the last host of request, try times: %d, err: %v, host: %s",
                                i+1, err, host)
                        return
                }
                code := 0
                if resp != nil {
                        code = resp.StatusCode
                }
                logInfo := fmt.Sprintf("try times: %d, code: %d, err: %v, host: %s", i+1, code, err, host)
                if c.cfg.ShouldRetry(code, err) {
                        span.Info("lb.doCtx: retry host,", logInfo)
                        index++
                        c.sel.SetFail(host)
                        if r.Body == nil {
                                continue
                        }
                        if r.GetBody != nil {
                                var _err error
                                r.Body, _err = r.GetBody()
                                if _err != nil {
                                        span.Warnf("lb.doCtx: retry failed, try times: %d, code: %d, err: %v, host: %s",
                                                i+1, code, _err, host)
                                        return
                                }
                                continue
                        }
                        span.Warn("lb.doCtx: request not support retry,", logInfo)
                        return
                }
                span.Debug("lb.doCtx: the last host of request,", logInfo)
                return
        }
        return
}

func (c *lbClient) Close() {
        c.sel.Close()
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import "net/http"

// MiddlewareHandler middleware above rpc server default router.
// Run sorted by progress handler order.
func MiddlewareHandler(phs ...ProgressHandler) http.Handler {
        DefaultRouter.hasMiddleware = true
        phs = append(DefaultRouter.headMiddlewares, phs...)
        return buildHTTPHandler(DefaultRouter.ServeHTTP, phs...)
}

// MiddlewareHandlerFunc middleware func above rpc server default router.
// Run sorted by progress handler order.
func MiddlewareHandlerFunc(phs ...ProgressHandler) http.HandlerFunc {
        DefaultRouter.hasMiddleware = true
        phs = append(DefaultRouter.headMiddlewares, phs...)
        return buildHTTPHandler(DefaultRouter.ServeHTTP, phs...)
}

// MiddlewareHandlerWith middleware above rpc server router
// Run sorted by progress handler order.
func MiddlewareHandlerWith(r *Router, phs ...ProgressHandler) http.Handler {
        r.hasMiddleware = true
        phs = append(r.headMiddlewares, phs...)
        return buildHTTPHandler(r.ServeHTTP, phs...)
}

// MiddlewareHandlerFuncWith middleware func above rpc server router
// Run sorted by progress handler order.
func MiddlewareHandlerFuncWith(r *Router, phs ...ProgressHandler) http.HandlerFunc {
        r.hasMiddleware = true
        phs = append(r.headMiddlewares, phs...)
        return buildHTTPHandler(r.ServeHTTP, phs...)
}

func buildHTTPHandler(h http.HandlerFunc, phs ...ProgressHandler) http.HandlerFunc {
        if len(phs) == 0 {
                return h
        }

        last := len(phs) - 1
        return buildHTTPHandler(func(w http.ResponseWriter, req *http.Request) {
                phs[last].Handler(w, req, h)
        }, phs[:last]...)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "bytes"
        "crypto/md5"
        "encoding/json"
        "fmt"
        "io"
        "io/ioutil"
        "net/http"
        "os"
        "path"
        "runtime"
        "strings"

        "github.com/cubefs/cubefs/blobstore/util/version"
)

// headers
const (
        HeaderContentType   = "Content-Type"
        HeaderContentLength = "Content-Length"
        HeaderContentRange  = "Content-Range"
        HeaderContentMD5    = "Content-MD5"
        HeaderUA            = "User-Agent"

        // trace
        HeaderTraceLog  = "Trace-Log"
        HeaderTraceTags = "Trace-Tags"

        // crc checker
        HeaderCrcEncoded    = "X-Crc-Encoded"
        HeaderAckCrcEncoded = "X-Ack-Crc-Encoded"
)

// mime
const (
        MIMEStream            = "application/octet-stream"
        MIMEJSON              = "application/json"
        MIMEXML               = "application/xml"
        MIMEPlain             = "text/plain"
        MIMEPOSTForm          = "application/x-www-form-urlencoded"
        MIMEMultipartPOSTForm = "multipart/form-data"
        MIMEYAML              = "application/x-yaml"
)

// encoding
const (
        GzipEncodingType = "gzip"
)

// UserAgent user agent
var UserAgent = "Golang blobstore/rpc package"

type (
        // ValueGetter fill argument's field from url values or http params.
        ValueGetter func(string) string
        // Parser is the interface implemented by argument types
        // that can parse themselves from url.Values.
        Parser interface {
                Parse(ValueGetter) error
        }

        // priority of marshaler and unmarshaler (default is json).
        //  - - - - - - - - - - - - - - - - - - - - - -
        //  |         | marshaler   | unmarshaler     |
        //  | higher  |
        //  |   ^     | MarshalerTo | UnmarshalerFrom |
        //  |   |     | Marshaler   | Unmarshaler     |
        //  |   |     | JSON Marshal| JSON Unmarshal  |
        //  |  lower  |
        //  - - - - - - - - - - - - - - - - - - - - - -

        // Actions on RPC.
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  |     APP       |  Client     |    TCP    |  Server     |
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        //  | Request Type  | marshaler   | - - - - > | unmarshaler |
        //  |                                              |        |
        //  |                                              \/       |
        //  | Response Type | unmarshaler | < - - - - | marshaler   |
        //  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

        // Marshaler is the interface implemented by types that
        // can marshal themselves into bytes, second parameter
        // is content type.
        Marshaler interface {
                Marshal() ([]byte, string, error)
        }
        // MarshalerTo is the interface implemented by types that
        // can marshal themselves into writer, the first parameter
        // is content type. (Not Recommended).
        // The underlying writer is a *bytes.Buffer.
        // Context.RespondWithReader is better than MarshalerTo on Server Side.
        MarshalerTo interface {
                MarshalTo(responseBody io.Writer) (string, error)
        }
        // Unmarshaler is the interface implemented by types
        // that can unmarshal themselves from bytes.
        Unmarshaler interface {
                Unmarshal([]byte) error
        }
        // UnmarshalerFrom is the interface implemented by types
        // that can unmarshal themselves from body reader.
        // The body underlying implementation is a *io.LimitedReader.
        UnmarshalerFrom interface {
                UnmarshalFrom(requestBody io.Reader) error
        }

        // HTTPError interface of error with http status code
        HTTPError interface {
                // StatusCode http status code
                StatusCode() int
                // ErrorCode special defined code
                ErrorCode() string
                // Error detail message
                Error() string
        }
)

// ProgressHandler http progress handler
type ProgressHandler interface {
        Handler(http.ResponseWriter, *http.Request, func(http.ResponseWriter, *http.Request))
}

// NoneBody no body of request of response.
var NoneBody Marshaler = noneBody{}

type noneBody struct{}

func (noneBody) Marshal() ([]byte, string, error) {
        return []byte{}, "", nil
}

type marshalledBody struct {
        ContentLength int
        ContentType   string
        Body          io.Reader
}

func marshalObj(obj interface{}) (*marshalledBody, error) {
        var (
                buffer []byte
                ct     string = MIMEJSON
                err    error
        )
        if obj == nil {
                buffer = jsonNull[:]
        } else if o, ok := obj.(MarshalerTo); ok {
                w := bytes.NewBuffer(nil)
                ct, err = o.MarshalTo(w)
                if err != nil {
                        return nil, err
                }
                return &marshalledBody{
                        ContentLength: w.Len(),
                        ContentType:   ct,
                        Body:          w,
                }, nil

        } else if o, ok := obj.(Marshaler); ok {
                buffer, ct, err = o.Marshal()
        } else {
                buffer, err = json.Marshal(obj)
        }
        if err != nil {
                return nil, err
        }
        return &marshalledBody{
                ContentLength: len(buffer),
                ContentType:   ct,
                Body:          bytes.NewReader(buffer),
        }, nil
}

func programVersion() string {
        sp := strings.Fields(strings.TrimSpace(version.Version()))
        if len(sp) == 0 || sp[0] == "develop" {
                data, err := ioutil.ReadFile(os.Args[0])
                if err != nil {
                        return "_"
                }
                return fmt.Sprintf("%x", md5.Sum(data))[:10]
        }
        if len(sp) > 10 {
                return sp[0][:10]
        }
        return sp[0]
}

func init() {
        hostname, _ := os.Hostname()
        ua := fmt.Sprintf("%s/%s (%s/%s; %s) %s/%s",
                path.Base(os.Args[0]),
                programVersion(),
                runtime.GOOS,
                runtime.GOARCH,
                runtime.Version(),
                hostname,
                fmt.Sprint(os.Getpid()),
        )
        if UserAgent != ua {
                UserAgent = ua
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "bytes"
        "fmt"
        "net"
        "net/http"
        "os"
        "runtime"
        "strings"

        "github.com/cubefs/cubefs/blobstore/util/log"
)

// defaultRecovery logging panic info, then panic to next handler
func defaultRecovery(w http.ResponseWriter, req *http.Request, err interface{}) {
        var brokenPipe bool
        if ne, ok := err.(*net.OpError); ok {
                if se, ok := ne.Err.(*os.SyscallError); ok {
                        if strings.Contains(strings.ToLower(se.Error()), "broken pipe") ||
                                strings.Contains(strings.ToLower(se.Error()), "connection reset by peer") {
                                brokenPipe = true
                        }
                }
        }

        stack := stack(3)
        if brokenPipe {
                log.Warnf("handle panic: %s on broken pipe\n%s", err, stack)
        } else {
                log.Errorf("handle panic: %s\n%s", err, stack)
                panic(err)
        }
}

func stack(skip int) []byte {
        buf := new(bytes.Buffer)
        for i := skip; ; i++ {
                pc, file, line, ok := runtime.Caller(i)
                if !ok {
                        break
                }
                fmt.Fprintf(buf, "%s:%d (0x%x:%s)\n", file, line, pc, funcname(pc))
        }
        return buf.Bytes()
}

// funcname returns the name of the function
func funcname(pc uintptr) []byte {
        fn := runtime.FuncForPC(pc)
        if fn == nil {
                return []byte("???")
        }
        name := []byte(fn.Name())

        if last := bytes.LastIndex(name, []byte("/")); last >= 0 {
                name = name[last+1:]
        }
        if first := bytes.Index(name, []byte(".")); first >= 0 {
                name = name[first+1:]
        }
        return name
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "net/http"
        "reflect"
        "runtime"

        "github.com/julienschmidt/httprouter"

        "github.com/cubefs/cubefs/blobstore/util/log"
)

type (
        // Router router with interceptors
        // Interceptor is middleware for http serve but named `interceptor`.
        // Middleware within this Router called `interceptor`.
        //
        // headMiddlewares is middlewares run firstly.
        // Running order is:
        //     headMiddlewares --> middlewares --> interceptors --> handler
        //
        // example:
        // router := New()
        // router.Use(interceptor1, interceptor2)
        // router.Handle(http.MethodGet, "/get/:name", handlerGet)
        // router.Handle(http.MethodPut, "/put/:name", handlerPut)
        Router struct {
                Router          *httprouter.Router // router
                hasMiddleware   bool               // true if the router with Middleware*
                headMiddlewares []ProgressHandler  // middlewares run firstly of all
                headHandler     http.HandlerFunc   // run this handler if has no middlewares
                interceptors    []HandlerFunc      // interceptors after middlewares
        }
)

// ServeHTTP makes the router implement the http.Handler interface.
func (r *Router) ServeHTTP(w http.ResponseWriter, req *http.Request) {
        if !r.hasMiddleware {
                r.headHandler(w, req)
                return
        }
        r.Router.ServeHTTP(w, req)
}

// DefaultRouter default router for server
var DefaultRouter *Router

func init() {
        initDefaultRouter()
}

func initDefaultRouter() {
        DefaultRouter = New()
        DefaultRouter.Router.PanicHandler = defaultRecovery
}

// New alias of httprouter.New
// Return a Router, control by yourself
func New() *Router {
        r := &Router{
                Router:          httprouter.New(),
                hasMiddleware:   false,
                headMiddlewares: []ProgressHandler{&crcDecoder{}},
        }
        r.headHandler = buildHTTPHandler(r.Router.ServeHTTP, r.headMiddlewares...)
        return r
}

// Use attaches a global interceptor to the router.
// You should Use interceptor before register handler.
// It is sorted by registered order.
func (r *Router) Use(interceptors ...HandlerFunc) {
        if len(r.interceptors)+len(interceptors) >= int(abortIndex) {
                panic("too many regiter handlers")
        }
        r.interceptors = append(r.interceptors, interceptors...)
}

// Handle registers a new request handle with the given path and method.
//
// For HEAD, GET, POST, PUT, PATCH and DELETE requests the respective shortcut
// functions can be used.
func (r *Router) Handle(method, path string, handler HandlerFunc, opts ...ServerOption) {
        // Notice: in golang, sentence [ sliceA := append(sliceA, item) ]
        // the pointer of sliceA is the pointer of sliceB, if sliceB has enough capacity.
        // so we need make a new slice.
        handlers := make([]HandlerFunc, 0, len(r.interceptors)+1)
        handlers = append(handlers, r.interceptors...)
        handlers = append(handlers, handler)
        if len(handlers) >= int(abortIndex) {
                panic("too many regiter handlers")
        }
        r.Router.Handle(method, path, makeHandler(handlers, opts...))

        opt := new(serverOptions)
        for _, o := range opts {
                o.apply(opt)
        }
        icnames := make([]string, 0, len(r.interceptors))
        for _, ic := range r.interceptors {
                icnames = append(icnames, runtime.FuncForPC(reflect.ValueOf(ic).Pointer()).Name())
        }
        name := runtime.FuncForPC(reflect.ValueOf(handler).Pointer()).Name()
        log.Infof("register handler method:%s, path:%s, interceptors:%s, handler:%s, opts:%+v",
                method, path, icnames, name, opt)
}

// Use attaches a global interceptor to the default router.
// You should Use interceptor before register handler.
// It is sorted by registered order.
func Use(interceptors ...HandlerFunc) {
        DefaultRouter.interceptors = append(DefaultRouter.interceptors, interceptors...)
}

// HEAD is a shortcut for Handle(http.MethodHead, path, handle)
func HEAD(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodHead, path, handler, opts...)
}

// GET is a shortcut for Handle(http.MethodGet, path, handle)
func GET(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodGet, path, handler, opts...)
}

// POST is a shortcut for Handle(http.MethodPost, path, handle)
func POST(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodPost, path, handler, opts...)
}

// PUT is a shortcut for Handle(http.MethodPut, path, handle)
func PUT(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodPut, path, handler, opts...)
}

// DELETE is a shortcut for Handle(http.MethodDelete, path, handle)
func DELETE(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodDelete, path, handler, opts...)
}

// OPTIONS is a shortcut for Handle(http.MethodOptions, path, handle)
func OPTIONS(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodOptions, path, handler, opts...)
}

// PATCH is a shortcut for Handle(http.MethodPatch, path, handle)
func PATCH(path string, handler HandlerFunc, opts ...ServerOption) {
        Handle(http.MethodPatch, path, handler, opts...)
}

// Handle registers a new request handle with the given path and method.
//
// For HEAD, GET, POST, PUT, PATCH and DELETE requests the respective shortcut
// functions can be used.
func Handle(method, path string, handler HandlerFunc, opts ...ServerOption) {
        DefaultRouter.Handle(method, path, handler, opts...)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "context"
        "math/rand"
        "sync"
        "time"
)

// hostItem Host information
type hostItem struct {
        rawHost string
        // The last time the host failed
        lastFailedTime int64
        // The number of times the host can retry
        retryTimes int
        // Whether the host is a backup host
        isBackup bool

        sync.RWMutex
}

type Selector interface {
        // GetAvailableHosts get the available hosts
        GetAvailableHosts() []string
        // SetFail Mark host unavailable
        SetFail(string)
        // Close If use a background coroutine to enable the broken host, you can use it to close the coroutine
        Close()
}

// allocate hostItem to request
type selector struct {
        // normal hosts
        hosts []*hostItem
        // broken hosts
        crackHosts map[*hostItem]interface{}
        // backup hosts
        backupHost []*hostItem
        hostMap    map[string]*hostItem
        // the frequency for a host to retry, if retryTimes > hostTryTimes the host will be marked as failed
        hostTryTimes int
        // retry interval after host failure, after this time, the host can be remarked as available
        failRetryIntervalS int
        // the interval for a host can fail one time, the lastFailedTime will be set to current time if exceeded
        maxFailsPeriodS int

        sync.RWMutex
        cancelDetectionGoroutine context.CancelFunc
}

func newSelector(cfg *LbConfig) Selector {
        ctx, cancelFunc := context.WithCancel(context.Background())
        rand.Seed(time.Now().UnixNano())
        s := &selector{
                hosts:                    initHost(cfg.Hosts, cfg, false),
                backupHost:               initHost(cfg.BackupHosts, cfg, true),
                hostTryTimes:             cfg.HostTryTimes,
                failRetryIntervalS:       cfg.FailRetryIntervalS,
                crackHosts:               map[*hostItem]interface{}{},
                hostMap:                  map[string]*hostItem{},
                cancelDetectionGoroutine: cancelFunc,
                maxFailsPeriodS:          cfg.MaxFailsPeriodS,
        }
        s.initHostMap()
        if cfg.FailRetryIntervalS < 0 {
                return s
        }
        go func() {
                s.detectAvailableHostInBack()
                ticker := time.NewTicker(time.Duration(s.failRetryIntervalS) * time.Second)
                defer ticker.Stop()
                for {
                        select {
                        case <-ticker.C:
                                s.detectAvailableHostInBack()
                        case <-ctx.Done():
                                return
                        }
                }
        }()
        return s
}

// GetAvailableHosts return available hosts from hosts and backupHost
func (s *selector) GetAvailableHosts() (hosts []string) {
        s.RLock()
        hostLen := len(s.hosts)
        length := len(s.hosts) + len(s.backupHost)
        hosts = make([]string, length)
        for index, host := range s.hosts {
                hosts[index] = host.rawHost
        }
        for index, host := range s.backupHost {
                hosts[index+hostLen] = host.rawHost
        }
        s.RUnlock()
        randomShuffle(hosts, hostLen)
        return
}

// SetFail update the requestFailedRetryTimes of hostItem or disable the host
func (s *selector) SetFail(host string) {
        if s.failRetryIntervalS < 0 {
                return
        }
        item := s.hostMap[host]
        item.Lock()
        now := time.Now().Unix()
        // init last failed time
        if item.lastFailedTime == 0 {
                item.lastFailedTime = now
        }
        // update last failed time
        if now-item.lastFailedTime >= int64(s.maxFailsPeriodS) {
                item.retryTimes = s.hostTryTimes
                item.lastFailedTime = now
        }
        item.retryTimes -= 1
        if item.retryTimes > 0 {
                item.Unlock()
                return
        }
        item.Unlock()
        s.disableHost(item)
}

// detectAvailableHostInBack enable the host from crackHosts
func (s *selector) detectAvailableHostInBack() {
        var cache []*hostItem
        s.RLock()
        for key := range s.crackHosts {
                cache = append(cache, key)
        }
        s.RUnlock()

        for _, hItem := range cache {
                hItem.Lock()
                now := time.Now().Unix()
                if now-hItem.lastFailedTime >= int64(s.failRetryIntervalS) {
                        hItem.retryTimes = s.hostTryTimes
                        hItem.lastFailedTime = 0
                        hItem.Unlock()
                        s.enableHost(hItem)
                        continue
                }
                hItem.Unlock()
        }
}

func initHost(hosts []string, cfg *LbConfig, isBackup bool) (hs []*hostItem) {
        for _, host := range hosts {
                hs = append(hs, &hostItem{
                        retryTimes: cfg.HostTryTimes,
                        rawHost:    host,
                        isBackup:   isBackup,
                })
        }
        return
}

func (s *selector) initHostMap() {
        for _, item := range s.hosts {
                s.hostMap[item.rawHost] = item
        }
        for _, item := range s.backupHost {
                s.hostMap[item.rawHost] = item
        }
}

//  mess up the order of hosts to load balancing
func randomShuffle(hosts []string, length int) {
        for i := length; i > 0; i-- {
                lastIdx := i - 1
                idx := rand.Intn(i)
                hosts[lastIdx], hosts[idx] = hosts[idx], hosts[lastIdx]
        }
        for i := len(hosts); i > length; i-- {
                lastIdx := i - 1
                idx := rand.Intn(i-length) + length
                hosts[lastIdx], hosts[idx] = hosts[idx], hosts[lastIdx]
        }
}

// add unavailable host from hosts or backupHost into crackHosts
func (s *selector) disableHost(item *hostItem) {
        s.Lock()
        defer s.Unlock()
        s.crackHosts[item] = struct{}{}
        index := 0
        var temp *[]*hostItem
        if item.isBackup {
                temp = &s.backupHost
        } else {
                temp = &s.hosts
        }
        for ; index < len(*temp); index++ {
                if item == (*temp)[index] {
                        if index == len(*temp)-1 {
                                *temp = (*temp)[:index]
                                return
                        }
                        *temp = append((*temp)[:index], (*temp)[index+1:]...)
                        return
                }
        }
}

// enableHost add available host from crackHosts into backupHost or hosts
func (s *selector) enableHost(hItem *hostItem) {
        s.Lock()
        defer s.Unlock()

        delete(s.crackHosts, hItem)
        if hItem.isBackup {
                s.backupHost = append(s.backupHost, hItem)
                return
        }
        s.hosts = append(s.hosts, hItem)
}

func (s *selector) Close() {
        s.cancelDetectionGoroutine()
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "net/http"

        "github.com/julienschmidt/httprouter"
)

type (
        // HandlerFunc defines the handler of app function
        HandlerFunc func(*Context)

        // ServerOption server option applier
        // Order: if args in body ignore others options,
        //        else uri > query > form > postfrom
        ServerOption interface {
                apply(*serverOptions)
        }
        serverOptions struct {
                argsBody     bool
                argsURI      bool
                argsQuery    bool
                argsForm     bool
                argsPostForm bool

                metaCapacity int
        }
        funcServerOption struct {
                f func(*serverOptions)
        }
)

func (so *serverOptions) copy() *serverOptions {
        return &serverOptions{
                argsBody:     so.argsBody,
                argsURI:      so.argsURI,
                argsQuery:    so.argsQuery,
                argsForm:     so.argsForm,
                argsPostForm: so.argsPostForm,

                metaCapacity: so.metaCapacity,
        }
}

func (so *serverOptions) hasArgs() bool {
        return so.argsBody || so.argsURI || so.argsQuery || so.argsForm || so.argsPostForm
}

func (fo *funcServerOption) apply(f *serverOptions) {
        fo.f(f)
}

func newFuncServerOption(f func(*serverOptions)) *funcServerOption {
        return &funcServerOption{
                f: f,
        }
}

// OptArgsBody argument in request body
func OptArgsBody() ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                o.argsBody = true
        })
}

// OptArgsURI argument in uri
func OptArgsURI() ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                o.argsURI = true
        })
}

// OptArgsQuery argument in query string
func OptArgsQuery() ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                o.argsQuery = true
        })
}

// OptArgsForm argument in form
func OptArgsForm() ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                o.argsForm = true
        })
}

// OptArgsPostForm argument in post form
func OptArgsPostForm() ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                o.argsPostForm = true
        })
}

// OptMetaCapacity initial meta capacity
func OptMetaCapacity(capacity int) ServerOption {
        return newFuncServerOption(func(o *serverOptions) {
                if capacity >= 0 {
                        o.metaCapacity = capacity
                }
        })
}

// makeHandler make handle of httprouter
func makeHandler(handlers []HandlerFunc, opts ...ServerOption) httprouter.Handle {
        opt := new(serverOptions)
        for _, o := range opts {
                o.apply(opt)
        }

        return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) {
                c := &Context{
                        opts:  opt,
                        Param: ps,

                        Request: r,
                        Writer:  w,

                        Meta: make(map[string]interface{}, opt.metaCapacity),

                        index:    -1,
                        handlers: handlers,
                }
                c.Next()
                if !c.wroteHeader {
                        c.RespondStatus(http.StatusOK)
                }
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package rpc

import (
        "context"
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        urllib "net/url"
        "strings"
        "time"

        "github.com/cubefs/cubefs/blobstore/common/crc32block"
        "github.com/cubefs/cubefs/blobstore/common/trace"
        "github.com/cubefs/cubefs/blobstore/util/bytespool"
        "github.com/cubefs/cubefs/blobstore/util/errors"
)

// Config simple client config
type Config struct {
        // the whole request and response timeout
        ClientTimeoutMs int64 `json:"client_timeout_ms"`
        // bandwidthBPMs for read body
        BodyBandwidthMBPs float64 `json:"body_bandwidth_mbps"`

        // base timeout for read body
        BodyBaseTimeoutMs int64 `json:"body_base_timeout_ms"`
        // transport config
        Tc TransportConfig `json:"transport_config"`
}

// ErrBodyReadTimeout timeout error
var ErrBodyReadTimeout = errors.New("read body timeout")

// Option client options
type Option func(req *http.Request)

// WithCrcEncode request with crc32 encode
func WithCrcEncode() Option {
        return func(req *http.Request) {
                req.Header.Set(HeaderCrcEncoded, "1")
                // util not support reader = nil
                if req.ContentLength > 0 && req.Body != nil {
                        encoder := crc32block.NewBodyEncoder(req.Body)
                        req.Body = encoder
                        if bodyGetter := req.GetBody; bodyGetter != nil {
                                req.GetBody = func() (io.ReadCloser, error) {
                                        body, err := bodyGetter()
                                        return crc32block.NewBodyEncoder(body), err
                                }
                        }
                        req.ContentLength = encoder.CodeSize(req.ContentLength)
                }
        }
}

// Client implements the rpc client with http
type Client interface {
        // Method*** handle response by yourself
        Do(ctx context.Context, req *http.Request) (*http.Response, error)
        Head(ctx context.Context, url string) (*http.Response, error)
        Get(ctx context.Context, url string) (*http.Response, error)
        Delete(ctx context.Context, url string) (*http.Response, error)
        Form(ctx context.Context, method, url string, form map[string][]string) (*http.Response, error)
        Put(ctx context.Context, url string, params interface{}) (*http.Response, error)
        Post(ctx context.Context, url string, params interface{}) (*http.Response, error)

        // ***With means parse result in client
        DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error
        GetWith(ctx context.Context, url string, ret interface{}) error
        PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error
        PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error

        // Close background goroutines in lb client
        Close()
}

type client struct {
        client            *http.Client
        bandwidthBPMs     int64 // using for reading body
        bodyBaseTimeoutMs int64 // base time read body
}

// NewClient returns a rpc client
func NewClient(cfg *Config) Client {
        if cfg == nil {
                cfg = &Config{}
        }
        cfg.Tc = cfg.Tc.Default()
        if cfg.BodyBaseTimeoutMs == 0 {
                cfg.BodyBaseTimeoutMs = 30 * 1e3
        }
        return &client{
                client: &http.Client{
                        Transport: NewTransport(&cfg.Tc),
                        Timeout:   time.Duration(cfg.ClientTimeoutMs) * time.Millisecond,
                },
                bandwidthBPMs:     int64(cfg.BodyBandwidthMBPs * (1 << 20) / 1e3),
                bodyBaseTimeoutMs: cfg.BodyBaseTimeoutMs,
        }
}

func (c *client) Form(ctx context.Context, method, url string, form map[string][]string) (resp *http.Response, err error) {
        body := urllib.Values(form).Encode()
        request, err := http.NewRequest(method, url, strings.NewReader(body))
        if err != nil {
                return
        }
        return c.Do(ctx, request)
}

func (c *client) Put(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
        body, err := marshalObj(params)
        if err != nil {
                return
        }
        request, err := http.NewRequest(http.MethodPut, url, body.Body)
        if err != nil {
                return
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        return c.Do(ctx, request)
}

func (c *client) Post(ctx context.Context, url string, params interface{}) (resp *http.Response, err error) {
        body, err := marshalObj(params)
        if err != nil {
                return nil, err
        }
        request, err := http.NewRequest(http.MethodPost, url, body.Body)
        if err != nil {
                return nil, err
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        return c.Do(ctx, request)
}

func (c *client) DoWith(ctx context.Context, req *http.Request, ret interface{}, opts ...Option) error {
        for _, opt := range opts {
                opt(req)
        }
        resp, err := c.Do(ctx, req)
        if err != nil {
                return err
        }
        defer resp.Body.Close()

        err = serverCrcEncodeCheck(ctx, req, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *client) GetWith(ctx context.Context, url string, ret interface{}) error {
        resp, err := c.Get(ctx, url)
        if err != nil {
                return err
        }
        return parseData(resp, ret)
}

func (c *client) PutWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) (err error) {
        body, err := marshalObj(params)
        if err != nil {
                return
        }
        request, err := http.NewRequest(http.MethodPut, url, body.Body)
        if err != nil {
                return
        }
        request.Header.Set(HeaderContentType, body.ContentType)
        for _, opt := range opts {
                opt(request)
        }
        resp, err := c.Do(ctx, request)
        if err != nil {
                return
        }
        defer resp.Body.Close()
        err = serverCrcEncodeCheck(ctx, request, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *client) PostWith(ctx context.Context, url string, ret interface{}, params interface{}, opts ...Option) error {
        body, err := marshalObj(params)
        if err != nil {
                return err
        }
        request, err := http.NewRequest(http.MethodPost, url, body.Body)
        if err != nil {
                return err
        }
        request.Header.Set(HeaderContentType, body.ContentType)

        for _, opt := range opts {
                opt(request)
        }
        resp, err := c.Do(ctx, request)
        if err != nil {
                return err
        }
        defer resp.Body.Close()

        err = serverCrcEncodeCheck(ctx, request, resp)
        if err != nil {
                return err
        }
        return ParseData(resp, ret)
}

func (c *client) Head(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodHead, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *client) Get(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodGet, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *client) Delete(ctx context.Context, url string) (resp *http.Response, err error) {
        req, err := http.NewRequest(http.MethodDelete, url, nil)
        if err != nil {
                return
        }
        return c.Do(ctx, req)
}

func (c *client) Do(ctx context.Context, req *http.Request) (*http.Response, error) {
        if req.Header.Get(HeaderUA) == "" {
                req.Header.Set(HeaderUA, UserAgent)
        }
        span := trace.SpanFromContextSafe(ctx)
        err := trace.InjectWithHTTPHeader(ctx, req)
        if err != nil {
                span.Errorf("inject failed, %v", err)
        }
        resp, err := c.doWithCtx(ctx, req)
        if err != nil {
                return resp, err
        }

        header := resp.Header
        traceLog := header[HeaderTraceLog]
        if len(traceLog) > 0 {
                span.AppendRPCTrackLog([]string{strings.Join(traceLog, ";")})
        }
        return resp, err
}

func (c *client) Close() {
        // Do nothing to close.
}

func (c *client) doWithCtx(ctx context.Context, req *http.Request) (resp *http.Response, err error) {
        span := trace.SpanFromContextSafe(ctx)
        req = req.WithContext(ctx)
        if c.bandwidthBPMs > 0 && req.Body != nil {
                t := req.ContentLength/c.bandwidthBPMs + c.bodyBaseTimeoutMs
                req.Body = &timeoutReadCloser{timeoutMs: t, body: req.Body}
        }
        resp, err = c.client.Do(req)
        if err != nil {
                span.Warnf("do request to %s failed, error: %s", req.URL, err.Error())
                return
        }
        if c.bandwidthBPMs > 0 {
                t := resp.ContentLength/c.bandwidthBPMs + c.bodyBaseTimeoutMs
                resp.Body = &timeoutReadCloser{timeoutMs: t, body: resp.Body}
        }
        return
}

// parseData close response body in this package.
func parseData(resp *http.Response, data interface{}) (err error) {
        defer resp.Body.Close()
        return ParseData(resp, data)
}

// ParseData parse response with data, close response body by yourself.
func ParseData(resp *http.Response, data interface{}) (err error) {
        if resp.StatusCode/100 == 2 {
                size := resp.ContentLength
                if data != nil && size != 0 {
                        if d, ok := data.(UnmarshalerFrom); ok {
                                return d.UnmarshalFrom(io.LimitReader(resp.Body, size))
                        }

                        if d, ok := data.(Unmarshaler); ok {
                                buf := bytespool.Alloc(int(size))
                                defer bytespool.Free(buf)
                                if _, err = io.ReadFull(resp.Body, buf); err != nil {
                                        return NewError(resp.StatusCode, "ReadResponse", err)
                                }
                                return d.Unmarshal(buf)
                        }

                        if err := json.NewDecoder(resp.Body).Decode(data); err != nil {
                                return NewError(resp.StatusCode, "JSONDecode", err)
                        }
                }
                if resp.StatusCode == 200 {
                        return nil
                }
                return NewError(resp.StatusCode, "", err)
        }

        return ParseResponseErr(resp)
}

// ParseResponseErr parse error of response
func ParseResponseErr(resp *http.Response) (err error) {
        // wrap the error with HttpError for StatusCode is not 2XX
        if resp.StatusCode > 299 && resp.ContentLength != 0 {
                errR := &errorResponse{}
                if err := json.NewDecoder(resp.Body).Decode(errR); err != nil {
                        return NewError(resp.StatusCode, resp.Status, nil)
                }
                err = NewError(resp.StatusCode, errR.Code, errors.New(errR.Error))
                return
        }
        return NewError(resp.StatusCode, resp.Status, nil)
}

type timeoutReadCloser struct {
        body      io.ReadCloser
        timeoutMs int64
}

func (tr *timeoutReadCloser) Close() (err error) {
        return tr.body.Close()
}

func (tr *timeoutReadCloser) Read(p []byte) (n int, err error) {
        readOk := make(chan struct{})
        if tr.timeoutMs > 0 {
                startTime := time.Now().UnixNano() / 1e6
                after := time.After(time.Millisecond * time.Duration(tr.timeoutMs))
                go func() {
                        n, err = tr.body.Read(p)
                        close(readOk)
                }()
                select {
                case <-readOk:
                        // really cost time
                        tr.timeoutMs = tr.timeoutMs - (time.Now().UnixNano()/1e6 - startTime)
                        return
                case <-after:
                        tr.body.Close()
                        return 0, ErrBodyReadTimeout
                }
        }
        tr.body.Close()
        return 0, ErrBodyReadTimeout
}

func serverCrcEncodeCheck(ctx context.Context, request *http.Request, resp *http.Response) (err error) {
        // set Header and log errors
        if request.Header.Get(HeaderCrcEncoded) != "" && resp.Header.Get(HeaderAckCrcEncoded) == "" {
                msg := fmt.Sprintf("server do not ack that body has been crc encoded, url:%v", request.URL)
                trace.SpanFromContextSafe(ctx).Error(msg)
                return NewError(http.StatusNotImplemented, "resp.Status", errors.New(msg))
        }
        return nil
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

import (
        "strconv"
        "strings"

        "github.com/opentracing/opentracing-go"
)

const (
        prefixTracer = "blobstore-tracer-"

        tracerFieldCount = 2
)

// Propagate with those keys.
// Define your own key by setting those variables before your application started.
var (
        RequestIDKey = "X-Reqid"

        PrefixBaggage   = "blobstore-baggage-"
        FieldKeyTraceID = prefixTracer + "traceid"
        FieldKeySpanID  = prefixTracer + "spanid"
)

var (
        // ErrUnsupportedFormat is the alias of opentracing.ErrUnsupportedFormat.
        ErrUnsupportedFormat = opentracing.ErrUnsupportedFormat

        // ErrSpanContextNotFound is the alias of opentracing.ErrSpanContextNotFound.
        ErrSpanContextNotFound = opentracing.ErrSpanContextNotFound

        // ErrInvalidSpanContext is the alias of opentracing.ErrInvalidSpanContext.
        ErrInvalidSpanContext = opentracing.ErrInvalidSpanContext

        // ErrInvalidCarrier is the alias of opentracing.ErrInvalidCarrier.
        ErrInvalidCarrier = opentracing.ErrInvalidCarrier

        // ErrSpanContextCorrupted is the alias of opentracing.ErrSpanContextCorrupted.
        ErrSpanContextCorrupted = opentracing.ErrSpanContextCorrupted
)

const (
        // Binary is the alias of opentracing.Binary.
        Binary = opentracing.Binary

        // TextMap is the alias of opentracing.TextMap.
        TextMap = opentracing.TextMap

        // HTTPHeaders is the alias of opentracing.HTTPHeaders.
        HTTPHeaders = opentracing.HTTPHeaders
)

// TextMapCarrier is the alias of opentracing.TextMapCarrier.
type TextMapCarrier = opentracing.TextMapCarrier

// HTTPHeadersCarrier is the alias of opentracing.HTTPHeadersCarrier.
type HTTPHeadersCarrier = opentracing.HTTPHeadersCarrier

// TextMapPropagator is a combined Injector and Extractor for TextMap format.
type TextMapPropagator struct{}

var defaultTexMapPropagator = TextMapPropagator{}

// Inject implements Injector of TextMapPropagator
func (t *TextMapPropagator) Inject(sc *SpanContext, carrier interface{}) error {
        writer, ok := carrier.(opentracing.TextMapWriter)
        if !ok {
                return ErrInvalidCarrier
        }
        writer.Set(FieldKeyTraceID, sc.traceID)
        writer.Set(FieldKeySpanID, sc.spanID.String())

        sc.ForeachBaggageItems(func(k string, v []string) bool {
                if k != internalTrackLogKey { // internal baggage will not inject
                        writer.Set(PrefixBaggage+k, strings.Join(v, ","))
                }
                return true
        })
        return nil
}

// Extract implements Extractor of TextMapPropagator.
func (t *TextMapPropagator) Extract(carrier interface{}) (opentracing.SpanContext, error) {
        reader, ok := carrier.(opentracing.TextMapReader)
        if !ok {
                return nil, ErrInvalidCarrier
        }
        var (
                traceID    string
                spanID     ID
                baggage    = make(map[string][]string)
                fieldCount int
                err        error
        )
        err = reader.ForeachKey(func(key, val string) error {
                switch strings.ToLower(key) {
                case FieldKeyTraceID:
                        traceID = val
                        fieldCount++
                case FieldKeySpanID:
                        id, err := strconv.ParseUint(val, 16, 64)
                        if err != nil {
                                return ErrSpanContextCorrupted
                        }
                        spanID = ID(id)
                        fieldCount++
                default:
                        lowerKey := strings.ToLower(key)
                        if strings.HasPrefix(lowerKey, PrefixBaggage) {
                                k := strings.TrimPrefix(lowerKey, PrefixBaggage)
                                baggage[k] = append(baggage[k], val)
                        }
                }
                return nil
        })
        if err != nil {
                return nil, err
        }

        if fieldCount == 0 {
                return nil, ErrSpanContextNotFound
        }
        if fieldCount < tracerFieldCount {
                return nil, ErrSpanContextCorrupted
        }
        return &SpanContext{
                traceID: traceID,
                spanID:  spanID,
                baggage: baggage,
        }, nil
}

// GetTraceIDKey returns http header name of traceid
func GetTraceIDKey() string {
        return FieldKeyTraceID
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

import (
        "fmt"
        "os"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/opentracing/opentracing-go"
        ptlog "github.com/opentracing/opentracing-go/log"

        "github.com/cubefs/cubefs/blobstore/util/log"
)

const (
        maxErrorLen = 32
)

// Span extends opentracing.Span
type Span interface {
        opentracing.Span

        // OperationName allows retrieving current operation name.
        OperationName() string

        // WithOperation recursively save span with operation.
        WithOperation(operation string) Span

        // Tags returns tags for span
        Tags() Tags

        // Logs returns micro logs for span
        Logs() []opentracing.LogRecord

        // String returns traceID:spanID.
        String() string

        // TraceID returns traceID
        TraceID() string

        // AppendRPCTrackLog appends RPC track logs to baggage with default key fieldTrackLogKey.
        AppendRPCTrackLog(logs []string)
        // AppendTrackLog records cost time with startTime (duration=time.Since(startTime)) for a calling to a module and
        // appends to baggage with default key fieldTrackLogKey.
        AppendTrackLog(module string, startTime time.Time, err error, opts ...SpanOption)
        // AppendTrackLogWithDuration records cost time with duration for a calling to a module and
        // appends to baggage with default key fieldTrackLogKey.
        AppendTrackLogWithDuration(module string, duration time.Duration, err error, opts ...SpanOption)
        // AppendTrackLogWithFunc records cost time for the function calling to a module and
        // appends to baggage with default key fieldTrackLogKey.
        AppendTrackLogWithFunc(module string, fn func() error, opts ...SpanOption)
        // TrackLog returns track log, calls BaggageItem with default key fieldTrackLogKey.
        TrackLog() []string

        // BaseLogger defines interface of application log apis.
        log.BaseLogger
}

// spanImpl implements Span
type spanImpl struct {
        operationName string

        tracer *Tracer

        context *SpanContext

        startTime time.Time
        duration  time.Duration

        tags Tags

        logs []opentracing.LogRecord

        // rootSpan, if true indicate that this span is the root of the (sub)tree
        // of spans and parentID is empty.
        rootSpan bool

        // references for this span
        references []opentracing.SpanReference

        rw sync.RWMutex
}

// Finish implements opentracing.Span API
func (s *spanImpl) Finish() {
        s.FinishWithOptions(opentracing.FinishOptions{})
}

// FinishWithOptions implements opentracing.Span API
func (s *spanImpl) FinishWithOptions(opts opentracing.FinishOptions) {
        finishTime := opts.FinishTime
        if finishTime.IsZero() {
                finishTime = time.Now()
        }
        s.duration = finishTime.Sub(s.startTime)

        s.rw.Lock()
        defer s.rw.Unlock()

        s.logs = append(s.logs, opts.LogRecords...)
        for _, ld := range opts.BulkLogData {
                s.logs = append(s.logs, ld.ToLogRecord())
        }

        // TODO report span
}

// Context implements opentracing.Span API
func (s *spanImpl) Context() opentracing.SpanContext {
        s.rw.RLock()
        defer s.rw.RUnlock()
        return s.context
}

// OperationName returns operationName for span
func (s *spanImpl) OperationName() string {
        s.rw.RLock()
        defer s.rw.RUnlock()
        return s.operationName
}

// SetOperationName implements opentracing.Span API
func (s *spanImpl) SetOperationName(operationName string) opentracing.Span {
        s.rw.Lock()
        defer s.rw.Unlock()
        s.operationName = operationName
        return s
}

func (s *spanImpl) WithOperation(operation string) Span {
        op := s.OperationName()
        if len(op) > 0 {
                if len(operation) > 0 {
                        op = fmt.Sprintf("%s:%s", op, operation)
                }
        } else {
                op = operation
        }
        return &operationSpan{
                Span:      s,
                operation: op,
        }
}

// LogFields implements opentracing.Span API
func (s *spanImpl) LogFields(fields ...ptlog.Field) {
        s.rw.Lock()
        defer s.rw.Unlock()
        s.logs = append(s.logs, opentracing.LogRecord{
                Fields:    fields,
                Timestamp: time.Now(),
        })
}

// LogKV implements opentracing.Span API
func (s *spanImpl) LogKV(keyValues ...interface{}) {
        fields, err := ptlog.InterleavedKVToFields(keyValues...)
        if err != nil {
                s.LogFields(ptlog.Error(err), ptlog.String("function", "LogKV"))
                return
        }
        s.LogFields(fields...)
}

// SetBaggageItem implements opentracing.Span API
func (s *spanImpl) SetBaggageItem(key, value string) opentracing.Span {
        for _, ref := range s.references {
                spanCtx, ok := ref.ReferencedContext.(*SpanContext)
                if !ok {
                        continue
                }
                spanCtx.setBaggageItem(key, []string{value})
        }
        s.context.setBaggageItem(key, []string{value})
        return s
}

// BaggageItem implements opentracing.Span API
func (s *spanImpl) BaggageItem(key string) string {
        return strings.Join(s.context.baggageItem(key), ",")
}

// Tracer implements opentracing.Span API
func (s *spanImpl) Tracer() opentracing.Tracer {
        return s.tracer
}

// SetTag implements opentracing.Span API
func (s *spanImpl) SetTag(key string, value interface{}) opentracing.Span {
        s.rw.Lock()
        defer s.rw.Unlock()
        if s.tags == nil {
                s.tags = Tags{}
        }
        s.tags[key] = value
        return s
}

// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) LogEvent(event string) {
        // Deprecated: explaining why this function is empty.
}

// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) LogEventWithPayload(event string, payload interface{}) {
        // Deprecated: explaining why this function is empty.
}

// Deprecated: use LogFields or LogKV (not implements)
func (s *spanImpl) Log(data opentracing.LogData) {
        // Deprecated: explaining why this function is empty.
}

// Tags returns tags for span
func (s *spanImpl) Tags() Tags {
        s.rw.RLock()
        defer s.rw.RUnlock()
        // copy
        tags := make(map[string]interface{}, len(s.tags))
        for key, value := range s.tags {
                tags[key] = value
        }
        return tags
}

// Logs returns micro logs for span
func (s *spanImpl) Logs() []opentracing.LogRecord {
        s.rw.RLock()
        defer s.rw.RUnlock()
        return s.logs
}

// AppendTrackLog records cost time with startTime (duration=time.Since(startTime)) for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendTrackLog(module string, startTime time.Time, err error, opts ...SpanOption) {
        s.AppendTrackLogWithDuration(module, time.Since(startTime), err, opts...)
}

// AppendTrackLogWithDuration records cost time with duration for a calling to a module and
// appends to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendTrackLogWithDuration(module string, duration time.Duration, err error, opts ...SpanOption) {
        spanOpt := &spanOptions{duration: durationMs, errorLength: maxErrorLen} // compatibility
        for _, opt := range opts {
                opt(spanOpt)
        }

        if spanOpt.duration == durationAny {
                module += ":" + duration.String()
        } else if dur := spanOpt.duration.Value(duration); dur > 0 {
                module += ":" + strconv.FormatInt(dur, 10)
                if spanOpt.durationUnit {
                        module += spanOpt.duration.Unit(duration)
                }
        }

        if err != nil {
                msg := err.Error()
                errLen := spanOpt.errorLength
                if len(msg) > errLen {
                        msg = msg[:errLen]
                }
                module += "/" + msg
        }
        s.track(module)
}

// AppendTrackLogWithFunc records cost time for the function calling to a module.
func (s *spanImpl) AppendTrackLogWithFunc(module string, fn func() error, opts ...SpanOption) {
        startTime := time.Now()
        err := fn()
        s.AppendTrackLog(module, startTime, err, opts...)
}

// AppendRPCTrackLog appends RPC track logs to baggage with default key fieldTrackLogKey.
func (s *spanImpl) AppendRPCTrackLog(logs []string) {
        for _, trackLog := range logs {
                s.track(trackLog)
        }
}

// TrackLog returns track log, calls BaggageItem with default key fieldTrackLogKey.
func (s *spanImpl) TrackLog() []string {
        return s.context.trackLogs()
}

func (s *spanImpl) track(value string) {
        maxTracks := s.tracer.options.maxInternalTrack
        for _, ref := range s.references {
                spanCtx, ok := ref.ReferencedContext.(*SpanContext)
                if !ok {
                        continue
                }
                spanCtx.append(maxTracks, value)
        }
        s.context.append(maxTracks, value)
}

// String returns traceID:spanID.
func (s *spanImpl) String() string {
        return fmt.Sprintf("%s:%s", s.context.traceID, s.context.spanID)
}

// TraceID return traceID
func (s *spanImpl) TraceID() string {
        return s.context.traceID
}

// -------------------------------------------------------------------
const (
        defaultCalldepth = 3
)

func (s *spanImpl) output(lvl log.Level, v []interface{}) {
        if log.DefaultLogger.GetOutputLevel() > lvl {
                return
        }
        log.DefaultLogger.Output(s.String(), lvl, defaultCalldepth, v...)
}

func (s *spanImpl) outputf(lvl log.Level, format string, v []interface{}) {
        if log.DefaultLogger.GetOutputLevel() > lvl {
                return
        }
        log.DefaultLogger.Outputf(s.String(), lvl, defaultCalldepth, format, v...)
}

func (s *spanImpl) Println(v ...interface{})               { s.output(log.Linfo, v) }
func (s *spanImpl) Printf(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *spanImpl) Debug(v ...interface{})                 { s.output(log.Ldebug, v) }
func (s *spanImpl) Debugf(format string, v ...interface{}) { s.outputf(log.Ldebug, format, v) }
func (s *spanImpl) Info(v ...interface{})                  { s.output(log.Linfo, v) }
func (s *spanImpl) Infof(format string, v ...interface{})  { s.outputf(log.Linfo, format, v) }
func (s *spanImpl) Warn(v ...interface{})                  { s.output(log.Lwarn, v) }
func (s *spanImpl) Warnf(format string, v ...interface{})  { s.outputf(log.Lwarn, format, v) }
func (s *spanImpl) Error(v ...interface{})                 { s.output(log.Lerror, v) }
func (s *spanImpl) Errorf(format string, v ...interface{}) { s.outputf(log.Lerror, format, v) }

func (s *spanImpl) Panic(v ...interface{}) {
        str := fmt.Sprintln(v...)
        s.output(log.Lpanic, v)
        panic(s.String() + " -> " + str)
}

func (s *spanImpl) Panicf(format string, v ...interface{}) {
        str := fmt.Sprintf(format, v...)
        s.outputf(log.Lpanic, format, v)
        panic(s.String() + " -> " + str)
}

func (s *spanImpl) Fatal(v ...interface{}) {
        s.output(log.Lfatal, v)
        os.Exit(1)
}

func (s *spanImpl) Fatalf(format string, v ...interface{}) {
        s.outputf(log.Lfatal, format, v)
        os.Exit(1)
}

// -------------------------------------------------------------------
type operationSpan struct {
        Span
        operation string
}

func (s *operationSpan) OperationName() string {
        return s.operation
}

func (s *operationSpan) SetOperationName(operation string) opentracing.Span {
        s.operation = operation
        return s
}

func (s *operationSpan) WithOperation(operation string) Span {
        op := s.OperationName()
        if len(op) > 0 {
                if len(operation) > 0 {
                        op = fmt.Sprintf("%s:%s", op, operation)
                }
        } else {
                op = operation
        }
        return &operationSpan{
                Span:      s,
                operation: op,
        }
}

func (s *operationSpan) String() string {
        span := s.Span
        next := true
        for next {
                switch x := span.(type) {
                case *operationSpan:
                        span = x.Span
                default:
                        next = false
                }
        }
        if op := s.OperationName(); op != "" {
                return fmt.Sprintf("%s:%s", span.String(), op)
        }
        return span.String()
}

func (s *operationSpan) output(lvl log.Level, v []interface{}) {
        if log.DefaultLogger.GetOutputLevel() > lvl {
                return
        }
        log.DefaultLogger.Output(s.String(), lvl, defaultCalldepth, v...)
}

func (s *operationSpan) outputf(lvl log.Level, format string, v []interface{}) {
        if log.DefaultLogger.GetOutputLevel() > lvl {
                return
        }
        log.DefaultLogger.Outputf(s.String(), lvl, defaultCalldepth, format, v...)
}

func (s *operationSpan) Println(v ...interface{})               { s.output(log.Linfo, v) }
func (s *operationSpan) Printf(format string, v ...interface{}) { s.outputf(log.Linfo, format, v) }
func (s *operationSpan) Debug(v ...interface{})                 { s.output(log.Ldebug, v) }
func (s *operationSpan) Debugf(format string, v ...interface{}) { s.outputf(log.Ldebug, format, v) }
func (s *operationSpan) Info(v ...interface{})                  { s.output(log.Linfo, v) }
func (s *operationSpan) Infof(format string, v ...interface{})  { s.outputf(log.Linfo, format, v) }
func (s *operationSpan) Warn(v ...interface{})                  { s.output(log.Lwarn, v) }
func (s *operationSpan) Warnf(format string, v ...interface{})  { s.outputf(log.Lwarn, format, v) }
func (s *operationSpan) Error(v ...interface{})                 { s.output(log.Lerror, v) }
func (s *operationSpan) Errorf(format string, v ...interface{}) { s.outputf(log.Lerror, format, v) }

func (s *operationSpan) Panic(v ...interface{}) {
        str := fmt.Sprintln(v...)
        s.output(log.Lpanic, v)
        panic(s.String() + " -> " + str)
}

func (s *operationSpan) Panicf(format string, v ...interface{}) {
        str := fmt.Sprintf(format, v...)
        s.outputf(log.Lpanic, format, v)
        panic(s.String() + " -> " + str)
}

func (s *operationSpan) Fatal(v ...interface{}) {
        s.output(log.Lfatal, v)
        os.Exit(1)
}

func (s *operationSpan) Fatalf(format string, v ...interface{}) {
        s.outputf(log.Lfatal, format, v)
        os.Exit(1)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

import (
        "fmt"
        "hash/maphash"
        "sync"
)

const (
        internalTrackLogKey = "internal-baggage-key-tracklog"
)

// ID used for spanID or traceID
type ID uint64

func (id ID) String() string {
        return fmt.Sprintf("%016x", uint64(id))
}

// RandomID generate ID for traceID or spanID
func RandomID() ID {
        return ID(new(maphash.Hash).Sum64())
}

// SpanContext implements opentracing.SpanContext
type SpanContext struct {
        // traceID represents globally unique ID of the trace.
        traceID string

        // spanID represents span ID that must be unique within its trace.
        spanID ID

        // parentID refers to the ID of the parent span.
        // Should be 0 if the current span is a root span.
        parentID ID

        // Distributed Context baggage.
        baggage map[string][]string
        sync.RWMutex
}

// ForeachBaggageItem implements opentracing.SpanContext API
func (s *SpanContext) ForeachBaggageItem(handler func(k, v string) bool) {
        panic("not implements")
}

// ForeachBaggageItems will called the handler function  for each baggage key/values pair.
func (s *SpanContext) ForeachBaggageItems(handler func(k string, v []string) bool) {
        s.Lock()
        defer s.Unlock()

        for k, v := range s.baggage {
                if !handler(k, v) {
                        break
                }
        }
}

func (s *SpanContext) setBaggageItem(key string, value []string) {
        s.Lock()
        defer s.Unlock()

        if s.baggage == nil {
                s.baggage = map[string][]string{key: value}
                return
        }
        s.baggage[key] = value
}

func (s *SpanContext) trackLogs() []string {
        return s.baggageItemDeepCopy(internalTrackLogKey)
}

func (s *SpanContext) append(maxTracks int, value string) {
        s.Lock()
        if s.baggage == nil {
                s.baggage = make(map[string][]string)
        }
        if len(s.baggage[internalTrackLogKey]) < maxTracks {
                s.baggage[internalTrackLogKey] = append(s.baggage[internalTrackLogKey], value)
        }
        s.Unlock()
}

func (s *SpanContext) baggageItem(key string) []string {
        s.RLock()
        defer s.RUnlock()
        return s.baggage[key]
}

func (s *SpanContext) baggageItemDeepCopy(key string) (item []string) {
        s.RLock()
        item = append(item, s.baggage[key]...)
        s.RUnlock()
        return
}

// IsValid returns true if SpanContext is valid
func (s *SpanContext) IsValid() bool {
        return s.traceID != "" && s.spanID != 0
}

// IsEmpty returns true is span context is empty
func (s *SpanContext) IsEmpty() bool {
        return !s.IsValid() && len(s.baggage) == 0
}

// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

import (
        "time"
)

// TracerOption is a function that sets some option on the tracer
type SpanOption func(*spanOptions)

type spanOptionDuration uint8

const (
        durationAny spanOptionDuration = iota
        durationNone
        durationNs
        durationUs
        durationMs // default
        durationSecond
        durationMinute
        durationHour
)

func (d spanOptionDuration) Value(duration time.Duration) int64 {
        var v int64
        switch d {
        case durationNone:
        case durationNs:
                v = duration.Nanoseconds()
        case durationUs:
                v = duration.Microseconds()
        case durationMs:
                v = duration.Milliseconds()
        case durationSecond:
                v = int64(duration / time.Second)
        case durationMinute:
                v = int64(duration / time.Minute)
        case durationHour:
                v = int64(duration / time.Hour)
        }
        return v
}

func (d spanOptionDuration) Unit(duration time.Duration) string {
        switch d {
        case durationNs:
                return "ns"
        case durationUs:
                return "us"
        case durationMs:
                return "ms"
        case durationSecond:
                return "s"
        case durationMinute:
                return "m"
        case durationHour:
                return "h"
        }
        return ""
}

type spanOptions struct {
        duration     spanOptionDuration
        durationUnit bool
        errorLength  int
}

func OptSpanDurationAny() SpanOption    { return func(o *spanOptions) { o.duration = durationAny } }
func OptSpanDurationNone() SpanOption   { return func(o *spanOptions) { o.duration = durationNone } }
func OptSpanDurationNs() SpanOption     { return func(o *spanOptions) { o.duration = durationNs } }
func OptSpanDurationUs() SpanOption     { return func(o *spanOptions) { o.duration = durationUs } }
func OptSpanDurationMs() SpanOption     { return func(o *spanOptions) { o.duration = durationMs } }
func OptSpanDurationSecond() SpanOption { return func(o *spanOptions) { o.duration = durationSecond } }
func OptSpanDurationMinute() SpanOption { return func(o *spanOptions) { o.duration = durationMinute } }
func OptSpanDurationHour() SpanOption   { return func(o *spanOptions) { o.duration = durationHour } }
func OptSpanDurationUnit() SpanOption   { return func(o *spanOptions) { o.durationUnit = true } }

func OptSpanErrorLength(l int) SpanOption {
        return func(o *spanOptions) {
                if l >= 0 {
                        o.errorLength = l
                }
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

import (
        "context"
        "encoding/json"
        "fmt"
        "net/http"
        "os"
        "path"
        "time"

        "github.com/opentracing/opentracing-go"
        "github.com/opentracing/opentracing-go/ext"
)

const (
        defaultRootSpanName   = ""
        defaultMaxLogsPerSpan = 50
        defaultInternalTrack  = 64
)

// ChildOf is the alias of opentracing.ChildOf
var ChildOf = opentracing.ChildOf

// FollowsFrom is the alias of opentracing.FollowsFrom
var FollowsFrom = opentracing.FollowsFrom

// StartTime is alias of opentracing.StartTime.
type StartTime = opentracing.StartTime

// Tags are the expand of opentracing.Tags
type Tags opentracing.Tags

// Apply satisfies the StartSpanOption interface.
func (t Tags) Apply(options *opentracing.StartSpanOptions) {
        if options.Tags == nil {
                options.Tags = make(opentracing.Tags)
        }
        for k, v := range t {
                options.Tags[k] = v
        }
}

// ToSlice change tags to slice
func (t Tags) ToSlice() (ret []string) {
        for k := range t {
                ret = append(ret, k+":"+fmt.Sprint(t[k]))
        }
        return
}

// Marshal marshal tracer tags
func (t Tags) Marshal() (ret []byte, err error) {
        ret, err = json.Marshal(t)
        return
}

// Tag is the alias of opentracing.Tag,
type Tag = opentracing.Tag

// Options tracer options
type Options struct {
        maxLogsPerSpan   int
        maxInternalTrack int
}

// Tracer implements opentracing.Tracer
type Tracer struct {
        serviceName string

        options Options
}

// init sets default global tracer
func init() {
        tracer := NewTracer(path.Base(os.Args[0]))
        SetGlobalTracer(tracer)
}

// NewTracer creates a tracer with serviceName
func NewTracer(serviceName string, opts ...TracerOption) *Tracer {
        t := &Tracer{
                serviceName: serviceName,
        }
        for _, option := range opts {
                option(t)
        }

        if t.options.maxLogsPerSpan <= 0 {
                t.options.maxLogsPerSpan = defaultMaxLogsPerSpan
        }
        if t.options.maxInternalTrack <= 0 {
                t.options.maxInternalTrack = defaultInternalTrack
        }

        return t
}

// StartSpan implements StartSpan() method of opentracing.Tracer.
// Create, start, and return a new Span with the given `operationName` and
// incorporate the given StartSpanOption `opts`.
func (t *Tracer) StartSpan(operationName string, options ...opentracing.StartSpanOption) opentracing.Span {
        sso := opentracing.StartSpanOptions{}
        for _, o := range options {
                o.Apply(&sso)
        }
        return t.startSpanWithOptions(operationName, sso)
}

func (t *Tracer) startSpanWithOptions(operationName string, opts opentracing.StartSpanOptions) Span {
        startTime := opts.StartTime
        if startTime.IsZero() {
                startTime = time.Now()
        }

        var (
                hasParent  bool
                parent     *SpanContext
                references []opentracing.SpanReference
                ctx        = &SpanContext{}
        )

        for _, reference := range opts.References {
                spanCtx, ok := reference.ReferencedContext.(*SpanContext)
                if !ok {
                        continue
                }

                if spanCtx == nil || spanCtx.IsEmpty() {
                        continue
                }

                if spanCtx.IsValid() {
                        references = append(references, reference)
                }

                if !hasParent {
                        parent = spanCtx
                        hasParent = reference.Type == opentracing.ChildOfRef
                }
        }

        if !hasParent && parent != nil && !parent.IsEmpty() {
                hasParent = true
        }

        if !hasParent || (parent != nil && !parent.IsValid()) {
                ctx.traceID = RandomID().String()
                ctx.spanID = RandomID()
                ctx.parentID = 0
        } else {
                ctx.traceID = parent.traceID
                ctx.spanID = RandomID()
                ctx.parentID = parent.spanID
        }
        if hasParent {
                // copy baggage items
                parent.ForeachBaggageItems(func(k string, v []string) bool {
                        ctx.setBaggageItem(k, v)
                        return true
                })
        }

        tags := opts.Tags

        span := &spanImpl{
                operationName: operationName,
                startTime:     startTime,
                tags:          tags,
                context:       ctx,
                tracer:        t,
                references:    references,
                duration:      0,
        }
        span.rootSpan = ctx.parentID == 0
        return span
}

// Inject implements Inject() method of opentracing.Tracer
func (t *Tracer) Inject(sc opentracing.SpanContext, format interface{}, carrier interface{}) error {
        s, ok := sc.(*SpanContext)
        if !ok {
                return opentracing.ErrInvalidSpanContext
        }
        switch format {
        case TextMap, HTTPHeaders:
                return defaultTexMapPropagator.Inject(s, carrier)
        default:
                return ErrUnsupportedFormat
        }
}

// Extract implements Extract() method of opentracing.Tracer
func (t *Tracer) Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
        switch format {
        case TextMap, HTTPHeaders:
                return defaultTexMapPropagator.Extract(carrier)
        default:
                return nil, ErrUnsupportedFormat
        }
}

// Close releases all resources
func (t *Tracer) Close() error {
        // TODO report span
        return nil
}

// StartSpanFromContext starts and returns a Span with `operationName`, using
// any Span found within `ctx` as a ChildOfRef. If no such parent could be
// found, StartSpanFromContext creates a root (parentless) Span.
func StartSpanFromContext(ctx context.Context, operationName string, opts ...opentracing.StartSpanOption) (Span, context.Context) {
        span, ctx := opentracing.StartSpanFromContext(ctx, operationName, opts...)
        return span.(Span), ctx
}

// StartSpanFromContextWithTraceID starts and return a new span with `operationName` and traceID.
func StartSpanFromContextWithTraceID(ctx context.Context, operationName string, traceID string, opts ...opentracing.StartSpanOption) (Span, context.Context) {
        span, ctx := opentracing.StartSpanFromContext(ctx, operationName, opts...)
        s := span.(*spanImpl)
        s.context.traceID = traceID
        return s, ctx
}

// StartSpanFromHTTPHeaderSafe starts and return a Span with `operationName` and http.Request
func StartSpanFromHTTPHeaderSafe(r *http.Request, operationName string) (Span, context.Context) {
        spanCtx, _ := Extract(HTTPHeaders, HTTPHeadersCarrier(r.Header))
        traceID := r.Header.Get(RequestIDKey)
        if traceID == "" {
                return StartSpanFromContext(r.Context(), operationName, ext.RPCServerOption(spanCtx))
        }
        return StartSpanFromContextWithTraceID(r.Context(), operationName, traceID, ext.RPCServerOption(spanCtx))
}

// ContextWithSpan returns a new `context.Context` that holds a reference to
// the span. If span is nil, a new context without an active span is returned.
func ContextWithSpan(ctx context.Context, span Span) context.Context {
        return opentracing.ContextWithSpan(ctx, span)
}

// SpanFromContext returns the `Span` previously associated with `ctx`, or
// `nil` if no such `Span` could be found.
func SpanFromContext(ctx context.Context) Span {
        span := opentracing.SpanFromContext(ctx)
        s, ok := span.(Span)
        if !ok {
                return nil
        }
        return s
}

// SpanFromContextSafe returns the `Span` previously associated with `ctx`, or
// creates a root Span with name default.
func SpanFromContextSafe(ctx context.Context) Span {
        span := opentracing.SpanFromContext(ctx)
        s, ok := span.(Span)
        if !ok || s == nil {
                return opentracing.GlobalTracer().StartSpan(defaultRootSpanName).(Span)
        }
        return s
}

// SetGlobalTracer sets the [singleton] opentracing.Tracer returned by
// GlobalTracer(). Those who use GlobalTracer (rather than directly manage an
// opentracing.Tracer instance) should call SetGlobalTracer as early as
// possible in main(), prior to calling the `StartSpan` global func below.
// Prior to calling `SetGlobalTracer`, any Spans started via the `StartSpan`
// (etc) globals are noops.
func SetGlobalTracer(tracer *Tracer) {
        opentracing.SetGlobalTracer(tracer)
}

// CloseGlobalTracer closes global tracer gracefully.
func CloseGlobalTracer() {
        tracer, ok := opentracing.GlobalTracer().(*Tracer)
        if !ok {
                return
        }
        tracer.Close()
}

// GlobalTracer returns the global singleton `Tracer` implementation.
func GlobalTracer() *Tracer {
        t := opentracing.GlobalTracer()
        return t.(*Tracer)
}

// Extract returns a SpanContext instance given `format` and `carrier`.
func Extract(format interface{}, carrier interface{}) (opentracing.SpanContext, error) {
        return GlobalTracer().Extract(format, carrier)
}

// InjectWithHTTPHeader takes the `sm` SpanContext instance and injects it for
// propagation within `HTTPHeadersCarrier` and `HTTPHeaders`.
func InjectWithHTTPHeader(ctx context.Context, r *http.Request) error {
        span := SpanFromContextSafe(ctx)

        ext.SpanKindRPCClient.Set(span)
        ext.HTTPMethod.Set(span, r.Method)

        return span.Tracer().Inject(span.Context(), HTTPHeaders, HTTPHeadersCarrier(r.Header))
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package trace

// TracerOption is a function that sets some option on the tracer
type TracerOption func(tracer *Tracer)

// TracerOptions is a factory for all available TracerOption's
var TracerOptions tracerOptions

type tracerOptions struct{}

func (tracerOptions) MaxLogsPerSpan(maxLogsPerSpan int) TracerOption {
        return func(tracer *Tracer) {
                tracer.options.maxLogsPerSpan = maxLogsPerSpan
        }
}

func (tracerOptions) MaxInternalTrackLog(internalTrack int) TracerOption {
        return func(tracer *Tracer) {
                tracer.options.maxInternalTrack = internalTrack
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bytespool

import "sync"

func newBytes(size int) func() interface{} {
        return func() interface{} {
                return make([]byte, size)
        }
}

const (
        zeroSize int = 1 << 14 // 16K

        // 1K - 2K - 4K - 8K - 16K - 32K - 64K
        numPools      = 7
        sizeStep      = 2
        startSize int = 1 << 10 // 1K
        maxSize   int = 1 << 16 // 64K
)

var (
        zero = make([]byte, zeroSize)

        pools    [numPools]sync.Pool
        poolSize [numPools]int
)

func init() {
        size := startSize
        for ii := 0; ii < numPools; ii++ {
                pools[ii] = sync.Pool{
                        New: newBytes(size),
                }
                poolSize[ii] = size
                size *= sizeStep
        }
}

// GetPool returns a sync.Pool that generates bytes slice with the size.
// Return nil if no such pool exists.
func GetPool(size int) *sync.Pool {
        for idx, psize := range poolSize {
                if size <= psize {
                        return &pools[idx]
                }
        }
        return nil
}

// Alloc returns a bytes slice with the size.
// Make a new bytes slice if oversize.
func Alloc(size int) []byte {
        if pool := GetPool(size); pool != nil {
                b := pool.Get().([]byte)
                return b[:size]
        }
        return make([]byte, size)
}

// Free puts the bytes slice into suitable pool.
// Discard the bytes slice if oversize.
func Free(b []byte) {
        size := cap(b)
        if size > maxSize {
                return
        }

        b = b[0:size]
        for ii := numPools - 1; ii >= 0; ii-- {
                if size >= poolSize[ii] {
                        pools[ii].Put(b) // nolint: staticcheck
                        return
                }
        }
}

// Zero clean up the bytes slice b to zero.
func Zero(b []byte) {
        for len(b) > 0 {
                n := copy(b, zero)
                b = b[n:]
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package defaulter

// Set basic type's value to default.
// Epsilon of float is 1e-9.

import (
        "fmt"
        "math"
        "reflect"
)

// Empty sets string value to default if it's empty.
func Empty(valPointer *string, defaultVal string) {
        if *valPointer == "" {
                *valPointer = defaultVal
        }
}

// Equal sets basic value to default if it equal zero.
func Equal(valPointer interface{}, defaultVal interface{}) {
        setDefault(valPointer, defaultVal, equalZero)
}

// Less sets basic value to default if it is less than zero.
func Less(valPointer interface{}, defaultVal interface{}) {
        setDefault(valPointer, defaultVal, lessZero)
}

// LessOrEqual sets basic value to default if it is not greater than zero.
func LessOrEqual(valPointer interface{}, defaultVal interface{}) {
        setDefault(valPointer, defaultVal, lessOrEqualZero)
}

func setDefault(valPointer interface{}, defaultVal interface{},
        cmp func(reflect.Value, reflect.Kind) bool) {
        typ := reflect.TypeOf(valPointer)
        if typ.Kind() != reflect.Ptr {
                panic(typ.Name() + " must be pointer")
        }
        typ = typ.Elem()
        val := reflect.ValueOf(valPointer).Elem()

        dTyp, dVal := parseDefault(defaultVal)
        if typ.Kind() != dTyp.Kind() {
                panic(fmt.Sprintf("not the same type %s != %s", typ.Kind().String(), dTyp.Kind().String()))
        }

        if cmp(val, typ.Kind()) {
                val.Set(dVal)
        }
}

func parseDefault(defaultVal interface{}) (reflect.Type, reflect.Value) {
        typ, val := reflect.TypeOf(defaultVal), reflect.ValueOf(defaultVal)
        if typ.Kind() == reflect.Ptr {
                typ, val = typ.Elem(), val.Elem()
        }
        return typ, val
}

func equalZero(val reflect.Value, typ reflect.Kind) bool {
        switch typ {
        case reflect.Bool:
                return !val.Bool()
        case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
                return val.Int() == 0
        case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
                return val.Uint() == 0
        case reflect.Float32, reflect.Float64:
                return math.Float64bits(val.Float()) == 0
        default:
                panic("equal zero unsupported type " + typ.String())
        }
}

func lessZero(val reflect.Value, typ reflect.Kind) bool {
        switch typ {
        case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
                return val.Int() < 0
        case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
                return false
        case reflect.Float32, reflect.Float64:
                return val.Float() < -1e-9
        default:
                panic("less zero unsupported type " + typ.String())
        }
}

func lessOrEqualZero(val reflect.Value, typ reflect.Kind) bool {
        switch typ {
        case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
                return val.Int() <= 0
        case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
                return val.Uint() == 0
        case reflect.Float32, reflect.Float64:
                return val.Float() < 1e-9
        default:
                panic("less or equal zero unsupported type " + typ.String())
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "fmt"
        "runtime"
        "strconv"
        "strings"
)

const prefix = " --> "

// Cause returns the cause of this error
func Cause(err error) error {
        if e, ok := err.(interface{ Cause() error }); ok {
                if diag := e.Cause(); diag != nil {
                        return diag
                }
        }
        return err
}

// Detail returns detail of error, add prefix sign at the first
func Detail(err error) string {
        if err == nil {
                return ""
        }

        if e, ok := err.(interface{ Details() string }); ok {
                return e.Details()
        }

        builder := strings.Builder{}
        builder.WriteString(prefix)
        builder.WriteString(err.Error())
        return builder.String()
}

// Error error with detail
type Error struct {
        Err  error
        Why  error
        File string
        Line int
        Cmd  []interface{}
}

// Base returns a runtime.Caller(1) detail error based the error
func Base(err error, cmd ...interface{}) *Error {
        _, file, line, ok := runtime.Caller(1)
        if !ok {
                file = "???"
        }
        return &Error{Err: Cause(err), Why: err, File: file, Line: line, Cmd: cmd}
}

// Info alias of Base, deprecated
func Info(err error, cmd ...interface{}) *Error {
        _, file, line, ok := runtime.Caller(1)
        if !ok {
                file = "???"
        }
        return &Error{Err: Cause(err), Why: err, File: file, Line: line, Cmd: cmd}
}

// BaseEx returns a runtime.Caller(skip) detail error based the error.
// file and line tracing may have problems with go1.9,
// see related issue: https://github.com/golang/go/issues/22916
func BaseEx(skip int, err error, cmd ...interface{}) *Error {
        oldErr := err
        if e, ok := err.(*Error); ok {
                err = e.Err
        }
        _, file, line, ok := runtime.Caller(skip)
        if !ok {
                file = "???"
        }
        return &Error{Err: Cause(err), Why: oldErr, File: file, Line: line, Cmd: cmd}
}

// InfoEx alias of BaseEx, deprecated
func InfoEx(skip int, err error, cmd ...interface{}) *Error {
        return BaseEx(skip, err, cmd...)
}

// Cause returns the cause of this error
func (r *Error) Cause() error {
        return r.Err
}

// Unwrap returns why of the error
func (r *Error) Unwrap() error {
        return r.Why
}

// Error returns base error Error()
func (r *Error) Error() string {
        if r.Err != nil {
                return r.Err.Error()
        }
        return ""
}

// Details returns detail message of the error
func (r *Error) Details() string {
        builder := strings.Builder{}
        builder.WriteString(prefix)
        builder.WriteString(r.File)
        builder.WriteByte(':')
        builder.WriteString(strconv.Itoa(r.Line))
        builder.WriteByte(' ')
        builder.WriteString(r.Error())
        if len(r.Cmd) > 0 {
                builder.WriteString(" ~ ")
                builder.WriteString(stringJoin(r.Cmd...))
        }
        if r.Why != nil && r.Why != r.Err {
                builder.WriteString(Detail(r.Why))
        }
        return builder.String()
}

// Detail returns Error with why
func (r *Error) Detail(err error) *Error {
        r.Why = err
        return r
}

func stringJoin(v ...interface{}) string {
        builder := strings.Builder{}
        for idx, value := range v {
                if idx > 0 {
                        builder.WriteByte(' ')
                }
                builder.WriteString(fmt.Sprintf("%+v", value))
        }
        return builder.String()
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "errors"
        "fmt"
)

// New alias of errors.New
func New(msg string) error {
        return errors.New(msg)
}

// Newf alias of fmt.Errorf
func Newf(format string, a ...interface{}) error {
        return fmt.Errorf(format, a...)
}

// Newx returns error with multi message
func Newx(v ...interface{}) error {
        return errors.New(stringJoin(v...))
}

// As alias of errors.As
func As(err error, target interface{}) bool {
        return errors.As(err, target)
}

// Is alias of errors.Is
func Is(err, target error) bool {
        return errors.Is(err, target)
}

// Unwrap alias of errors.Unwrap
func Unwrap(err error) error {
        return errors.Unwrap(err)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package log

import (
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        "os"
        "strconv"
        "strings"
)

// defines log level
const (
        Ldebug Level = iota
        Linfo
        Lwarn
        Lerror
        Lpanic
        Lfatal
        maxLevel
)

// Level type log level
type Level int

// UnmarshalJSON unserialize log level with json.
// Try compatible digit firstly then string.
func (l *Level) UnmarshalJSON(data []byte) error {
        if lvl, err := strconv.Atoi(string(data)); err == nil {
                if lvl < 0 || lvl >= int(maxLevel) {
                        return fmt.Errorf("invalid log level: %s", string(data))
                }
                *l = Level(lvl)
                return nil
        }

        var lvlName string
        json.Unmarshal(data, &lvlName)
        lvl, exist := levelMapping[strings.ToLower(lvlName)]
        if !exist {
                return fmt.Errorf("invalid log level: %s", string(data))
        }
        *l = lvl
        return nil
}

// UnmarshalYAML unserialize log level with yaml.
func (l *Level) UnmarshalYAML(unmarshal func(interface{}) error) error {
        var lvlName string
        unmarshal(&lvlName)
        lvl, exist := levelMapping[strings.ToLower(lvlName)]
        if !exist {
                return fmt.Errorf("invalid log level: %s", lvlName)
        }
        *l = lvl
        return nil
}

var levelMapping = map[string]Level{
        "debug": Ldebug,
        "info":  Linfo,
        "warn":  Lwarn,
        "error": Lerror,
        "panic": Lpanic,
        "fatal": Lfatal,
}

var levelToStrings = []string{
        "[DEBUG]",
        "[INFO]",
        "[WARN]",
        "[ERROR]",
        "[PANIC]",
        "[FATAL]",
}

// DefaultLogger default logger initial with os.Stderr.
var DefaultLogger Logger

// BaseLogger defines interface of application log apis.
type BaseLogger interface {
        Printf(format string, v ...interface{})
        Println(v ...interface{})
        Debugf(format string, v ...interface{})
        Debug(v ...interface{})
        Infof(format string, v ...interface{})
        Info(v ...interface{})
        Warnf(format string, v ...interface{})
        Warn(v ...interface{})
        Errorf(format string, v ...interface{})
        Error(v ...interface{})
        Fatalf(format string, v ...interface{})
        Fatal(v ...interface{})
        Panicf(format string, v ...interface{})
        Panic(v ...interface{})
}

// Logger a implemented logger should implements all these function.
type Logger interface {
        BaseLogger

        // atomically control log level
        GetOutputLevel() Level
        SetOutputLevel(logLevel Level)
        SetOutput(w io.Writer)
        Output(id string, lvl Level, calldepth int, a ...interface{}) error
        Outputf(id string, lvl Level, calldepth int, format string, a ...interface{}) error

        // implement raft Logger with these two function
        Warningf(format string, v ...interface{})
        Warning(v ...interface{})
}

func init() {
        DefaultLogger = New(os.Stderr, 3)
}

// ChangeDefaultLevelHandler returns http handler of default log level modify API
func ChangeDefaultLevelHandler() (string, http.HandlerFunc) {
        return "/log/level", func(w http.ResponseWriter, r *http.Request) {
                switch r.Method {
                case http.MethodGet:
                        level := DefaultLogger.GetOutputLevel()
                        w.Write([]byte(fmt.Sprintf("{\"level\": \"%s\"}", levelToStrings[level])))
                case http.MethodPost:
                        if err := r.ParseForm(); err != nil {
                                w.WriteHeader(http.StatusBadRequest)
                                return
                        }
                        var level Level
                        lvlName := r.FormValue("level")
                        if lvl, ok := levelMapping[lvlName]; ok {
                                level = lvl
                        } else if err := level.UnmarshalJSON([]byte(lvlName)); err != nil {
                                w.WriteHeader(http.StatusBadRequest)
                                return
                        }
                        DefaultLogger.SetOutputLevel(Level(level))
                default:
                        w.WriteHeader(http.StatusMethodNotAllowed)
                }
        }
}

func Printf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Linfo, format, v) }
func Println(v ...interface{})               { DefaultLogger.(*logger).output(Linfo, v) }
func Debugf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Ldebug, format, v) }
func Debug(v ...interface{})                 { DefaultLogger.(*logger).output(Ldebug, v) }
func Infof(format string, v ...interface{})  { DefaultLogger.(*logger).outputf(Linfo, format, v) }
func Info(v ...interface{})                  { DefaultLogger.(*logger).output(Linfo, v) }
func Warnf(format string, v ...interface{})  { DefaultLogger.(*logger).outputf(Lwarn, format, v) }
func Warn(v ...interface{})                  { DefaultLogger.(*logger).output(Lwarn, v) }
func Errorf(format string, v ...interface{}) { DefaultLogger.(*logger).outputf(Lerror, format, v) }
func Error(v ...interface{})                 { DefaultLogger.(*logger).output(Lerror, v) }
func Fatalf(format string, v ...interface{}) {
        DefaultLogger.(*logger).outputf(Lfatal, format, v)
        os.Exit(1)
}

func Fatal(v ...interface{}) {
        DefaultLogger.(*logger).output(Lfatal, v)
        os.Exit(1)
}

func Panicf(format string, v ...interface{}) {
        s := fmt.Sprintf(format, v...)
        DefaultLogger.(*logger).outputf(Lpanic, format, v)
        panic(s)
}

func Panic(v ...interface{}) {
        s := fmt.Sprintln(v...)
        DefaultLogger.(*logger).output(Lpanic, v)
        panic(s)
}

func GetOutputLevel() Level    { return DefaultLogger.GetOutputLevel() }
func SetOutputLevel(lvl Level) { DefaultLogger.SetOutputLevel(lvl) }
func SetOutput(w io.Writer)    { DefaultLogger.SetOutput(w) }

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package log

import (
        "bytes"
        "fmt"
        "io"
        "os"
        "runtime"
        "sync"
        "sync/atomic"
        "time"
)

type logger struct {
        level     int32
        calldepth int
        writer    atomic.Value
        pool      sync.Pool
}

type logWriter struct {
        io.Writer
}

// New return a logger with default level Linfo.
// output buffer with reused bytes pool.
func New(out io.Writer, calldepth int) Logger {
        l := &logger{
                level:     int32(Linfo),
                calldepth: calldepth,
                pool: sync.Pool{
                        New: func() interface{} {
                                return new(bytes.Buffer)
                        },
                },
        }
        l.writer.Store(&logWriter{out})
        return l
}

func (l *logger) Output(id string, lvl Level, calldepth int, a ...interface{}) error {
        if int32(lvl) < atomic.LoadInt32(&l.level) || lvl >= maxLevel {
                return nil
        }
        _, file, line, ok := runtime.Caller(calldepth)
        if !ok {
                file = "???"
                line = 0
        }
        return l.write(id, lvl, file, line, fmt.Sprintln(a...))
}

func (l *logger) Outputf(id string, lvl Level, calldepth int, format string, a ...interface{}) error {
        if int32(lvl) < atomic.LoadInt32(&l.level) || lvl >= maxLevel {
                return nil
        }
        _, file, line, ok := runtime.Caller(calldepth)
        if !ok {
                file = "???"
                line = 0
        }
        return l.write(id, lvl, file, line, fmt.Sprintf(format, a...))
}

func (l *logger) write(id string, lvl Level, file string, line int, s string) error {
        now := time.Now()
        buf := l.pool.Get().(*bytes.Buffer)

        buf.Reset()
        l.formatOutput(buf, now, file, line, lvl)
        if id != "" {
                buf.WriteByte('[')
                buf.WriteString(id)
                buf.WriteByte(']')
                buf.WriteByte(' ')
        }
        buf.WriteString(s)
        if len(s) > 0 && s[len(s)-1] != '\n' {
                buf.WriteByte('\n')
        }
        out := l.writer.Load().(io.Writer)
        _, err := out.Write(buf.Bytes())
        l.pool.Put(buf)
        return err
}

// -----------------------------------------

func (l *logger) outputf(lvl Level, format string, v []interface{}) {
        l.Outputf("", lvl, l.calldepth, format, v...)
}

func (l *logger) output(lvl Level, v []interface{}) {
        l.Output("", lvl, l.calldepth, v...)
}

func (l *logger) Printf(format string, v ...interface{})   { l.outputf(Linfo, format, v) }
func (l *logger) Println(v ...interface{})                 { l.output(Linfo, v) }
func (l *logger) Debugf(format string, v ...interface{})   { l.outputf(Ldebug, format, v) }
func (l *logger) Debug(v ...interface{})                   { l.output(Ldebug, v) }
func (l *logger) Infof(format string, v ...interface{})    { l.outputf(Linfo, format, v) }
func (l *logger) Info(v ...interface{})                    { l.output(Linfo, v) }
func (l *logger) Warnf(format string, v ...interface{})    { l.outputf(Lwarn, format, v) }
func (l *logger) Warn(v ...interface{})                    { l.output(Lwarn, v) }
func (l *logger) Warningf(format string, v ...interface{}) { l.outputf(Lwarn, format, v) }
func (l *logger) Warning(v ...interface{})                 { l.output(Lwarn, v) }
func (l *logger) Errorf(format string, v ...interface{})   { l.outputf(Lerror, format, v) }
func (l *logger) Error(v ...interface{})                   { l.output(Lerror, v) }

func (l *logger) Fatalf(format string, v ...interface{}) {
        l.outputf(Lfatal, format, v)
        os.Exit(1)
}

func (l *logger) Fatal(v ...interface{}) {
        l.output(Lfatal, v)
        os.Exit(1)
}

func (l *logger) Panicf(format string, v ...interface{}) {
        s := fmt.Sprintf(format, v...)
        l.outputf(Lpanic, format, v)
        panic(s)
}

func (l *logger) Panic(v ...interface{}) {
        s := fmt.Sprintln(v...)
        l.output(Lpanic, v)
        panic(s)
}

// -----------------------------------------

func (l *logger) GetOutputLevel() Level {
        return Level(atomic.LoadInt32(&l.level))
}

func (l *logger) SetOutput(w io.Writer) {
        l.writer.Store(&logWriter{w})
}

func (l *logger) SetOutputLevel(lvl Level) {
        if lvl >= maxLevel {
                lvl = Lfatal
        }
        atomic.StoreInt32(&l.level, int32(lvl))
}

func (l *logger) formatOutput(buf *bytes.Buffer, t time.Time, file string, line int, lvl Level) {
        year, month, day := t.Date()
        itoa(buf, year, 4)
        buf.WriteByte('/')
        itoa(buf, int(month), 2)
        buf.WriteByte('/')
        itoa(buf, day, 2)
        buf.WriteByte(' ')

        hour, min, sec := t.Clock()
        itoa(buf, hour, 2)
        buf.WriteByte(':')
        itoa(buf, min, 2)
        buf.WriteByte(':')
        itoa(buf, sec, 2)
        buf.WriteByte('.')
        itoa(buf, t.Nanosecond()/1e3, 6)
        buf.WriteByte(' ')

        buf.WriteString(levelToStrings[lvl])
        buf.WriteByte(' ')

        buf.WriteString(file)
        buf.WriteByte(':')
        itoa(buf, line, -1)
        buf.WriteByte(' ')
}

// itoa cheap integer to fixed width decimal ASCII.
// a negative width to avoid zero-padding.
// the buffer has enough capacity.
func itoa(buf *bytes.Buffer, i int, width int) {
        u := uint(i)
        if u == 0 && width <= 1 {
                buf.WriteByte('0')
                return
        }

        // assemble decimal in reverse order
        var b [32]byte
        bp := len(b)
        for ; u > 0 || width > 0; u /= 10 {
                bp--
                width--
                b[bp] = byte(u%10) + '0'
        }

        // avoid slicing b to make an allocation
        buf.Write(b[bp:])
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package retry

import (
        "context"
        "time"
)

// Insist successfully on f.
// On error every duration if has error.
// Sleep duration time after f run.
func Insist(duration time.Duration, f func() error, onError func(error)) {
        err := f()
        if err == nil {
                return
        }
        onError(err)

        timer := time.NewTimer(duration)
        defer timer.Stop()
        <-timer.C

        for {
                err = f()
                if err == nil {
                        return
                }
                onError(err)

                timer.Reset(duration)
                <-timer.C
        }
}

// InsistContext successfully on f or done with context.
// On error every duration if has error.
// Sleep duration time after f run.
func InsistContext(ctx context.Context, duration time.Duration, f func() error, onError func(error)) {
        err := f()
        if err == nil {
                return
        }
        onError(err)

        timer := time.NewTimer(duration)
        defer timer.Stop()

        select {
        case <-ctx.Done():
                return
        case <-timer.C:
        }

        for {
                err = f()
                if err == nil {
                        return
                }
                onError(err)

                timer.Reset(duration)
                select {
                case <-ctx.Done():
                        return
                case <-timer.C:
                }
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package retry

import (
        "errors"
        "time"
)

var (
        // ErrRetryFailed all retry attempts failed.
        ErrRetryFailed = errors.New("retry: all retry attempts failed")
        // ErrRetryNext retry next on interrupt.
        ErrRetryNext = errors.New("retry: retry next on interrupt")
)

// Retryer is an interface retry on a specific function.
type Retryer interface {
        // On performs a retry on function, until it doesn't return any error.
        On(func() error) error
        // RuptOn performs a retry on function, until it doesn't return any error or interrupt.
        RuptOn(func() (bool, error)) error
}

type retry struct {
        attempts  int
        nextDelay func() uint32
}

// On implements Retryer.On.
func (r *retry) On(caller func() error) error {
        var lastErr error
        attempt := 1
        for attempt <= r.attempts {
                if lastErr = caller(); lastErr == nil {
                        return nil
                }

                // do not wait on last useless delay
                if attempt >= r.attempts {
                        break
                }
                time.Sleep(time.Duration(r.nextDelay()) * time.Millisecond)
                attempt++
        }
        return lastErr
}

// RuptOn implements Retryer.RuptOn.
func (r *retry) RuptOn(caller func() (bool, error)) error {
        var lastErr error
        attempt := 1
        for attempt <= r.attempts {
                interrupted, err := caller()
                if err == nil {
                        return nil
                }
                // return last error of method, if interrupted
                if err != ErrRetryNext {
                        lastErr = err
                }
                if interrupted {
                        break
                }

                // do not wait on last useless delay
                if attempt >= r.attempts {
                        break
                }
                time.Sleep(time.Duration(r.nextDelay()) * time.Millisecond)
                attempt++
        }
        return lastErr
}

// Timed returns a retry with fixed interval delay.
func Timed(attempts int, delay uint32) Retryer {
        return &retry{
                attempts: attempts,
                nextDelay: func() uint32 {
                        return delay
                },
        }
}

// ExponentialBackoff returns a retry with exponential delay.
func ExponentialBackoff(attempts int, expDelay uint32) Retryer {
        next := expDelay
        return &retry{
                attempts: attempts,
                nextDelay: func() uint32 {
                        r := next
                        next += expDelay
                        return r
                },
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package task

import "context"

var (
        // C alias of Concurrent
        C = Concurrent
        // CC alias of ConcurrentContext
        CC = ConcurrentContext
)

// Concurrent is tasks run concurrently.
func Concurrent(f func(index int, arg interface{}), args []interface{}) {
        ConcurrentContext(context.Background(), f, args)
}

// ConcurrentContext is tasks run concurrently with context.
// How to make []interface{} see: https://golang.org/doc/faq#convert_slice_of_interface
func ConcurrentContext(ctx context.Context, f func(index int, arg interface{}), args []interface{}) {
        tasks := make([]func() error, len(args))
        for ii := 0; ii < len(args); ii++ {
                index, arg := ii, args[ii]
                tasks[ii] = func() error {
                        f(index, arg)
                        return nil
                }
        }
        Run(ctx, tasks...)
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package task

import (
        "context"
)

type semaphore struct {
        ready chan struct{}
}

func newSemaphore(n int) *semaphore {
        s := &semaphore{
                ready: make(chan struct{}, n),
        }
        for ii := 0; ii < n; ii++ {
                s.ready <- struct{}{}
        }
        return s
}

func (s *semaphore) Wait() <-chan struct{} {
        return s.ready
}

func (s *semaphore) Signal() {
        s.ready <- struct{}{}
}

// Run executes list of tasks in parallel,
// returns the first error or nil if all tasks done.
func Run(ctx context.Context, tasks ...func() error) error {
        n := len(tasks)
        semaphore := newSemaphore(n)
        errorCh := make(chan error, 1)

        for _, task := range tasks {
                <-semaphore.Wait()
                go func(task func() error) {
                        err := task()
                        if err == nil {
                                semaphore.Signal()
                                return
                        }

                        select {
                        case errorCh <- err:
                        default:
                        }
                }(task)
        }

        for ii := 0; ii < n; ii++ {
                select {
                case err := <-errorCh:
                        return err
                case <-ctx.Done():
                        return ctx.Err()
                case <-semaphore.Wait():
                }
        }

        return nil
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package version

import (
        "fmt"
        "io/ioutil"
        "os"
)

var (
        version string      = ""
        fPerm   os.FileMode = 0o600
)

func init() {
        if len(os.Args) > 1 && os.Args[1] == "-version" {
                fmt.Println("version:", version)
                os.Exit(0)
        }
        writeFile(".version", version)
}

func Version() string {
        return version
}

func writeFile(fname, field string) {
        if field != "" {
                ioutil.WriteFile(fname, []byte(field), fPerm)
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "os"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

const (
        _ int = iota
        statusOK
        statusNoent
        statusError
)

type BcacheClient struct {
        connPool *ConnPool
}

var (
        once   sync.Once
        client *BcacheClient
)

func NewBcacheClient() *BcacheClient {
        once.Do(func() {
                expireTime := int64(time.Second * ConnectExpireTime)
                cp := NewConnPool(UnixSocketPath, 20, 200, expireTime)
                client = &BcacheClient{connPool: cp}
        })
        return client
}

func (c *BcacheClient) Get(key string, buf []byte, offset uint64, size uint32) (int, error) {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("bcache-get", err, bgTime, 1)
        }()

        req := &GetCacheRequest{
                CacheKey: key,
                Offset:   offset,
                Size:     size,
        }
        packet := NewBlockCachePacket()
        packet.Opcode = OpBlockCacheGet
        err = packet.MarshalData(req)
        if err != nil {
                log.LogDebugf("get block cache: req(%v) err(%v)", req.CacheKey, err)
                return 0, err
        }
        stat.EndStat("bcache-get-marshal", err, bgTime, 1)
        conn, err := c.connPool.Get()
        if err != nil {
                log.LogDebugf("get block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
                return 0, err
        }
        defer func() {
                c.connPool.Put(conn)
        }()
        stat.EndStat("bcache-get-conn", err, bgTime, 1)
        err = packet.WriteToConn(*conn)
        if err != nil {
                log.LogDebugf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
                return 0, errors.NewErrorf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
        }
        stat.EndStat("bcache-get-writeconn", err, bgTime, 1)
        err = packet.ReadFromConn(*conn, 1)
        if err != nil {
                log.LogDebugf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
                return 0, errors.NewErrorf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
        }
        stat.EndStat("bcache-get-readconn", err, bgTime, 1)

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogDebugf("get block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
                return 0, err
        }

        resp := new(GetCachePathResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogDebugf("get block cache: req(%v) err(%v) PacketData(%v)", req.CacheKey, err, string(packet.Data))
                return 0, err
        }

        cachePath := resp.CachePath
        stat.EndStat("bcache-get-meta", err, bgTime, 1)

        readBgTime := stat.BeginStat()

        subs := strings.Split(cachePath, "/")
        if subs[len(subs)-1] != key {
                log.LogDebugf("cacheKey(%v) cache path(%v) is not legal",
                        key, cachePath)
                return 0, errors.NewErrorf("cacheKey(%v) cache path is not legal: %v", key, cachePath)
        }
        f, err := os.Open(cachePath)
        if err != nil {
                return 0, err
        }
        defer f.Close()
        n, err := f.ReadAt(buf, int64(offset))
        if n != int(size) {
                log.LogDebugf("get block cache: BCache client GET() error,exception size(%v),but readSize(%v)", size, n)
                return 0, errors.NewErrorf("BcacheClient GET() error, exception size(%v), but readSize(%v)", size, n)
        }
        if err != nil {
                log.LogDebugf("get block cache: BCache client read %v err %v", cachePath, err.Error())
                return 0, errors.NewErrorf("get block cache: BCache client read %v err %v", cachePath, err.Error())
        }
        encryptXOR(buf[:n])
        stat.EndStat("bcache-get-read", err, readBgTime, 1)
        return n, nil
}

func (c *BcacheClient) Put(key string, buf []byte) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("bcache-put", err, bgTime, 1)
        }()

        req := &PutCacheRequest{
                CacheKey: key,
                Data:     buf,
        }
        packet := NewBlockCachePacket()
        packet.Opcode = OpBlockCachePut
        err = packet.MarshalData(req)
        if err != nil {
                log.LogDebugf("put block cache: req(%v) err(%v)", req.CacheKey, err)
                return err
        }

        conn, err := c.connPool.Get()
        if err != nil {
                log.LogDebugf("put block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
                return err
        }
        defer func() {
                c.connPool.Put(conn)
        }()

        err = packet.WriteToConn(*conn)
        if err != nil {
                log.LogDebugf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
                return errors.NewErrorf("Failed to write to conn, req(%v) err(%v)", req.CacheKey, err)
        }

        err = packet.ReadFromConn(*conn, proto.NoReadDeadlineTime)
        if err != nil {
                log.LogDebugf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
                return errors.NewErrorf("Failed to read from conn, req(%v), err(%v)", req.CacheKey, err)
        }
        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogDebugf("put block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
                return err
        }

        return err
}

func (c *BcacheClient) Evict(key string) error {
        req := &DelCacheRequest{CacheKey: key}
        packet := NewBlockCachePacket()
        packet.Opcode = OpBlockCacheDel
        err := packet.MarshalData(req)
        if err != nil {
                log.LogDebugf("del block cache: req(%v) err(%v)", req.CacheKey, err)
                return err
        }

        conn, err := c.connPool.Get()
        if err != nil {
                log.LogDebugf("del block cache: get Conn failed, req(%v) err(%v)", req.CacheKey, err)
                return err
        }
        defer func() {
                c.connPool.Put(conn)
        }()

        err = packet.WriteToConn(*conn)
        if err != nil {
                return err
        }

        err = packet.ReadFromConn(*conn, proto.NoReadDeadlineTime)
        if err != nil {
                return err
        }
        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("del block cache: req(%v) err(%v) result(%v)", req.CacheKey, err, packet.GetResultMsg())
                return err
        }
        log.LogDebugf("del block cache success: req(%v)", req.CacheKey)
        return nil
}

func parseStatus(result uint8) (status int) {
        switch result {
        case proto.OpOk:
                status = statusOK
        case proto.OpNotExistErr:
                status = statusNoent
        default:
                status = statusError
        }
        return
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "net"
        "time"
)

const (
        DefaultTimeOut    = 1 * time.Second
        ConnectExpireTime = 20
)

type ConnObject struct {
        c          *net.Conn
        lastActive int64
}

type ConnPool struct {
        conns  chan *ConnObject
        mincap int
        maxcap int
        expire int64
        target string
}

func NewConnPool(target string, mincap, maxcap int, expire int64) *ConnPool {
        p := &ConnPool{
                conns:  make(chan *ConnObject, maxcap),
                mincap: mincap,
                maxcap: maxcap,
                expire: expire,
                target: target,
        }
        return p
}

func (connPool *ConnPool) Get() (c *net.Conn, err error) {
        var o *ConnObject
        for {
                select {
                case o = <-connPool.conns:
                default:
                        return connPool.NewConnect(connPool.target)
                }
                if time.Now().UnixNano()-o.lastActive > connPool.expire {
                        _ = (*o.c).Close()
                        o = nil
                        continue
                }
                return o.c, nil
        }
}

func (connPool *ConnPool) NewConnect(target string) (*net.Conn, error) {
        conn, err := net.DialTimeout("unix", target, DefaultTimeOut)

        return &conn, err
}

func (connPool *ConnPool) Put(c *net.Conn) {
        o := &ConnObject{
                c:          c,
                lastActive: time.Now().UnixNano(),
        }
        select {
        case connPool.conns <- o:
                return
        default:
                if o.c != nil {
                        (*o.c).Close()
                }
                return
        }
}

// Copyright (C) 2020 Juicefs
// Modified work Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "bufio"
        "bytes"
        "container/list"
        "crypto/md5"
        "encoding/hex"
        "fmt"
        "hash/crc32"
        "io"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

const (
        PathListSeparator    = ";"
        CacheConfSeparator   = ":"
        SpaceCheckInterval   = 60 * time.Second
        TmpFileCheckInterval = 20 * 60 * time.Second
        FilePerm             = 0o644
        Basedir              = "blocks"
)

type ReadCloser interface {
        io.Reader
        io.ReaderAt
        io.Closer
}

type BcacheManager interface {
        cache(key string, data []byte, direct bool)
        read(key string, offset uint64, len uint32) (io.ReadCloser, error)
        queryCachePath(key string, offset uint64, len uint32) (string, error)
        load(key string) (ReadCloser, error)
        erase(key string)
        stats() (int64, int64)
}

func newBcacheManager(conf *bcacheConfig) BcacheManager {
        log.LogInfof("init block cache: %s size:%d GB", conf.CacheDir, conf.BlockSize)
        if conf.CacheDir == "" {
                log.LogWarnf("no cache config,cacheDirs or size is empty!")
                return nil
        }
        // todo cachedir reg match
        cacheDirs := strings.Split(conf.CacheDir, PathListSeparator)
        if len(cacheDirs) == 0 {
                log.LogWarnf("no cache dir config!")
                return nil
        }

        dirSizeMap := make(map[string]int64, len(cacheDirs))
        for _, dir := range cacheDirs {
                result := strings.Split(dir, CacheConfSeparator)
                dirPath := result[0]
                cacheSize, err := strconv.Atoi(result[1])
                if dirPath == "" || err != nil {
                        log.LogWarnf("cache dir config error!")
                        return nil
                }
                dirSizeMap[dirPath] = int64(cacheSize)
                conf.CacheSize = conf.CacheSize + int64(cacheSize)
        }

        bm := &bcacheManager{
                bstore:     make([]*DiskStore, len(cacheDirs)),
                bcacheKeys: make(map[string]*list.Element),
                lrulist:    list.New(),
                blockSize:  conf.BlockSize,
                pending:    make(chan waitFlush, 1024),
        }
        index := 0
        for cacheDir, cacheSize := range dirSizeMap {
                disk := NewDiskStore(cacheDir, cacheSize, conf)
                bm.bstore[index] = disk
                go bm.reBuildCacheKeys(cacheDir, disk)
                index++
        }
        go bm.spaceManager()
        go bm.flush()
        // go bm.scrub()
        return bm
}

type cacheItem struct {
        key  string
        size uint32
}

type keyPair struct {
        key string
        it  *cacheItem
}

// key vid_inode_offset
type waitFlush struct {
        Key  string
        Data []byte
}

type bcacheManager struct {
        sync.RWMutex
        bcacheKeys map[string]*list.Element
        lrulist    *list.List
        bstore     []*DiskStore
        blockSize  uint32
        pending    chan waitFlush
}

func encryptXOR(data []byte) {
        for index, value := range data {
                data[index] = value ^ byte(0xF)
        }
}

func (bm *bcacheManager) queryCachePath(key string, offset uint64, len uint32) (path string, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("GetCache:GetCachePath", err, bgTime, 1)
        }()

        bm.Lock()
        element, ok := bm.bcacheKeys[key]
        bm.Unlock()
        if ok {
                item := element.Value.(*cacheItem)
                path, err := bm.getCachePath(key)
                if err != nil {
                        return "", err
                }
                bm.Lock()
                bm.lrulist.MoveToBack(element)
                bm.Unlock()
                log.LogDebugf("Cache item found. key=%v offset =%v,len=%v size=%v, path=%v", key, offset, len, item.size, path)
                return path, nil
        }
        log.LogDebugf("Cache item not found. key=%v offset =%v,len=%v", key, offset, len)
        return "", os.ErrNotExist
}

func (bm *bcacheManager) getCachePath(key string) (string, error) {
        if len(bm.bstore) == 0 {
                return "", errors.New("no cache dir")
        }
        cachePath := bm.selectDiskKv(key).getPath(key)
        return cachePath, nil
}

func (bm *bcacheManager) cache(key string, data []byte, direct bool) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Cache:Write", nil, bgTime, 1)
                stat.StatBandWidth("Cache", uint32(len(data)))
        }()
        log.LogDebugf("TRACE cache. key(%v)  len(%v) direct(%v)", key, len(data), direct)
        if direct {
                bm.cacheDirect(key, data)
                return
        }
        select {
        case bm.pending <- waitFlush{Key: key, Data: data}:
        default:
                log.LogDebugf("pending chan is full,skip memory. key =%v,len=%v bytes", key, len(data))
                bm.cacheDirect(key, data)
        }
}

func (bm *bcacheManager) cacheDirect(key string, data []byte) {
        diskKv := bm.selectDiskKv(key)
        if diskKv.flushKey(key, data) == nil {
                bm.Lock()
                item := &cacheItem{
                        key:  key,
                        size: uint32(len(data)),
                }
                element := bm.lrulist.PushBack(item)
                bm.bcacheKeys[key] = element
                bm.Unlock()
        }
}

func (bm *bcacheManager) read(key string, offset uint64, len uint32) (io.ReadCloser, error) {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("GetCache:Read", err, bgTime, 1)
                if err == nil {
                        stat.StatBandWidth("GetCache:Read", len)
                }
        }()
        metaBgTime := stat.BeginStat()
        bm.Lock()
        element, ok := bm.bcacheKeys[key]
        bm.Unlock()
        stat.EndStat("GetCache:Read:GetMeta", nil, metaBgTime, 1)
        log.LogDebugf("Trace read. ok =%v", ok)
        if ok {
                item := element.Value.(*cacheItem)
                f, err := bm.load(key)
                if os.IsNotExist(err) {
                        bm.Lock()
                        delete(bm.bcacheKeys, key)
                        bm.Unlock()
                        d := bm.selectDiskKv(key)
                        atomic.AddInt64(&d.usedSize, -int64(item.size))
                        atomic.AddInt64(&d.usedCount, -1)
                        return nil, os.ErrNotExist
                }
                if err != nil {
                        return nil, err
                }
                defer f.Close()
                size := item.size
                log.LogDebugf("read. offset =%v,len=%v size=%v", offset, len, size)
                if uint32(offset)+len > size {
                        len = size - uint32(offset)
                }
                dataBgTime := stat.BeginStat()
                buf := make([]byte, len)
                n, err := f.ReadAt(buf, int64(offset))
                stat.EndStat("GetCache:Read:ReadData", err, dataBgTime, 1)
                if err != nil {
                        return nil, err
                } else {
                        // decrypt
                        encryptXOR(buf[:n])
                        return io.NopCloser(bytes.NewBuffer(buf[:n])), nil
                }
        } else {
                err = os.ErrNotExist
        }
        return nil, err
}

func (bm *bcacheManager) load(key string) (ReadCloser, error) {
        if len(bm.bstore) == 0 {
                return nil, errors.New("no cache dir")
        }
        f, err := bm.selectDiskKv(key).load(key)
        if err != nil {
                return nil, err
        }
        bm.Lock()
        defer bm.Unlock()
        if element, ok := bm.bcacheKeys[key]; ok {
                bm.lrulist.MoveToBack(element)
        }
        return f, err
}

func (bm *bcacheManager) erase(key string) {
        if len(bm.bstore) == 0 {
                return
        }
        err := bm.selectDiskKv(key).remove(key)
        if err == nil {
                bm.Lock()
                defer bm.Unlock()
                if element, ok := bm.bcacheKeys[key]; ok {
                        bm.lrulist.Remove(element)
                }
                delete(bm.bcacheKeys, key)
        }
}

func (bm *bcacheManager) stats() (int64, int64) {
        var usedCount, usedSize int64
        for _, item := range bm.bstore {
                usedSize += atomic.LoadInt64(&item.usedSize)
                usedCount += atomic.LoadInt64(&item.usedCount)
        }
        return usedCount, usedSize
}

func (bm *bcacheManager) selectDiskKv(key string) *DiskStore {
        return bm.bstore[hashKey(key)%uint32(len(bm.bstore))]
}

func (bm *bcacheManager) spaceManager() {
        ticker := time.NewTicker(SpaceCheckInterval)
        tmpTicker := time.NewTicker(TmpFileCheckInterval)

        defer func() {
                ticker.Stop()
                tmpTicker.Stop()
        }()
        for {
                select {
                case <-ticker.C:
                        for _, store := range bm.bstore {
                                useRatio, files := store.diskUsageRatio()
                                log.LogDebugf("useRation(%v), files(%v)", useRatio, files)
                                if 1-useRatio < store.freeLimit || files > int64(store.limit) {
                                        bm.freeSpace(store, 1-useRatio, files)
                                }
                        }
                case <-tmpTicker.C:
                        for _, store := range bm.bstore {
                                useRatio, files := store.diskUsageRatio()
                                log.LogInfof("useRation(%v), files(%v)", useRatio, files)
                                bm.deleteTmpFile(store)
                        }
                }
        }
}

//lru cache
//func (bm *bcacheManager) freeSpace(index int, store *DiskStore, free float32, files int64) {
//        var decreaseSpace int64
//        var decreaseCnt int
//        storeCnt := uint32(len(bm.bstore))
//        bm.Lock()
//        defer bm.Unlock()
//        if free < store.freeLimit {
//                decreaseSpace = int64((store.freeLimit - free) * (float32(store.capacity)))
//        }
//        if files > int64(store.limit) {
//                decreaseCnt = int(files - int64(store.limit))
//        }
//        var lastKey string
//        var lastItem cacheItem
//        var cnt int
//        for key, value := range bm.bcacheKeys {
//                if int(hashKey(key)%storeCnt) == index {
//                        if cnt == 0 || lastItem.atime > value.atime {
//                                lastKey = key
//                                lastItem = value
//                        }
//                        cnt++
//                        if cnt > 1 {
//                                store.remove(lastKey)
//                                delete(bm.bcacheKeys, lastKey)
//                                decreaseSpace -= int64(value.size)
//                                decreaseCnt--
//                                cnt = 0
//                                log.LogDebugf("remove %s from cache, age: %d", lastKey, lastItem.atime)
//                                if decreaseCnt <= 0 && decreaseSpace <= 0 {
//                                        break
//                                }
//                        }
//                }
//        }
//
//}

// lru
func (bm *bcacheManager) freeSpace(store *DiskStore, free float32, files int64) {
        var decreaseSpace int64
        var decreaseCnt int

        if free < store.freeLimit {
                decreaseSpace = int64((store.freeLimit - free) * (float32(store.capacity)))
        }
        if files > int64(store.limit) {
                decreaseCnt = int(files - int64(store.limit))
        }

        cnt := 0
        for {
                if decreaseCnt <= 0 && decreaseSpace <= 0 {
                        break
                }
                // avoid dead loop
                if cnt > 500000 {
                        break
                }
                bm.Lock()

                element := bm.lrulist.Front()
                if element == nil {
                        bm.Unlock()
                        return
                }
                item := element.Value.(*cacheItem)

                if err := store.remove(item.key); err == nil {
                        bm.lrulist.Remove(element)
                        delete(bm.bcacheKeys, item.key)
                        decreaseSpace -= int64(item.size)
                        decreaseCnt--
                        cnt++
                }

                bm.Unlock()
                log.LogDebugf("remove %v from cache", item.key)

        }
}

func (bm *bcacheManager) reBuildCacheKeys(dir string, store *DiskStore) {
        if _, err := os.Stat(dir); err != nil {
                log.LogErrorf("cache dir %s is not exists", dir)
                return
        }
        log.LogDebugf("reBuildCacheKeys(%s)", dir)
        c := make(chan keyPair)
        keyPrefix := filepath.Join(dir, Basedir)
        go func() {
                filepath.Walk(dir, bm.walker(c, keyPrefix, true))
                close(c)
        }()

        for value := range c {
                bm.Lock()
                element := bm.lrulist.PushBack(value.it)
                bm.bcacheKeys[value.key] = element
                bm.Unlock()
                log.LogDebugf("updateStat(%v)", value.it.size)
                store.updateStat(value.it.size)
        }
}

func (bm *bcacheManager) walker(c chan keyPair, prefix string, initial bool) filepath.WalkFunc {
        return func(path string, info os.FileInfo, err error) error {
                if err != nil {
                        log.LogWarnf("walk path %v failed %v", path, err)
                        return err
                }
                if info.IsDir() || !strings.HasPrefix(path, prefix) {
                        return nil
                }
                if strings.HasSuffix(path, ".tmp") && (initial || checkoutTempFileOuttime(path)) {
                        os.Remove(path)
                        log.LogDebugf("Remove tmp file %v", path)
                        return nil
                }
                _, key := filepath.Split(path)
                size := uint32(info.Size())
                pair := keyPair{
                        key: key,
                        it: &cacheItem{
                                key:  key,
                                size: size,
                        },
                }
                c <- pair
                return nil
        }
}

func (bm *bcacheManager) flush() {
        for {
                pending := <-bm.pending
                diskKv := bm.selectDiskKv(pending.Key)
                log.LogDebugf("flush data,key(%v), dir(%v)", pending.Key, diskKv.dir)
                if diskKv.flushKey(pending.Key, pending.Data) == nil {
                        bm.Lock()
                        item := &cacheItem{
                                key:  pending.Key,
                                size: uint32(len(pending.Data)),
                        }
                        element := bm.lrulist.PushBack(item)
                        bm.bcacheKeys[pending.Key] = element
                        bm.Unlock()
                }
        }
}

func hashKey(key string) uint32 {
        return crc32.ChecksumIEEE([]byte(key))
}

type DiskStore struct {
        sync.Mutex
        dir       string
        mode      uint32
        capacity  int64
        freeLimit float32
        limit     uint32
        usedSize  int64
        usedCount int64
}

func NewDiskStore(dir string, cacheSize int64, config *bcacheConfig) *DiskStore {
        if config.Mode == 0 {
                config.Mode = FilePerm
        }
        if config.FreeRatio <= 0 {
                config.FreeRatio = 0.15
        }

        if config.Limit <= 0 {
                config.Limit = 50000000
        }

        if config.Limit > 50000000 {
                config.Limit = 50000000
        }
        c := &DiskStore{
                dir:       dir,
                mode:      config.Mode,
                capacity:  cacheSize,
                freeLimit: config.FreeRatio,
                limit:     config.Limit,
        }
        log.LogDebugf("ignored method DiskStore.scrub at %p", c.scrub) // TODO: ignored
        c.checkBuildCacheDir(dir)
        return c
}

func (d *DiskStore) checkBuildCacheDir(dir string) {
        mode := os.FileMode(d.mode)
        if st, err := os.Stat(dir); os.IsNotExist(err) {
                if parent := filepath.Dir(dir); parent != dir {
                        d.checkBuildCacheDir(parent)
                }
                os.Mkdir(dir, mode)
        } else if err != nil && st.Mode() != mode {
                os.Chmod(dir, mode)
        }
}

//func (d *DiskStore) flushKey(key string, data []byte) error {
//        var err error
//        bgTime := stat.BeginStat()
//        defer func() {
//                stat.EndStat("Cache:Write:FlushData", err, bgTime, 1)
//        }()
//        cachePath := d.buildCachePath(key, d.dir)
//        log.LogDebugf("TRACE BCacheService flushKey Enter. key(%v) cachePath(%v)", key, cachePath)
//        d.checkBuildCacheDir(filepath.Dir(cachePath))
//        tmp := cachePath + ".tmp"
//        f, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(d.mode))
//        defer os.Remove(tmp)
//        if err != nil {
//                log.LogErrorf("Create block tmp file:%s err:%s!", tmp, err)
//                return err
//        }
//        //encrypt
//        encryptXOR(data)
//        _, err = f.Write(data)
//        if err != nil {
//                f.Close()
//                log.LogErrorf("Write tmp failed: file %s err %s!", tmp, err)
//                return err
//        }
//        err = f.Close()
//        if err != nil {
//                log.LogErrorf("Close tmp failed: file:%s err:%s!", tmp, err)
//                return err
//        }
//        info, err := os.Stat(cachePath)
//        //if already cached
//        if !os.IsNotExist(err) {
//                atomic.AddInt64(&d.usedSize, -(info.Size()))
//                atomic.AddInt64(&d.usedCount, -1)
//                os.Remove(cachePath)
//        }
//        err = os.Rename(tmp, cachePath)
//        if err != nil {
//                log.LogErrorf("Rename block tmp file:%s err:%s!", tmp, err)
//                return err
//        }
//        atomic.AddInt64(&d.usedSize, int64(len(data)))
//        atomic.AddInt64(&d.usedCount, 1)
//        log.LogDebugf("TRACE BCacheService flushKey Exit. key(%v) cachePath(%v)", key, cachePath)
//        return nil
//
//}

func (d *DiskStore) flushKey(key string, data []byte) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Cache:Write:FlushData", err, bgTime, 1)
        }()
        cachePath := d.buildCachePath(key, d.dir)
        info, err := os.Stat(cachePath)
        // if already cached
        if err == nil && info.Size() > 0 {
                return nil
        }
        log.LogDebugf("TRACE BCacheService flushKey Enter. key(%v) cachePath(%v)", key, cachePath)
        d.checkBuildCacheDir(filepath.Dir(cachePath))
        tmp := cachePath + ".tmp"
        f, err := os.OpenFile(tmp, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, os.FileMode(d.mode))
        defer os.Remove(tmp)
        if err != nil {
                log.LogWarnf("Create block tmp file:%s err:%s!", tmp, err)
                return err
        }
        // encrypt
        encryptXOR(data)
        _, err = f.Write(data)
        if err != nil {
                f.Close()
                log.LogErrorf("Write tmp failed: file %s err %s!", tmp, err)
                return err
        }
        err = f.Close()
        if err != nil {
                log.LogErrorf("Close tmp failed: file:%s err:%s!", tmp, err)
                return err
        }
        //info, err := os.Stat(cachePath)
        ////if already cached
        //if !os.IsNotExist(err) {
        //        atomic.AddInt64(&d.usedSize, -(info.Size()))
        //        atomic.AddInt64(&d.usedCount, -1)
        //        os.Remove(cachePath)
        //}
        err = os.Rename(tmp, cachePath)
        if err != nil {
                log.LogErrorf("Rename block tmp file:%s err:%s!", tmp, err)
                return err
        }
        atomic.AddInt64(&d.usedSize, int64(len(data)))
        atomic.AddInt64(&d.usedCount, 1)
        log.LogDebugf("TRACE BCacheService flushKey Exit. key(%v) cachePath(%v)", key, cachePath)
        return nil
}

func (d *DiskStore) load(key string) (ReadCloser, error) {
        cachePath := d.buildCachePath(key, d.dir)
        log.LogDebugf("TRACE BCacheService load Enter. key(%v) cachePath(%v)", key, cachePath)
        //if _, err := os.Stat(cachePath); err != nil {
        //        return nil, errors.NewError(os.ErrNotExist)
        //}
        f, err := os.OpenFile(cachePath, os.O_RDONLY, os.FileMode(d.mode))
        log.LogDebugf("TRACE BCacheService load Exit. err(%v)", err)
        return f, err
}

func (d *DiskStore) remove(key string) (err error) {
        var size int64
        cachePath := d.buildCachePath(key, d.dir)
        log.LogDebugf("remove. cachePath(%v)", cachePath)
        if info, err := os.Stat(cachePath); err == nil {
                size = info.Size()
                if err = os.Remove(cachePath); err == nil {
                        atomic.AddInt64(&d.usedSize, -size)
                        atomic.AddInt64(&d.usedCount, -1)
                }
        }
        return err
}

func (d *DiskStore) buildCachePath(key string, dir string) string {
        inodeId, err := strconv.ParseInt(strings.Split(key, "_")[1], 10, 64)
        if err != nil {
                return fmt.Sprintf("%s/blocks/%d/%d/%s", dir, hashKey(key)&0xFFF%512, hashKey(key)%512, key)
        }
        return fmt.Sprintf("%s/blocks/%d/%d/%s", dir, hashKey(key)&0xFFF%512, inodeId%512, key)
}

func (d *DiskStore) diskUsageRatio() (float32, int64) {
        log.LogDebugf("usedSize(%v), usedCount(%v)", atomic.LoadInt64(&d.usedSize), atomic.LoadInt64(&d.usedCount))
        if atomic.LoadInt64(&d.usedSize) < 0 || atomic.LoadInt64(&d.usedCount) < 0 {
                return 0, 0
        }
        return float32(atomic.LoadInt64(&d.usedSize)) / float32(d.capacity), atomic.LoadInt64(&d.usedCount)
}

func (d *DiskStore) scrub(key string, md5Sum string) error {
        defer func() {
                if r := recover(); r != nil {
                        return
                }
        }()
        cachePath := d.buildCachePath(key, d.dir)
        f, err := os.Open(cachePath)
        if err != nil {
                return err
        }
        defer f.Close()
        r := bufio.NewReader(f)
        h := md5.New()
        _, err = io.Copy(h, r)
        if err != nil {
                return err
        }
        if md5Sum != hex.EncodeToString(h.Sum(nil)) {
                return errors.New("scrub error")
        }
        return nil
}

func (d *DiskStore) updateStat(size uint32) {
        atomic.AddInt64(&d.usedSize, int64(size))
        atomic.AddInt64(&d.usedCount, 1)
}

func (d *DiskStore) getPath(key string) string {
        cachePath := d.buildCachePath(key, d.dir)
        return cachePath
}

func (bm *bcacheManager) deleteTmpFile(store *DiskStore) {
        if _, err := os.Stat(store.dir); err != nil {
                log.LogErrorf("cache dir %s is not exists", store.dir)
                return
        }
        log.LogDebugf("clear tmp files in %v", store.dir)
        c := make(chan keyPair)
        keyPrefix := filepath.Join(store.dir, Basedir)
        log.LogDebugf("keyPrefix %v", keyPrefix)
        go func() {
                filepath.Walk(store.dir, bm.walker(c, keyPrefix, false))
                close(c)
        }()
        // consume chan
        for range c {
        }

        log.LogDebugf("clear tmp files end%v", store.dir)
}

func checkoutTempFileOuttime(file string) bool {
        finfo, err := os.Stat(file)
        if err != nil {
                return false
        }
        stat_t := finfo.Sys().(*syscall.Stat_t)
        now := time.Now()
        return now.Sub(timespecToTime(stat_t.Ctim)).Seconds() > 60*60 // 1 hour
}

func timespecToTime(ts syscall.Timespec) time.Time {
        return time.Unix(int64(ts.Sec), int64(ts.Nsec))
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "encoding/binary"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net"
        "strconv"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/buf"
)

const (
        OpBlockCachePut uint8 = 0xB1
        OpBlockCacheGet uint8 = 0xB2
        OpBlockCacheDel uint8 = 0xB3
)

const (
        CacheMagic uint8 = 0xFF
)

const (
        PacketHeaderSize = 11
)

var Buffers *buf.BufferPool

type PutCacheRequest struct {
        CacheKey string `json:"key"`
        Data     []byte `json:"data"`
}

type GetCacheRequest struct {
        CacheKey string `json:"key"`
        Offset   uint64 `json:"offset"`
        Size     uint32 `json:"size"`
}

type GetCachePathResponse struct {
        CachePath string `json:"path"`
}

type GetCacheDataResponse struct {
        Data []byte `json:"data"`
}

type DelCacheRequest struct {
        CacheKey string `json:"key"`
}

type BlockCachePacket struct {
        Magic      uint8
        Opcode     uint8
        ResultCode uint8 // 3
        CRC        uint32
        Size       uint32 // Data's size ; header size: 3 + 8 = 11
        Data       []byte
        StartT     int64
}

func NewBlockCachePacket() *BlockCachePacket {
        p := new(BlockCachePacket)
        p.Magic = CacheMagic
        p.StartT = time.Now().UnixNano()
        return p
}

func (p *BlockCachePacket) String() string {
        return fmt.Sprintf("OpMsg(%v)", p.GetOpMsg())
}

func (p *BlockCachePacket) GetOpMsg() (m string) {
        switch p.Opcode {
        case OpBlockCachePut:
                m = "OpBlockCachePut"
        case OpBlockCacheGet:
                m = "OpBlockCacheGet"
        case OpBlockCacheDel:
                m = "OpBlockCacheDel"
        default:
                // do nothing
        }
        return
}

func (p *BlockCachePacket) GetResultMsg() (m string) {
        if p == nil {
                return ""
        }
        switch p.ResultCode {
        case proto.OpErr:
                m = "Err: " + string(p.Data)
        case proto.OpOk:
                m = "Ok"
        case proto.OpNotExistErr:
                m = "NotExistErr"
        default:
                return fmt.Sprintf("Unknown ResultCode(%v)", p.ResultCode)
        }
        return
}

func (p *BlockCachePacket) MarshalHeader(out []byte) {
        out[0] = p.Magic
        out[1] = p.Opcode
        out[2] = p.ResultCode
        binary.BigEndian.PutUint32(out[3:7], p.CRC)
        binary.BigEndian.PutUint32(out[7:11], p.Size)
}

func (p *BlockCachePacket) UnMarshalHeader(in []byte) error {
        p.Magic = in[0]
        if p.Magic != CacheMagic {
                return errors.New("Bad Magic " + strconv.Itoa(int(p.Magic)))
        }
        p.Opcode = in[1]
        p.ResultCode = in[2]
        p.CRC = binary.BigEndian.Uint32(in[3:7])
        p.Size = binary.BigEndian.Uint32(in[7:11])
        return nil
}

func (p *BlockCachePacket) MarshalData(v interface{}) error {
        data, err := json.Marshal(v)
        if err == nil {
                p.Data = data
                p.Size = uint32(len(p.Data))
        }
        return err
}

func (p *BlockCachePacket) UnmarshalData(v interface{}) error {
        return json.Unmarshal(p.Data, v)
}

func (p *BlockCachePacket) WriteToConn(c net.Conn) (err error) {
        header, err := Buffers.Get(PacketHeaderSize)
        if err != nil {
                header = make([]byte, PacketHeaderSize)
        }
        defer Buffers.Put(header)
        c.SetWriteDeadline(time.Now().Add(proto.WriteDeadlineTime * time.Second))
        p.MarshalHeader(header)
        if _, err = c.Write(header); err == nil {
                if p.Data != nil {
                        _, err = c.Write(p.Data[:p.Size])
                }
        }
        return
}

func (p *BlockCachePacket) ReadFromConn(c net.Conn, timeoutSec int) (err error) {
        if timeoutSec != proto.NoReadDeadlineTime {
                c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
        } else {
                c.SetReadDeadline(time.Time{})
        }
        header, err := Buffers.Get(PacketHeaderSize)
        if err != nil {
                header = make([]byte, PacketHeaderSize)
        }
        defer Buffers.Put(header)
        var n int
        if n, err = io.ReadFull(c, header); err != nil {
                return
        }
        if n != PacketHeaderSize {
                return syscall.EBADMSG
        }
        if err = p.UnMarshalHeader(header); err != nil {
                return
        }
        size := p.Size
        //if p.Opcode == OpBlockCachePut || p.Opcode == OpBlockCacheDel {
        //        size = 0
        //}
        p.Data = make([]byte, size)
        if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
                return err
        }
        if n != int(size) {
                return syscall.EBADMSG
        }
        return nil
}

func (p *BlockCachePacket) PacketOkReplay() {
        p.ResultCode = proto.OpOk
        p.Size = 0
        p.Data = nil
}

func (p *BlockCachePacket) PacketOkWithBody(reply []byte) {
        p.Size = uint32(len(reply))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:p.Size], reply)
        p.ResultCode = proto.OpOk
}

func (p *BlockCachePacket) PacketErrorWithBody(code uint8, reply []byte) {
        p.Size = uint32(len(reply))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:p.Size], reply)
        p.ResultCode = code
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "encoding/json"
        "fmt"
        "io"
        "net"
        "os"
        "path/filepath"
        "runtime"
        "strconv"

        "github.com/cubefs/cubefs/cmd/common"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        UnixSocketPath = "/var/run/cubefscache/bcache.socket"

        // config
        CacheDir      = "cacheDir"
        CacheLimit    = "cacheLimit"
        CacheFree     = "cacheFree"
        BlockSize     = "blockSize"
        MaxFileSize   = 128 << 30
        MaxBlockSize  = 128 << 20
        BigExtentSize = 32 << 20
)

type bcacheConfig struct {
        CacheDir  string
        BlockSize uint32
        Mode      uint32
        CacheSize int64
        FreeRatio float32
        Limit     uint32
}

type bcacheStore struct {
        bcache  BcacheManager
        conf    *bcacheConfig
        control common.Control
        stopC   chan struct{}
}

func NewServer() *bcacheStore {
        return &bcacheStore{}
}

func (s *bcacheStore) Start(cfg *config.Config) (err error) {
        runtime.GOMAXPROCS(runtime.NumCPU())
        return s.control.Start(s, cfg, doStart)
}

func (s *bcacheStore) Shutdown() {
        s.control.Shutdown(s, doShutdown)
}

func (s *bcacheStore) Sync() {
        s.control.Sync()
}

func doStart(server common.Server, cfg *config.Config) (err error) {
        s, ok := server.(*bcacheStore)
        if !ok {
                return errors.New("Invalid node Type!")
        }
        // parse the config file
        var bconf *bcacheConfig
        bconf, err = s.parserConf(cfg)
        if err != nil {
                err = errors.NewErrorf("block config parser error.")
                panic(err)
        }
        // start bcache manage
        bm := newBcacheManager(bconf)
        if bm == nil {
                err = errors.NewErrorf("block cache manager init fail.")
                panic(err)
        }
        s.bcache = bm
        s.conf = bconf

        // start unix domain socket
        s.startServer()
        return
}

func doShutdown(server common.Server) {
        s, ok := server.(*bcacheStore)
        if !ok {
                return
        }
        // stop unix domain socket
        s.stopServer()
        // close connpool
}

func (s *bcacheStore) startServer() (err error) {
        // create socket dir
        os.MkdirAll(filepath.Dir(UnixSocketPath), FilePerm)

        if _, err := os.Stat(UnixSocketPath); err == nil {
                existErr := fmt.Sprintf("Another process is running or %s already exist,force delete it.", UnixSocketPath)
                log.LogErrorf(existErr)
                os.Remove(UnixSocketPath)
        }

        s.stopC = make(chan struct{})
        ln, err := net.Listen("unix", UnixSocketPath)
        if err != nil {
                panic(err)
        }
        go func(stopC chan struct{}) {
                defer ln.Close()
                for {
                        conn, err := ln.Accept()
                        select {
                        case <-stopC:
                                return
                        default:
                        }
                        if err != nil {
                                continue
                        }
                        go s.serveConn(conn, stopC)
                }
        }(s.stopC)

        log.LogInfof("start blockcache server.")
        return
}

func (s *bcacheStore) stopServer() {
        if s.stopC != nil {
                defer func() {
                        if r := recover(); r != nil {
                                log.LogErrorf("action[StopBcacheServer],err:%v", r)
                        }
                }()
                close(s.stopC)
        }
}

func (s *bcacheStore) serveConn(conn net.Conn, stopC chan struct{}) {
        defer conn.Close()
        for {
                select {
                case <-stopC:
                        return
                default:
                }
                p := &BlockCachePacket{}
                if err := p.ReadFromConn(conn, proto.NoReadDeadlineTime); err != nil {
                        if err != io.EOF {
                                log.LogDebugf("serve BcacheServer: %v", err.Error())
                        }
                        return
                }
                if err := s.handlePacket(conn, p); err != nil {
                        log.LogDebugf("serve handlePacket fail: %v", err)
                }
        }
}

func (s *bcacheStore) handlePacket(conn net.Conn, p *BlockCachePacket) (err error) {
        switch p.Opcode {
        case OpBlockCachePut:
                err = s.opBlockCachePut(conn, p)
        case OpBlockCacheGet:
                err = s.opBlockCacheGet(conn, p)
        case OpBlockCacheDel:
                err = s.opBlockCacheEvict(conn, p)
        default:
                err = fmt.Errorf("unknown Opcode: %d", p.Opcode)
        }
        return
}

func (s *bcacheStore) opBlockCachePut(conn net.Conn, p *BlockCachePacket) (err error) {
        req := &PutCacheRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                s.response(conn, p)
                err = errors.NewErrorf("req[%v],err[%v]", req, err.Error())
                return
        }
        s.bcache.cache(req.CacheKey, req.Data, false)
        p.PacketOkReplay()
        s.response(conn, p)
        return
}

func (s *bcacheStore) opBlockCacheGet(conn net.Conn, p *BlockCachePacket) (err error) {
        req := &GetCacheRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                s.response(conn, p)
                err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
                return
        }

        cachePath, err := s.bcache.queryCachePath(req.CacheKey, req.Offset, req.Size)
        if err != nil {
                if err == os.ErrNotExist {
                        p.PacketErrorWithBody(proto.OpNotExistErr, ([]byte)(err.Error()))
                } else {
                        p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                }
                s.response(conn, p)
                err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
                return
        }

        resp := &GetCachePathResponse{CachePath: cachePath}
        reply, err := json.Marshal(resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                s.response(conn, p)
                err = errors.NewErrorf("req[%v],err[%v]", req, string(p.Data))
                return
        }
        p.PacketOkWithBody(reply)
        s.response(conn, p)
        return
}

func (s *bcacheStore) opBlockCacheEvict(conn net.Conn, p *BlockCachePacket) (err error) {
        req := &DelCacheRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                s.response(conn, p)
                err = errors.NewErrorf("req[%v],err[%v]", req, err.Error())
                return
        }
        s.bcache.erase(req.CacheKey)
        p.PacketOkReplay()
        s.response(conn, p)
        return
}

func (s *bcacheStore) response(conn net.Conn, p *BlockCachePacket) (err error) {
        defer func() {
                if r := recover(); r != nil {
                        switch data := r.(type) {
                        case error:
                                err = data
                        default:
                                err = errors.New(data.(string))
                        }
                }
        }()
        err = p.WriteToConn(conn)
        if err != nil {
                log.LogDebugf("response to client[%s], "+
                        "request[%s]",
                        err.Error(), p.GetOpMsg())
        }
        return
}

func (s *bcacheStore) parserConf(cfg *config.Config) (*bcacheConfig, error) {
        bconf := &bcacheConfig{}
        cacheDir := cfg.GetString(CacheDir)
        cacheLimit := cfg.GetString(CacheLimit)
        cacheFree := cfg.GetString(CacheFree)
        blockSize := cfg.GetString(BlockSize)
        bconf.CacheDir = cacheDir
        if cacheDir == "" {
                return nil, errors.NewErrorf("cacheDir is required.")
        }
        if v, err := strconv.ParseUint(blockSize, 10, 32); err == nil {
                bconf.BlockSize = uint32(v)
        }
        if v, err := strconv.ParseUint(cacheLimit, 10, 32); err == nil {
                bconf.Limit = uint32(v)
        }
        if v, err := strconv.ParseFloat(cacheFree, 32); err == nil {
                bconf.FreeRatio = float32(v)
        }
        return bconf, nil
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package bcache

import (
        "os"
        "syscall"
)

func AccessTime(info os.FileInfo) int64 {
        linuxFileAttr := info.Sys().(*syscall.Stat_t)
        return linuxFileAttr.Atim.Sec
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package common

import "time"

type Retry struct {
        retryTimes int
        delayTime  uint32
}

func (r Retry) On(caller func() error) error {
        var lastErr error
        for i := 0; i < r.retryTimes; i++ {
                err := caller()
                if err == nil {
                        return nil
                }

                i++
                time.Sleep(time.Duration(r.delayTime) * time.Millisecond)

        }
        return lastErr
}

// return a Retry
// delayTime is millisecond
func Timed(retryTimes int, delayTime uint32) Retry {
        return Retry{
                retryTimes: retryTimes,
                delayTime:  delayTime,
        }
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

// Package taskpool provides limited pool running task
package common

// TaskPool limited pool
type TaskPool struct {
        pool chan func()
}

// New returns task pool with workerCount and poolSize
func New(workerCount, poolSize int) TaskPool {
        pool := make(chan func(), poolSize)
        for i := 0; i < workerCount; i++ {
                go func() {
                        for {
                                task, ok := <-pool
                                if !ok {
                                        break
                                }
                                task()
                        }
                }()
        }
        return TaskPool{pool: pool}
}

// Run add task to pool, block if pool is full
func (tp TaskPool) Run(task func()) {
        tp.pool <- task
}

// TryRun try to add task to pool, return immediately
func (tp TaskPool) TryRun(task func()) bool {
        select {
        case tp.pool <- task:
                return true
        default:
                return false
        }
}

// Close the pool, the function is concurrent unsafe
func (tp TaskPool) Close() {
        close(tp.pool)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "syscall"
        "time"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
        "github.com/cubefs/cubefs/proto"
)

const (
        RootInode = proto.RootIno
)

const (
        DefaultBlksize    = uint32(1) << 12
        DefaultMaxNameLen = uint32(256)
)

const (
        DefaultInodeExpiration = 120 * time.Second
        MaxInodeCache          = 10000000 // in terms of the number of items
        DefaultMaxInodeCache   = 2000000
)

const (
        // the expiration duration of the dentry in the cache (used internally)
        DentryValidDuration = 5 * time.Second
        DefaultReaddirLimit = 1024
)

const (
        DeleteExtentsTimeout = 600 * time.Second
)

const (
        MaxSizePutOnce = int64(1) << 23
)

const (
        DefaultFlag = 0x0f
)

var (
        // The following two are used in the FUSE cache
        // every time the lookup will be performed on the fly, and the result will not be cached
        LookupValidDuration = 5 * time.Second
        // the expiration duration of the attributes in the FUSE cache
        AttrValidDuration = 30 * time.Second

        DisableMetaCache = true
)

// ParseError returns the error type.
func ParseError(err error) fuse.Errno {
        switch v := err.(type) {
        case syscall.Errno:
                return fuse.Errno(v)
        case fuse.Errno:
                return v
        default:
                return fuse.EIO
        }
}

// ParseType returns the dentry type.
func ParseType(t uint32) fuse.DirentType {
        if proto.IsDir(t) {
                return fuse.DT_Dir
        } else if proto.IsSymlink(t) {
                return fuse.DT_Link
        }
        return fuse.DT_File
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "sync"
        "time"
)

// DentryCache defines the dentry cache.
type DentryCache struct {
        sync.Mutex
        cache      map[string]uint64
        expiration time.Time
}

// NewDentryCache returns a new dentry cache.
func NewDentryCache() *DentryCache {
        return &DentryCache{
                cache:      make(map[string]uint64),
                expiration: time.Now().Add(DentryValidDuration),
        }
}

// Put puts an item into the cache.
func (dc *DentryCache) Put(name string, ino uint64) {
        if dc == nil {
                return
        }
        dc.Lock()
        defer dc.Unlock()
        dc.cache[name] = ino
        dc.expiration = time.Now().Add(DentryValidDuration)
}

// Get gets the item from the cache based on the given key.
func (dc *DentryCache) Get(name string) (uint64, bool) {
        if dc == nil {
                return 0, false
        }

        dc.Lock()
        defer dc.Unlock()
        if dc.expiration.Before(time.Now()) {
                dc.cache = make(map[string]uint64)
                return 0, false
        }
        ino, ok := dc.cache[name]
        return ino, ok
}

// Delete deletes the item based on the given key.
func (dc *DentryCache) Delete(name string) {
        if dc == nil {
                return
        }
        dc.Lock()
        defer dc.Unlock()
        delete(dc.cache, name)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "container/list"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        // MinDentryCacheEvictNum is used in the foreground eviction.
        // When clearing the inodes from the cache, it stops as soon as 10 inodes have been evicted.
        MinDentryCacheEvictNum = 10
        // MaxDentryCacheEvictNum is used in the back ground. We can evict 200000 inodes at max.
        MaxDentryCacheEvictNum = 200000

        DentryBgEvictionInterval = 2 * time.Minute
)

// Dcache defines the structure of the inode cache.
type Dcache struct {
        sync.RWMutex
        cache       map[string]*list.Element
        lruList     *list.List
        expiration  time.Duration
        maxElements int
}

// NewDentryCache returns a new inode cache.
func NewDcache(exp time.Duration, maxElements int) *Dcache {
        dc := &Dcache{
                cache:       make(map[string]*list.Element),
                lruList:     list.New(),
                expiration:  exp,
                maxElements: maxElements,
        }
        go dc.backgroundEviction()
        return dc
}

// Put puts the given inode info into the inode cache.
func (dc *Dcache) Put(info *proto.DentryInfo) {
        dc.Lock()
        old, ok := dc.cache[info.Name]
        if ok {
                dc.lruList.Remove(old)
                delete(dc.cache, info.Name)
        }

        if dc.lruList.Len() >= dc.maxElements {
                dc.evict(true)
        }

        dentrySetExpiration(info, dc.expiration)
        element := dc.lruList.PushFront(info)
        dc.cache[info.Name] = element
        dc.Unlock()
        // log.LogDebugf("Dcache put inode: inode(%v)", info.Inode)
}

// Get returns the inode info based on the given inode number.
func (dc *Dcache) Get(name string) *proto.DentryInfo {
        dc.RLock()
        element, ok := dc.cache[name]
        if !ok {
                dc.RUnlock()
                return nil
        }

        info := element.Value.(*proto.DentryInfo)
        if dentryExpired(info) && DisableMetaCache {
                dc.RUnlock()
                // log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v), expired(%d)", time.Now().Format(LogTimeFormat), info.Inode, info.Expiration())
                return nil
        }
        dc.RUnlock()
        return info
}

// Delete deletes the dentry info based on the given name(partentId+name).
func (dc *Dcache) Delete(name string) {
        // log.LogDebugf("Dcache Delete: ino(%v)", ino)
        dc.Lock()
        element, ok := dc.cache[name]
        if ok {
                dc.lruList.Remove(element)
                delete(dc.cache, name)
        }
        dc.Unlock()
}

// Foreground eviction cares more about the speed.
// Background eviction evicts all expired items from the cache.
// The caller should grab the WRITE lock of the inode cache.
func (dc *Dcache) evict(foreground bool) {
        var count int

        for i := 0; i < MinDentryCacheEvictNum; i++ {
                element := dc.lruList.Back()
                if element == nil {
                        return
                }

                // For background eviction, if all expired items have been evicted, just return
                // But for foreground eviction, we need to evict at least MinDentryCacheEvictNum inodes.
                // The foreground eviction, does not need to care if the inode has expired or not.
                info := element.Value.(*proto.DentryInfo)
                if !foreground && !dentryExpired(info) {
                        return
                }

                // log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
                dc.lruList.Remove(element)
                delete(dc.cache, info.Name)
                count++
        }

        // For background eviction, we need to continue evict all expired items from the cache
        if foreground {
                return
        }

        for i := 0; i < MaxDentryCacheEvictNum; i++ {
                element := dc.lruList.Back()
                if element == nil {
                        break
                }
                info := element.Value.(*proto.DentryInfo)
                if !dentryExpired(info) {
                        break
                }
                // log.LogDebugf("Dcache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
                dc.lruList.Remove(element)
                delete(dc.cache, info.Name)
                count++
        }
}

func (dc *Dcache) backgroundEviction() {
        t := time.NewTicker(DentryBgEvictionInterval)
        defer t.Stop()

        for range t.C {
                log.LogInfof("Dcache: start BG evict")
                if !DisableMetaCache {
                        log.LogInfof("Dcache: no need to do BG evict")
                        continue
                }
                start := time.Now()
                dc.Lock()
                dc.evict(false)
                dc.Unlock()
                elapsed := time.Since(start)
                log.LogInfof("Dcache: total inode cache(%d), cost(%d)ns", dc.lruList.Len(), elapsed.Nanoseconds())
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "syscall"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
)

func isDirectIOEnabled(flags fuse.OpenFlags) bool {
        return (int(flags) & syscall.O_DIRECT) != 0
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "context"
        "fmt"
        "io"
        "os"
        "path"
        "strconv"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
        "github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

// used to locate the position in parent
type DirContext struct {
        Name string
}

type DirContexts struct {
        sync.RWMutex
        dirCtx map[fuse.HandleID]*DirContext
}

func NewDirContexts() (dctx *DirContexts) {
        dctx = &DirContexts{}
        dctx.dirCtx = make(map[fuse.HandleID]*DirContext, 0)
        return
}

func (dctx *DirContexts) GetCopy(handle fuse.HandleID) DirContext {
        dctx.RLock()
        dirCtx, found := dctx.dirCtx[handle]
        dctx.RUnlock()

        if found {
                return DirContext{dirCtx.Name}
        } else {
                return DirContext{}
        }
}

func (dctx *DirContexts) Put(handle fuse.HandleID, dirCtx *DirContext) {
        dctx.Lock()
        defer dctx.Unlock()

        oldCtx, found := dctx.dirCtx[handle]
        if found {
                oldCtx.Name = dirCtx.Name
                return
        }

        dctx.dirCtx[handle] = dirCtx
}

func (dctx *DirContexts) Remove(handle fuse.HandleID) {
        dctx.Lock()
        delete(dctx.dirCtx, handle)
        dctx.Unlock()
}

// Dir defines the structure of a directory
type Dir struct {
        super     *Super
        info      *proto.InodeInfo
        dcache    *DentryCache
        dctx      *DirContexts
        parentIno uint64
        name      string
}

// Functions that Dir needs to implement
var (
        _ fs.Node                = (*Dir)(nil)
        _ fs.NodeCreater         = (*Dir)(nil)
        _ fs.NodeForgetter       = (*Dir)(nil)
        _ fs.NodeMkdirer         = (*Dir)(nil)
        _ fs.NodeMknoder         = (*Dir)(nil)
        _ fs.NodeRemover         = (*Dir)(nil)
        _ fs.NodeFsyncer         = (*Dir)(nil)
        _ fs.NodeRequestLookuper = (*Dir)(nil)
        _ fs.HandleReadDirAller  = (*Dir)(nil)
        _ fs.NodeRenamer         = (*Dir)(nil)
        _ fs.NodeSetattrer       = (*Dir)(nil)
        _ fs.NodeSymlinker       = (*Dir)(nil)
        _ fs.NodeGetxattrer      = (*Dir)(nil)
        _ fs.NodeListxattrer     = (*Dir)(nil)
        _ fs.NodeSetxattrer      = (*Dir)(nil)
        _ fs.NodeRemovexattrer   = (*Dir)(nil)
)

// NewDir returns a new directory.
func NewDir(s *Super, i *proto.InodeInfo, pino uint64, dirName string) fs.Node {
        return &Dir{
                super:     s,
                info:      i,
                parentIno: pino,
                name:      dirName,
                dctx:      NewDirContexts(),
        }
}

// Attr set the attributes of a directory.
func (d *Dir) Attr(ctx context.Context, a *fuse.Attr) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Attr", err, bgTime, 1)
        }()

        ino := d.info.Inode
        info, err := d.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Attr: ino(%v) err(%v)", ino, err)
                return ParseError(err)
        }
        fillAttr(info, a)
        log.LogDebugf("TRACE Attr: inode(%v)", info)
        return nil
}

func (d *Dir) Release(ctx context.Context, req *fuse.ReleaseRequest) (err error) {
        d.dctx.Remove(req.Handle)
        return nil
}

// Create handles the create request.
func (d *Dir) Create(ctx context.Context, req *fuse.CreateRequest, resp *fuse.CreateResponse) (fs.Node, fs.Handle, error) {
        start := time.Now()

        bgTime := stat.BeginStat()
        var err error
        var newInode uint64
        metric := exporter.NewTPCnt("filecreate")
        fullPath := path.Join(d.getCwd(), req.Name)
        defer func() {
                stat.EndStat("Create", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
                auditlog.LogClientOp("Create", fullPath, "nil", err, time.Since(start).Microseconds(), newInode, 0)
        }()

        info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(req.Mode.Perm()), req.Uid, req.Gid, nil, fullPath)
        if err != nil {
                log.LogErrorf("Create: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
                return nil, nil, ParseError(err)
        }

        d.super.ic.Put(info)
        child := NewFile(d.super, info, uint32(req.Flags&DefaultFlag), d.info.Inode, req.Name)
        newInode = info.Inode

        d.super.ec.OpenStream(info.Inode)
        d.super.fslock.Lock()
        d.super.nodeCache[info.Inode] = child
        d.super.fslock.Unlock()

        if d.super.keepCache {
                resp.Flags |= fuse.OpenKeepCache
        }
        resp.EntryValid = LookupValidDuration

        d.super.ic.Delete(d.info.Inode)

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Create: parent(%v) req(%v) resp(%v) ino(%v) (%v)ns", d.info.Inode, req, resp, info.Inode, elapsed.Nanoseconds())
        return child, child, nil
}

// Forget is called when the evict is invoked from the kernel.
func (d *Dir) Forget() {
        bgTime := stat.BeginStat()
        ino := d.info.Inode
        defer func() {
                stat.EndStat("Forget", nil, bgTime, 1)
                log.LogDebugf("TRACE Forget: ino(%v)", ino)
        }()

        d.super.ic.Delete(ino)

        d.super.fslock.Lock()
        delete(d.super.nodeCache, ino)
        d.super.fslock.Unlock()
}

// Mkdir handles the mkdir request.
func (d *Dir) Mkdir(ctx context.Context, req *fuse.MkdirRequest) (fs.Node, error) {
        start := time.Now()

        bgTime := stat.BeginStat()
        var err error
        var newInode uint64
        metric := exporter.NewTPCnt("mkdir")
        fullPath := path.Join(d.getCwd(), req.Name)
        defer func() {
                stat.EndStat("Mkdir", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
                auditlog.LogClientOp("Mkdir", fullPath, "nil", err, time.Since(start).Microseconds(), newInode, 0)
        }()

        info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(os.ModeDir|req.Mode.Perm()), req.Uid, req.Gid, nil, fullPath)
        if err != nil {
                log.LogErrorf("Mkdir: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
                return nil, ParseError(err)
        }

        d.super.ic.Put(info)
        child := NewDir(d.super, info, d.info.Inode, req.Name)
        newInode = info.Inode
        d.super.fslock.Lock()
        d.super.nodeCache[info.Inode] = child
        d.super.fslock.Unlock()

        d.super.ic.Delete(d.info.Inode)

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Mkdir: parent(%v) req(%v) ino(%v) (%v)ns", d.info.Inode, req, info.Inode, elapsed.Nanoseconds())
        return child, nil
}

// Remove handles the remove request.
func (d *Dir) Remove(ctx context.Context, req *fuse.RemoveRequest) error {
        start := time.Now()
        d.dcache.Delete(req.Name)
        dcacheKey := d.buildDcacheKey(d.info.Inode, req.Name)
        d.super.dc.Delete(dcacheKey)

        bgTime := stat.BeginStat()
        var err error
        var deletedInode uint64
        metric := exporter.NewTPCnt("remove")
        fullPath := path.Join(d.getCwd(), req.Name)
        defer func() {
                stat.EndStat("Remove", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
                auditlog.LogClientOp("Remove", fullPath, "nil", err, time.Since(start).Microseconds(), deletedInode, 0)
        }()

        info, err := d.super.mw.Delete_ll(d.info.Inode, req.Name, req.Dir, fullPath)
        if err != nil {
                log.LogErrorf("Remove: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
                return ParseError(err)
        }

        if info != nil {
                deletedInode = info.Inode
        }
        d.super.ic.Delete(d.info.Inode)

        if info != nil && info.Nlink == 0 && !proto.IsDir(info.Mode) {
                d.super.orphan.Put(info.Inode)
                log.LogDebugf("Remove: add to orphan inode list, ino(%v)", info.Inode)
        }

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Remove: parent(%v) req(%v) inode(%v) (%v)ns", d.info.Inode, req, info, elapsed.Nanoseconds())
        return nil
}

func (d *Dir) Fsync(ctx context.Context, req *fuse.FsyncRequest) error {
        return nil
}

// Lookup handles the lookup request.
func (d *Dir) Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.LookupResponse) (fs.Node, error) {
        var (
                ino      uint64
                err      error
                dcachev2 bool
        )

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Lookup", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE Lookup: parent(%v) req(%v)", d.info.Inode, req)
        log.LogDebugf("TRACE Lookup: parent(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)

        if d.needDentrycache() {
                dcachev2 = true
        }
        if dcachev2 {
                lookupMetric := exporter.NewCounter("lookupDcache")
                lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
                dcacheKey := d.buildDcacheKey(d.info.Inode, req.Name)
                dentryInfo := d.super.dc.Get(dcacheKey)
                if dentryInfo == nil {
                        lookupMetric := exporter.NewCounter("lookupDcacheMiss")
                        lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
                        ino, _, err = d.super.mw.Lookup_ll(d.info.Inode, req.Name)
                        if err != nil {
                                if err != syscall.ENOENT {
                                        log.LogErrorf("Lookup: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
                                }
                                return nil, ParseError(err)
                        }
                        info := &proto.DentryInfo{
                                Name:  dcacheKey,
                                Inode: ino,
                        }
                        d.super.dc.Put(info)
                } else {
                        lookupMetric := exporter.NewCounter("lookupDcacheHit")
                        lookupMetric.AddWithLabels(1, map[string]string{exporter.Vol: d.super.volname})
                        ino = dentryInfo.Inode
                }
        } else {
                cino, ok := d.dcache.Get(req.Name)
                if !ok {
                        cino, _, err = d.super.mw.Lookup_ll(d.info.Inode, req.Name)
                        if err != nil {
                                if err != syscall.ENOENT {
                                        log.LogErrorf("Lookup: parent(%v) name(%v) err(%v)", d.info.Inode, req.Name, err)
                                }
                                return nil, ParseError(err)
                        }
                }
                ino = cino
        }

        info, err := d.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Lookup: parent(%v) name(%v) ino(%v) err(%v)", d.info.Inode, req.Name, ino, err)
                dummyInodeInfo := &proto.InodeInfo{Inode: ino}
                dummyChild := NewFile(d.super, dummyInodeInfo, DefaultFlag, d.info.Inode, req.Name)
                return dummyChild, nil
        }
        mode := proto.OsMode(info.Mode)
        d.super.fslock.Lock()
        child, ok := d.super.nodeCache[ino]
        if !ok {
                if mode.IsDir() {
                        child = NewDir(d.super, info, d.info.Inode, req.Name)
                } else {
                        child = NewFile(d.super, info, DefaultFlag, d.info.Inode, req.Name)
                }
                d.super.nodeCache[ino] = child
        }
        d.super.fslock.Unlock()

        resp.EntryValid = LookupValidDuration

        log.LogDebugf("TRACE Lookup exit: parent(%v) req(%v) cost (%d)", d.info.Inode, req, time.Since(*bgTime).Microseconds())
        return child, nil
}

func (d *Dir) buildDcacheKey(inode uint64, name string) string {
        return fmt.Sprintf("%v_%v", inode, name)
}

func (d *Dir) ReadDir(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) ([]fuse.Dirent, error) {
        var err error
        var limit uint64 = DefaultReaddirLimit
        start := time.Now()

        bgTime := stat.BeginStat()
        // var err error
        metric := exporter.NewTPCnt("readdir")
        defer func() {
                stat.EndStat("ReadDirLimit", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
        }()
        var dirCtx DirContext
        if req.Offset != 0 {
                dirCtx = d.dctx.GetCopy(req.Handle)
        } else {
                dirCtx = DirContext{}
        }
        children, err := d.super.mw.ReadDirLimit_ll(d.info.Inode, dirCtx.Name, limit)
        if err != nil {
                log.LogErrorf("readdirlimit: Readdir: ino(%v) err(%v) offset %v", d.info.Inode, err, req.Offset)
                return make([]fuse.Dirent, 0), ParseError(err)
        }

        if req.Offset == 0 {
                if len(children) == 0 {
                        dirents := make([]fuse.Dirent, 0, len(children))
                        dirents = append(dirents, fuse.Dirent{
                                Inode: d.info.Inode,
                                Type:  fuse.DT_Dir,
                                Name:  ".",
                        })
                        pid := uint64(req.Pid)
                        if d.info.Inode == 1 {
                                pid = d.info.Inode
                        }
                        dirents = append(dirents, fuse.Dirent{
                                Inode: pid,
                                Type:  fuse.DT_Dir,
                                Name:  "..",
                        })
                        return dirents, io.EOF
                }
                children = append([]proto.Dentry{{
                        Name:  ".",
                        Inode: d.info.Inode,
                        Type:  uint32(os.ModeDir),
                }, {
                        Name:  "..",
                        Inode: uint64(req.Pid),
                        Type:  uint32(os.ModeDir),
                }}, children...)
        }

        // skip the first one, which is already accessed
        childrenNr := uint64(len(children))
        if childrenNr == 0 || (dirCtx.Name != "" && childrenNr == 1) {
                return make([]fuse.Dirent, 0), io.EOF
        } else if childrenNr < limit {
                err = io.EOF
        }
        if dirCtx.Name != "" {
                children = children[1:]
        }

        /* update dirCtx */
        dirCtx.Name = children[len(children)-1].Name
        d.dctx.Put(req.Handle, &dirCtx)

        inodes := make([]uint64, 0, len(children))
        dirents := make([]fuse.Dirent, 0, len(children))

        log.LogDebugf("Readdir ino(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)
        var dcache *DentryCache
        if !d.super.disableDcache {
                dcache = NewDentryCache()
        }

        var dcachev2 bool
        if d.needDentrycache() {
                dcachev2 = true
        }

        for _, child := range children {
                dentry := fuse.Dirent{
                        Inode: child.Inode,
                        Type:  ParseType(child.Type),
                        Name:  child.Name,
                }

                inodes = append(inodes, child.Inode)
                dirents = append(dirents, dentry)
                if dcachev2 {
                        info := &proto.DentryInfo{
                                Name:  d.buildDcacheKey(d.info.Inode, child.Name),
                                Inode: child.Inode,
                        }
                        d.super.dc.Put(info)
                } else {
                        dcache.Put(child.Name, child.Inode)
                }
        }

        infos := d.super.mw.BatchInodeGet(inodes)
        for _, info := range infos {
                d.super.ic.Put(info)
        }

        d.dcache = dcache
        elapsed := time.Since(start)
        log.LogDebugf("TRACE ReadDir exit: ino(%v) (%v)ns %v", d.info.Inode, elapsed.Nanoseconds(), req)
        return dirents, err
}

// ReadDirAll gets all the dentries in a directory and puts them into the cache.
func (d *Dir) ReadDirAll(ctx context.Context) ([]fuse.Dirent, error) {
        start := time.Now()
        bgTime := stat.BeginStat()
        var err error
        metric := exporter.NewTPCnt("readdir")
        defer func() {
                stat.EndStat("ReadDirAll", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
        }()

        // transform ReadDirAll to ReadDirLimit_ll
        noMore := false
        from := ""
        var children []proto.Dentry
        for !noMore {
                batches, err := d.super.mw.ReadDirLimit_ll(d.info.Inode, from, DefaultReaddirLimit)
                if err != nil {
                        log.LogErrorf("Readdir: ino(%v) err(%v) from(%v)", d.info.Inode, err, from)
                        return make([]fuse.Dirent, 0), ParseError(err)
                }
                batchNr := uint64(len(batches))
                if batchNr == 0 || (from != "" && batchNr == 1) {
                        noMore = true
                        break
                } else if batchNr < DefaultReaddirLimit {
                        noMore = true
                }
                if from != "" {
                        batches = batches[1:]
                }
                children = append(children, batches...)
                from = batches[len(batches)-1].Name
        }

        inodes := make([]uint64, 0, len(children))
        dirents := make([]fuse.Dirent, 0, len(children))

        log.LogDebugf("Readdir ino(%v) path(%v) d.super.bcacheDir(%v)", d.info.Inode, d.getCwd(), d.super.bcacheDir)
        var dcache *DentryCache
        if !d.super.disableDcache {
                dcache = NewDentryCache()
        }

        var dcachev2 bool
        if d.needDentrycache() {
                dcachev2 = true
        }

        for _, child := range children {
                dentry := fuse.Dirent{
                        Inode: child.Inode,
                        Type:  ParseType(child.Type),
                        Name:  child.Name,
                }

                inodes = append(inodes, child.Inode)
                dirents = append(dirents, dentry)
                if dcachev2 {
                        info := &proto.DentryInfo{
                                Name:  d.buildDcacheKey(d.info.Inode, child.Name),
                                Inode: child.Inode,
                        }
                        d.super.dc.Put(info)
                } else {
                        dcache.Put(child.Name, child.Inode)
                }
        }

        infos := d.super.mw.BatchInodeGet(inodes)
        for _, info := range infos {
                d.super.ic.Put(info)
        }
        d.dcache = dcache
        elapsed := time.Since(start)
        log.LogDebugf("TRACE ReadDirAll: ino(%v) (%v)ns", d.info.Inode, elapsed.Nanoseconds())
        return dirents, nil
}

// Rename handles the rename request.
func (d *Dir) Rename(ctx context.Context, req *fuse.RenameRequest, newDir fs.Node) error {
        dstDir, ok := newDir.(*Dir)
        if !ok {
                log.LogErrorf("Rename: NOT DIR, parent(%v) req(%v)", d.info.Inode, req)
                return fuse.ENOTSUP
        }
        start := time.Now()
        var srcInode uint64 // must exist
        var dstInode uint64 // may not exist
        var err error
        if ino, ok := dstDir.dcache.Get(req.NewName); ok {
                dstInode = ino
        }
        if ino, ok := d.dcache.Get(req.OldName); ok {
                srcInode = ino
        } else {
                // will not get there
                if ino, _, err := d.super.mw.Lookup_ll(d.info.Inode, req.OldName); err == nil {
                        srcInode = ino
                }
        }
        d.dcache.Delete(req.OldName)
        dcacheKey := d.buildDcacheKey(d.info.Inode, req.OldName)
        d.super.dc.Delete(dcacheKey)

        bgTime := stat.BeginStat()

        metric := exporter.NewTPCnt("rename")
        srcPath := path.Join(d.getCwd(), req.OldName)
        dstPath := path.Join(dstDir.getCwd(), req.NewName)
        defer func() {
                stat.EndStat("Rename", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
                d.super.fslock.Lock()
                node, ok := d.super.nodeCache[srcInode]
                if ok && srcInode != 0 {
                        if dir, ok := node.(*Dir); ok {
                                dir.name = req.NewName
                                dir.parentIno = dstDir.info.Inode
                        } else {
                                file := node.(*File)
                                file.name = req.NewName
                                file.parentIno = dstDir.info.Inode
                        }
                }
                d.super.fslock.Unlock()
                auditlog.LogClientOp("Rename", srcPath, dstPath, err, time.Since(start).Microseconds(), srcInode, dstInode)
        }()
        // changePathMap := d.super.mw.GetChangeQuota(d.getCwd()+"/"+req.OldName, dstDir.getCwd()+"/"+req.NewName)
        if d.super.mw.EnableQuota {
                if !d.canRenameByQuota(dstDir, req.OldName) {
                        return fuse.EPERM
                }
        }
        err = d.super.mw.Rename_ll(d.info.Inode, req.OldName, dstDir.info.Inode, req.NewName, srcPath, dstPath, true)
        if err != nil {
                log.LogErrorf("Rename: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
                return ParseError(err)
        }
        // if len(changePathMap) != 0 {
        //         d.super.mw.BatchModifyQuotaPath(changePathMap)
        // }
        d.super.ic.Delete(d.info.Inode)
        d.super.ic.Delete(dstDir.info.Inode)

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Rename: SrcParent(%v) OldName(%v) DstParent(%v) NewName(%v) (%v)ns", d.info.Inode, req.OldName, dstDir.info.Inode, req.NewName, elapsed.Nanoseconds())
        return nil
}

// Setattr handles the setattr request.
func (d *Dir) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Setattr", err, bgTime, 1)
        }()

        ino := d.info.Inode
        start := time.Now()
        info, err := d.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Setattr: ino(%v) err(%v)", ino, err)
                return ParseError(err)
        }

        if valid := setattr(info, req); valid != 0 {
                err = d.super.mw.Setattr(ino, valid, info.Mode, info.Uid, info.Gid, info.AccessTime.Unix(),
                        info.ModifyTime.Unix())
                if err != nil {
                        d.super.ic.Delete(ino)
                        return ParseError(err)
                }
        }

        fillAttr(info, &resp.Attr)

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Setattr: ino(%v) req(%v) inodeSize(%v) (%v)ns", ino, req, info.Size, elapsed.Nanoseconds())
        return nil
}

func (d *Dir) Mknod(ctx context.Context, req *fuse.MknodRequest) (fs.Node, error) {
        if req.Rdev != 0 {
                return nil, fuse.ENOSYS
        }

        start := time.Now()

        bgTime := stat.BeginStat()
        var err error
        metric := exporter.NewTPCnt("mknod")
        defer func() {
                stat.EndStat("Mknod", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
        }()
        fullPath := path.Join(d.getCwd(), req.Name)
        info, err := d.super.mw.Create_ll(d.info.Inode, req.Name, proto.Mode(req.Mode), req.Uid, req.Gid, nil, fullPath)
        if err != nil {
                log.LogErrorf("Mknod: parent(%v) req(%v) err(%v)", d.info.Inode, req, err)
                return nil, ParseError(err)
        }

        d.super.ic.Put(info)
        child := NewFile(d.super, info, DefaultFlag, d.info.Inode, req.Name)

        d.super.fslock.Lock()
        d.super.nodeCache[info.Inode] = child
        d.super.fslock.Unlock()

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Mknod: parent(%v) req(%v) ino(%v) (%v)ns", d.info.Inode, req, info.Inode, elapsed.Nanoseconds())
        return child, nil
}

// Symlink handles the symlink request.
func (d *Dir) Symlink(ctx context.Context, req *fuse.SymlinkRequest) (fs.Node, error) {
        parentIno := d.info.Inode
        start := time.Now()

        bgTime := stat.BeginStat()
        var err error
        metric := exporter.NewTPCnt("symlink")
        defer func() {
                stat.EndStat("Symlink", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
        }()
        fullPath := path.Join(d.getCwd(), req.NewName)
        info, err := d.super.mw.Create_ll(parentIno, req.NewName, proto.Mode(os.ModeSymlink|os.ModePerm), req.Uid, req.Gid, []byte(req.Target), fullPath)
        if err != nil {
                log.LogErrorf("Symlink: parent(%v) NewName(%v) err(%v)", parentIno, req.NewName, err)
                return nil, ParseError(err)
        }

        d.super.ic.Put(info)
        child := NewFile(d.super, info, DefaultFlag, d.info.Inode, req.NewName)

        d.super.fslock.Lock()
        d.super.nodeCache[info.Inode] = child
        d.super.fslock.Unlock()

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Symlink: parent(%v) req(%v) ino(%v) (%v)ns", parentIno, req, info.Inode, elapsed.Nanoseconds())
        return child, nil
}

// Link handles the link request.
func (d *Dir) Link(ctx context.Context, req *fuse.LinkRequest, old fs.Node) (fs.Node, error) {
        var oldInode *proto.InodeInfo
        switch old := old.(type) {
        case *File:
                oldInode = old.info
        default:
                return nil, fuse.EPERM
        }

        if !proto.IsRegular(oldInode.Mode) {
                log.LogErrorf("Link: not regular, parent(%v) name(%v) ino(%v) mode(%v)", d.info.Inode, req.NewName, oldInode.Inode, proto.OsMode(oldInode.Mode))
                return nil, fuse.EPERM
        }

        start := time.Now()

        bgTime := stat.BeginStat()
        var err error
        metric := exporter.NewTPCnt("link")
        defer func() {
                stat.EndStat("Link", err, bgTime, 1)
                metric.SetWithLabels(err, map[string]string{exporter.Vol: d.super.volname})
        }()
        fullPath := path.Join(d.getCwd(), req.NewName)
        info, err := d.super.mw.Link(d.info.Inode, req.NewName, oldInode.Inode, fullPath)
        if err != nil {
                log.LogErrorf("Link: parent(%v) name(%v) ino(%v) err(%v)", d.info.Inode, req.NewName, oldInode.Inode, err)
                return nil, ParseError(err)
        }

        d.super.ic.Put(info)

        d.super.fslock.Lock()
        newFile, ok := d.super.nodeCache[info.Inode]
        if !ok {
                newFile = NewFile(d.super, info, DefaultFlag, d.info.Inode, req.NewName)
                d.super.nodeCache[info.Inode] = newFile
        }
        d.super.fslock.Unlock()

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Link: parent(%v) name(%v) ino(%v) (%v)ns", d.info.Inode, req.NewName, info.Inode, elapsed.Nanoseconds())
        return newFile, nil
}

// Getxattr has not been implemented yet.
func (d *Dir) Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error {
        if !d.super.enableXattr {
                return fuse.ENOSYS
        }
        ino := d.info.Inode
        name := req.Name
        size := req.Size
        pos := req.Position

        var value []byte
        var info *proto.XAttrInfo
        var err error

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Getxattr", err, bgTime, 1)
        }()

        if name == meta.SummaryKey {
                if !d.super.mw.EnableSummary {
                        return fuse.ENOSYS
                }
                var summaryInfo meta.SummaryInfo
                cacheSummaryInfo := d.super.sc.Get(ino)
                if cacheSummaryInfo != nil {
                        summaryInfo = *cacheSummaryInfo
                } else {
                        summaryInfo, err = d.super.mw.GetSummary_ll(ino, 20)
                        if err != nil {
                                log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
                                return ParseError(err)
                        }
                        d.super.sc.Put(ino, &summaryInfo)
                }

                files := summaryInfo.Files
                subdirs := summaryInfo.Subdirs
                fbytes := summaryInfo.Fbytes
                summaryStr := "Files:" + strconv.FormatInt(int64(files), 10) + "," +
                        "Dirs:" + strconv.FormatInt(int64(subdirs), 10) + "," +
                        "Bytes:" + strconv.FormatInt(int64(fbytes), 10)
                value = []byte(summaryStr)

        } else {
                info, err = d.super.mw.XAttrGet_ll(ino, name)
                if err != nil {
                        log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
                        return ParseError(err)
                }
                value = info.Get(name)
        }

        if pos > 0 {
                value = value[pos:]
        }
        if size > 0 && size < uint32(len(value)) {
                value = value[:size]
        }
        resp.Xattr = value
        log.LogDebugf("TRACE GetXattr: ino(%v) name(%v)", ino, name)
        return nil
}

// Listxattr has not been implemented yet.
func (d *Dir) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error {
        if !d.super.enableXattr {
                return fuse.ENOSYS
        }

        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Getxattr", err, bgTime, 1)
        }()

        ino := d.info.Inode
        _ = req.Size     // ignore currently
        _ = req.Position // ignore currently

        keys, err := d.super.mw.XAttrsList_ll(ino)
        if err != nil {
                log.LogErrorf("ListXattr: ino(%v) err(%v)", ino, err)
                return ParseError(err)
        }
        for _, key := range keys {
                resp.Append(key)
        }
        log.LogDebugf("TRACE Listxattr: ino(%v)", ino)
        return nil
}

// Setxattr has not been implemented yet.
func (d *Dir) Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error {
        if !d.super.enableXattr {
                return fuse.ENOSYS
        }

        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Setxattr", err, bgTime, 1)
        }()

        ino := d.info.Inode
        name := req.Name
        value := req.Xattr
        if name == meta.SummaryKey {
                log.LogErrorf("Set 'DirStat' is not supported.")
                return fuse.ENOSYS
        }
        // TODO： implement flag to improve compatible (Mofei Zhang)
        if err = d.super.mw.XAttrSet_ll(ino, []byte(name), []byte(value)); err != nil {
                log.LogErrorf("Setxattr: ino(%v) name(%v) err(%v)", ino, name, err)
                return ParseError(err)
        }
        log.LogDebugf("TRACE Setxattr: ino(%v) name(%v)", ino, name)
        return nil
}

// Removexattr has not been implemented yet.
func (d *Dir) Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error {
        if !d.super.enableXattr {
                return fuse.ENOSYS
        }

        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Removexattr", err, bgTime, 1)
        }()

        ino := d.info.Inode
        name := req.Name
        if name == meta.SummaryKey {
                log.LogErrorf("Remove 'DirStat' is not supported.")
                return fuse.ENOSYS
        }
        if err = d.super.mw.XAttrDel_ll(ino, name); err != nil {
                log.LogErrorf("Removexattr: ino(%v) name(%v) err(%v)", ino, name, err)
                return ParseError(err)
        }
        log.LogDebugf("TRACE RemoveXattr: ino(%v) name(%v)", ino, name)
        return nil
}

func (d *Dir) getCwd() string {
        dirPath := ""
        if d.info.Inode == d.super.rootIno {
                return "/"
        }
        curIno := d.info.Inode
        for curIno != d.super.rootIno {
                d.super.fslock.Lock()
                node, ok := d.super.nodeCache[curIno]
                d.super.fslock.Unlock()
                if !ok {
                        log.LogErrorf("Get node cache failed: ino(%v)", curIno)
                        return "unknown" + dirPath
                }
                curDir, ok := node.(*Dir)
                if !ok {
                        log.LogErrorf("Type error: Can not convert node -> *Dir, ino(%v)", curDir.parentIno)
                        return "unknown" + dirPath
                }
                dirPath = "/" + curDir.name + dirPath
                curIno = curDir.parentIno
        }
        return dirPath
}

func (d *Dir) needDentrycache() bool {
        return !DisableMetaCache && d.super.bcacheDir != "" && strings.HasPrefix(d.getCwd(), d.super.bcacheDir)
}

func dentryExpired(info *proto.DentryInfo) bool {
        return time.Now().UnixNano() > info.Expiration()
}

func dentrySetExpiration(info *proto.DentryInfo, t time.Duration) {
        info.SetExpiration(time.Now().Add(t).UnixNano())
}

func (d *Dir) canRenameByQuota(dstDir *Dir, srcName string) bool {
        fullPaths := d.super.mw.GetQuotaFullPaths()
        if len(fullPaths) == 0 {
                return true
        }
        var srcPath string
        if d.getCwd() == "/" {
                srcPath = "/" + srcName
        } else {
                srcPath = d.getCwd() + "/" + srcName
        }

        for _, fullPath := range fullPaths {
                log.LogDebugf("srcPath [%v] fullPath[%v].", srcPath, fullPath)
                if proto.IsAncestor(srcPath, fullPath) {
                        return false
                }
        }
        return true
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "context"
        "fmt"
        "io"
        "path"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
        "github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/blobstore"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

// File defines the structure of a file.
type File struct {
        super     *Super
        info      *proto.InodeInfo
        idle      int32
        parentIno uint64
        name      string
        sync.RWMutex
        fReader *blobstore.Reader
        fWriter *blobstore.Writer
}

// Functions that File needs to implement
var (
        _ fs.Node              = (*File)(nil)
        _ fs.Handle            = (*File)(nil)
        _ fs.NodeForgetter     = (*File)(nil)
        _ fs.NodeOpener        = (*File)(nil)
        _ fs.HandleReleaser    = (*File)(nil)
        _ fs.HandleReader      = (*File)(nil)
        _ fs.HandleWriter      = (*File)(nil)
        _ fs.HandleFlusher     = (*File)(nil)
        _ fs.NodeFsyncer       = (*File)(nil)
        _ fs.NodeSetattrer     = (*File)(nil)
        _ fs.NodeReadlinker    = (*File)(nil)
        _ fs.NodeGetxattrer    = (*File)(nil)
        _ fs.NodeListxattrer   = (*File)(nil)
        _ fs.NodeSetxattrer    = (*File)(nil)
        _ fs.NodeRemovexattrer = (*File)(nil)
)

// NewFile returns a new file.
func NewFile(s *Super, i *proto.InodeInfo, flag uint32, pino uint64, filename string) fs.Node {
        if proto.IsCold(s.volType) {
                var (
                        fReader    *blobstore.Reader
                        fWriter    *blobstore.Writer
                        clientConf blobstore.ClientConfig
                )

                clientConf = blobstore.ClientConfig{
                        VolName:         s.volname,
                        VolType:         s.volType,
                        Ino:             i.Inode,
                        BlockSize:       s.EbsBlockSize,
                        Bc:              s.bc,
                        Mw:              s.mw,
                        Ec:              s.ec,
                        Ebsc:            s.ebsc,
                        EnableBcache:    s.enableBcache,
                        WConcurrency:    s.writeThreads,
                        ReadConcurrency: s.readThreads,
                        CacheAction:     s.CacheAction,
                        FileCache:       false,
                        FileSize:        i.Size,
                        CacheThreshold:  s.CacheThreshold,
                }
                log.LogDebugf("Trace NewFile:flag(%v). clientConf(%v)", flag, clientConf)

                switch flag {
                case syscall.O_RDONLY:
                        fReader = blobstore.NewReader(clientConf)
                case syscall.O_WRONLY:
                        fWriter = blobstore.NewWriter(clientConf)
                case syscall.O_RDWR:
                        fReader = blobstore.NewReader(clientConf)
                        fWriter = blobstore.NewWriter(clientConf)
                default:
                        // no thing
                }
                log.LogDebugf("Trace NewFile:fReader(%v) fWriter(%v) ", fReader, fWriter)
                return &File{super: s, info: i, fWriter: fWriter, fReader: fReader, parentIno: pino, name: filename}
        }
        return &File{super: s, info: i, parentIno: pino, name: filename}
}

// get file parentPath
func (f *File) getParentPath() string {
        if f.parentIno == f.super.rootIno {
                return "/"
        }

        f.super.fslock.Lock()
        node, ok := f.super.nodeCache[f.parentIno]
        f.super.fslock.Unlock()
        if !ok {
                log.LogErrorf("Get node cache failed: ino(%v)", f.parentIno)
                return "unknown"
        }
        parentDir, ok := node.(*Dir)
        if !ok {
                log.LogErrorf("Type error: Can not convert node -> *Dir, ino(%v)", f.parentIno)
                return "unknown"
        }
        return parentDir.getCwd()
}

// Attr sets the attributes of a file.
func (f *File) Attr(ctx context.Context, a *fuse.Attr) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Attr", err, bgTime, 1)
        }()

        ino := f.info.Inode
        info, err := f.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Attr: ino(%v) err(%v)", ino, err)
                if err == fuse.ENOENT {
                        a.Inode = ino
                        return nil
                }
                return ParseError(err)
        }

        fillAttr(info, a)
        a.ParentIno = f.parentIno
        fileSize, gen := f.fileSizeVersion2(ino)
        log.LogDebugf("Attr: ino(%v) fileSize(%v) gen(%v) inode.gen(%v)", ino, fileSize, gen, info.Generation)
        if gen >= info.Generation {
                a.Size = uint64(fileSize)
        }
        if proto.IsSymlink(info.Mode) {
                a.Size = uint64(len(info.Target))
        }
        log.LogDebugf("TRACE Attr: inode(%v) attr(%v)", info, a)
        return nil
}

// Forget evicts the inode of the current file. This can only happen when the inode is on the orphan list.
func (f *File) Forget() {
        var err error
        bgTime := stat.BeginStat()

        ino := f.info.Inode
        defer func() {
                stat.EndStat("Forget", err, bgTime, 1)
                log.LogDebugf("TRACE Forget: ino(%v)", ino)
        }()

        //TODO:why cannot close fwriter
        //log.LogErrorf("TRACE Forget: ino(%v)", ino)
        //if f.fWriter != nil {
        //        f.fWriter.Close()
        //}

        if DisableMetaCache {
                f.super.ic.Delete(ino)
                f.super.fslock.Lock()
                delete(f.super.nodeCache, ino)
                f.super.fslock.Unlock()
                if err := f.super.ec.EvictStream(ino); err != nil {
                        log.LogWarnf("Forget: stream not ready to evict, ino(%v) err(%v)", ino, err)
                        return
                }
        }

        if !f.super.orphan.Evict(ino) {
                return
        }
        fullPath := f.getParentPath() + f.name
        if err := f.super.mw.Evict(ino, fullPath); err != nil {
                log.LogWarnf("Forget Evict: ino(%v) err(%v)", ino, err)
        }
}

// Open handles the open request.
func (f *File) Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenResponse) (handle fs.Handle, err error) {
        bgTime := stat.BeginStat()
        var needBCache bool

        defer func() {
                stat.EndStat("Open", err, bgTime, 1)
        }()

        ino := f.info.Inode
        log.LogDebugf("TRACE open ino(%v) info(%v)", ino, f.info)
        start := time.Now()

        if f.super.bcacheDir != "" && !f.filterFilesSuffix(f.super.bcacheFilterFiles) {
                parentPath := f.getParentPath()
                if parentPath != "" && !strings.HasSuffix(parentPath, "/") {
                        parentPath = parentPath + "/"
                }
                log.LogDebugf("TRACE open ino(%v) parentPath(%v)", ino, parentPath)
                if strings.HasPrefix(parentPath, f.super.bcacheDir) {
                        needBCache = true
                }
        }
        if needBCache {
                f.super.ec.OpenStreamWithCache(ino, needBCache)
        } else {
                f.super.ec.OpenStream(ino)
        }
        log.LogDebugf("TRACE open ino(%v) f.super.bcacheDir(%v) needBCache(%v)", ino, f.super.bcacheDir, needBCache)

        f.super.ec.RefreshExtentsCache(ino)

        if f.super.keepCache && resp != nil {
                resp.Flags |= fuse.OpenKeepCache
        }
        if proto.IsCold(f.super.volType) {
                log.LogDebugf("TRANCE open ino(%v) info(%v)", ino, f.info)
                fileSize, _ := f.fileSizeVersion2(ino)
                clientConf := blobstore.ClientConfig{
                        VolName:         f.super.volname,
                        VolType:         f.super.volType,
                        BlockSize:       f.super.EbsBlockSize,
                        Ino:             f.info.Inode,
                        Bc:              f.super.bc,
                        Mw:              f.super.mw,
                        Ec:              f.super.ec,
                        Ebsc:            f.super.ebsc,
                        EnableBcache:    f.super.enableBcache,
                        WConcurrency:    f.super.writeThreads,
                        ReadConcurrency: f.super.readThreads,
                        CacheAction:     f.super.CacheAction,
                        FileCache:       false,
                        FileSize:        uint64(fileSize),
                        CacheThreshold:  f.super.CacheThreshold,
                }
                f.fWriter.FreeCache()
                switch req.Flags & 0x0f {
                case syscall.O_RDONLY:
                        f.fReader = blobstore.NewReader(clientConf)
                        f.fWriter = nil
                case syscall.O_WRONLY:
                        f.fWriter = blobstore.NewWriter(clientConf)
                        f.fReader = nil
                case syscall.O_RDWR:
                        f.fReader = blobstore.NewReader(clientConf)
                        f.fWriter = blobstore.NewWriter(clientConf)
                default:
                        f.fWriter = blobstore.NewWriter(clientConf)
                        f.fReader = nil
                }
                log.LogDebugf("TRACE file open,ino(%v)  req.Flags(%v) reader(%v)  writer(%v)", ino, req.Flags, f.fReader, f.fWriter)
        }

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Open: ino(%v) req(%v) resp(%v) (%v)ns", ino, req, resp, elapsed.Nanoseconds())

        return f, nil
}

// Release handles the release request.
func (f *File) Release(ctx context.Context, req *fuse.ReleaseRequest) (err error) {
        ino := f.info.Inode
        bgTime := stat.BeginStat()

        defer func() {
                stat.EndStat("Release", err, bgTime, 1)
                log.LogInfof("action[Release] %v", f.fWriter)
                f.fWriter.FreeCache()
                if DisableMetaCache {
                        f.super.ic.Delete(ino)
                }
        }()

        log.LogDebugf("TRACE Release enter: ino(%v) req(%v)", ino, req)

        start := time.Now()

        //log.LogErrorf("TRACE Release close stream: ino(%v) req(%v)", ino, req)
        //if f.fWriter != nil {
        //        f.fWriter.Close()
        //}

        err = f.super.ec.CloseStream(ino)
        if err != nil {
                log.LogErrorf("Release: close writer failed, ino(%v) req(%v) err(%v)", ino, req, err)
                return ParseError(err)
        }
        elapsed := time.Since(start)
        log.LogDebugf("TRACE Release: ino(%v) req(%v) (%v)ns", ino, req, elapsed.Nanoseconds())

        return nil
}

// Read handles the read request.
func (f *File) Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Read", err, bgTime, 1)
                stat.StatBandWidth("Read", uint32(req.Size))
        }()

        log.LogDebugf("TRACE Read enter: ino(%v) offset(%v) reqsize(%v) req(%v)", f.info.Inode, req.Offset, req.Size, req)

        start := time.Now()

        metric := exporter.NewTPCnt("fileread")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
        }()
        var size int
        if proto.IsHot(f.super.volType) {
                size, err = f.super.ec.Read(f.info.Inode, resp.Data[fuse.OutHeaderSize:], int(req.Offset), req.Size)
        } else {
                size, err = f.fReader.Read(ctx, resp.Data[fuse.OutHeaderSize:], int(req.Offset), req.Size)
        }
        if err != nil && err != io.EOF {
                msg := fmt.Sprintf("Read: ino(%v) req(%v) err(%v) size(%v)", f.info.Inode, req, err, size)
                f.super.handleError("Read", msg)
                errMetric := exporter.NewCounter("fileReadFailed")
                errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
                return ParseError(err)
        }

        if size > req.Size {
                msg := fmt.Sprintf("Read: read size larger than request size, ino(%v) req(%v) size(%v)", f.info.Inode, req, size)
                f.super.handleError("Read", msg)
                errMetric := exporter.NewCounter("fileReadFailed")
                errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "ERANGE"})
                return fuse.ERANGE
        }

        if size > 0 {
                resp.Data = resp.Data[:size+fuse.OutHeaderSize]
        } else if size <= 0 {
                resp.Data = resp.Data[:fuse.OutHeaderSize]
                log.LogWarnf("Read: ino(%v) offset(%v) reqsize(%v) req(%v) size(%v)", f.info.Inode, req.Offset, req.Size, req, size)
        }

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Read: ino(%v) offset(%v) reqsize(%v) req(%v) size(%v) (%v)ns", f.info.Inode, req.Offset, req.Size, req, size, elapsed.Nanoseconds())

        return nil
}

// Write handles the write request.
func (f *File) Write(ctx context.Context, req *fuse.WriteRequest, resp *fuse.WriteResponse) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Write", err, bgTime, 1)
                stat.StatBandWidth("Write", uint32(len(req.Data)))
        }()

        ino := f.info.Inode
        reqlen := len(req.Data)
        log.LogDebugf("TRACE Write enter: ino(%v) offset(%v) len(%v)  flags(%v) fileflags(%v) quotaIds(%v) req(%v)",
                ino, req.Offset, reqlen, req.Flags, req.FileFlags, f.info.QuotaInfos, req)
        if proto.IsHot(f.super.volType) {
                filesize, _ := f.fileSize(ino)
                if req.Offset > int64(filesize) && reqlen == 1 && req.Data[0] == 0 {

                        // workaround: posix_fallocate would write 1 byte if fallocate is not supported.
                        fullPath := path.Join(f.getParentPath(), f.name)
                        err = f.super.ec.Truncate(f.super.mw, f.parentIno, ino, int(req.Offset)+reqlen, fullPath)
                        if err == nil {
                                resp.Size = reqlen
                        }
                        log.LogDebugf("fallocate: ino(%v) origFilesize(%v) req(%v) err(%v)", f.info.Inode, filesize, req, err)
                        return
                }
        }

        defer func() {
                f.super.ic.Delete(ino)
        }()

        var waitForFlush bool
        var flags int

        if isDirectIOEnabled(req.FileFlags) || (req.FileFlags&fuse.OpenSync != 0) {
                waitForFlush = true
                if f.super.enSyncWrite {
                        flags |= proto.FlagsSyncWrite
                }
                if proto.IsCold(f.super.volType) {
                        waitForFlush = false
                        flags |= proto.FlagsSyncWrite
                }
        }

        if req.FileFlags&fuse.OpenAppend != 0 || proto.IsCold(f.super.volType) {
                flags |= proto.FlagsAppend
        }

        start := time.Now()
        metric := exporter.NewTPCnt("filewrite")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
        }()

        checkFunc := func() error {
                if !f.super.mw.EnableQuota {
                        return nil
                }
                if ok := f.super.ec.UidIsLimited(req.Uid); ok {
                        return ParseError(syscall.ENOSPC)
                }
                var quotaIds []uint32
                for quotaId := range f.info.QuotaInfos {
                        quotaIds = append(quotaIds, quotaId)
                }
                if limited := f.super.mw.IsQuotaLimited(quotaIds); limited {
                        return ParseError(syscall.ENOSPC)
                }
                return nil
        }
        var size int
        if proto.IsHot(f.super.volType) {
                f.super.ec.GetStreamer(ino).SetParentInode(f.parentIno)
                if size, err = f.super.ec.Write(ino, int(req.Offset), req.Data, flags, checkFunc); err == ParseError(syscall.ENOSPC) {
                        return
                }
        } else {
                atomic.StoreInt32(&f.idle, 0)
                size, err = f.fWriter.Write(ctx, int(req.Offset), req.Data, flags)
        }
        if err != nil {
                msg := fmt.Sprintf("Write: ino(%v) offset(%v) len(%v) err(%v)", ino, req.Offset, reqlen, err)
                f.super.handleError("Write", msg)
                errMetric := exporter.NewCounter("fileWriteFailed")
                errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
                if err == syscall.EOPNOTSUPP {
                        return fuse.ENOTSUP
                }
                return fuse.EIO
        }

        resp.Size = size
        if size != reqlen {
                log.LogErrorf("Write: ino(%v) offset(%v) len(%v) size(%v)", ino, req.Offset, reqlen, size)
        }

        // only hot volType need to wait flush
        if waitForFlush {
                err = f.super.ec.Flush(ino)
                if err != nil {
                        msg := fmt.Sprintf("Write: failed to wait for flush, ino(%v) offset(%v) len(%v) err(%v) req(%v)", ino, req.Offset, reqlen, err, req)
                        f.super.handleError("Wrtie", msg)
                        errMetric := exporter.NewCounter("fileWriteFailed")
                        errMetric.AddWithLabels(1, map[string]string{exporter.Vol: f.super.volname, exporter.Err: "EIO"})
                        return ParseError(err)
                }
        }
        elapsed := time.Since(start)
        log.LogDebugf("TRACE Write: ino(%v) offset(%v) len(%v) flags(%v) fileflags(%v) req(%v) (%v)ns ",
                ino, req.Offset, reqlen, req.Flags, req.FileFlags, req, elapsed.Nanoseconds())
        return nil
}

// Flush only when fsyncOnClose is enabled.
func (f *File) Flush(ctx context.Context, req *fuse.FlushRequest) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Flush", err, bgTime, 1)
        }()

        if !f.super.fsyncOnClose {
                return fuse.ENOSYS
        }
        log.LogDebugf("TRACE Flush enter: ino(%v)", f.info.Inode)
        start := time.Now()

        metric := exporter.NewTPCnt("filesync")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: f.super.volname})
        }()
        if proto.IsHot(f.super.volType) {
                err = f.super.ec.Flush(f.info.Inode)
        } else {
                f.Lock()
                err = f.fWriter.Flush(f.info.Inode, ctx)
                f.Unlock()
        }
        log.LogDebugf("TRACE Flush: ino(%v) err(%v)", f.info.Inode, err)
        if err != nil {
                msg := fmt.Sprintf("Flush: ino(%v) err(%v)", f.info.Inode, err)
                f.super.handleError("Flush", msg)
                log.LogErrorf("TRACE Flush err: ino(%v) err(%v)", f.info.Inode, err)
                return ParseError(err)
        }

        if DisableMetaCache {
                f.super.ic.Delete(f.info.Inode)
        }

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Flush: ino(%v) (%v)ns", f.info.Inode, elapsed.Nanoseconds())

        return nil
}

// Fsync hanldes the fsync request.
func (f *File) Fsync(ctx context.Context, req *fuse.FsyncRequest) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Fsync", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE Fsync enter: ino(%v)", f.info.Inode)
        start := time.Now()
        if proto.IsHot(f.super.volType) {
                err = f.super.ec.Flush(f.info.Inode)
        } else {
                err = f.fWriter.Flush(f.info.Inode, ctx)
        }
        if err != nil {
                msg := fmt.Sprintf("Fsync: ino(%v) err(%v)", f.info.Inode, err)
                f.super.handleError("Fsync", msg)
                return ParseError(err)
        }
        f.super.ic.Delete(f.info.Inode)
        elapsed := time.Since(start)
        log.LogDebugf("TRACE Fsync: ino(%v) (%v)ns", f.info.Inode, elapsed.Nanoseconds())
        return nil
}

// Setattr handles the setattr request.
func (f *File) Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Setattr", err, bgTime, 1)
        }()

        ino := f.info.Inode
        start := time.Now()
        if req.Valid.Size() && proto.IsHot(f.super.volType) {
                // when use trunc param in open request through nfs client and mount on cfs mountPoint, cfs client may not recv open message but only setAttr,
                // the streamer may not open and cause io error finally,so do a open no matter the stream be opened or not
                if err := f.super.ec.OpenStream(ino); err != nil {
                        log.LogErrorf("Setattr: OpenStream ino(%v) size(%v) err(%v)", ino, req.Size, err)
                        return ParseError(err)
                }
                defer f.super.ec.CloseStream(ino)

                if err := f.super.ec.Flush(ino); err != nil {
                        log.LogErrorf("Setattr: truncate wait for flush ino(%v) size(%v) err(%v)", ino, req.Size, err)
                        return ParseError(err)
                }
                fullPath := path.Join(f.getParentPath(), f.name)
                if err := f.super.ec.Truncate(f.super.mw, f.parentIno, ino, int(req.Size), fullPath); err != nil {
                        log.LogErrorf("Setattr: truncate ino(%v) size(%v) err(%v)", ino, req.Size, err)
                        return ParseError(err)
                }
                f.super.ic.Delete(ino)
                f.super.ec.RefreshExtentsCache(ino)
        }

        info, err := f.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Setattr: InodeGet failed, ino(%v) err(%v)", ino, err)
                return ParseError(err)
        }

        if req.Valid.Size() && proto.IsHot(f.super.volType) {
                if req.Size != info.Size {
                        log.LogWarnf("Setattr: truncate ino(%v) reqSize(%v) inodeSize(%v)", ino, req.Size, info.Size)
                }
        }

        if valid := setattr(info, req); valid != 0 {
                err = f.super.mw.Setattr(ino, valid, info.Mode, info.Uid, info.Gid, info.AccessTime.Unix(),
                        info.ModifyTime.Unix())
                if err != nil {
                        f.super.ic.Delete(ino)
                        return ParseError(err)
                }
        }

        fillAttr(info, &resp.Attr)

        elapsed := time.Since(start)
        log.LogDebugf("TRACE Setattr: ino(%v) req(%v) (%v)ns", ino, req, elapsed.Nanoseconds())
        return nil
}

// Readlink handles the readlink request.
func (f *File) Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error) {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Readlink", err, bgTime, 1)
        }()

        ino := f.info.Inode
        info, err := f.super.InodeGet(ino)
        if err != nil {
                log.LogErrorf("Readlink: ino(%v) err(%v)", ino, err)
                return "", ParseError(err)
        }
        log.LogDebugf("TRACE Readlink: ino(%v) target(%v)", ino, string(info.Target))
        return string(info.Target), nil
}

// Getxattr has not been implemented yet.
func (f *File) Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Getxattr", err, bgTime, 1)
        }()

        if !f.super.enableXattr {
                return fuse.ENOSYS
        }
        ino := f.info.Inode
        name := req.Name
        size := req.Size
        pos := req.Position
        info, err := f.super.mw.XAttrGet_ll(ino, name)
        if err != nil {
                log.LogErrorf("GetXattr: ino(%v) name(%v) err(%v)", ino, name, err)
                return ParseError(err)
        }
        value := info.Get(name)
        if pos > 0 {
                value = value[pos:]
        }
        if size > 0 && size < uint32(len(value)) {
                value = value[:size]
        }
        resp.Xattr = value
        log.LogDebugf("TRACE GetXattr: ino(%v) name(%v)", ino, name)
        return nil
}

// Listxattr has not been implemented yet.
func (f *File) Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Listxattr", err, bgTime, 1)
        }()

        if !f.super.enableXattr {
                return fuse.ENOSYS
        }
        ino := f.info.Inode
        _ = req.Size     // ignore currently
        _ = req.Position // ignore currently

        keys, err := f.super.mw.XAttrsList_ll(ino)
        if err != nil {
                log.LogErrorf("ListXattr: ino(%v) err(%v)", ino, err)
                return ParseError(err)
        }
        for _, key := range keys {
                resp.Append(key)
        }
        log.LogDebugf("TRACE Listxattr: ino(%v)", ino)
        return nil
}

// Setxattr has not been implemented yet.
func (f *File) Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Setxattr", err, bgTime, 1)
        }()

        if !f.super.enableXattr {
                return fuse.ENOSYS
        }
        ino := f.info.Inode
        name := req.Name
        value := req.Xattr
        // TODO： implement flag to improve compatible (Mofei Zhang)
        if err = f.super.mw.XAttrSet_ll(ino, []byte(name), []byte(value)); err != nil {
                log.LogErrorf("Setxattr: ino(%v) name(%v) err(%v)", ino, name, err)
                return ParseError(err)
        }
        log.LogDebugf("TRACE Setxattr: ino(%v) name(%v)", ino, name)
        return nil
}

// Removexattr has not been implemented yet.
func (f *File) Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("Removexattr", err, bgTime, 1)
        }()

        if !f.super.enableXattr {
                return fuse.ENOSYS
        }
        ino := f.info.Inode
        name := req.Name
        if err = f.super.mw.XAttrDel_ll(ino, name); err != nil {
                log.LogErrorf("Removexattr: ino(%v) name(%v) err(%v)", ino, name, err)
                return ParseError(err)
        }
        log.LogDebugf("TRACE RemoveXattr: ino(%v) name(%v)", ino, name)
        return nil
}

func (f *File) fileSize(ino uint64) (size int, gen uint64) {
        size, gen, valid := f.super.ec.FileSize(ino)
        if !valid {
                if info, err := f.super.InodeGet(ino); err == nil {
                        size = int(info.Size)
                        gen = info.Generation
                }
        }

        log.LogDebugf("TRANCE fileSize: ino(%v) fileSize(%v) gen(%v) valid(%v)", ino, size, gen, valid)
        return
}

func (f *File) fileSizeVersion2(ino uint64) (size int, gen uint64) {
        size, gen, valid := f.super.ec.FileSize(ino)
        if proto.IsCold(f.super.volType) {
                valid = false
        }
        if !valid {
                if info, err := f.super.InodeGet(ino); err == nil {
                        size = int(info.Size)
                        if f.fWriter != nil {
                                cacheSize := f.fWriter.CacheFileSize()
                                if cacheSize > size {
                                        size = cacheSize
                                }
                        }
                        gen = info.Generation
                }
        }

        log.LogDebugf("TRACE fileSizeVersion2: ino(%v) fileSize(%v) gen(%v) valid(%v)", ino, size, gen, valid)
        return
}

// return true mean this file will not cache in block cache
func (f *File) filterFilesSuffix(filterFiles string) bool {
        if f.name == "" {
                log.LogWarnf("this file inode[%v], name is nil", f.info)
                return true
        }
        if filterFiles == "" {
                return false
        }
        suffixs := strings.Split(filterFiles, ";")
        for _, suffix := range suffixs {
                //.py means one type of file
                suffix = "." + suffix
                if suffix != "." && strings.Contains(f.name, suffix) {
                        log.LogDebugf("fileName:%s,filter:%s,suffix:%s,suffixs:%v", f.name, filterFiles, suffix, suffixs)
                        return true
                }
        }
        return false
}

//go:build gofuzz
// +build gofuzz

// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package fs

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
        "github.com/cubefs/cubefs/proto"
)

type NewFileParam struct {
        Pino     uint64
        Flag     uint32
        FileName string
        Info     *proto.InodeInfo
        Super    *Super
}

type NewDirParam struct {
        Pino     uint64
        FileName string
        Info     *proto.InodeInfo
        Super    *Super
}

func FuzzNewFile(data []byte) int {
        f := fuzz.NewConsumer(data)
        param := NewFileParam{}

        err := f.GenerateStruct(&param)
        if err != nil {
                return 0
        }

        file := NewFile(param.Super, param.Info, param.Flag, param.Pino, param.FileName)
        if file == nil {
                return 0
        }
        return 1
}

func FuzzNewDir(data []byte) int {
        f := fuzz.NewConsumer(data)
        param := NewDirParam{}

        err := f.GenerateStruct(&param)
        if err != nil {
                return 0
        }

        dir := NewDir(param.Super, param.Info, param.Pino, param.FileName)
        if dir == nil {
                return 0
        }
        return 1
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "container/list"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        // MinInodeCacheEvictNum is used in the foreground eviction.
        // When clearing the inodes from the cache, it stops as soon as 10 inodes have been evicted.
        MinInodeCacheEvictNum = 10
        // MaxInodeCacheEvictNum is used in the back ground. We can evict 200000 inodes at max.
        MaxInodeCacheEvictNum = 200000

        BgEvictionInterval = 2 * time.Minute
)

// InodeCache defines the structure of the inode cache.
type InodeCache struct {
        sync.RWMutex
        cache       map[uint64]*list.Element
        lruList     *list.List
        expiration  time.Duration
        maxElements int
}

// NewInodeCache returns a new inode cache.
func NewInodeCache(exp time.Duration, maxElements int) *InodeCache {
        ic := &InodeCache{
                cache:       make(map[uint64]*list.Element),
                lruList:     list.New(),
                expiration:  exp,
                maxElements: maxElements,
        }
        go ic.backgroundEviction()
        return ic
}

// Put puts the given inode info into the inode cache.
func (ic *InodeCache) Put(info *proto.InodeInfo) {
        ic.Lock()
        old, ok := ic.cache[info.Inode]
        if ok {
                ic.lruList.Remove(old)
                delete(ic.cache, info.Inode)
        }

        if ic.lruList.Len() >= ic.maxElements {
                ic.evict(true)
        }

        inodeSetExpiration(info, ic.expiration)
        element := ic.lruList.PushFront(info)
        ic.cache[info.Inode] = element
        ic.Unlock()
        // log.LogDebugf("InodeCache put inode: inode(%v)", info.Inode)
}

// Get returns the inode info based on the given inode number.
func (ic *InodeCache) Get(ino uint64) *proto.InodeInfo {
        ic.RLock()
        element, ok := ic.cache[ino]
        if !ok {
                ic.RUnlock()
                return nil
        }

        info := element.Value.(*proto.InodeInfo)
        if inodeExpired(info) && DisableMetaCache {
                ic.RUnlock()
                // log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v), expired(%d)", time.Now().Format(LogTimeFormat), info.Inode, info.Expiration())
                return nil
        }
        ic.RUnlock()
        return info
}

// Delete deletes the inode info based on the given inode number.
func (ic *InodeCache) Delete(ino uint64) {
        // log.LogDebugf("InodeCache Delete: ino(%v)", ino)
        ic.Lock()
        element, ok := ic.cache[ino]
        if ok {
                ic.lruList.Remove(element)
                delete(ic.cache, ino)
        }
        ic.Unlock()
}

// Foreground eviction cares more about the speed.
// Background eviction evicts all expired items from the cache.
// The caller should grab the WRITE lock of the inode cache.
func (ic *InodeCache) evict(foreground bool) {
        var count int

        for i := 0; i < MinInodeCacheEvictNum; i++ {
                element := ic.lruList.Back()
                if element == nil {
                        return
                }

                // For background eviction, if all expired items have been evicted, just return
                // But for foreground eviction, we need to evict at least MinInodeCacheEvictNum inodes.
                // The foreground eviction, does not need to care if the inode has expired or not.
                info := element.Value.(*proto.InodeInfo)
                if !foreground && !inodeExpired(info) {
                        return
                }

                // log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
                ic.lruList.Remove(element)
                delete(ic.cache, info.Inode)
                count++
        }

        // For background eviction, we need to continue evict all expired items from the cache
        if foreground {
                return
        }

        for i := 0; i < MaxInodeCacheEvictNum; i++ {
                element := ic.lruList.Back()
                if element == nil {
                        break
                }
                info := element.Value.(*proto.InodeInfo)
                if !inodeExpired(info) {
                        break
                }
                // log.LogDebugf("InodeCache GetConnect expired: now(%v) inode(%v)", time.Now().Format(LogTimeFormat), info.Inode)
                ic.lruList.Remove(element)
                delete(ic.cache, info.Inode)
                count++
        }
}

func (ic *InodeCache) backgroundEviction() {
        t := time.NewTicker(BgEvictionInterval)
        defer t.Stop()

        for range t.C {
                log.LogInfof("InodeCache: start BG evict")
                if !DisableMetaCache {
                        log.LogInfof("InodeCache: no need to do BG evict")
                        continue
                }
                start := time.Now()
                ic.Lock()
                ic.evict(false)
                ic.Unlock()
                elapsed := time.Since(start)
                log.LogInfof("InodeCache: total inode cache(%d), cost(%d)ns", ic.lruList.Len(), elapsed.Nanoseconds())
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "time"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        LogTimeFormat = "20060102150405000"
)

func (s *Super) InodeGet(ino uint64) (*proto.InodeInfo, error) {
        info := s.ic.Get(ino)
        if info != nil {
                return info, nil
        }

        info, err := s.mw.InodeGet_ll(ino)
        if err != nil || info == nil {
                log.LogErrorf("InodeGet: ino(%v) err(%v) info(%v)", ino, err, info)
                if err != nil {
                        return nil, ParseError(err)
                } else {
                        return nil, fuse.ENOENT
                }
        }
        s.ic.Put(info)
        s.fslock.Lock()
        node, isFind := s.nodeCache[ino]
        s.fslock.Unlock()
        if isFind {
                s, ok := node.(*Dir)
                if ok {
                        s.info = info
                } else {
                        node.(*File).info = info
                }
        }
        s.ec.RefreshExtentsCache(ino)
        return info, nil
}

func setattr(info *proto.InodeInfo, req *fuse.SetattrRequest) (valid uint32) {
        if req.Valid.Mode() {
                info.Mode = proto.Mode(req.Mode)
                valid |= proto.AttrMode
        }

        if req.Valid.Uid() {
                info.Uid = req.Uid
                valid |= proto.AttrUid
        }

        if req.Valid.Gid() {
                info.Gid = req.Gid
                valid |= proto.AttrGid
        }

        if req.Valid.Atime() {
                info.AccessTime = req.Atime
                valid |= proto.AttrAccessTime
        }

        if req.Valid.Mtime() {
                info.ModifyTime = req.Mtime
                valid |= proto.AttrModifyTime
        }

        return
}

func fillAttr(info *proto.InodeInfo, attr *fuse.Attr) {
        attr.Valid = AttrValidDuration
        attr.Nlink = info.Nlink
        attr.Inode = info.Inode
        attr.Mode = proto.OsMode(info.Mode)
        attr.Size = info.Size
        attr.Blocks = attr.Size >> 9 // In 512 bytes
        attr.Atime = info.AccessTime
        attr.Ctime = info.CreateTime
        attr.Mtime = info.ModifyTime
        attr.BlockSize = DefaultBlksize
        attr.Uid = info.Uid
        attr.Gid = info.Gid
}

func inodeExpired(info *proto.InodeInfo) bool {
        return time.Now().UnixNano() > info.Expiration()
}

func inodeSetExpiration(info *proto.InodeInfo, t time.Duration) {
        info.SetExpiration(time.Now().Add(t).UnixNano())
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "container/list"
        "sync"
)

// OrphanInodeList defines the orphan inode list, which is a list of orphan inodes.
// An orphan inode is the inode whose nlink value is 0.
type OrphanInodeList struct {
        sync.RWMutex
        cache map[uint64]*list.Element
        list  *list.List
}

// NewOrphanInodeList returns a new orphan inode list.
func NewOrphanInodeList() *OrphanInodeList {
        return &OrphanInodeList{
                cache: make(map[uint64]*list.Element),
                list:  list.New(),
        }
}

// Put puts an inode into the orphan inode list.
func (l *OrphanInodeList) Put(ino uint64) {
        l.Lock()
        defer l.Unlock()
        _, ok := l.cache[ino]
        if !ok {
                element := l.list.PushFront(ino)
                l.cache[ino] = element
        }
}

// Evict remove the given inode from the orphan inode list, and evicts it.
func (l *OrphanInodeList) Evict(ino uint64) bool {
        l.Lock()
        defer l.Unlock()
        element, ok := l.cache[ino]
        if !ok {
                return false
        }
        l.list.Remove(element)
        delete(l.cache, ino)
        return true
}

package fs

import (
        "container/list"
        "sync"
        "time"

        "github.com/cubefs/cubefs/sdk/meta"
)

const (
        MinSummaryCacheEvictNum   = 10
        MaxSummaryCacheEvictNum   = 200000
        SummaryBgEvictionInterval = 2 * time.Minute
        DefaultSummaryExpiration  = 2 * time.Minute
        MaxSummaryCache           = 1000000
)

// SummaryCache defines the structure of the content-summary cache.
type SummaryCache struct {
        sync.RWMutex
        cache       map[uint64]*list.Element
        lruList     *list.List
        expiration  time.Duration
        maxElements int
}

// summaryCacheElement defines the structure of the content-summary cache's element.
type summaryCacheElement struct {
        ino        uint64
        info       *meta.SummaryInfo
        expiration int64
}

// NewSummaryCache returns a new content-summary cache.
func NewSummaryCache(exp time.Duration, maxElement int) *SummaryCache {
        sc := &SummaryCache{
                cache:       make(map[uint64]*list.Element),
                lruList:     list.New(),
                expiration:  exp,
                maxElements: maxElement,
        }
        go sc.backgroundEviction()
        return sc
}

// Put puts the given summary info into the content-summary cache.
func (sc *SummaryCache) Put(inode uint64, summaryInfo *meta.SummaryInfo) {
        sc.Lock()
        old, ok := sc.cache[inode]
        if ok {
                sc.lruList.Remove(old)
                delete(sc.cache, inode)
        }
        if sc.lruList.Len() >= sc.maxElements {
                sc.evict(true)
        }
        element := sc.lruList.PushFront(&summaryCacheElement{
                ino:        inode,
                info:       summaryInfo,
                expiration: time.Now().Add(sc.expiration).UnixNano(),
        })
        sc.cache[inode] = element
        sc.Unlock()
}

// Get returns the content-summary info based on the given inode number.
func (sc *SummaryCache) Get(inode uint64) *meta.SummaryInfo {
        sc.RLock()
        element, ok := sc.cache[inode]
        if !ok {
                sc.RUnlock()
                return nil
        }
        info := element.Value.(*summaryCacheElement)
        if cacheExpired(info) {
                sc.RUnlock()
                return nil
        }
        sc.RUnlock()
        return info.info
}

// Delete deletes the content-summary info based on the given inode number.
func (sc *SummaryCache) Delete(inode uint64) {
        sc.Lock()
        element, ok := sc.cache[inode]
        if ok {
                sc.lruList.Remove(element)
                delete(sc.cache, inode)
        }
        sc.Unlock()
}

func (sc *SummaryCache) evict(foreground bool) {
        for i := 0; i < MinSummaryCacheEvictNum; i++ {
                element := sc.lruList.Back()
                if element == nil {
                        return
                }
                info := element.Value.(*summaryCacheElement)
                if !foreground && !cacheExpired(info) {
                        return
                }
                sc.lruList.Remove(element)
                delete(sc.cache, info.ino)
        }
        if foreground {
                return
        }

        for i := 0; i < MaxSummaryCacheEvictNum; i++ {
                element := sc.lruList.Back()
                if element == nil {
                        break
                }
                info := element.Value.(*summaryCacheElement)
                if !cacheExpired(info) {
                        break
                }
                sc.lruList.Remove(element)
                delete(sc.cache, info.ino)
        }
}

func (sc *SummaryCache) backgroundEviction() {
        t := time.NewTicker(SummaryBgEvictionInterval)
        defer t.Stop()
        for {
                select {
                case <-t.C:
                        sc.Lock()
                        sc.evict(false)
                        sc.Unlock()
                }
        }
}

func cacheExpired(info *summaryCacheElement) bool {
        if time.Now().UnixNano() > info.expiration {
                return true
        }
        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fs

import (
        "context"
        "fmt"
        "net/http"
        "os"
        "path"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/blobstore/api/access"
        "github.com/cubefs/cubefs/blockcache/bcache"
        "github.com/cubefs/cubefs/client/common"
        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
        "github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/blobstore"
        "github.com/cubefs/cubefs/sdk/data/stream"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/ump"
)

// Super defines the struct of a super block.
type Super struct {
        cluster     string
        volname     string
        masters     string
        mountPoint  string
        subDir      string
        owner       string
        ic          *InodeCache
        dc          *Dcache
        mw          *meta.MetaWrapper
        ec          *stream.ExtentClient
        orphan      *OrphanInodeList
        enSyncWrite bool
        keepCache   bool

        nodeCache map[uint64]fs.Node
        fslock    sync.Mutex

        disableDcache bool
        fsyncOnClose  bool
        enableXattr   bool
        rootIno       uint64

        state     fs.FSStatType
        sockaddr  string
        suspendCh chan interface{}

        // data lake
        volType             int
        ebsEndpoint         string
        CacheAction         int
        CacheThreshold      int
        EbsBlockSize        int
        enableBcache        bool
        bcacheDir           string
        bcacheFilterFiles   string
        bcacheCheckInterval int64
        bcacheBatchCnt      int64

        readThreads  int
        writeThreads int
        bc           *bcache.BcacheClient
        ebsc         *blobstore.BlobStoreClient
        sc           *SummaryCache

        taskPool      []common.TaskPool
        closeC        chan struct{}
        enableVerRead bool
}

// Functions that Super needs to implement
var (
        _ fs.FS         = (*Super)(nil)
        _ fs.FSStatfser = (*Super)(nil)
)

const (
        BlobWriterIdleTimeoutPeriod = 10
        DefaultTaskPoolSize         = 30
)

// NewSuper returns a new Super.
func NewSuper(opt *proto.MountOptions) (s *Super, err error) {
        s = new(Super)
        masters := strings.Split(opt.Master, meta.HostsSeparator)
        metaConfig := &meta.MetaConfig{
                Volume:          opt.Volname,
                Owner:           opt.Owner,
                Masters:         masters,
                Authenticate:    opt.Authenticate,
                TicketMess:      opt.TicketMess,
                ValidateOwner:   opt.Authenticate || opt.AccessKey == "",
                EnableSummary:   opt.EnableSummary && opt.EnableXattr,
                MetaSendTimeout: opt.MetaSendTimeout,
        }
        s.mw, err = meta.NewMetaWrapper(metaConfig)
        if err != nil {
                return nil, errors.Trace(err, "NewMetaWrapper failed!"+err.Error())
        }

        s.SetTransaction(opt.EnableTransaction, opt.TxTimeout, opt.TxConflictRetryNum, opt.TxConflictRetryInterval)
        s.mw.EnableQuota = opt.EnableQuota

        s.volname = opt.Volname
        s.masters = opt.Master
        s.mountPoint = opt.MountPoint
        s.subDir = opt.SubDir
        s.owner = opt.Owner
        s.cluster = s.mw.Cluster()
        inodeExpiration := DefaultInodeExpiration
        if opt.IcacheTimeout >= 0 {
                inodeExpiration = time.Duration(opt.IcacheTimeout) * time.Second
        }
        if opt.LookupValid >= 0 {
                LookupValidDuration = time.Duration(opt.LookupValid) * time.Second
        }
        if opt.AttrValid >= 0 {
                AttrValidDuration = time.Duration(opt.AttrValid) * time.Second
        }
        if opt.EnSyncWrite > 0 {
                s.enSyncWrite = true
        }

        s.keepCache = opt.KeepCache
        if opt.MaxStreamerLimit > 0 {
                s.ic = NewInodeCache(inodeExpiration, MaxInodeCache)
                s.dc = NewDcache(inodeExpiration, MaxInodeCache)
        } else {
                s.ic = NewInodeCache(inodeExpiration, DefaultMaxInodeCache)
                s.dc = NewDcache(inodeExpiration, DefaultMaxInodeCache)
        }
        s.orphan = NewOrphanInodeList()
        s.nodeCache = make(map[uint64]fs.Node)
        s.disableDcache = opt.DisableDcache
        s.fsyncOnClose = opt.FsyncOnClose
        s.enableXattr = opt.EnableXattr
        s.bcacheCheckInterval = opt.BcacheCheckIntervalS
        s.bcacheFilterFiles = opt.BcacheFilterFiles
        s.bcacheBatchCnt = opt.BcacheBatchCnt
        s.closeC = make(chan struct{}, 1)
        s.taskPool = []common.TaskPool{common.New(DefaultTaskPoolSize, DefaultTaskPoolSize), common.New(DefaultTaskPoolSize, DefaultTaskPoolSize)}

        if s.mw.EnableSummary {
                s.sc = NewSummaryCache(DefaultSummaryExpiration, MaxSummaryCache)
        }

        if opt.MaxStreamerLimit > 0 {
                DisableMetaCache = false
                s.fsyncOnClose = false
        }

        if !strings.HasSuffix(opt.MountPoint, "/") {
                opt.MountPoint = opt.MountPoint + "/"
        }
        if !strings.HasSuffix(opt.SubDir, "/") {
                opt.SubDir = opt.SubDir + "/"
        }
        if opt.BcacheDir != "" && !strings.HasSuffix(opt.BcacheDir, "/") {
                opt.BcacheDir = opt.BcacheDir + "/"
        }

        // use block cache and default use mountPoint as bcache dir
        if opt.EnableBcache && opt.BcacheDir == "" {
                s.bcacheDir = opt.MountPoint
        }

        if s.bcacheDir == opt.MountPoint {
                s.bcacheDir = "/"
        } else {
                s.bcacheDir = strings.ReplaceAll(opt.BcacheDir, opt.MountPoint, "/")
                if s.bcacheDir != "" && !strings.HasSuffix(s.bcacheDir, "/") {
                        s.bcacheDir = s.bcacheDir + "/"
                }
        }

        s.volType = opt.VolType
        s.ebsEndpoint = opt.EbsEndpoint
        s.CacheAction = opt.CacheAction
        s.CacheThreshold = opt.CacheThreshold
        s.EbsBlockSize = opt.EbsBlockSize
        s.enableBcache = opt.EnableBcache

        s.readThreads = int(opt.ReadThreads)
        s.writeThreads = int(opt.WriteThreads)

        if s.enableBcache {
                s.bc = bcache.NewBcacheClient()
        }

        extentConfig := &stream.ExtentConfig{
                Volume:            opt.Volname,
                Masters:           masters,
                FollowerRead:      opt.FollowerRead,
                NearRead:          opt.NearRead,
                ReadRate:          opt.ReadRate,
                WriteRate:         opt.WriteRate,
                VolumeType:        opt.VolType,
                BcacheEnable:      opt.EnableBcache,
                BcacheDir:         opt.BcacheDir,
                MaxStreamerLimit:  opt.MaxStreamerLimit,
                VerReadSeq:        opt.VerReadSeq,
                OnAppendExtentKey: s.mw.AppendExtentKey,
                OnSplitExtentKey:  s.mw.SplitExtentKey,
                OnGetExtents:      s.mw.GetExtents,
                OnTruncate:        s.mw.Truncate,
                OnEvictIcache:     s.ic.Delete,
                OnLoadBcache:      s.bc.Get,
                OnCacheBcache:     s.bc.Put,
                OnEvictBcache:     s.bc.Evict,

                DisableMetaCache:             DisableMetaCache,
                MinWriteAbleDataPartitionCnt: opt.MinWriteAbleDataPartitionCnt,
        }

        s.ec, err = stream.NewExtentClient(extentConfig)
        if err != nil {
                return nil, errors.Trace(err, "NewExtentClient failed!")
        }
        s.mw.VerReadSeq = s.ec.GetReadVer()
        if proto.IsCold(opt.VolType) {
                s.ebsc, err = blobstore.NewEbsClient(access.Config{
                        ConnMode: access.NoLimitConnMode,
                        Consul: access.ConsulConfig{
                                Address: opt.EbsEndpoint,
                        },
                        MaxSizePutOnce: MaxSizePutOnce,
                        Logger: &access.Logger{
                                Filename: path.Join(opt.Logpath, "client/ebs.log"),
                        },
                })
                if err != nil {
                        return nil, errors.Trace(err, "NewEbsClient failed!")
                }
        }
        s.mw.Client = s.ec

        if !opt.EnablePosixACL {
                opt.EnablePosixACL = s.ec.GetEnablePosixAcl()
        }

        if s.rootIno, err = s.mw.GetRootIno(opt.SubDir); err != nil {
                return nil, err
        }

        s.suspendCh = make(chan interface{})
        if proto.IsCold(opt.VolType) {
                go s.scheduleFlush()
        }
        if s.mw.EnableSummary {
                s.sc = NewSummaryCache(DefaultSummaryExpiration, MaxSummaryCache)
        }

        if opt.NeedRestoreFuse {
                atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatRestore))
        }

        log.LogInfof("NewSuper: cluster(%v) volname(%v) icacheExpiration(%v) LookupValidDuration(%v) AttrValidDuration(%v) state(%v)",
                s.cluster, s.volname, inodeExpiration, LookupValidDuration, AttrValidDuration, s.state)

        go s.loopSyncMeta()

        return s, nil
}

func (s *Super) scheduleFlush() {
        t := time.NewTicker(2 * time.Second)
        defer t.Stop()
        for {
                select {
                case <-t.C:
                        ctx := context.Background()
                        s.fslock.Lock()
                        for ino, node := range s.nodeCache {
                                if _, ok := node.(*File); !ok {
                                        continue
                                }
                                file := node.(*File)
                                if atomic.LoadInt32(&file.idle) >= BlobWriterIdleTimeoutPeriod {
                                        if file.fWriter != nil {
                                                atomic.StoreInt32(&file.idle, 0)
                                                go file.fWriter.Flush(ino, ctx)
                                        }
                                } else {
                                        atomic.AddInt32(&file.idle, 1)
                                }
                        }
                        s.fslock.Unlock()
                }
        }
}

// Root returns the root directory where it resides.
func (s *Super) Root() (fs.Node, error) {
        inode, err := s.InodeGet(s.rootIno)
        if err != nil {
                return nil, err
        }
        root := NewDir(s, inode, inode.Inode, "")
        return root, nil
}

func (s *Super) Node(ino, pino uint64, mode uint32) (fs.Node, error) {
        var node fs.Node

        // Create a fake InodeInfo. All File or Dir operations only use
        // InodeInfo.Inode.
        fakeInfo := &proto.InodeInfo{Inode: ino, Mode: mode}
        if proto.OsMode(fakeInfo.Mode).IsDir() {
                node = NewDir(s, fakeInfo, pino, "")
        } else {
                node = NewFile(s, fakeInfo, DefaultFlag, pino, "")
                // The node is saved in FuseContextNodes list, that means
                // the node is not evict. So we create a streamer for it,
                // and streamer's refcnt is 0.
                file := node.(*File)
                file.Open(nil, nil, nil)
                file.Release(nil, nil)
        }
        s.fslock.Lock()
        s.nodeCache[ino] = node
        s.fslock.Unlock()
        return node, nil
}

// Statfs handles the Statfs request and returns a set of statistics.
func (s *Super) Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.StatfsResponse) error {
        const defaultMaxMetaPartitionInodeID uint64 = 1<<63 - 1
        total, used, inodeCount := s.mw.Statfs()
        resp.Blocks = total / uint64(DefaultBlksize)
        resp.Bfree = (total - used) / uint64(DefaultBlksize)
        resp.Bavail = resp.Bfree
        resp.Bsize = DefaultBlksize
        resp.Namelen = DefaultMaxNameLen
        resp.Frsize = DefaultBlksize
        resp.Files = inodeCount
        resp.Ffree = defaultMaxMetaPartitionInodeID - inodeCount
        return nil
}

// ClusterName returns the cluster name.
func (s *Super) ClusterName() string {
        return s.cluster
}

func (s *Super) GetRate(w http.ResponseWriter, r *http.Request) {
        w.Write([]byte(s.ec.GetRate()))
}

func (s *Super) SetRate(w http.ResponseWriter, r *http.Request) {
        if err := r.ParseForm(); err != nil {
                w.Write([]byte(err.Error()))
                return
        }

        if rate := r.FormValue("read"); rate != "" {
                val, err := strconv.Atoi(rate)
                if err != nil {
                        w.Write([]byte("Set read rate failed\n"))
                } else {
                        msg := s.ec.SetReadRate(val)
                        w.Write([]byte(fmt.Sprintf("Set read rate to %v successfully\n", msg)))
                }
        }

        if rate := r.FormValue("write"); rate != "" {
                val, err := strconv.Atoi(rate)
                if err != nil {
                        w.Write([]byte("Set write rate failed\n"))
                } else {
                        msg := s.ec.SetWriteRate(val)
                        w.Write([]byte(fmt.Sprintf("Set write rate to %v successfully\n", msg)))
                }
        }
}

func (s *Super) exporterKey(act string) string {
        return fmt.Sprintf("%v_fuseclient_%v", s.cluster, act)
}

func (s *Super) umpKey(act string) string {
        return fmt.Sprintf("%v_fuseclient_%v", s.cluster, act)
}

func (s *Super) handleError(op, msg string) {
        log.LogError(msg)
        ump.Alarm(s.umpKey(op), msg)
}

func replyFail(w http.ResponseWriter, r *http.Request, msg string) {
        w.WriteHeader(http.StatusBadRequest)
        w.Write([]byte(msg))
}

func replySucc(w http.ResponseWriter, r *http.Request, msg string) {
        w.WriteHeader(http.StatusOK)
        w.Write([]byte(msg))
}

func (s *Super) SetSockAddr(addr string) {
        s.sockaddr = addr
}

func (s *Super) SetSuspend(w http.ResponseWriter, r *http.Request) {
        var (
                err error
                ret string
        )

        if err = r.ParseForm(); err != nil {
                replyFail(w, r, err.Error())
                return
        }

        sockaddr := r.FormValue("sock")
        if sockaddr == "" {
                err = fmt.Errorf("NeedAfterAlloc parameter 'sock' for IPC")
                replyFail(w, r, err.Error())
                return
        }

        s.fslock.Lock()
        if s.sockaddr != "" ||
                !atomic.CompareAndSwapUint32((*uint32)(&s.state), uint32(fs.FSStatResume), uint32(fs.FSStatSuspend)) {
                s.fslock.Unlock()
                err = fmt.Errorf("Already in suspend: sock '%s', state %v", s.sockaddr, s.state)
                replyFail(w, r, err.Error())
                return
        }
        s.sockaddr = sockaddr
        s.fslock.Unlock()

        // wait
        msg := <-s.suspendCh
        switch msg.(type) {
        case error:
                err = msg.(error)
        case string:
                ret = msg.(string)
        default:
                err = fmt.Errorf("Unknown return type: %v", msg)
        }

        if err != nil {
                s.fslock.Lock()
                atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
                s.sockaddr = ""
                s.fslock.Unlock()
                replyFail(w, r, err.Error())
                return
        }

        if !atomic.CompareAndSwapUint32((*uint32)(&s.state), uint32(fs.FSStatSuspend), uint32(fs.FSStatShutdown)) {
                s.fslock.Lock()
                atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
                s.sockaddr = ""
                s.fslock.Unlock()
                err = fmt.Errorf("Invalid old state %v", s.state)
                replyFail(w, r, err.Error())
                return
        }

        replySucc(w, r, fmt.Sprintf("set suspend successfully: %s", ret))
}

func (s *Super) SetResume(w http.ResponseWriter, r *http.Request) {
        s.fslock.Lock()
        atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
        s.sockaddr = ""
        s.fslock.Unlock()
        replySucc(w, r, "set resume successfully")
}

func (s *Super) EnableAuditLog(w http.ResponseWriter, r *http.Request) {
        var err error
        if err = r.ParseForm(); err != nil {
                auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        logPath := r.FormValue("path")
        if logPath == "" {
                err = fmt.Errorf("path cannot be empty")
                auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        prefix := r.FormValue("prefix")
        if prefix == "" {
                err = fmt.Errorf("prefix cannot be empty")
                auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        logSize := 0

        if logSizeStr := r.FormValue("logsize"); logSizeStr != "" {
                val, err := strconv.Atoi(logSizeStr)
                if err != nil {
                        err = fmt.Errorf("logSize error")
                        auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }
                logSize = val
        } else {
                logSize = auditlog.DefaultAuditLogSize
        }

        dir, logModule, logMaxSize, err := auditlog.GetAuditLogInfo()
        if err != nil {

                _, err = auditlog.InitAuditWithPrefix(logPath, prefix, int64(auditlog.DefaultAuditLogSize),
                        auditlog.NewAuditPrefix(s.masters, s.volname, s.subDir, s.mountPoint))
                if err != nil {
                        err = errors.NewErrorf("Init audit log fail: %v\n", err)
                        auditlog.BuildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }

                info := fmt.Sprintf("audit log is initialized with params: logDir(%v) logModule(%v) logMaxSize(%v)",
                        logPath, prefix, logSize)
                auditlog.BuildSuccessResp(w, info)
        } else {
                info := fmt.Sprintf("audit log is already initialized with params: logDir(%v) logModule(%v) logMaxSize(%v)",
                        dir, logModule, logMaxSize)
                auditlog.BuildSuccessResp(w, info)
        }
}

func (s *Super) State() (state fs.FSStatType, sockaddr string) {
        return fs.FSStatType(atomic.LoadUint32((*uint32)(&s.state))), s.sockaddr
}

func (s *Super) Notify(stat fs.FSStatType, msg interface{}) {
        if stat == fs.FSStatSuspend {
                s.suspendCh <- msg
        } else if stat == fs.FSStatRestore {
                s.fslock.Lock()
                atomic.StoreUint32((*uint32)(&s.state), uint32(fs.FSStatResume))
                s.sockaddr = ""
                s.fslock.Unlock()
        }
}

func (s *Super) loopSyncMeta() {
        if s.bcacheDir == "" {
                return
        }
        for {
                finishC := s.syncMeta()
                select {
                case <-finishC:
                        time.Sleep(time.Second * time.Duration(s.bcacheCheckInterval))
                case <-s.closeC:
                        return
                }
        }
}

func (s *Super) syncMeta() <-chan struct{} {
        finishC := make(chan struct{})
        start := time.Now()
        cacheLen := s.ic.lruList.Len()

        allInodes := func() <-chan uint64 {
                out := make(chan uint64)
                go func() {
                        for i := s.ic.lruList.Front(); i != nil; i = i.Next() {
                                oldInfo := i.Value.(*proto.InodeInfo)
                                out <- oldInfo.Inode
                        }
                        close(out)
                }()
                return out
        }()

        getChanged := func(in <-chan uint64, batchCnt int64) <-chan uint64 {
                out := make(chan uint64)
                changed := make([]uint64, 0)
                s.taskPool[0].Run(func() {
                        tmpInodes := make([]uint64, 0, batchCnt)
                        for i := range in {
                                tmpInodes = append(tmpInodes, i)
                                if len(tmpInodes) == int(batchCnt) {
                                        changed = append(changed, s.getModifyInodes(tmpInodes)...)
                                        tmpInodes = tmpInodes[:0]
                                }
                        }
                        if len(tmpInodes) != 0 {
                                changed = append(changed, s.getModifyInodes(tmpInodes)...)
                        }
                        for i := range changed {
                                out <- changed[i]
                        }
                        close(out)
                })
                return out
        }

        batCh := make([]<-chan uint64, DefaultTaskPoolSize/3)
        for i := range batCh {
                batCh[i] = getChanged(allInodes, s.bcacheBatchCnt)
        }

        mergeChanged := func(cs []<-chan uint64) <-chan uint64 {
                var wg sync.WaitGroup
                out := make(chan uint64)
                wg.Add(len(cs))
                for _, c := range cs {
                        go func(c <-chan uint64) {
                                for n := range c {
                                        out <- n
                                }
                                wg.Done()
                        }(c)
                }
                go func() {
                        wg.Wait()
                        close(out)
                }()
                return out
        }

        var changeCnt int

        for ino := range mergeChanged(batCh) {
                inode := ino
                changeCnt++
                log.LogDebugf("sync meta,inode:%d changed", inode)
                s.ic.Delete(inode)
                s.taskPool[1].Run(func() {
                        common.Timed(3, 100).On(func() error {
                                extents := s.ec.GetExtents(inode)
                                if err := s.ec.ForceRefreshExtentsCache(inode); err != nil {
                                        if err != os.ErrNotExist {
                                                log.LogErrorf("ForceRefreshExtentsCache failed:%v", err)
                                        }
                                }
                                log.LogDebugf("inode:%d,extents is :%v", inode, extents)
                                for _, extent := range extents {
                                        cacheKey := util.GenerateRepVolKey(s.volname, inode, extent.PartitionId, extent.ExtentId, extent.FileOffset)
                                        // retry to make possible evict success
                                        if s.bc != nil {
                                                common.Timed(3, 100).On(func() error {
                                                        return s.bc.Evict(cacheKey)
                                                })
                                        }

                                }
                                return nil
                        })
                })
        }

        log.LogDebugf("total cache cnt:%d,changedCnt:%d,sync meta cost:%v", cacheLen, changeCnt, time.Since(start))
        close(finishC)

        return finishC
}

func (s *Super) getModifyInodes(inodes []uint64) (changedNodes []uint64) {
        inodeInfos := s.mw.BatchInodeGet(inodes)

        // get deleted files
        if len(inodeInfos) != len(inodes) {
                changedNodes = append(changedNodes, getDelInodes(inodes, inodeInfos)...)
                log.LogDebugf("len inodes is %d, len get inode infos is :%d, del inodes is:%v", len(inodes), len(inodeInfos), changedNodes)
        }

        for _, newInfo := range inodeInfos {
                oldInfo := s.ic.Get(newInfo.Inode)
                if oldInfo == nil {
                        continue
                }
                if !oldInfo.ModifyTime.Equal(newInfo.ModifyTime) || newInfo.Generation != s.ec.GetExtentCacheGen(newInfo.Inode) {
                        log.LogDebugf("oldInfo:ino(%d) modifyTime(%v) gen(%d),newInfo:ino(%d) modifyTime(%d) gen(%d)", oldInfo.Inode, oldInfo.ModifyTime.Unix(), s.ec.GetExtentCacheGen(newInfo.Inode), newInfo.Inode, newInfo.ModifyTime.Unix(), newInfo.Generation)
                        changedNodes = append(changedNodes, newInfo.Inode)
                } else {
                        log.LogDebugf("oldInfo:ino(%d) modifyTime(%v) gen(%d),newInfo:ino(%d) modifyTime(%d) gen(%d)", oldInfo.Inode, oldInfo.ModifyTime.Unix(), s.ec.GetExtentCacheGen(newInfo.Inode), newInfo.Inode, newInfo.ModifyTime.Unix(), newInfo.Generation)
                }
        }
        return
}

func getDelInodes(src []uint64, act []*proto.InodeInfo) []uint64 {
        delInodes := make([]uint64, 0)
        m := make(map[uint64]struct{})
        for _, iInfo := range act {
                m[iInfo.Inode] = struct{}{}
        }
        for _, inode := range src {
                if _, ok := m[inode]; !ok {
                        delInodes = append(delInodes, inode)
                }
        }
        return delInodes
}

func (s *Super) Close() {
        close(s.closeC)
}

func (s *Super) SetTransaction(txMaskStr string, timeout int64, retryNum int64, retryInterval int64) {
        // maskStr := proto.GetMaskString(txMask)
        mask, err := proto.GetMaskFromString(txMaskStr)
        if err != nil {
                log.LogErrorf("SetTransaction: err[%v], op[%v], timeout[%v]", err, txMaskStr, timeout)
                return
        }

        s.mw.EnableTransaction = mask
        if timeout <= 0 {
                timeout = proto.DefaultTransactionTimeout
        }
        s.mw.TxTimeout = timeout

        if retryNum <= 0 {
                retryNum = proto.DefaultTxConflictRetryNum
        }
        s.mw.TxConflictRetryNum = retryNum

        if retryInterval <= 0 {
                retryInterval = proto.DefaultTxConflictRetryInterval
        }
        s.mw.TxConflictRetryInterval = retryInterval
        log.LogDebugf("SetTransaction: mask[%v], op[%v], timeout[%v], retryNum[%v], retryInterval[%v ms]",
                mask, txMaskStr, timeout, retryNum, retryInterval)
}

package common

import (
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/util/config"
)

const (
        StateStandby uint32 = iota
        StateStart
        StateRunning
        StateShutdown
        StateStopped
)

type Control struct {
        state uint32
        wg    sync.WaitGroup
}

type Server interface {
        Start(cfg *config.Config) error
        Shutdown()
        // Sync will block invoker goroutine until this MetaNode shutdown.
        Sync()
}

type (
        DoStartFunc    func(s Server, cfg *config.Config) (err error)
        DoShutdownFunc func(s Server)
)

func (c *Control) Start(s Server, cfg *config.Config, do DoStartFunc) (err error) {
        if atomic.CompareAndSwapUint32(&c.state, StateStandby, StateStart) {
                defer func() {
                        var newState uint32
                        if err != nil {
                                newState = StateStandby
                        } else {
                                newState = StateRunning
                        }
                        atomic.StoreUint32(&c.state, newState)
                }()
                if err = do(s, cfg); err != nil {
                        return
                }
                c.wg.Add(1)
        }
        return
}

func (c *Control) Shutdown(s Server, do DoShutdownFunc) {
        if atomic.CompareAndSwapUint32(&c.state, StateRunning, StateShutdown) {
                do(s)
                c.wg.Done()
                atomic.StoreUint32(&c.state, StateStopped)
        }
}

func (c *Control) Sync() {
        if atomic.LoadUint32(&c.state) == StateRunning {
                c.wg.Wait()
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "encoding/binary"
        "encoding/json"
        "fmt"
        "hash/crc32"
        "math"
        "net"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// DataPartitionRepairTask defines the repair task for the data partition.
type DataPartitionRepairTask struct {
        TaskType                       uint8
        addr                           string
        extents                        map[uint64]*storage.ExtentInfo
        ExtentsToBeCreated             []*storage.ExtentInfo
        ExtentsToBeRepaired            []*storage.ExtentInfo
        LeaderTinyDeleteRecordFileSize int64
        LeaderAddr                     string
}

func NewDataPartitionRepairTask(extentFiles []*storage.ExtentInfo, tinyDeleteRecordFileSize int64, source, leaderAddr string) (task *DataPartitionRepairTask) {
        task = &DataPartitionRepairTask{
                extents:                        make(map[uint64]*storage.ExtentInfo),
                ExtentsToBeCreated:             make([]*storage.ExtentInfo, 0),
                ExtentsToBeRepaired:            make([]*storage.ExtentInfo, 0),
                LeaderTinyDeleteRecordFileSize: tinyDeleteRecordFileSize,
                LeaderAddr:                     leaderAddr,
        }
        for _, extentFile := range extentFiles {
                extentFile.Source = source
                task.extents[extentFile.FileID] = extentFile
        }
        return
}

// Main function to perform the repair.
// The repair process can be described as follows:
// There are two types of repairs.
// The first one is called the normal extent repair, and the second one is called the tiny extent repair.
//  1. normal extent repair:
//     - the leader collects all the extent information from the followers.
//     - for each extent, we compare all the replicas to find the one with the largest size.
//     - periodically check the size of the local extent, and if it is smaller than the largest size,
//     add it to the tobeRepaired list, and generate the corresponding tasks.
//  2. tiny extent repair:
//     - when creating the new partition, add all tiny extents to the toBeRepaired list,
//     and the repair task will create all the tiny extents first.
//     - The leader of the replicas periodically collects the extent information of each follower
//     - for each extent, we compare all the replicas to find the one with the largest size.
//     - periodically check the size of the local extent, and if it is smaller than the largest size,
//     add it to the tobeRepaired list, and generate the corresponding tasks.
func (dp *DataPartition) repair(extentType uint8) {
        start := time.Now().UnixNano()
        log.LogInfof("action[repair] partition(%v) start.", dp.partitionID)

        var tinyExtents []uint64 // unavailable extents
        if proto.IsTinyExtentType(extentType) {
                tinyExtents = dp.brokenTinyExtents()
                if len(tinyExtents) == 0 {
                        return
                }
        }

        // fix dp replica index panic , using replica copy
        replica := dp.getReplicaCopy()
        repairTasks := make([]*DataPartitionRepairTask, len(replica))
        err := dp.buildDataPartitionRepairTask(repairTasks, extentType, tinyExtents, replica)
        if err != nil {
                log.LogErrorf(errors.Stack(err))
                log.LogErrorf("action[repair] partition(%v) err(%v).",
                        dp.partitionID, err)
                dp.moveToBrokenTinyExtentC(extentType, tinyExtents)
                return
        }
        log.LogInfof("action[repair] partition(%v) before prepareRepairTasks", dp.partitionID)
        // compare all the extents in the replicas to compute the good and bad ones
        availableTinyExtents, brokenTinyExtents := dp.prepareRepairTasks(repairTasks)

        // notify the replicas to repair the extent
        err = dp.NotifyExtentRepair(repairTasks)
        if err != nil {
                dp.sendAllTinyExtentsToC(extentType, availableTinyExtents, brokenTinyExtents)
                log.LogErrorf("action[repair] partition(%v) err(%v).",
                        dp.partitionID, err)
                log.LogError(errors.Stack(err))
                return
        }
        log.LogDebugf("DoRepair")
        // ask the leader to do the repair
        dp.DoRepair(repairTasks)
        end := time.Now().UnixNano()

        // every time we need to figure out which extents need to be repaired and which ones do not.
        dp.sendAllTinyExtentsToC(extentType, availableTinyExtents, brokenTinyExtents)

        // error check
        if dp.extentStore.AvailableTinyExtentCnt()+dp.extentStore.BrokenTinyExtentCnt() > storage.TinyExtentCount {
                log.LogWarnf("action[repair] partition(%v) GoodTinyExtents(%v) "+
                        "BadTinyExtents(%v) finish cost[%vms].", dp.partitionID, dp.extentStore.AvailableTinyExtentCnt(),
                        dp.extentStore.BrokenTinyExtentCnt(), (end-start)/int64(time.Millisecond))
        }

        log.LogInfof("action[repair] partition(%v) GoodTinyExtents(%v) BadTinyExtents(%v)"+
                " finish cost[%vms] masterAddr(%v).", dp.partitionID, dp.extentStore.AvailableTinyExtentCnt(),
                dp.extentStore.BrokenTinyExtentCnt(), (end-start)/int64(time.Millisecond), MasterClient.Nodes())
}

func (dp *DataPartition) buildDataPartitionRepairTask(repairTasks []*DataPartitionRepairTask, extentType uint8, tinyExtents []uint64, replica []string) (err error) {
        // get the local extent info
        extents, leaderTinyDeleteRecordFileSize, err := dp.getLocalExtentInfo(extentType, tinyExtents)
        if err != nil {
                return err
        }
        // new repair task for the leader
        log.LogInfof("buildDataPartitionRepairTask dp %v, extent type %v, len extent %v, replica size %v", dp.partitionID, extentType, len(extents), len(replica))
        repairTasks[0] = NewDataPartitionRepairTask(extents, leaderTinyDeleteRecordFileSize, replica[0], replica[0])
        repairTasks[0].addr = replica[0]

        // new repair tasks for the followers
        for index := 1; index < len(replica); index++ {
                extents, err := dp.getRemoteExtentInfo(extentType, tinyExtents, replica[index])
                if err != nil {
                        log.LogErrorf("buildDataPartitionRepairTask PartitionID(%v) on (%v) err(%v)", dp.partitionID, replica[index], err)
                        continue
                }
                log.LogInfof("buildDataPartitionRepairTask dp %v,  add new add %v,  extent type %v", dp.partitionID, replica[index], extentType)
                repairTasks[index] = NewDataPartitionRepairTask(extents, leaderTinyDeleteRecordFileSize, replica[index], replica[0])
                repairTasks[index].addr = replica[index]
        }

        return
}

func (dp *DataPartition) getLocalExtentInfo(extentType uint8, tinyExtents []uint64) (extents []*storage.ExtentInfo, leaderTinyDeleteRecordFileSize int64, err error) {
        var localExtents []*storage.ExtentInfo

        if proto.IsNormalExtentType(extentType) {
                localExtents, leaderTinyDeleteRecordFileSize, err = dp.extentStore.GetAllWatermarks(storage.NormalExtentFilter())
        } else {
                localExtents, leaderTinyDeleteRecordFileSize, err = dp.extentStore.GetAllWatermarks(storage.TinyExtentFilter(tinyExtents))
        }
        if err != nil {
                err = errors.Trace(err, "getLocalExtentInfo extent DataPartition(%v) GetAllWaterMark", dp.partitionID)
                return
        }
        if len(localExtents) <= 0 {
                extents = make([]*storage.ExtentInfo, 0)
                return
        }
        extents = make([]*storage.ExtentInfo, 0, len(localExtents))
        for _, et := range localExtents {
                newEt := *et
                extents = append(extents, &newEt)
        }
        return
}

func (dp *DataPartition) getRemoteExtentInfo(extentType uint8, tinyExtents []uint64,
        target string) (extentFiles []*storage.ExtentInfo, err error) {
        p := repl.NewPacketToGetAllWatermarks(dp.partitionID, extentType)
        extentFiles = make([]*storage.ExtentInfo, 0)
        if proto.IsTinyExtentType(extentType) {
                p.Data, err = json.Marshal(tinyExtents)
                if err != nil {
                        err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) GetAllWatermarks", dp.partitionID)
                        return
                }
                p.Size = uint32(len(p.Data))
        }
        var conn *net.TCPConn
        conn, err = gConnPool.GetConnect(target) // get remote connection
        if err != nil {
                err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) get host(%v) connect", dp.partitionID, target)
                return
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        err = p.WriteToConn(conn) // write command to the remote host
        if err != nil {
                err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) write to host(%v)", dp.partitionID, target)
                return
        }
        reply := new(repl.Packet)
        err = reply.ReadFromConnWithVer(conn, proto.GetAllWatermarksDeadLineTime) // read the response
        if err != nil {
                err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) read from host(%v)", dp.partitionID, target)
                return
        }
        err = json.Unmarshal(reply.Data[:reply.Size], &extentFiles)
        if err != nil {
                err = errors.Trace(err, "getRemoteExtentInfo DataPartition(%v) unmarshal json(%v) from host(%v)",
                        dp.partitionID, string(reply.Data[:reply.Size]), target)
                return
        }

        return
}

// DoRepair asks the leader to perform the repair tasks.
func (dp *DataPartition) DoRepair(repairTasks []*DataPartitionRepairTask) {
        store := dp.extentStore
        for _, extentInfo := range repairTasks[0].ExtentsToBeCreated {
                if !AutoRepairStatus {
                        log.LogWarnf("AutoRepairStatus is False,so cannot Create extent(%v),pid=%d", extentInfo.String(), dp.partitionID)
                        continue
                }
                if dp.ExtentStore().IsDeletedNormalExtent(extentInfo.FileID) {
                        continue
                }

                dp.disk.allocCheckLimit(proto.IopsWriteType, 1)

                store.Create(extentInfo.FileID)
        }
        log.LogDebugf("action[DoRepair] leader to repair len[%v], {%v}", len(repairTasks[0].ExtentsToBeRepaired), repairTasks[0].ExtentsToBeRepaired)
        for _, extentInfo := range repairTasks[0].ExtentsToBeRepaired {
                log.LogDebugf("action[DoRepair] leader to repair len[%v], {%v}", len(repairTasks[0].ExtentsToBeRepaired), extentInfo)
                err := dp.streamRepairExtent(extentInfo)
                if err != nil {
                        err = errors.Trace(err, "doStreamExtentFixRepair %v", dp.applyRepairKey(int(extentInfo.FileID)))
                        localExtentInfo, opErr := dp.ExtentStore().Watermark(uint64(extentInfo.FileID))
                        if opErr != nil {
                                err = errors.Trace(err, opErr.Error())
                        }
                        err = errors.Trace(err, "partition(%v) remote(%v) local(%v)",
                                dp.partitionID, extentInfo, localExtentInfo)
                        log.LogWarnf("action[doStreamExtentFixRepair] err(%v).", err)
                }
        }
}

func (dp *DataPartition) moveToBrokenTinyExtentC(extentType uint8, extents []uint64) {
        if proto.IsTinyExtentType(extentType) {
                dp.extentStore.SendAllToBrokenTinyExtentC(extents)
        }
}

func (dp *DataPartition) sendAllTinyExtentsToC(extentType uint8, availableTinyExtents, brokenTinyExtents []uint64) {
        if !proto.IsTinyExtentType(extentType) {
                return
        }
        for _, extentID := range availableTinyExtents {
                if storage.IsTinyExtent(extentID) {
                        dp.extentStore.SendToAvailableTinyExtentC(extentID)
                }
        }
        for _, extentID := range brokenTinyExtents {
                if storage.IsTinyExtent(extentID) {
                        dp.extentStore.SendToBrokenTinyExtentC(extentID)
                }
        }
}

func (dp *DataPartition) brokenTinyExtents() (brokenTinyExtents []uint64) {
        brokenTinyExtents = make([]uint64, 0)
        extentsToBeRepaired := MinTinyExtentsToRepair
        if dp.extentStore.AvailableTinyExtentCnt() <= MinAvaliTinyExtentCnt {
                extentsToBeRepaired = storage.TinyExtentCount
        }
        for i := 0; i < extentsToBeRepaired; i++ {
                extentID, err := dp.extentStore.GetBrokenTinyExtent()
                if err != nil {
                        return
                }
                brokenTinyExtents = append(brokenTinyExtents, extentID)
        }
        return
}

func (dp *DataPartition) prepareRepairTasks(repairTasks []*DataPartitionRepairTask) (availableTinyExtents []uint64, brokenTinyExtents []uint64) {
        extentInfoMap := make(map[uint64]*storage.ExtentInfo)
        deleteExtents := make(map[uint64]bool)
        log.LogInfof("action[prepareRepairTasks] dp %v task len %v", dp.partitionID, len(repairTasks))
        for index := 0; index < len(repairTasks); index++ {
                repairTask := repairTasks[index]
                if repairTask == nil {
                        continue
                }
                for extentID, extentInfo := range repairTask.extents {
                        if extentInfo.IsDeleted {
                                deleteExtents[extentID] = true
                                continue
                        }
                        extentWithMaxSize, ok := extentInfoMap[extentID]
                        if !ok {
                                extentInfoMap[extentID] = extentInfo
                        } else {
                                if extentInfo.TotalSize() > extentWithMaxSize.TotalSize() {
                                        extentInfoMap[extentID] = extentInfo
                                }
                        }
                        //                        log.LogInfof("action[prepareRepairTasks] dp %v extentid %v addr[dst %v,leader %v] info %v", dp.partitionID, extentID, repairTask.addr, repairTask.LeaderAddr, extentInfoMap[extentID])
                }
        }
        for extentID := range deleteExtents {
                extentInfo := extentInfoMap[extentID]
                if extentInfo != nil {
                        extentInfo.IsDeleted = true
                        extentInfoMap[extentID] = extentInfo
                }
        }
        dp.buildExtentCreationTasks(repairTasks, extentInfoMap)
        availableTinyExtents, brokenTinyExtents = dp.buildExtentRepairTasks(repairTasks, extentInfoMap)
        return
}

// Create a new extent if one of the replica is missing.
func (dp *DataPartition) buildExtentCreationTasks(repairTasks []*DataPartitionRepairTask, extentInfoMap map[uint64]*storage.ExtentInfo) {
        for extentID, extentInfo := range extentInfoMap {
                if storage.IsTinyExtent(extentID) {
                        continue
                }
                for index := 0; index < len(repairTasks); index++ {
                        repairTask := repairTasks[index]
                        if repairTask == nil {
                                continue
                        }
                        if _, ok := repairTask.extents[extentID]; !ok && !extentInfo.IsDeleted {
                                if storage.IsTinyExtent(extentID) {
                                        continue
                                }
                                if extentInfo.IsDeleted {
                                        continue
                                }
                                if dp.ExtentStore().IsDeletedNormalExtent(extentID) {
                                        continue
                                }
                                ei := &storage.ExtentInfo{Source: extentInfo.Source, FileID: extentID, Size: extentInfo.Size, SnapshotDataOff: extentInfo.SnapshotDataOff}
                                repairTask.ExtentsToBeCreated = append(repairTask.ExtentsToBeCreated, ei)
                                repairTask.ExtentsToBeRepaired = append(repairTask.ExtentsToBeRepaired, ei)
                                log.LogInfof("action[generatorAddExtentsTasks] addFile(%v_%v) on Index(%v).", dp.partitionID, ei, index)
                        }
                }
        }
}

// Repair an extent if the replicas do not have the same length.
func (dp *DataPartition) buildExtentRepairTasks(repairTasks []*DataPartitionRepairTask, maxSizeExtentMap map[uint64]*storage.ExtentInfo) (availableTinyExtents []uint64, brokenTinyExtents []uint64) {
        availableTinyExtents = make([]uint64, 0)
        brokenTinyExtents = make([]uint64, 0)
        for extentID, maxFileInfo := range maxSizeExtentMap {

                hasBeenRepaired := true
                for index := 0; index < len(repairTasks); index++ {
                        if repairTasks[index] == nil {
                                continue
                        }
                        extentInfo, ok := repairTasks[index].extents[extentID]
                        if !ok {
                                continue
                        }
                        if extentInfo.IsDeleted {
                                continue
                        }
                        if dp.ExtentStore().IsDeletedNormalExtent(extentID) {
                                continue
                        }
                        if extentInfo.TotalSize() < maxFileInfo.TotalSize() {
                                fixExtent := &storage.ExtentInfo{Source: maxFileInfo.Source, FileID: extentID, Size: maxFileInfo.Size, SnapshotDataOff: maxFileInfo.SnapshotDataOff}
                                repairTasks[index].ExtentsToBeRepaired = append(repairTasks[index].ExtentsToBeRepaired, fixExtent)
                                log.LogInfof("action[generatorFixExtentSizeTasks] fixExtent(%v_%v) on Index(%v) on(%v).",
                                        dp.partitionID, fixExtent, index, repairTasks[index].addr)
                                hasBeenRepaired = false
                        }

                }
                if storage.IsTinyExtent(extentID) {
                        if hasBeenRepaired {
                                availableTinyExtents = append(availableTinyExtents, extentID)
                        } else {
                                brokenTinyExtents = append(brokenTinyExtents, extentID)
                        }
                }
        }
        return
}

func (dp *DataPartition) notifyFollower(wg *sync.WaitGroup, index int, members []*DataPartitionRepairTask) (err error) {
        p := repl.NewPacketToNotifyExtentRepair(dp.partitionID) // notify all the followers to repair
        var conn *net.TCPConn
        // target := dp.getReplicaAddr(index)
        // fix repair case panic,may be dp's replicas is change
        target := members[index].addr

        p.Data, _ = json.Marshal(members[index])
        p.Size = uint32(len(p.Data))
        conn, err = gConnPool.GetConnect(target)
        defer func() {
                wg.Done()
                if err == nil {
                        log.LogInfof(ActionNotifyFollowerToRepair+" to host(%v) Partition(%v) done", target, dp.partitionID)
                } else {
                        log.LogErrorf(ActionNotifyFollowerToRepair+" to host(%v) Partition(%v) failed, err(%v)", target, dp.partitionID, err)
                }
        }()
        if err != nil {
                return err
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        if err = p.WriteToConn(conn); err != nil {
                return err
        }
        if err = p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
                return err
        }
        return err
}

// NotifyExtentRepair notifies the followers to repair.
func (dp *DataPartition) NotifyExtentRepair(members []*DataPartitionRepairTask) (err error) {
        wg := new(sync.WaitGroup)
        for i := 1; i < len(members); i++ {
                if members[i] == nil || !dp.IsExistReplica(members[i].addr) {
                        if members[i] != nil {
                                log.LogInfof("notify extend repair is change ,index(%v),pid(%v),task_member_add(%v),IsExistReplica(%v)",
                                        i, dp.partitionID, members[i].addr, dp.IsExistReplica(members[i].addr))
                        }
                        continue
                }

                wg.Add(1)
                go dp.notifyFollower(wg, i, members)
        }
        wg.Wait()
        return
}

// DoStreamExtentFixRepair executes the repair on the followers.
func (dp *DataPartition) doStreamExtentFixRepair(wg *sync.WaitGroup, remoteExtentInfo *storage.ExtentInfo) {
        defer wg.Done()

        err := dp.streamRepairExtent(remoteExtentInfo)
        if err != nil {
                // only decommission repair need to check err cnt
                if dp.isDecommissionRecovering() {
                        atomic.AddUint64(&dp.recoverErrCnt, 1)
                        if atomic.LoadUint64(&dp.recoverErrCnt) >= dp.dataNode.GetDpMaxRepairErrCnt() {
                                dp.handleDecommissionRecoverFailed()
                                return
                        }
                }
                err = errors.Trace(err, "doStreamExtentFixRepair %v", dp.applyRepairKey(int(remoteExtentInfo.FileID)))
                localExtentInfo, opErr := dp.ExtentStore().Watermark(uint64(remoteExtentInfo.FileID))
                if opErr != nil {
                        err = errors.Trace(err, opErr.Error())
                }
                err = errors.Trace(err, "partition(%v) remote(%v) local(%v)",
                        dp.partitionID, remoteExtentInfo, localExtentInfo)
                log.LogWarnf("action[doStreamExtentFixRepair] err(%v).", err)
        }
}

func (dp *DataPartition) applyRepairKey(extentID int) (m string) {
        return fmt.Sprintf("ApplyRepairKey(%v_%v)", dp.partitionID, extentID)
}

// The actual repair of an extent happens here.
func (dp *DataPartition) streamRepairExtent(remoteExtentInfo *storage.ExtentInfo) (err error) {
        log.LogDebugf("streamRepairExtent dp %v remote info %v", dp.partitionID, remoteExtentInfo)
        store := dp.ExtentStore()
        if !store.HasExtent(remoteExtentInfo.FileID) {
                log.LogDebugf("streamRepairExtent remote info %v not exist", remoteExtentInfo)
                return
        }
        if !AutoRepairStatus && !storage.IsTinyExtent(remoteExtentInfo.FileID) {
                log.LogWarnf("AutoRepairStatus is False,so cannot AutoRepair extent(%v)", remoteExtentInfo.String())
                return
        }
        localExtentInfo, err := store.Watermark(remoteExtentInfo.FileID)
        if err != nil {
                log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
                return errors.Trace(err, "streamRepairExtent Watermark error")
        }
        log.LogDebugf("streamRepairExtent dp %v remote info %v,local %v", dp.partitionID, remoteExtentInfo, localExtentInfo)
        if dp.ExtentStore().IsDeletedNormalExtent(remoteExtentInfo.FileID) {
                log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
                return nil
        }

        if localExtentInfo.Size >= remoteExtentInfo.Size && localExtentInfo.SnapshotDataOff >= remoteExtentInfo.SnapshotDataOff {
                log.LogDebugf("streamRepairExtent local %v remote info %v", localExtentInfo, remoteExtentInfo)
                return nil
        }

        doWork := func(wType int, currFixOffset uint64, dstOffset uint64, request *repl.Packet) (err error) {
                log.LogDebugf("streamRepairExtent. currFixOffset %v dstOffset %v, request %v", currFixOffset, dstOffset, request)
                var conn net.Conn
                conn, err = dp.getRepairConn(remoteExtentInfo.Source)
                if err != nil {
                        return errors.Trace(err, "streamRepairExtent get conn from host(%v) error", remoteExtentInfo.Source)
                }
                defer func() {
                        dp.putRepairConn(conn, err != nil)
                }()

                if err = request.WriteToConn(conn); err != nil {
                        err = errors.Trace(err, "streamRepairExtent send streamRead to host(%v) error", remoteExtentInfo.Source)
                        log.LogWarnf("action[streamRepairExtent] err(%v).", err)
                        return
                }

                var hasRecoverySize uint64
                var loopTimes uint64
                for currFixOffset < dstOffset {
                        if currFixOffset >= dstOffset {
                                break
                        }
                        reply := repl.NewPacket()

                        // read 64k streaming repair packet
                        if err = reply.ReadFromConnWithVer(conn, 60); err != nil {
                                err = errors.Trace(err, "streamRepairExtent receive data error,localExtentSize(%v) remoteExtentSize(%v)", currFixOffset, dstOffset)
                                return
                        }

                        if reply.ResultCode != proto.OpOk {
                                err = errors.Trace(fmt.Errorf("unknow result code"),
                                        "streamRepairExtent receive opcode error(%v) ,localExtentSize(%v) remoteExtentSize(%v)", string(reply.Data[:intMin(len(reply.Data), int(reply.Size))]), currFixOffset, remoteExtentInfo.Size)
                                return
                        }

                        if reply.ReqID != request.ReqID || reply.PartitionID != request.PartitionID ||
                                reply.ExtentID != request.ExtentID {
                                err = errors.Trace(fmt.Errorf("unavali reply"), "streamRepairExtent receive unavalid "+
                                        "request(%v) reply(%v) ,localExtentSize(%v) remoteExtentSize(%v)", request.GetUniqueLogId(), reply.GetUniqueLogId(), currFixOffset, dstOffset)
                                return
                        }

                        if !storage.IsTinyExtent(reply.ExtentID) && (reply.Size == 0 || reply.ExtentOffset != int64(currFixOffset)) {
                                err = errors.Trace(fmt.Errorf("unavali reply"), "streamRepairExtent receive unavalid "+
                                        "request(%v) reply(%v) localExtentSize(%v) remoteExtentSize(%v)", request.GetUniqueLogId(), reply.GetUniqueLogId(), currFixOffset, dstOffset)
                                return
                        }
                        if loopTimes%100 == 0 {
                                log.LogInfof(fmt.Sprintf("action[streamRepairExtent] fix(%v_%v) start fix from (%v)"+
                                        " remoteSize(%v)localSize(%v) reply(%v).", dp.partitionID, localExtentInfo.FileID, remoteExtentInfo.String(),
                                        dstOffset, currFixOffset, reply.GetUniqueLogId()))
                        }
                        loopTimes++

                        actualCrc := crc32.ChecksumIEEE(reply.Data[:reply.Size])
                        if reply.CRC != actualCrc {
                                err = fmt.Errorf("streamRepairExtent crc mismatch expectCrc(%v) actualCrc(%v) extent(%v_%v) start fix from (%v)"+
                                        " remoteSize(%v) localSize(%v) request(%v) reply(%v) ", reply.CRC, actualCrc, dp.partitionID, remoteExtentInfo.String(),
                                        remoteExtentInfo.Source, dstOffset, currFixOffset, request.GetUniqueLogId(), reply.GetUniqueLogId())
                                return errors.Trace(err, "streamRepairExtent receive data error")
                        }
                        isEmptyResponse := false
                        // Write it to local extent file
                        if storage.IsTinyExtent(uint64(localExtentInfo.FileID)) {
                                currRecoverySize := uint64(reply.Size)
                                var remoteAvaliSize uint64
                                if reply.ArgLen == TinyExtentRepairReadResponseArgLen {
                                        remoteAvaliSize = binary.BigEndian.Uint64(reply.Arg[9:TinyExtentRepairReadResponseArgLen])
                                }
                                if reply.Arg != nil { // compact v1.2.0 recovery
                                        isEmptyResponse = reply.Arg[0] == EmptyResponse
                                }
                                if isEmptyResponse {
                                        currRecoverySize = binary.BigEndian.Uint64(reply.Arg[1:9])
                                        reply.Size = uint32(currRecoverySize)
                                }
                                err = store.TinyExtentRecover(uint64(localExtentInfo.FileID), int64(currFixOffset), int64(currRecoverySize), reply.Data, reply.CRC, isEmptyResponse)
                                if hasRecoverySize+currRecoverySize >= remoteAvaliSize {
                                        log.LogInfof("streamRepairTinyExtent(%v) recover fininsh,remoteAvaliSize(%v) "+
                                                "hasRecoverySize(%v) currRecoverySize(%v)", dp.applyRepairKey(int(localExtentInfo.FileID)),
                                                remoteAvaliSize, hasRecoverySize+currRecoverySize, currRecoverySize)
                                        break
                                }
                        } else {
                                log.LogDebugf("streamRepairExtent reply size %v, currFixoffset %v, reply %v ", reply.Size, currFixOffset, reply)
                                _, err = store.Write(uint64(localExtentInfo.FileID), int64(currFixOffset), int64(reply.Size), reply.Data, reply.CRC, wType, BufferWrite)
                        }
                        // log.LogDebugf("streamRepairExtent reply size %v, currFixoffset %v, reply %v err %v", reply.Size, currFixOffset, reply, err)
                        // write to the local extent file
                        if err != nil {
                                err = errors.Trace(err, "streamRepairExtent repair data error ")
                                return
                        }
                        hasRecoverySize += uint64(reply.Size)
                        currFixOffset += uint64(reply.Size)
                        if currFixOffset >= dstOffset {
                                log.LogWarnf(fmt.Sprintf("action[streamRepairExtent] fix(%v_%v) start fix from (%v)"+
                                        " remoteSize(%v)localSize(%v) reply(%v).", dp.partitionID, localExtentInfo.FileID, remoteExtentInfo.String(),
                                        dstOffset, currFixOffset, reply.GetUniqueLogId()))
                                break
                        }
                }
                return
        }

        // size difference between the local extent and the remote extent
        var request *repl.Packet
        sizeDiff := remoteExtentInfo.Size - localExtentInfo.Size

        if storage.IsTinyExtent(remoteExtentInfo.FileID) {
                if sizeDiff >= math.MaxUint32 {
                        sizeDiff = math.MaxUint32 - util.MB
                }
                request = repl.NewTinyExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.Size), int(sizeDiff))
                currFixOffset := localExtentInfo.Size
                return doWork(0, currFixOffset, remoteExtentInfo.Size, request)
        } else {
                if sizeDiff > 0 {
                        log.LogDebugf("streamRepairExtent. local info %v, remote %v", localExtentInfo, remoteExtentInfo)
                        request = repl.NewExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.Size), int(sizeDiff))
                        currFixOffset := localExtentInfo.Size
                        if err = doWork(storage.AppendWriteType, currFixOffset, remoteExtentInfo.Size, request); err != nil {
                                return
                        }
                }
                sizeDiffVerAppend := remoteExtentInfo.SnapshotDataOff - localExtentInfo.SnapshotDataOff
                if sizeDiffVerAppend > 0 {
                        request = repl.NewExtentRepairReadPacket(dp.partitionID, remoteExtentInfo.FileID, int(localExtentInfo.SnapshotDataOff), int(sizeDiffVerAppend))
                        currFixOffset := localExtentInfo.SnapshotDataOff
                        return doWork(storage.AppendRandomWriteType, currFixOffset, remoteExtentInfo.SnapshotDataOff, request)
                }
        }

        return
}

func intMin(a, b int) int {
        if a < b {
                return a
        } else {
                return b
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "context"
        "fmt"
        syslog "log"
        "os"
        "path"
        "regexp"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/loadutil"
        "github.com/cubefs/cubefs/util/log"
        "github.com/shirou/gopsutil/disk"
)

var (
        // RegexpDataPartitionDir validates the directory name of a data partition.
        RegexpDataPartitionDir, _        = regexp.Compile(`^datapartition_(\d)+_(\d)+$`)
        RegexpCachePartitionDir, _       = regexp.Compile(`^cachepartition_(\d)+_(\d)+$`)
        RegexpPreLoadPartitionDir, _     = regexp.Compile(`^preloadpartition_(\d)+_(\d)+$`)
        RegexpExpiredDataPartitionDir, _ = regexp.Compile(`^expired_datapartition_(\d)+_(\d)+$`)
)

const (
        ExpiredPartitionPrefix    = "expired_"
        ExpiredPartitionExistTime = time.Hour * time.Duration(24*7)
)

const (
        DecommissionDiskMark = "decommissionDiskMark"
)

// Disk represents the structure of the disk
type Disk struct {
        sync.RWMutex
        Path        string
        ReadErrCnt  uint64 // number of read errors
        WriteErrCnt uint64 // number of write errors

        Total       uint64
        Used        uint64
        Available   uint64
        Unallocated uint64
        Allocated   uint64

        MaxErrCnt       int // maximum number of errors
        Status          int // disk status such as READONLY
        ReservedSpace   uint64
        DiskRdonlySpace uint64

        RejectWrite                               bool
        partitionMap                              map[uint64]*DataPartition
        syncTinyDeleteRecordFromLeaderOnEveryDisk chan bool
        space                                     *SpaceManager
        dataNode                                  *DataNode

        limitFactor map[uint32]*rate.Limiter
        limitRead   *ioLimiter
        limitWrite  *ioLimiter

        // diskPartition info
        diskPartition       *disk.PartitionStat
        DiskErrPartitionSet map[uint64]struct{}
        decommission        bool
}

const (
        SyncTinyDeleteRecordFromLeaderOnEveryDisk = 5
)

type PartitionVisitor func(dp *DataPartition)

func NewDisk(path string, reservedSpace, diskRdonlySpace uint64, maxErrCnt int, space *SpaceManager) (d *Disk, err error) {
        d = new(Disk)
        d.Path = path
        d.ReservedSpace = reservedSpace
        d.DiskRdonlySpace = diskRdonlySpace
        d.MaxErrCnt = maxErrCnt
        d.RejectWrite = false
        d.space = space
        d.dataNode = space.dataNode
        d.partitionMap = make(map[uint64]*DataPartition)
        d.syncTinyDeleteRecordFromLeaderOnEveryDisk = make(chan bool, SyncTinyDeleteRecordFromLeaderOnEveryDisk)
        err = d.computeUsage()
        if err != nil {
                return nil, err
        }
        err = d.updateSpaceInfo()
        if err != nil {
                return nil, err
        }
        // get disk partition info
        d.diskPartition, err = loadutil.GetMatchParation(d.Path)
        if err != nil {
                // log but let execution continue
                log.LogErrorf("get partition info error, path is %v error message %v", d.Path, err.Error())
                err = nil
        }
        d.startScheduleToUpdateSpaceInfo()

        d.limitFactor = make(map[uint32]*rate.Limiter, 0)
        d.limitFactor[proto.FlowReadType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxFLowLimit), proto.QosDefaultBurst)
        d.limitFactor[proto.FlowWriteType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxFLowLimit), proto.QosDefaultBurst)
        d.limitFactor[proto.IopsReadType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxIoLimit), defaultIOLimitBurst)
        d.limitFactor[proto.IopsWriteType] = rate.NewLimiter(rate.Limit(proto.QosDefaultDiskMaxIoLimit), defaultIOLimitBurst)
        d.limitRead = newIOLimiter(space.dataNode.diskReadFlow, space.dataNode.diskReadIocc)
        d.limitWrite = newIOLimiter(space.dataNode.diskWriteFlow, space.dataNode.diskWriteIocc)

        d.DiskErrPartitionSet = make(map[uint64]struct{}, 0)

        err = d.initDecommissionStatus()
        if err != nil {
                log.LogErrorf("action[NewDisk]: failed to load disk decommission status")
                // NOTE: continue execution
                err = nil
        }
        return
}

func (d *Disk) MarkDecommissionStatus(decommission bool) {
        probePath := path.Join(d.Path, DecommissionDiskMark)
        var err error
        defer func() {
                if err != nil {
                        log.LogErrorf("action[MarkDecommissionStatus]: %v", err)
                        return
                }
        }()
        if decommission {
                file, err := os.Create(probePath)
                if err == nil {
                        file.Close()
                }
        } else {
                err = os.Remove(probePath)
                if os.IsNotExist(err) {
                        err = nil
                }
        }
        d.decommission = decommission
}

func (d *Disk) GetDecommissionStatus() bool {
        return d.decommission
}

func (d *Disk) initDecommissionStatus() error {
        probePath := path.Join(d.Path, DecommissionDiskMark)
        _, err := os.Stat(probePath)
        if err == nil {
                d.decommission = true
                return nil
        }
        if os.IsNotExist(err) {
                return nil
        }
        return err
}

func (d *Disk) GetDiskPartition() *disk.PartitionStat {
        return d.diskPartition
}

func (d *Disk) updateQosLimiter() {
        if d.dataNode.diskReadFlow > 0 {
                d.limitFactor[proto.FlowReadType].SetLimit(rate.Limit(d.dataNode.diskReadFlow))
        }
        if d.dataNode.diskWriteFlow > 0 {
                d.limitFactor[proto.FlowWriteType].SetLimit(rate.Limit(d.dataNode.diskWriteFlow))
        }
        if d.dataNode.diskReadIops > 0 {
                d.limitFactor[proto.IopsReadType].SetLimit(rate.Limit(d.dataNode.diskReadIops))
        }
        if d.dataNode.diskWriteIops > 0 {
                d.limitFactor[proto.IopsWriteType].SetLimit(rate.Limit(d.dataNode.diskWriteIops))
        }
        for i := proto.IopsReadType; i < proto.FlowWriteType; i++ {
                log.LogInfof("action[updateQosLimiter] type %v limit %v", proto.QosTypeString(i), d.limitFactor[i].Limit())
        }
        log.LogInfof("action[updateQosLimiter] read(iocc:%d iops:%d flow:%d) write(iocc:%d iops:%d flow:%d)",
                d.dataNode.diskReadIocc, d.dataNode.diskReadIops, d.dataNode.diskReadFlow,
                d.dataNode.diskWriteIocc, d.dataNode.diskWriteIops, d.dataNode.diskWriteFlow)
        d.limitRead.ResetIO(d.dataNode.diskReadIocc)
        d.limitRead.ResetFlow(d.dataNode.diskReadFlow)
        d.limitWrite.ResetIO(d.dataNode.diskWriteIocc)
        d.limitWrite.ResetFlow(d.dataNode.diskWriteFlow)
}

func (d *Disk) allocCheckLimit(factorType uint32, used uint32) error {
        if !(d.dataNode.diskQosEnableFromMaster && d.dataNode.diskQosEnable) {
                return nil
        }

        ctx := context.Background()
        d.limitFactor[factorType].WaitN(ctx, int(used))
        return nil
}

// PartitionCount returns the number of partitions in the partition map.
func (d *Disk) PartitionCount() int {
        d.RLock()
        defer d.RUnlock()
        return len(d.partitionMap)
}

func (d *Disk) CanWrite() bool {
        if d.Status == proto.ReadWrite || !d.RejectWrite {
                return true
        }

        // if ReservedSpace < diskFreeSpace < DiskRdonlySpace, writeOp is ok, disk & dp is rdonly, can't create dp again
        // if ReservedSpace > diskFreeSpace, writeOp is also not allowed.
        if d.Total+d.DiskRdonlySpace > d.Used+d.ReservedSpace {
                return true
        }

        return false
}

// Compute the disk usage
func (d *Disk) computeUsage() (err error) {
        d.RLock()
        defer d.RUnlock()
        fs := syscall.Statfs_t{}
        err = syscall.Statfs(d.Path, &fs)
        if err != nil {
                log.LogErrorf("computeUsage. err %v", err)
                return
        }

        repairSize := uint64(d.repairAllocSize())

        //  total := math.Max(0, int64(fs.Blocks*uint64(fs.Bsize) - d.PreReserveSpace))
        total := int64(fs.Blocks*uint64(fs.Bsize) - d.DiskRdonlySpace)
        if total < 0 {
                total = 0
        }
        d.Total = uint64(total)

        //  available := math.Max(0, int64(fs.Bavail*uint64(fs.Bsize) - d.PreReserveSpace))
        available := int64(fs.Bavail*uint64(fs.Bsize) - d.DiskRdonlySpace - repairSize)
        if available < 0 {
                available = 0
        }
        d.Available = uint64(available)

        //  used := math.Max(0, int64(total - available))
        free := int64(fs.Bfree*uint64(fs.Bsize) - d.DiskRdonlySpace - repairSize)

        used := int64(total - free)
        if used < 0 {
                used = 0
        }
        d.Used = uint64(used)

        allocatedSize := int64(0)
        for _, dp := range d.partitionMap {
                allocatedSize += int64(dp.Size())
        }

        log.LogDebugf("computeUsage. fs info [%v,%v,%v,%v] total %v available %v DiskRdonlySpace %v ReservedSpace %v allocatedSize %v",
                fs.Blocks, fs.Bsize, fs.Bavail, fs.Bfree, d.Total, d.Available, d.DiskRdonlySpace, d.ReservedSpace, allocatedSize)

        atomic.StoreUint64(&d.Allocated, uint64(allocatedSize))
        //  unallocated = math.Max(0, total - allocatedSize)
        unallocated := total - allocatedSize
        if unallocated < 0 {
                unallocated = 0
        }
        if d.Available <= 0 {
                d.RejectWrite = true
        } else {
                d.RejectWrite = false
        }
        d.Unallocated = uint64(unallocated)

        log.LogDebugf("action[computeUsage] disk(%v) all(%v) available(%v) used(%v)", d.Path, d.Total, d.Available, d.Used)

        return
}

func (d *Disk) repairAllocSize() int {
        allocSize := 0
        for _, dp := range d.partitionMap {
                if dp.DataPartitionCreateType == proto.NormalCreateDataPartition || dp.leaderSize <= dp.used {
                        continue
                }

                allocSize += dp.leaderSize - dp.used
        }

        return allocSize
}

func (d *Disk) incReadErrCnt() {
        atomic.AddUint64(&d.ReadErrCnt, 1)
}

func (d *Disk) getReadErrCnt() uint64 {
        return atomic.LoadUint64(&d.ReadErrCnt)
}

func (d *Disk) incWriteErrCnt() {
        atomic.AddUint64(&d.WriteErrCnt, 1)
}

func (d *Disk) getWriteErrCnt() uint64 {
        return atomic.LoadUint64(&d.WriteErrCnt)
}

func (d *Disk) getTotalErrCnt() uint64 {
        return d.getReadErrCnt() + d.getWriteErrCnt()
}

func (d *Disk) startScheduleToUpdateSpaceInfo() {
        go func() {
                updateSpaceInfoTicker := time.NewTicker(5 * time.Second)
                checkStatusTicker := time.NewTicker(time.Minute * 2)
                defer func() {
                        updateSpaceInfoTicker.Stop()
                        checkStatusTicker.Stop()
                }()
                for {
                        select {
                        case <-updateSpaceInfoTicker.C:
                                d.computeUsage()
                                d.updateSpaceInfo()
                        case <-checkStatusTicker.C:
                                d.checkDiskStatus()
                        }
                }
        }()
}

func (d *Disk) doBackendTask() {
        for {
                partitions := make([]*DataPartition, 0)
                d.RLock()
                for _, dp := range d.partitionMap {
                        partitions = append(partitions, dp)
                }
                d.RUnlock()
                for _, dp := range partitions {
                        dp.extentStore.BackendTask()
                }
                time.Sleep(time.Minute)
        }
}

const (
        DiskStatusFile = ".diskStatus"
)

func (d *Disk) checkDiskStatus() {
        if d.Status == proto.Unavailable {
                log.LogInfof("[checkDiskStatus] disk status is unavailable, no need to check, disk path(%v)", d.Path)
                return
        }

        path := path.Join(d.Path, DiskStatusFile)
        fp, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_RDWR, 0o755)
        if err != nil {
                d.CheckDiskError(err, ReadFlag)
                return
        }
        defer fp.Close()
        data := []byte(DiskStatusFile)
        _, err = fp.WriteAt(data, 0)
        if err != nil {
                d.CheckDiskError(err, WriteFlag)
                return
        }
        if err = fp.Sync(); err != nil {
                d.CheckDiskError(err, WriteFlag)
                return
        }
        if _, err = fp.ReadAt(data, 0); err != nil {
                d.CheckDiskError(err, ReadFlag)
                return
        }
}

const DiskErrNotAssociatedWithPartition uint64 = 0 // use 0 for disk error without any data partition

func (d *Disk) CheckDiskError(err error, rwFlag uint8) {
        if err == nil {
                return
        }
        log.LogWarnf("CheckDiskError disk err: %v, disk:%v", err.Error(), d.Path)

        if !IsDiskErr(err.Error()) {
                return
        }

        d.triggerDiskError(rwFlag, DiskErrNotAssociatedWithPartition)
}

func (d *Disk) doDiskError() {
        d.Status = proto.Unavailable
        // d.ForceExitRaftStore()
}

func (d *Disk) triggerDiskError(rwFlag uint8, dpId uint64) {
        mesg := fmt.Sprintf("disk path %v error on %v, dpId %v", d.Path, LocalIP, dpId)
        exporter.Warning(mesg)
        log.LogWarnf(mesg)

        if rwFlag == WriteFlag {
                d.incWriteErrCnt()
        } else if rwFlag == ReadFlag {
                d.incReadErrCnt()
        } else {
                d.incWriteErrCnt()
                d.incReadErrCnt()
        }

        d.AddDiskErrPartition(dpId)

        diskErrCnt := d.getTotalErrCnt()
        diskErrPartitionCnt := d.GetDiskErrPartitionCount()
        if diskErrPartitionCnt >= d.dataNode.diskUnavailablePartitionErrorCount {
                msg := fmt.Sprintf("set disk unavailable for too many disk error, "+
                        "disk path(%v), ip(%v), diskErrCnt(%v), diskErrPartitionCnt(%v) threshold(%v)",
                        d.Path, LocalIP, diskErrCnt, diskErrPartitionCnt, d.dataNode.diskUnavailablePartitionErrorCount)
                exporter.Warning(msg)
                log.LogWarnf(msg)
                d.doDiskError()
        }
}

func (d *Disk) updateSpaceInfo() (err error) {
        var statsInfo syscall.Statfs_t
        if err = syscall.Statfs(d.Path, &statsInfo); err != nil {
                d.incReadErrCnt()
        }

        if d.Status == proto.Unavailable {
                mesg := fmt.Sprintf("disk path %v error on %v", d.Path, LocalIP)
                log.LogErrorf(mesg)
                exporter.Warning(mesg)
                // d.ForceExitRaftStore()
        } else if d.Available <= 0 {
                d.Status = proto.ReadOnly
        } else {
                d.Status = proto.ReadWrite
        }

        log.LogDebugf("action[updateSpaceInfo] disk(%v) total(%v) available(%v) remain(%v) "+
                "restSize(%v) preRestSize (%v) maxErrs(%v) readErrs(%v) writeErrs(%v) status(%v)", d.Path,
                d.Total, d.Available, d.Unallocated, d.ReservedSpace, d.DiskRdonlySpace, d.MaxErrCnt, d.ReadErrCnt, d.WriteErrCnt, d.Status)
        return
}

// AttachDataPartition adds a data partition to the partition map.
func (d *Disk) AttachDataPartition(dp *DataPartition) {
        d.Lock()
        d.partitionMap[dp.partitionID] = dp
        d.Unlock()

        d.computeUsage()
}

// DetachDataPartition removes a data partition from the partition map.
func (d *Disk) DetachDataPartition(dp *DataPartition) {
        d.Lock()
        delete(d.partitionMap, dp.partitionID)
        delete(d.DiskErrPartitionSet, dp.partitionID)
        d.Unlock()

        d.computeUsage()
}

// GetDataPartition returns the data partition based on the given partition ID.
func (d *Disk) GetDataPartition(partitionID uint64) (partition *DataPartition) {
        d.RLock()
        defer d.RUnlock()
        return d.partitionMap[partitionID]
}

func (d *Disk) GetDataPartitionCount() int {
        d.RLock()
        defer d.RUnlock()
        return len(d.partitionMap)
}

func (d *Disk) ForceExitRaftStore() {
        partitionList := d.DataPartitionList()
        for _, partitionID := range partitionList {
                partition := d.GetDataPartition(partitionID)
                partition.partitionStatus = proto.Unavailable
                partition.stopRaft()
        }
}

// DataPartitionList returns a list of the data partitions
func (d *Disk) DataPartitionList() (partitionIDs []uint64) {
        d.Lock()
        defer d.Unlock()
        partitionIDs = make([]uint64, 0, len(d.partitionMap))
        for _, dp := range d.partitionMap {
                partitionIDs = append(partitionIDs, dp.partitionID)
        }
        return
}

func unmarshalPartitionName(name string) (partitionID uint64, partitionSize int, err error) {
        arr := strings.Split(name, "_")
        if len(arr) != 3 {
                err = fmt.Errorf("error DataPartition name(%v)", name)
                return
        }
        if partitionID, err = strconv.ParseUint(arr[1], 10, 64); err != nil {
                return
        }
        if partitionSize, err = strconv.Atoi(arr[2]); err != nil {
                return
        }
        return
}

func (d *Disk) isPartitionDir(filename string) (isPartitionDir bool) {
        isPartitionDir = RegexpDataPartitionDir.MatchString(filename) ||
                RegexpCachePartitionDir.MatchString(filename) ||
                RegexpPreLoadPartitionDir.MatchString(filename)
        return
}

func (d *Disk) isExpiredPartitionDir(filename string) (isExpiredPartitionDir bool) {
        isExpiredPartitionDir = RegexpExpiredDataPartitionDir.MatchString(filename)
        return
}

// RestorePartition reads the files stored on the local disk and restores the data partitions.
func (d *Disk) RestorePartition(visitor PartitionVisitor) (err error) {
        convert := func(node *proto.DataNodeInfo) *DataNodeInfo {
                result := &DataNodeInfo{}
                result.Addr = node.Addr
                result.PersistenceDataPartitions = node.PersistenceDataPartitions
                return result
        }
        var dataNode *proto.DataNodeInfo
        for i := 0; i < 3; i++ {
                dataNode, err = MasterClient.NodeAPI().GetDataNode(d.space.dataNode.localServerAddr)
                if err != nil {
                        log.LogErrorf("action[RestorePartition]: getDataNode error %v", err)
                        continue
                }
                break
        }
        dinfo := convert(dataNode)
        if len(dinfo.PersistenceDataPartitions) == 0 {
                log.LogWarnf("action[RestorePartition]: length of PersistenceDataPartitions is 0, ExpiredPartition check " +
                        "without effect")
        }

        var (
                partitionID   uint64
                partitionSize int
        )

        fileInfoList, err := os.ReadDir(d.Path)
        if err != nil {
                log.LogErrorf("action[RestorePartition] read dir(%v) err(%v).", d.Path, err)
                return err
        }

        var (
                wg                            sync.WaitGroup
                toDeleteExpiredPartitionNames = make([]string, 0)
        )
        for _, fileInfo := range fileInfoList {
                filename := fileInfo.Name()
                if !d.isPartitionDir(filename) {
                        if d.isExpiredPartitionDir(filename) {
                                name := path.Join(d.Path, filename)
                                toDeleteExpiredPartitionNames = append(toDeleteExpiredPartitionNames, name)
                                log.LogInfof("action[RestorePartition] find expired partition on path(%s)", name)
                        }
                        continue
                }

                if partitionID, partitionSize, err = unmarshalPartitionName(filename); err != nil {
                        log.LogErrorf("action[RestorePartition] unmarshal partitionName(%v) from disk(%v) err(%v) ",
                                filename, d.Path, err.Error())
                        continue
                }
                log.LogDebugf("acton[RestorePartition] disk(%v) path(%v) PartitionID(%v) partitionSize(%v).",
                        d.Path, fileInfo.Name(), partitionID, partitionSize)

                if isExpiredPartition(partitionID, dinfo.PersistenceDataPartitions) {
                        log.LogErrorf("action[RestorePartition]: find expired partition[%s], rename it and you can delete it "+
                                "manually", filename)
                        oldName := path.Join(d.Path, filename)
                        newName := path.Join(d.Path, ExpiredPartitionPrefix+filename)
                        os.Rename(oldName, newName)
                        toDeleteExpiredPartitionNames = append(toDeleteExpiredPartitionNames, newName)
                        continue
                }

                wg.Add(1)

                go func(partitionID uint64, filename string) {
                        var (
                                dp  *DataPartition
                                err error
                        )
                        defer wg.Done()
                        if dp, err = LoadDataPartition(path.Join(d.Path, filename), d); err != nil {
                                mesg := fmt.Sprintf("action[RestorePartition] new partition(%v) err(%v) ",
                                        partitionID, err.Error())
                                log.LogError(mesg)
                                exporter.Warning(mesg)
                                syslog.Println(mesg)
                                return
                        }
                        if visitor != nil {
                                visitor(dp)
                        }
                }(partitionID, filename)
        }

        if len(toDeleteExpiredPartitionNames) > 0 {
                log.LogInfof("action[RestorePartition] expiredPartitions %v, disk %v", toDeleteExpiredPartitionNames, d.Path)

                notDeletedExpiredPartitionNames := d.deleteExpiredPartitions(toDeleteExpiredPartitionNames)

                if len(notDeletedExpiredPartitionNames) > 0 {
                        go func(toDeleteExpiredPartitions []string) {
                                ticker := time.NewTicker(ExpiredPartitionExistTime)
                                log.LogInfof("action[RestorePartition] delete expiredPartitions automatically start, toDeleteExpiredPartitions %v", toDeleteExpiredPartitions)

                                <-ticker.C
                                d.deleteExpiredPartitions(toDeleteExpiredPartitionNames)
                                ticker.Stop()
                                log.LogInfof("action[RestorePartition] delete expiredPartitions automatically finish")
                        }(notDeletedExpiredPartitionNames)
                }
        }
        wg.Wait()
        return err
}

func (d *Disk) deleteExpiredPartitions(toDeleteExpiredPartitionNames []string) (notDeletedExpiredPartitionNames []string) {
        notDeletedExpiredPartitionNames = make([]string, 0)
        for _, partitionName := range toDeleteExpiredPartitionNames {
                dirName, fileName := path.Split(partitionName)
                if !d.isExpiredPartitionDir(fileName) {
                        log.LogInfof("action[deleteExpiredPartitions] partition %v on %v is not expiredPartition", fileName, dirName)
                        continue
                }
                dirInfo, err := os.Stat(partitionName)
                if err != nil {
                        log.LogErrorf("action[deleteExpiredPartitions] stat expiredPartition %v fail, err(%v)", partitionName, err)
                        continue
                }
                dirStat := dirInfo.Sys().(*syscall.Stat_t)
                nowTime := time.Now().Unix()
                expiredTime := dirStat.Ctim.Sec
                if nowTime-expiredTime >= int64(ExpiredPartitionExistTime.Seconds()) {
                        err := os.RemoveAll(partitionName)
                        if err != nil {
                                log.LogErrorf("action[deleteExpiredPartitions] delete expiredPartition %v automatically fail, err(%v)", partitionName, err)
                                continue
                        }
                        log.LogInfof("action[deleteExpiredPartitions] delete expiredPartition %v automatically", partitionName)
                } else {
                        notDeletedExpiredPartitionNames = append(notDeletedExpiredPartitionNames, partitionName)
                }
        }
        return
}

func (d *Disk) AddSize(size uint64) {
        atomic.AddUint64(&d.Allocated, size)
}

func (d *Disk) updateDisk(allocSize uint64) {
        d.Lock()
        defer d.Unlock()

        if d.Available < allocSize {
                d.Status = proto.ReadOnly
                d.Available = 0
                return
        }
        d.Available = d.Available - allocSize
}

func (d *Disk) getSelectWeight() float64 {
        return float64(atomic.LoadUint64(&d.Allocated)) / float64(d.Total)
}

func (d *Disk) AddDiskErrPartition(dpId uint64) {
        if _, ok := d.DiskErrPartitionSet[dpId]; !ok {
                d.DiskErrPartitionSet[dpId] = struct{}{}
        }
}

func (d *Disk) GetDiskErrPartitionList() (diskErrPartitionList []uint64) {
        diskErrPartitionList = make([]uint64, 0)
        for k := range d.DiskErrPartitionSet {
                diskErrPartitionList = append(diskErrPartitionList, k)
        }
        return diskErrPartitionList
}

func (d *Disk) GetDiskErrPartitionCount() uint64 {
        return uint64(len(d.DiskErrPartitionSet))
}

// isExpiredPartition return whether one partition is expired
// if one partition does not exist in master, we decided that it is one expired partition
func isExpiredPartition(id uint64, partitions []uint64) bool {
        if len(partitions) == 0 {
                return true
        }

        for _, existId := range partitions {
                if existId == id {
                        return false
                }
        }
        return true
}

//go:build gofuzz
// +build gofuzz

// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package datanode

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

type NewDiskParam struct {
        Path            string
        ReservedSpace   uint64
        DiskRdonlySpace uint64
        MaxErrCnt       int
        Space           *SpaceManager
}

func FuzzNewDisk(data []byte) int {
        f := fuzz.NewConsumer(data)
        param := NewDiskParam{}

        err := f.GenerateStruct(&param)
        if err != nil {
                return 0
        }

        disk, err := NewDisk(param.Path, param.ReservedSpace, param.DiskRdonlySpace, param.MaxErrCnt, param.Space)
        if disk == nil {
                return 0
        }

        if err != nil {
                return 0
        }
        return 1
}

package datanode

import (
        "context"
        "fmt"

        "golang.org/x/time/rate"
)

var (
        deleteLimiteRater      = rate.NewLimiter(rate.Inf, defaultMarkDeleteLimitBurst)
        MaxExtentRepairLimit   = 20000
        MinExtentRepairLimit   = 5
        CurExtentRepairLimit   = MaxExtentRepairLimit
        extentRepairLimitRater chan struct{}
)

func initRepairLimit() {
        extentRepairLimitRater = make(chan struct{}, MaxExtentRepairLimit)
        for i := 0; i < MaxExtentRepairLimit; i++ {
                extentRepairLimitRater <- struct{}{}
        }
}

func requestDoExtentRepair() (err error) {
        err = fmt.Errorf("repair limit, cannot do extentRepair")

        select {
        case <-extentRepairLimitRater:
                return nil
        default:
                return
        }
}

func fininshDoExtentRepair() {
        select {
        case extentRepairLimitRater <- struct{}{}:
                return
        default:
                return
        }
}

func setDoExtentRepair(value int) {
        if value <= 0 {
                value = MaxExtentRepairLimit
        }

        if value > MaxExtentRepairLimit {
                value = MaxExtentRepairLimit
        }

        if value < MinExtentRepairLimit {
                value = MinExtentRepairLimit
        }

        if CurExtentRepairLimit != value {
                CurExtentRepairLimit = value
                close(extentRepairLimitRater)
                extentRepairLimitRater = make(chan struct{}, CurExtentRepairLimit)
                for i := 0; i < CurExtentRepairLimit; i++ {
                        extentRepairLimitRater <- struct{}{}
                }
        }
}

func DeleteLimiterWait() {
        ctx := context.Background()
        deleteLimiteRater.Wait(ctx)
}

func setLimiter(limiter *rate.Limiter, limitValue uint64) {
        r := limitValue
        l := rate.Limit(r)
        if r == 0 {
                l = rate.Inf
        }
        limiter.SetLimit(l)
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "context"
        "math"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/util/log"
        "golang.org/x/time/rate"
)

const minusOne = ^uint32(0)

type ioLimiter struct {
        limit int
        flow  *rate.Limiter
        io    atomic.Value
}

type LimiterStatus struct {
        FlowLimit int
        FlowUsed  int

        IOConcurrency int
        IOQueue       int
        IORunning     int
        IOWaiting     int
}

// flow rate limiter's burst is double limit.
// max queue size of io is 8-times io concurrency.
func newIOLimiter(flowLimit, ioConcurrency int) *ioLimiter {
        flow := rate.NewLimiter(rate.Inf, 0)
        if flowLimit > 0 {
                flow = rate.NewLimiter(rate.Limit(flowLimit), 2*flowLimit)
        }
        l := &ioLimiter{limit: flowLimit, flow: flow}
        l.io.Store(newIOQueue(ioConcurrency))
        return l
}

func (l *ioLimiter) getIO() *ioQueue {
        return l.io.Load().(*ioQueue)
}

func (l *ioLimiter) ResetFlow(flowLimit int) {
        l.limit = flowLimit
        if flowLimit <= 0 {
                l.flow.SetLimit(rate.Inf)
                l.flow.SetBurst(0)
        } else {
                l.flow.SetLimit(rate.Limit(flowLimit))
                l.flow.SetBurst(2 * flowLimit)
        }
}

func (l *ioLimiter) ResetIO(ioConcurrency int) {
        q := l.io.Swap(newIOQueue(ioConcurrency)).(*ioQueue)
        q.Close()
}

func (l *ioLimiter) Run(size int, taskFn func()) {
        if size > 0 {
                if err := l.flow.WaitN(context.Background(), size); err != nil {
                        log.LogWarnf("action[limitio] run wait flow with %d %s", size, err.Error())
                }
        }
        l.getIO().Run(taskFn)
}

func (l *ioLimiter) TryRun(size int, taskFn func()) bool {
        if ok := l.getIO().TryRun(taskFn); !ok {
                return false
        }
        if size > 0 {
                if err := l.flow.WaitN(context.Background(), size); err != nil {
                        log.LogWarnf("action[limitio] tryrun wait flow with %d %s", size, err.Error())
                        return false
                }
        }
        return true
}

func (l *ioLimiter) Status() (st LimiterStatus) {
        st = l.getIO().Status()

        limit := l.limit
        st.FlowLimit = limit
        if limit > 0 {
                now := time.Now()
                reserve := l.flow.ReserveN(now, l.flow.Burst())
                duration := reserve.DelayFrom(now)
                reserve.Cancel()

                if ms := duration.Microseconds(); ms > 0 {
                        st.FlowUsed = int(math.Ceil(float64(limit) * (float64(ms) / 1e6)))
                }
        }
        return
}

func (l *ioLimiter) Close() {
        q := l.io.Swap(newIOQueue(0)).(*ioQueue)
        q.Close()
}

type task struct {
        fn   func()
        done chan struct{}
}

type ioQueue struct {
        wg          sync.WaitGroup
        once        sync.Once
        running     uint32
        concurrency int
        stopCh      chan struct{}
        queue       chan *task
}

func newIOQueue(concurrency int) *ioQueue {
        q := &ioQueue{concurrency: concurrency}
        if q.concurrency <= 0 {
                return q
        }

        q.stopCh = make(chan struct{})
        q.queue = make(chan *task, 8*concurrency)
        q.wg.Add(concurrency)
        for ii := 0; ii < concurrency; ii++ {
                go func() {
                        defer q.wg.Done()
                        for {
                                select {
                                case <-q.stopCh:
                                        return
                                case task := <-q.queue:
                                        atomic.AddUint32(&q.running, 1)
                                        task.fn()
                                        atomic.AddUint32(&q.running, minusOne)
                                        close(task.done)
                                }
                        }
                }()
        }
        return q
}

func (q *ioQueue) Run(taskFn func()) {
        if q.concurrency <= 0 {
                taskFn()
                return
        }

        select {
        case <-q.stopCh:
                taskFn()
                return
        default:
        }

        task := &task{fn: taskFn, done: make(chan struct{})}
        select {
        case <-q.stopCh:
                taskFn()
        case q.queue <- task:
                <-task.done
        }
}

func (q *ioQueue) TryRun(taskFn func()) bool {
        if q.concurrency <= 0 {
                taskFn()
                return true
        }

        select {
        case <-q.stopCh:
                taskFn()
                return true
        default:
        }

        task := &task{fn: taskFn, done: make(chan struct{})}
        select {
        case <-q.stopCh:
                taskFn()
                return true
        case q.queue <- task:
                <-task.done
                return true
        default:
                return false
        }
}

func (q *ioQueue) Status() (st LimiterStatus) {
        st.IOConcurrency = q.concurrency
        st.IOQueue = cap(q.queue)
        st.IORunning = int(atomic.LoadUint32(&q.running))
        st.IOWaiting = len(q.queue)
        return
}

func (q *ioQueue) Close() {
        q.once.Do(func() {
                if q.concurrency > 0 {
                        close(q.stopCh)
                }
        })
        q.wg.Wait()

        // wait one minute if no task in the queue
        // to protect task been blocked.
        go func() {
                waitTimer := time.NewTimer(time.Minute)
                defer waitTimer.Stop()
                for {
                        select {
                        case task := <-q.queue:
                                task.fn()
                                close(task.done)
                                waitTimer.Reset(time.Minute)
                        case <-waitTimer.C:
                                return
                        }
                }
        }()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

const (
        StatPeriod                 = time.Minute * time.Duration(1)
        MetricPartitionIOName      = "dataPartitionIO"
        MetricPartitionIOBytesName = "dataPartitionIOBytes"
        MetricLackDpCount          = "lackDataPartitionCount"
        MetricCapacityToCreateDp   = "capacityToCreateDp"
        MetricConnectionCnt        = "connectionCnt"
        MetricDpCount              = "dataPartitionCount"
        MetricTotalDpSize          = "totalDpSize"
        MetricCapacity             = "capacity"
)

type DataNodeMetrics struct {
        dataNode                 *DataNode
        stopC                    chan struct{}
        MetricIOBytes            *exporter.Counter
        MetricLackDpCount        *exporter.GaugeVec
        MetricCapacityToCreateDp *exporter.GaugeVec
        MetricConnectionCnt      *exporter.Gauge
        MetricDpCount            *exporter.Gauge
        MetricTotalDpSize        *exporter.Gauge
        MetricCapacity           *exporter.GaugeVec
}

func (d *DataNode) registerMetrics() {
        d.metrics = &DataNodeMetrics{
                dataNode: d,
                stopC:    make(chan struct{}),
        }
        d.metrics.MetricIOBytes = exporter.NewCounter(MetricPartitionIOBytesName)
        d.metrics.MetricLackDpCount = exporter.NewGaugeVec(MetricLackDpCount, "", []string{"type"})
        d.metrics.MetricCapacityToCreateDp = exporter.NewGaugeVec(MetricCapacityToCreateDp, "", []string{"type"})
        d.metrics.MetricConnectionCnt = exporter.NewGauge(MetricConnectionCnt)
        d.metrics.MetricDpCount = exporter.NewGauge(MetricDpCount)
        d.metrics.MetricTotalDpSize = exporter.NewGauge(MetricTotalDpSize)
        d.metrics.MetricCapacity = exporter.NewGaugeVec(MetricCapacity, "", []string{"type"})
}

func (d *DataNode) startMetrics() {
        go d.metrics.statMetrics()
        log.LogInfof("startMetrics")
}

func (d *DataNode) closeMetrics() {
        close(d.metrics.stopC)
        log.LogInfof("closeMetrics")
}

func GetIoMetricLabels(partition *DataPartition, tp string) map[string]string {
        labels := make(map[string]string)
        labels[exporter.Vol] = partition.volumeID
        labels[exporter.Type] = tp
        labels[exporter.Disk] = partition.disk.Path
        if exporter.EnablePid {
                labels[exporter.PartId] = fmt.Sprintf("%d", partition.partitionID)
        }

        return labels
}

func (dm *DataNodeMetrics) statMetrics() {
        ticker := time.NewTicker(StatPeriod)

        for {
                select {
                case <-dm.stopC:
                        ticker.Stop()
                        log.LogInfof("stop metrics ticker")
                        return
                case <-ticker.C:
                        dm.doStat()
                }
        }
}

func (dm *DataNodeMetrics) doStat() {
        dm.setLackDpCountMetrics()
        dm.setCapacityToCreateDpMetrics()
        dm.setConnectionCntMetrics()
        dm.setDpCountMetrics()
        dm.setTotalDpSizeMetrics()
        dm.setCapacityMetrics()
}

func (dm *DataNodeMetrics) setLackDpCountMetrics() {
        lackPartitionsInMem := dm.dataNode.space.stats.LackPartitionsInMem
        lackPartitionsInDisk := dm.dataNode.space.stats.LackPartitionsInDisk
        dm.MetricLackDpCount.SetWithLabelValues(float64(lackPartitionsInMem), "inMemory")
        dm.MetricLackDpCount.SetWithLabelValues(float64(lackPartitionsInDisk), "inDisk")
}

func (dm *DataNodeMetrics) setCapacityToCreateDpMetrics() {
        remainingCapacityToCreateDp := dm.dataNode.space.stats.RemainingCapacityToCreatePartition
        maxCapacityToCreateDp := dm.dataNode.space.stats.MaxCapacityToCreatePartition
        dm.MetricCapacityToCreateDp.SetWithLabelValues(float64(remainingCapacityToCreateDp), "remaining")
        dm.MetricCapacityToCreateDp.SetWithLabelValues(float64(maxCapacityToCreateDp), "max")
}

func (dm *DataNodeMetrics) setConnectionCntMetrics() {
        connectionCnt := dm.dataNode.space.stats.ConnectionCnt
        dm.MetricConnectionCnt.Set(float64(connectionCnt))
}

func (dm *DataNodeMetrics) setDpCountMetrics() {
        dpCount := dm.dataNode.space.stats.CreatedPartitionCnt
        dm.MetricDpCount.Set(float64(dpCount))
}

func (dm *DataNodeMetrics) setTotalDpSizeMetrics() {
        totalDpSize := dm.dataNode.space.stats.TotalPartitionSize
        dm.MetricTotalDpSize.Set(float64(totalDpSize))
}

func (dm *DataNodeMetrics) setCapacityMetrics() {
        total := dm.dataNode.space.stats.Total
        used := dm.dataNode.space.stats.Used
        available := dm.dataNode.space.stats.Available
        dm.MetricCapacity.SetWithLabelValues(float64(total), "total")
        dm.MetricCapacity.SetWithLabelValues(float64(used), "used")
        dm.MetricCapacity.SetWithLabelValues(float64(available), "available")
}

package datanode

import (
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/util/log"
        "golang.org/x/time/rate"
)

const (
        defaultMarkDeleteLimitRate  = rate.Inf
        defaultMarkDeleteLimitBurst = 512
        defaultIOLimitBurst         = 512
        UpdateNodeInfoTicket        = 1 * time.Minute

        RepairTimeOut   = time.Hour * 24
        MaxRepairErrCnt = 1000
)

var nodeInfoStopC = make(chan struct{})

func (m *DataNode) startUpdateNodeInfo() {
        ticker := time.NewTicker(UpdateNodeInfoTicket)
        defer ticker.Stop()
        for {
                select {
                case <-nodeInfoStopC:
                        log.LogInfo("datanode nodeinfo goroutine stopped")
                        return
                case <-ticker.C:
                        m.updateNodeInfo()
                }
        }
}

func (m *DataNode) stopUpdateNodeInfo() {
        nodeInfoStopC <- struct{}{}
}

func (m *DataNode) updateNodeInfo() {
        clusterInfo, err := MasterClient.AdminAPI().GetClusterInfo()
        if err != nil {
                log.LogErrorf("[updateDataNodeInfo] %s", err.Error())
                return
        }

        setLimiter(deleteLimiteRater, clusterInfo.DataNodeDeleteLimitRate)

        setDoExtentRepair(int(clusterInfo.DataNodeAutoRepairLimitRate))

        atomic.StoreUint64(&m.dpMaxRepairErrCnt, clusterInfo.DpMaxRepairErrCnt)

        log.LogInfof("updateNodeInfo from master:"+
                "deleteLimite(%v), autoRepairLimit(%v), dpMaxRepairErrCnt(%v)",
                clusterInfo.DataNodeDeleteLimitRate, clusterInfo.DataNodeAutoRepairLimitRate,
                clusterInfo.DpMaxRepairErrCnt)
}

func (m *DataNode) GetDpMaxRepairErrCnt() uint64 {
        dpMaxRepairErrCnt := atomic.LoadUint64(&m.dpMaxRepairErrCnt)
        if dpMaxRepairErrCnt == 0 {
                return MaxRepairErrCnt
        }
        return dpMaxRepairErrCnt
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "encoding/json"
        "fmt"
        "hash/crc32"
        "math"
        "net"
        "os"
        "path"
        "sort"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        DataPartitionPrefix           = "datapartition"
        CachePartitionPrefix          = "cachepartition"
        PreLoadPartitionPrefix        = "preloadpartition"
        DataPartitionMetadataFileName = "META"
        TempMetadataFileName          = ".meta"
        ApplyIndexFile                = "APPLY"
        TempApplyIndexFile            = ".apply"
        TimeLayout                    = "2006-01-02 15:04:05"
)

const (
        RaftStatusStopped = 0
        RaftStatusRunning = 1
)

type DataPartitionMetadata struct {
        VolumeID                string
        PartitionID             uint64
        PartitionSize           int
        PartitionType           int
        CreateTime              string
        Peers                   []proto.Peer
        Hosts                   []string
        DataPartitionCreateType int
        LastTruncateID          uint64
        ReplicaNum              int
        StopRecover             bool
        VerList                 []*proto.VolVersionInfo
        ApplyID                 uint64
}

func (md *DataPartitionMetadata) Validate() (err error) {
        md.VolumeID = strings.TrimSpace(md.VolumeID)
        if len(md.VolumeID) == 0 || md.PartitionID == 0 || md.PartitionSize == 0 {
                err = errors.New("illegal data partition metadata")
                return
        }
        return
}

// MetaMultiSnapshotInfo
type MetaMultiSnapshotInfo struct {
        VerSeq uint64
        Status int8
        Ctime  time.Time
}

type DataPartition struct {
        clusterID       string
        volumeID        string
        partitionID     uint64
        partitionStatus int
        partitionSize   int
        partitionType   int
        replicaNum      int
        replicas        []string // addresses of the replicas
        replicasLock    sync.RWMutex
        disk            *Disk
        dataNode        *DataNode
        isLeader        bool
        isRaftLeader    bool
        path            string
        used            int
        leaderSize      int
        extentStore     *storage.ExtentStore
        raftPartition   raftstore.Partition
        config          *dataPartitionCfg
        appliedID       uint64 // apply id used in Raft
        lastTruncateID  uint64 // truncate id used in Raft
        metaAppliedID   uint64 // apply id while do meta persist
        minAppliedID    uint64
        maxAppliedID    uint64

        stopOnce  sync.Once
        stopRaftC chan uint64
        storeC    chan uint64
        stopC     chan bool

        raftStatus int32

        intervalToUpdateReplicas      int64 // interval to ask the master for updating the replica information
        snapshot                      []*proto.File
        snapshotMutex                 sync.RWMutex
        intervalToUpdatePartitionSize int64
        loadExtentHeaderStatus        int
        DataPartitionCreateType       int
        isLoadingDataPartition        int32
        persistMetaMutex              sync.RWMutex

        // snapshot
        verSeq                     uint64
        verSeqPrepare              uint64
        verSeqCommitStatus         int8
        volVersionInfoList         *proto.VolVersionInfoList
        decommissionRepairProgress float64 // record repair progress for decommission datapartition
        stopRecover                bool
        recoverErrCnt              uint64 // donot reset, if reach max err cnt, delete this dp

        diskErrCnt uint64 // number of disk io errors while reading or writing
}

func (dp *DataPartition) IsForbidden() bool {
        return dp.config.Forbidden
}

func (dp *DataPartition) SetForbidden(status bool) {
        dp.config.Forbidden = status
}

func CreateDataPartition(dpCfg *dataPartitionCfg, disk *Disk, request *proto.CreateDataPartitionRequest) (dp *DataPartition, err error) {
        if dp, err = newDataPartition(dpCfg, disk, true); err != nil {
                return
        }
        dp.ForceLoadHeader()
        if request.CreateType == proto.NormalCreateDataPartition {
                err = dp.StartRaft(false)
        } else {
                // init leaderSize to partitionSize
                disk.updateDisk(uint64(request.LeaderSize))
                // ensure heartbeat report  Recovering
                dp.partitionStatus = proto.Recovering
                go dp.StartRaftAfterRepair(false)
        }
        if err != nil {
                return nil, err
        }

        // persist file metadata
        go dp.StartRaftLoggingSchedule()
        dp.DataPartitionCreateType = request.CreateType
        dp.replicaNum = request.ReplicaNum
        err = dp.PersistMetadata()
        disk.AddSize(uint64(dp.Size()))
        return
}

func (dp *DataPartition) IsEquareCreateDataPartitionRequst(request *proto.CreateDataPartitionRequest) (err error) {
        if len(dp.config.Peers) != len(request.Members) {
                return fmt.Errorf("exist partition(%v)  peers len(%v) members len(%v)",
                        dp.partitionID, len(dp.config.Peers), len(request.Members))
        }
        for index, host := range dp.config.Hosts {
                requestHost := request.Hosts[index]
                if host != requestHost {
                        return fmt.Errorf("exist partition(%v) index(%v) requestHost(%v) persistHost(%v)",
                                dp.partitionID, index, requestHost, host)
                }
        }
        for index, peer := range dp.config.Peers {
                requestPeer := request.Members[index]
                if requestPeer.ID != peer.ID || requestPeer.Addr != peer.Addr {
                        return fmt.Errorf("exist partition(%v) index(%v) requestPeer(%v) persistPeers(%v)",
                                dp.partitionID, index, requestPeer, peer)
                }
        }
        if dp.config.VolName != request.VolumeId {
                return fmt.Errorf("exist partition Partition(%v)  requestVolName(%v) persistVolName(%v)",
                        dp.partitionID, request.VolumeId, dp.config.VolName)
        }

        return
}

func (dp *DataPartition) ForceSetDataPartitionToLoadding() {
        atomic.StoreInt32(&dp.isLoadingDataPartition, 1)
}

func (dp *DataPartition) ForceSetDataPartitionToFininshLoad() {
        atomic.StoreInt32(&dp.isLoadingDataPartition, 0)
}

func (dp *DataPartition) IsDataPartitionLoading() bool {
        return atomic.LoadInt32(&dp.isLoadingDataPartition) == 1
}

func (dp *DataPartition) ForceSetRaftRunning() {
        atomic.StoreInt32(&dp.raftStatus, RaftStatusRunning)
}

// LoadDataPartition loads and returns a partition instance based on the specified directory.
// It reads the partition metadata file stored under the specified directory
// and creates the partition instance.
func LoadDataPartition(partitionDir string, disk *Disk) (dp *DataPartition, err error) {
        var metaFileData []byte
        if metaFileData, err = os.ReadFile(path.Join(partitionDir, DataPartitionMetadataFileName)); err != nil {
                return
        }
        meta := &DataPartitionMetadata{}
        if err = json.Unmarshal(metaFileData, meta); err != nil {
                return
        }
        if err = meta.Validate(); err != nil {
                return
        }

        dpCfg := &dataPartitionCfg{
                VolName:       meta.VolumeID,
                PartitionSize: meta.PartitionSize,
                PartitionType: meta.PartitionType,
                PartitionID:   meta.PartitionID,
                ReplicaNum:    meta.ReplicaNum,
                Peers:         meta.Peers,
                Hosts:         meta.Hosts,
                RaftStore:     disk.space.GetRaftStore(),
                NodeID:        disk.space.GetNodeID(),
                ClusterID:     disk.space.GetClusterID(),
        }
        if dp, err = newDataPartition(dpCfg, disk, false); err != nil {
                return
        }
        dp.stopRecover = meta.StopRecover
        dp.metaAppliedID = meta.ApplyID
        dp.computeUsage()
        dp.ForceSetDataPartitionToLoadding()
        disk.space.AttachPartition(dp)
        if err = dp.LoadAppliedID(); err != nil {
                log.LogErrorf("action[loadApplyIndex] %v", err)
                return
        }
        log.LogInfof("Action(LoadDataPartition) PartitionID(%v) meta(%v) stopRecover(%v)", dp.partitionID, meta, meta.StopRecover)
        dp.DataPartitionCreateType = meta.DataPartitionCreateType
        dp.lastTruncateID = meta.LastTruncateID
        if meta.DataPartitionCreateType == proto.NormalCreateDataPartition {
                err = dp.StartRaft(true)
        } else {
                // init leaderSize to partitionSize
                dp.leaderSize = dp.partitionSize
                dp.partitionStatus = proto.Recovering
                go dp.StartRaftAfterRepair(true)
        }
        if err != nil {
                log.LogErrorf("PartitionID(%v) start raft err(%v)..", dp.partitionID, err)
                disk.space.DetachDataPartition(dp.partitionID)
                return
        }

        go dp.StartRaftLoggingSchedule()
        disk.AddSize(uint64(dp.Size()))
        dp.ForceLoadHeader()
        return
}

func newDataPartition(dpCfg *dataPartitionCfg, disk *Disk, isCreate bool) (dp *DataPartition, err error) {
        partitionID := dpCfg.PartitionID
        var dataPath string

        if proto.IsNormalDp(dpCfg.PartitionType) {
                dataPath = path.Join(disk.Path, fmt.Sprintf(DataPartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
        } else if proto.IsCacheDp(dpCfg.PartitionType) {
                dataPath = path.Join(disk.Path, fmt.Sprintf(CachePartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
        } else if proto.IsPreLoadDp(dpCfg.PartitionType) {
                dataPath = path.Join(disk.Path, fmt.Sprintf(PreLoadPartitionPrefix+"_%v_%v", partitionID, dpCfg.PartitionSize))
        } else {
                return nil, fmt.Errorf("newDataPartition fail, dataPartitionCfg(%v)", dpCfg)
        }

        partition := &DataPartition{
                volumeID:                dpCfg.VolName,
                clusterID:               dpCfg.ClusterID,
                partitionID:             partitionID,
                replicaNum:              dpCfg.ReplicaNum,
                disk:                    disk,
                dataNode:                disk.dataNode,
                path:                    dataPath,
                partitionSize:           dpCfg.PartitionSize,
                partitionType:           dpCfg.PartitionType,
                replicas:                make([]string, 0),
                stopC:                   make(chan bool),
                stopRaftC:               make(chan uint64),
                storeC:                  make(chan uint64, 128),
                snapshot:                make([]*proto.File, 0),
                partitionStatus:         proto.ReadWrite,
                config:                  dpCfg,
                raftStatus:              RaftStatusStopped,
                verSeq:                  dpCfg.VerSeq,
                DataPartitionCreateType: dpCfg.CreateType,
                volVersionInfoList:      &proto.VolVersionInfoList{},
        }
        atomic.StoreUint64(&partition.recoverErrCnt, 0)
        log.LogInfof("action[newDataPartition] dp %v replica num %v", partitionID, dpCfg.ReplicaNum)
        partition.replicasInit()
        partition.extentStore, err = storage.NewExtentStore(partition.path, dpCfg.PartitionID, dpCfg.PartitionSize,
                partition.partitionType, isCreate)
        if err != nil {
                log.LogWarnf("action[newDataPartition] dp %v NewExtentStore failed %v", partitionID, err.Error())
                return
        }
        // store applyid
        if err = partition.storeAppliedID(partition.appliedID); err != nil {
                log.LogErrorf("action[newDataPartition] dp %v initial Apply [%v] failed: %v",
                        partition.partitionID, partition.appliedID, err)
                return
        }
        disk.AttachDataPartition(partition)
        dp = partition
        go partition.statusUpdateScheduler()
        go partition.startEvict()
        if isCreate {
                if err = dp.getVerListFromMaster(); err != nil {
                        log.LogErrorf("action[newDataPartition] vol %v dp %v loadFromMaster verList failed err %v", dp.volumeID, dp.partitionID, err)
                        return
                }
        }

        log.LogInfof("action[newDataPartition] dp %v replica num %v CreateType %v create success",
                dp.partitionID, dpCfg.ReplicaNum, dp.DataPartitionCreateType)
        return
}

func (partition *DataPartition) HandleVersionOp(req *proto.MultiVersionOpRequest) (err error) {
        var (
                verData []byte
                pItem   *RaftCmdItem
        )
        if verData, err = json.Marshal(req); err != nil {
                return
        }
        pItem = &RaftCmdItem{
                Op: uint32(proto.OpVersionOp),
                K:  []byte("version"),
                V:  verData,
        }
        data, _ := MarshalRaftCmd(pItem)
        _, err = partition.Submit(data)
        return
}

func (partition *DataPartition) fsmVersionOp(opItem *RaftCmdItem) (err error) {
        req := new(proto.MultiVersionOpRequest)
        if err = json.Unmarshal(opItem.V, req); err != nil {
                log.LogErrorf("action[fsmVersionOp] dp[%v] op item %v", partition.partitionID, opItem)
                return
        }
        if len(req.VolVerList) == 0 {
                return
        }
        lastSeq := req.VolVerList[len(req.VolVerList)-1].Ver
        partition.volVersionInfoList.RWLock.Lock()
        if len(partition.volVersionInfoList.VerList) == 0 {
                partition.volVersionInfoList.VerList = make([]*proto.VolVersionInfo, len(req.VolVerList))
                copy(partition.volVersionInfoList.VerList, req.VolVerList)
                partition.verSeq = lastSeq
                log.LogInfof("action[fsmVersionOp] dp %v seq %v updateVerList reqeust ver %v verlist  %v  dp verlist nil and set",
                        partition.partitionID, partition.verSeq, lastSeq, req.VolVerList)
                partition.volVersionInfoList.RWLock.Unlock()
                return
        }

        lastVerInfo := partition.volVersionInfoList.GetLastVolVerInfo()
        log.LogInfof("action[fsmVersionOp] dp %v seq %v lastVerList seq %v req seq %v op %v",
                partition.partitionID, partition.verSeq, lastVerInfo.Ver, lastSeq, req.Op)

        if lastVerInfo.Ver >= lastSeq {
                if lastVerInfo.Ver == lastSeq {
                        if req.Op == proto.CreateVersionCommit {
                                lastVerInfo.Status = proto.VersionNormal
                        }
                }
                partition.volVersionInfoList.RWLock.Unlock()
                return
        }

        var status uint8 = proto.VersionPrepare
        if req.Op == proto.CreateVersionCommit {
                status = proto.VersionNormal
        }
        partition.volVersionInfoList.VerList = append(partition.volVersionInfoList.VerList, &proto.VolVersionInfo{
                Status: status,
                Ver:    lastSeq,
        })

        partition.verSeq = lastSeq

        err = partition.PersistMetadata()
        log.LogInfof("action[fsmVersionOp] dp %v seq %v updateVerList reqeust add new seq %v verlist (%v) err (%v)",
                partition.partitionID, partition.verSeq, lastSeq, partition.volVersionInfoList, err)

        partition.volVersionInfoList.RWLock.Unlock()
        return
}

func (dp *DataPartition) getVerListFromMaster() (err error) {
        var verList *proto.VolVersionInfoList
        verList, err = MasterClient.AdminAPI().GetVerList(dp.volumeID)
        if err != nil {
                log.LogErrorf("action[onStart] GetVerList err[%v]", err)
                return
        }

        for _, info := range verList.VerList {
                if info.Status != proto.VersionNormal {
                        continue
                }
                dp.volVersionInfoList.VerList = append(dp.volVersionInfoList.VerList, info)
        }

        log.LogDebugf("action[onStart] dp %v verList %v", dp.partitionID, dp.volVersionInfoList.VerList)
        dp.verSeq = dp.volVersionInfoList.GetLastVer()
        return
}

func (dp *DataPartition) replicasInit() {
        replicas := make([]string, 0)
        if dp.config.Hosts == nil {
                return
        }
        replicas = append(replicas, dp.config.Hosts...)
        dp.replicasLock.Lock()
        dp.replicas = replicas
        dp.replicasLock.Unlock()
        if dp.config.Hosts != nil && len(dp.config.Hosts) >= 1 {
                leaderAddr := strings.Split(dp.config.Hosts[0], ":")
                if len(leaderAddr) == 2 && strings.TrimSpace(leaderAddr[0]) == LocalIP {
                        dp.isLeader = true
                }
        }
}

func (dp *DataPartition) GetExtentCount() int {
        return dp.extentStore.GetExtentCount()
}

func (dp *DataPartition) Path() string {
        return dp.path
}

// IsRaftLeader tells if the given address belongs to the raft leader.
func (dp *DataPartition) IsRaftLeader() (addr string, ok bool) {
        if dp.raftStopped() {
                return
        }
        leaderID, _ := dp.raftPartition.LeaderTerm()
        if leaderID == 0 {
                return
        }
        ok = leaderID == dp.config.NodeID
        for _, peer := range dp.config.Peers {
                if leaderID == peer.ID {
                        addr = peer.Addr
                        return
                }
        }
        return
}

func (dp *DataPartition) Replicas() []string {
        dp.replicasLock.RLock()
        defer dp.replicasLock.RUnlock()
        return dp.replicas
}

func (dp *DataPartition) getReplicaCopy() []string {
        dp.replicasLock.RLock()
        defer dp.replicasLock.RUnlock()

        tmpCopy := make([]string, len(dp.replicas))
        copy(tmpCopy, dp.replicas)

        return tmpCopy
}

func (dp *DataPartition) getReplicaAddr(index int) string {
        dp.replicasLock.RLock()
        defer dp.replicasLock.RUnlock()
        return dp.replicas[index]
}

func (dp *DataPartition) getReplicaLen() int {
        dp.replicasLock.RLock()
        defer dp.replicasLock.RUnlock()
        return len(dp.replicas)
}

func (dp *DataPartition) IsExistReplica(addr string) bool {
        dp.replicasLock.RLock()
        defer dp.replicasLock.RUnlock()
        for _, host := range dp.replicas {
                if host == addr {
                        return true
                }
        }
        return false
}

func (dp *DataPartition) ReloadSnapshot() {
        files, err := dp.extentStore.SnapShot()
        if err != nil {
                log.LogErrorf("ReloadSnapshot err %v", err)
                return
        }

        dp.snapshotMutex.Lock()
        for _, f := range dp.snapshot {
                storage.PutSnapShotFileToPool(f)
        }
        dp.snapshot = files
        dp.snapshotMutex.Unlock()
}

// Snapshot returns the snapshot of the data partition.
func (dp *DataPartition) SnapShot() (files []*proto.File) {
        dp.snapshotMutex.RLock()
        defer dp.snapshotMutex.RUnlock()

        return dp.snapshot
}

// Stop close the store and the raft store.
func (dp *DataPartition) Stop() {
        dp.stopOnce.Do(func() {
                if dp.stopC != nil {
                        close(dp.stopC)
                }
                // Close the store and raftstore.
                dp.stopRaft()
                dp.extentStore.Close()
                err := dp.storeAppliedID(atomic.LoadUint64(&dp.appliedID))
                if err != nil {
                        log.LogErrorf("action[Stop]: failed to store applied index")
                }
        })
}

// Disk returns the disk instance.
func (dp *DataPartition) Disk() *Disk {
        return dp.disk
}

// func (dp *DataPartition) IsRejectWrite() bool {
//         return dp.Disk().RejectWrite
// }

// Status returns the partition status.
func (dp *DataPartition) Status() int {
        return dp.partitionStatus
}

// Size returns the partition size.
func (dp *DataPartition) Size() int {
        return dp.partitionSize
}

// Used returns the used space.
func (dp *DataPartition) Used() int {
        return dp.used
}

// Available returns the available space.
func (dp *DataPartition) Available() int {
        return dp.partitionSize - dp.used
}

func (dp *DataPartition) ForceLoadHeader() {
        dp.loadExtentHeaderStatus = FinishLoadDataPartitionExtentHeader
}

// PersistMetadata persists the file metadata on the disk.
func (dp *DataPartition) PersistMetadata() (err error) {
        dp.persistMetaMutex.Lock()
        defer dp.persistMetaMutex.Unlock()

        var (
                metadataFile *os.File
                metaData     []byte
        )
        fileName := path.Join(dp.Path(), TempMetadataFileName)
        if metadataFile, err = os.OpenFile(fileName, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                return
        }
        defer func() {
                metadataFile.Sync()
                metadataFile.Close()
                os.Remove(fileName)
        }()

        md := &DataPartitionMetadata{
                VolumeID:                dp.config.VolName,
                PartitionID:             dp.config.PartitionID,
                ReplicaNum:              dp.config.ReplicaNum,
                PartitionSize:           dp.config.PartitionSize,
                PartitionType:           dp.config.PartitionType,
                Peers:                   dp.config.Peers,
                Hosts:                   dp.config.Hosts,
                DataPartitionCreateType: dp.DataPartitionCreateType,
                CreateTime:              time.Now().Format(TimeLayout),
                LastTruncateID:          dp.lastTruncateID,
                StopRecover:             dp.stopRecover,
                VerList:                 dp.volVersionInfoList.VerList,
                ApplyID:                 dp.appliedID,
        }

        if metaData, err = json.Marshal(md); err != nil {
                return
        }
        if _, err = metadataFile.Write(metaData); err != nil {
                return
        }
        dp.metaAppliedID = dp.appliedID
        log.LogInfof("PersistMetadata DataPartition(%v) data(%v)", dp.partitionID, string(metaData))
        err = os.Rename(fileName, path.Join(dp.Path(), DataPartitionMetadataFileName))
        return
}

func (dp *DataPartition) statusUpdateScheduler() {
        ticker := time.NewTicker(time.Minute)
        snapshotTicker := time.NewTicker(time.Minute * 5)
        var index int
        for {
                select {
                case <-ticker.C:
                        dp.statusUpdate()
                        // only repair tiny extent
                        if !dp.isNormalType() {
                                dp.LaunchRepair(proto.TinyExtentType)
                                continue
                        }

                        index++
                        if index >= math.MaxUint32 {
                                index = 0
                        }

                        if index%2 == 0 {
                                dp.LaunchRepair(proto.TinyExtentType)
                        } else {
                                dp.LaunchRepair(proto.NormalExtentType)
                        }
                case <-snapshotTicker.C:
                        dp.ReloadSnapshot()
                case <-dp.stopC:
                        ticker.Stop()
                        snapshotTicker.Stop()
                        return
                }
        }
}

func (dp *DataPartition) statusUpdate() {
        status := proto.ReadWrite
        dp.computeUsage()

        if dp.used >= dp.partitionSize {
                status = proto.ReadOnly
        }
        if dp.isNormalType() && dp.extentStore.GetExtentCount() >= storage.MaxExtentCount {
                status = proto.ReadOnly
        }
        if dp.isNormalType() && dp.raftStatus == RaftStatusStopped {
                // dp is still recovering
                if dp.DataPartitionCreateType == proto.DecommissionedCreateDataPartition {
                        status = proto.Recovering
                } else {
                        status = proto.Unavailable
                }
        }
        if dp.getDiskErrCnt() > 0 {
                dp.partitionStatus = proto.Unavailable
        }

        log.LogInfof("action[statusUpdate] dp %v raft status %v dp.status %v, status %v, disk status %v",
                dp.partitionID, dp.raftStatus, dp.Status(), status, float64(dp.disk.Status))
        // dp.partitionStatus = int(math.Min(float64(status), float64(dp.disk.Status)))
        dp.partitionStatus = status
}

func (dp *DataPartition) computeUsage() {
        if time.Now().Unix()-dp.intervalToUpdatePartitionSize < IntervalToUpdatePartitionSize {
                return
        }
        dp.used = int(dp.ExtentStore().GetStoreUsedSize())
        dp.intervalToUpdatePartitionSize = time.Now().Unix()
}

func (dp *DataPartition) ExtentStore() *storage.ExtentStore {
        return dp.extentStore
}

func (dp *DataPartition) checkIsDiskError(err error, rwFlag uint8) {
        if err == nil {
                return
        }
        log.LogWarnf("checkIsDiskError: disk path %v, error: %v, partition:%v, rwFlag:%v",
                dp.Path(), err.Error(), dp.partitionID, rwFlag)
        if !IsDiskErr(err.Error()) {
                return
        }

        dp.stopRaft()
        dp.incDiskErrCnt()
        dp.disk.triggerDiskError(rwFlag, dp.partitionID)

        // must after change disk.status
        dp.statusUpdate()
        return
}

func newRaftApplyError(err error) error {
        return errors.NewErrorf("[Custom Error]: unhandled raft apply error, err(%s)", err)
}

func isRaftApplyError(errMsg string) bool {
        return strings.Contains(errMsg, "[Custom Error]: unhandled raft apply error")
}

// String returns the string format of the data partition information.
func (dp *DataPartition) String() (m string) {
        return fmt.Sprintf(DataPartitionPrefix+"_%v_%v", dp.partitionID, dp.partitionSize)
}

// LaunchRepair launches the repair of extents.
func (dp *DataPartition) LaunchRepair(extentType uint8) {
        if dp.partitionStatus == proto.Unavailable {
                return
        }
        if err := dp.updateReplicas(false); err != nil {
                log.LogErrorf("action[LaunchRepair] partition(%v) err(%v).", dp.partitionID, err)
                return
        }
        if !dp.isLeader {
                return
        }
        if dp.extentStore.BrokenTinyExtentCnt() == 0 {
                dp.extentStore.MoveAllToBrokenTinyExtentC(MinTinyExtentsToRepair)
        }
        dp.repair(extentType)
}

func (dp *DataPartition) updateReplicas(isForce bool) (err error) {
        if !isForce && time.Now().Unix()-dp.intervalToUpdateReplicas <= IntervalToUpdateReplica {
                return
        }
        dp.isLeader = false
        isLeader, replicas, err := dp.fetchReplicasFromMaster()
        if err != nil {
                return
        }
        dp.replicasLock.Lock()
        defer dp.replicasLock.Unlock()
        if !dp.compareReplicas(dp.replicas, replicas) {
                log.LogInfof("action[updateReplicas] partition(%v) replicas changed from (%v) to (%v).",
                        dp.partitionID, dp.replicas, replicas)
        }
        dp.isLeader = isLeader
        dp.replicas = replicas
        dp.intervalToUpdateReplicas = time.Now().Unix()
        log.LogInfof(fmt.Sprintf("ActionUpdateReplicationHosts partiton(%v), force(%v)", dp.partitionID, isForce))

        return
}

// Compare the fetched replica with the local one.
func (dp *DataPartition) compareReplicas(v1, v2 []string) (equals bool) {
        if len(v1) == len(v2) {
                for i := 0; i < len(v1); i++ {
                        if v1[i] != v2[i] {
                                return false
                        }
                }
                return true
        }
        return false
}

// Fetch the replica information from the master.
func (dp *DataPartition) fetchReplicasFromMaster() (isLeader bool, replicas []string, err error) {
        var partition *proto.DataPartitionInfo
        retry := 0
        for {
                if partition, err = MasterClient.AdminAPI().GetDataPartition(dp.volumeID, dp.partitionID); err != nil {
                        retry++
                        if retry > 5 {
                                isLeader = false
                                return
                        }
                } else {
                        break
                }
                time.Sleep(10 * time.Second)
        }

        replicas = append(replicas, partition.Hosts...)
        if partition.Hosts != nil && len(partition.Hosts) >= 1 {
                leaderAddr := strings.Split(partition.Hosts[0], ":")
                if len(leaderAddr) == 2 && strings.TrimSpace(leaderAddr[0]) == LocalIP {
                        isLeader = true
                }
        }
        return
}

func (dp *DataPartition) Load() (response *proto.LoadDataPartitionResponse) {
        response = &proto.LoadDataPartitionResponse{}
        response.PartitionId = uint64(dp.partitionID)
        response.PartitionStatus = dp.partitionStatus
        response.Used = uint64(dp.Used())
        var err error

        if dp.loadExtentHeaderStatus != FinishLoadDataPartitionExtentHeader {
                response.PartitionSnapshot = make([]*proto.File, 0)
        } else {
                response.PartitionSnapshot = dp.SnapShot()
        }
        if err != nil {
                response.Status = proto.TaskFailed
                response.Result = err.Error()
                return
        }
        return
}

// DoExtentStoreRepair performs the repairs of the extent store.
// 1. when the extent size is smaller than the max size on the record, start to repair the missing part.
// 2. if the extent does not even exist, create the extent first, and then repair.
func (dp *DataPartition) DoExtentStoreRepair(repairTask *DataPartitionRepairTask) {
        if dp.stopRecover && dp.isDecommissionRecovering() {
                log.LogWarnf("DoExtentStoreRepair %v receive stop signal", dp.partitionID)
                return
        }
        store := dp.extentStore
        log.LogDebugf("DoExtentStoreRepair.dp %v len extents %v", dp.partitionID, len(repairTask.ExtentsToBeCreated))
        for _, extentInfo := range repairTask.ExtentsToBeCreated {
                log.LogDebugf("DoExtentStoreRepair.dp %v len extentInfo %v", dp.partitionID, extentInfo)
                if storage.IsTinyExtent(extentInfo.FileID) {
                        continue
                }
                if store.HasExtent(uint64(extentInfo.FileID)) {
                        continue
                }
                if !AutoRepairStatus {
                        log.LogWarnf("AutoRepairStatus is False,so cannot Create extent(%v)", extentInfo.String())
                        continue
                }

                dp.disk.allocCheckLimit(proto.IopsWriteType, 1)

                err := store.Create(uint64(extentInfo.FileID))
                if err != nil {
                        continue
                }
        }

        var (
                wg           *sync.WaitGroup
                recoverIndex int
        )
        wg = new(sync.WaitGroup)
        for _, extentInfo := range repairTask.ExtentsToBeRepaired {
                if dp.stopRecover && dp.isDecommissionRecovering() {
                        log.LogWarnf("DoExtentStoreRepair %v receive stop signal", dp.partitionID)
                        return
                }
                if !store.HasExtent(uint64(extentInfo.FileID)) {
                        continue
                }
                wg.Add(1)

                // repair the extents
                go dp.doStreamExtentFixRepair(wg, extentInfo)
                recoverIndex++

                if recoverIndex%NumOfFilesToRecoverInParallel == 0 {
                        wg.Wait()
                }
        }
        wg.Wait()
        dp.doStreamFixTinyDeleteRecord(repairTask)
}

func (dp *DataPartition) pushSyncDeleteRecordFromLeaderMesg() bool {
        select {
        case dp.Disk().syncTinyDeleteRecordFromLeaderOnEveryDisk <- true:
                return true
        default:
                return false
        }
}

func (dp *DataPartition) consumeTinyDeleteRecordFromLeaderMesg() {
        select {
        case <-dp.Disk().syncTinyDeleteRecordFromLeaderOnEveryDisk:
                return
        default:
                return
        }
}

func (dp *DataPartition) doStreamFixTinyDeleteRecord(repairTask *DataPartitionRepairTask) {
        var (
                localTinyDeleteFileSize int64
                err                     error
                conn                    net.Conn
        )
        if !dp.pushSyncDeleteRecordFromLeaderMesg() {
                return
        }

        defer func() {
                dp.consumeTinyDeleteRecordFromLeaderMesg()
        }()
        if localTinyDeleteFileSize, err = dp.extentStore.LoadTinyDeleteFileOffset(); err != nil {
                return
        }

        log.LogInfof(ActionSyncTinyDeleteRecord+" start PartitionID(%v) localTinyDeleteFileSize(%v) leaderTinyDeleteFileSize(%v) leaderAddr(%v)",
                dp.partitionID, localTinyDeleteFileSize, repairTask.LeaderTinyDeleteRecordFileSize, repairTask.LeaderAddr)

        if localTinyDeleteFileSize >= repairTask.LeaderTinyDeleteRecordFileSize {
                return
        }

        if repairTask.LeaderTinyDeleteRecordFileSize-localTinyDeleteFileSize < MinTinyExtentDeleteRecordSyncSize {
                return
        }

        defer func() {
                log.LogInfof(ActionSyncTinyDeleteRecord+" end PartitionID(%v) localTinyDeleteFileSize(%v) leaderTinyDeleteFileSize(%v) leaderAddr(%v) err(%v)",
                        dp.partitionID, localTinyDeleteFileSize, repairTask.LeaderTinyDeleteRecordFileSize, repairTask.LeaderAddr, err)
        }()

        p := repl.NewPacketToReadTinyDeleteRecord(dp.partitionID, localTinyDeleteFileSize)
        if conn, err = dp.getRepairConn(repairTask.LeaderAddr); err != nil {
                return
        }
        defer func() {
                dp.putRepairConn(conn, err != nil)
        }()
        if err = p.WriteToConn(conn); err != nil {
                return
        }
        store := dp.extentStore
        start := time.Now().Unix()
        for localTinyDeleteFileSize < repairTask.LeaderTinyDeleteRecordFileSize {
                if dp.stopRecover && dp.isDecommissionRecovering() {
                        log.LogWarnf("doStreamFixTinyDeleteRecord %v receive stop signal", dp.partitionID)
                        return
                }
                if localTinyDeleteFileSize >= repairTask.LeaderTinyDeleteRecordFileSize {
                        return
                }
                if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
                        return
                }
                if p.IsErrPacket() {
                        logContent := fmt.Sprintf("action[doStreamFixTinyDeleteRecord] %v.",
                                p.LogMessage(p.GetOpMsg(), conn.RemoteAddr().String(), start, fmt.Errorf(string(p.Data[:p.Size]))))
                        err = fmt.Errorf(logContent)
                        return
                }
                if p.CRC != crc32.ChecksumIEEE(p.Data[:p.Size]) {
                        err = fmt.Errorf("crc not match")
                        return
                }
                if p.Size%storage.DeleteTinyRecordSize != 0 {
                        err = fmt.Errorf("unavali size")
                        return
                }
                var index int
                for (index+1)*storage.DeleteTinyRecordSize <= int(p.Size) {
                        record := p.Data[index*storage.DeleteTinyRecordSize : (index+1)*storage.DeleteTinyRecordSize]
                        extentID, offset, size := storage.UnMarshalTinyExtent(record)
                        localTinyDeleteFileSize += storage.DeleteTinyRecordSize
                        index++
                        if !storage.IsTinyExtent(extentID) {
                                continue
                        }
                        DeleteLimiterWait()
                        dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
                        // log.LogInfof("doStreamFixTinyDeleteRecord Delete PartitionID(%v)_Extent(%v)_Offset(%v)_Size(%v)", dp.partitionID, extentID, offset, size)
                        store.MarkDelete(extentID, int64(offset), int64(size))
                }
        }
}

// ChangeRaftMember is a wrapper function of changing the raft member.
func (dp *DataPartition) ChangeRaftMember(changeType raftProto.ConfChangeType, peer raftProto.Peer, context []byte) (resp interface{}, err error) {
        resp, err = dp.raftPartition.ChangeMember(changeType, peer, context)
        return
}

func (dp *DataPartition) canRemoveSelf() (canRemove bool, err error) {
        var partition *proto.DataPartitionInfo
        retry := 0
        for {
                if partition, err = MasterClient.AdminAPI().GetDataPartition(dp.volumeID, dp.partitionID); err != nil {
                        log.LogErrorf("action[canRemoveSelf] err[%v]", err)
                        retry++
                        if retry > 60 {
                                return
                        }
                } else {
                        break
                }
                time.Sleep(10 * time.Second)
        }

        canRemove = false
        var existInPeers bool
        for _, peer := range partition.Peers {
                if dp.config.NodeID == peer.ID {
                        existInPeers = true
                }
        }
        if !existInPeers {
                canRemove = true
                return
        }
        if dp.config.NodeID == partition.OfflinePeerID {
                canRemove = true
                return
        }
        return
}

func (dp *DataPartition) getRepairConn(target string) (net.Conn, error) {
        return dp.dataNode.getRepairConnFunc(target)
}

func (dp *DataPartition) putRepairConn(conn net.Conn, forceClose bool) {
        log.LogDebugf("action[putRepairConn], forceClose: %v", forceClose)
        dp.dataNode.putRepairConnFunc(conn, forceClose)
}

func (dp *DataPartition) isNormalType() bool {
        return proto.IsNormalDp(dp.partitionType)
}

type SimpleVolView struct {
        vv             *proto.SimpleVolView
        lastUpdateTime time.Time
}

type VolMap struct {
        sync.Mutex
        volMap map[string]*SimpleVolView
}

var volViews = VolMap{
        Mutex:  sync.Mutex{},
        volMap: make(map[string]*SimpleVolView),
}

func (vo *VolMap) getSimpleVolView(VolumeID string) (vv *proto.SimpleVolView, err error) {
        vo.Lock()
        if volView, ok := vo.volMap[VolumeID]; ok && time.Since(volView.lastUpdateTime) < 5*time.Minute {
                vo.Unlock()
                return volView.vv, nil
        }
        vo.Unlock()

        volView := &SimpleVolView{
                vv:             nil,
                lastUpdateTime: time.Time{},
        }

        if vv, err = MasterClient.AdminAPI().GetVolumeSimpleInfo(VolumeID); err != nil {
                log.LogErrorf("action[GetVolumeSimpleInfo] cannot get vol(%v) from master(%v) err(%v).",
                        VolumeID, MasterClient.Leader(), err)
                return nil, err
        }

        log.LogDebugf("get volume info, vol(%s), vol(%v)", vv.Name, volView)

        volView.vv = vv
        volView.lastUpdateTime = time.Now()

        vo.Lock()
        vo.volMap[VolumeID] = volView
        vo.Unlock()

        return
}

func (dp *DataPartition) doExtentTtl(ttl int) {
        if ttl <= 0 {
                log.LogWarn("[doTTL] ttl is 0, set default 30", ttl)
                ttl = 30
        }

        extents := dp.extentStore.DumpExtents()

        for _, ext := range extents {
                if storage.IsTinyExtent(ext.FileID) {
                        continue
                }

                if time.Now().Unix()-ext.AccessTime > int64(ttl)*util.OneDaySec() {
                        log.LogDebugf("action[doExtentTtl] ttl delete dp(%v) extent(%v).", dp.partitionID, ext)
                        dp.extentStore.MarkDelete(ext.FileID, 0, 0)
                }
        }
}

func (dp *DataPartition) doExtentEvict(vv *proto.SimpleVolView) {
        var (
                needDieOut      bool
                freeSpace       int
                freeExtentCount int
        )

        needDieOut = false
        if vv.CacheHighWater < vv.CacheLowWater || vv.CacheLowWater < 0 || vv.CacheHighWater > 100 {
                log.LogErrorf("action[doExtentEvict] invalid policy dp(%v), CacheHighWater(%v) CacheLowWater(%v).",
                        dp.partitionID, vv.CacheHighWater, vv.CacheLowWater)
                return
        }

        // if dp use age larger than the space high water, do die out.
        freeSpace = 0
        if dp.Used()*100/dp.Size() > vv.CacheHighWater {
                needDieOut = true
                freeSpace = dp.Used() - dp.Size()*vv.CacheLowWater/100
        } else if dp.partitionStatus == proto.ReadOnly {
                needDieOut = true
                freeSpace = dp.Used() * (vv.CacheHighWater - vv.CacheLowWater) / 100
        }

        // if dp extent count larger than upper count, do die out.
        freeExtentCount = 0
        extInfos := dp.extentStore.DumpExtents()
        maxExtentCount := dp.Size() / util.DefaultTinySizeLimit
        if len(extInfos) > maxExtentCount {
                needDieOut = true
                freeExtentCount = len(extInfos) - vv.CacheLowWater*maxExtentCount/100
        }

        log.LogDebugf("action[doExtentEvict], vol %v, LRU(%v, %v), dp %v, usage %v, status(%d), extents %v, freeSpace %v, freeExtentCount %v, needDieOut %v",
                vv.Name, vv.CacheLowWater, vv.CacheHighWater, dp.partitionID, dp.Used()*100/dp.Size(), dp.partitionStatus, len(extInfos),
                freeSpace, freeExtentCount, needDieOut)

        if !needDieOut {
                return
        }

        sort.Sort(extInfos)

        for _, ext := range extInfos {
                if storage.IsTinyExtent(ext.FileID) {
                        continue
                }

                freeSpace -= int(ext.Size)
                freeExtentCount--
                dp.extentStore.MarkDelete(ext.FileID, 0, 0)
                log.LogDebugf("action[doExtentEvict] die out. vol %v, dp(%v), extent(%v).", vv.Name, dp.partitionID, *ext)

                if freeSpace <= 0 && freeExtentCount <= 0 {
                        log.LogDebugf("[doExtentEvict] die out done, vol(%s), dp (%d)", vv.Name, dp.partitionID)
                        break
                }
        }
}

func (dp *DataPartition) startEvict() {
        // only cache or preload dp can't do evict.
        if !proto.IsCacheDp(dp.partitionType) {
                return
        }

        log.LogDebugf("[startEvict] start do dp(%d) evict op", dp.partitionID)

        vv, err := volViews.getSimpleVolView(dp.volumeID)
        if err != nil {
                err := fmt.Errorf("[startEvict] get vol [%s] info error, err %s", dp.volumeID, err.Error())
                log.LogError(err)
                panic(err)
        }

        lruInterval := getWithDefault(vv.CacheLruInterval, 5)
        cacheTtl := getWithDefault(vv.CacheTtl, 30)

        lruTimer := time.NewTicker(time.Duration(lruInterval) * time.Minute)
        ttlTimer := time.NewTicker(time.Duration(util.OneDaySec()) * time.Second)
        defer func() {
                lruTimer.Stop()
                ttlTimer.Stop()
        }()

        for {
                // check volume type and dp type.
                if proto.IsHot(vv.VolType) || !proto.IsCacheDp(dp.partitionType) {
                        log.LogErrorf("action[startEvict] cannot startEvict, vol(%v), dp(%v).", vv.Name, dp.partitionID)
                        return
                }

                select {
                case <-lruTimer.C:
                        log.LogDebugf("start [doExtentEvict] vol(%s), dp(%d).", vv.Name, dp.partitionID)
                        evictStart := time.Now()
                        dp.doExtentEvict(vv)
                        log.LogDebugf("action[doExtentEvict] vol(%v), dp(%v), cost (%v)ms, .", vv.Name, dp.partitionID, time.Since(evictStart))

                case <-ttlTimer.C:
                        log.LogDebugf("start [doExtentTtl] vol(%s), dp(%d).", vv.Name, dp.partitionID)
                        ttlStart := time.Now()
                        dp.doExtentTtl(cacheTtl)
                        log.LogDebugf("action[doExtentTtl] vol(%v), dp(%v), cost (%v)ms.", vv.Name, dp.partitionID, time.Since(ttlStart))

                case <-dp.stopC:
                        log.LogWarn("task[doExtentTtl] stopped", dp.volumeID, dp.partitionID)
                        return
                }

                // loop update vol info
                newVV, err := volViews.getSimpleVolView(dp.volumeID)
                if err != nil {
                        err := fmt.Errorf("[startEvict] get vol [%s] info error, err %s", dp.volumeID, err.Error())
                        log.LogError(err)
                        continue
                }

                vv = newVV
                if lruInterval != vv.CacheLruInterval || cacheTtl != vv.CacheTtl {
                        lruInterval = getWithDefault(vv.CacheLruInterval, 5)
                        cacheTtl = getWithDefault(vv.CacheTtl, 30)

                        lruTimer = time.NewTicker(time.Duration(lruInterval) * time.Minute)
                        log.LogInfof("[startEvict] update vol config, dp(%d) %v ", dp.partitionID, *vv)
                }
        }
}

func getWithDefault(base, def int) int {
        if base <= 0 {
                return def
        }

        return base
}

func (dp *DataPartition) StopDecommissionRecover(stop bool) {
        // only work for decommission repair
        if !dp.isDecommissionRecovering() {
                log.LogWarnf("[StopDecommissionRecover]  dp(%d) is not in recovering status: type %d status %d",
                        dp.partitionID, dp.partitionType, dp.Status())
                return
        }
        // for check timeout
        dp.stopRecover = stop
        dp.PersistMetadata()
}

func (dp *DataPartition) isDecommissionRecovering() bool {
        // decommission recover failed or success will set to normal
        return dp.DataPartitionCreateType == proto.DecommissionedCreateDataPartition
}

func (dp *DataPartition) handleDecommissionRecoverFailed() {
        if !dp.isDecommissionRecovering() {
                return
        }
        // prevent status changing from  Unavailable to Recovering again in statusUpdate()
        dp.partitionType = proto.NormalCreateDataPartition
        dp.partitionStatus = proto.Unavailable
        log.LogWarnf("[handleDecommissionRecoverFailed]  dp(%d) recover failed reach max limit", dp.partitionID)
        dp.PersistMetadata()
        dp.StopDecommissionRecover(true)
}

func (dp *DataPartition) incDiskErrCnt() {
        diskErrCnt := atomic.AddUint64(&dp.diskErrCnt, 1)
        log.LogWarnf("[incDiskErrCnt]: dp(%v) disk err count:%v", dp.partitionID, diskErrCnt)
}

func (dp *DataPartition) getDiskErrCnt() uint64 {
        return atomic.LoadUint64(&dp.diskErrCnt)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io"
        "net"
        "strings"
        "sync/atomic"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

type RaftCmdItem struct {
        Op uint32 `json:"op"`
        K  []byte `json:"k"`
        V  []byte `json:"v"`
}

type rndWrtOpItem struct {
        opcode   uint8
        extentID uint64
        offset   int64
        size     int64
        data     []byte
        crc      uint32
}

// Marshal random write value to binary data.
// Binary frame structure:
//  +------+----+------+------+------+------+------+
//  | Item | extentID | offset | size | crc | data |
//  +------+----+------+------+------+------+------+
//  | byte |     8    |    8   |  8   |  4  | size |
//  +------+----+------+------+------+------+------+

const (
        BinaryMarshalMagicVersion = 0xFF
)

func MarshalRandWriteRaftLog(opcode uint8, extentID uint64, offset, size int64, data []byte, crc uint32) (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0))
        buff.Grow(8 + 8*2 + 4 + int(size) + 4 + 4)
        if err = binary.Write(buff, binary.BigEndian, uint32(BinaryMarshalMagicVersion)); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, opcode); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, extentID); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, offset); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, size); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, crc); err != nil {
                return
        }
        if _, err = buff.Write(data); err != nil {
                return
        }
        result = buff.Bytes()
        return
}

// RandomWriteSubmit submits the proposal to raft.
func UnmarshalRandWriteRaftLog(raw []byte) (opItem *rndWrtOpItem, err error) {
        opItem = new(rndWrtOpItem)
        buff := bytes.NewBuffer(raw)
        var version uint32
        if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
                return
        }

        if err = binary.Read(buff, binary.BigEndian, &opItem.opcode); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.extentID); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.offset); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.size); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.crc); err != nil {
                return
        }
        opItem.data = make([]byte, opItem.size)
        if _, err = buff.Read(opItem.data); err != nil {
                return
        }

        return
}

func MarshalRaftCmd(raftOpItem *RaftCmdItem) (raw []byte, err error) {
        if raw, err = json.Marshal(raftOpItem); err != nil {
                return
        }
        return
}

func UnmarshalRaftCmd(raw []byte) (raftOpItem *RaftCmdItem, err error) {
        raftOpItem = new(RaftCmdItem)
        defer func() {
                log.LogDebugf("Unmarsh use oldVersion,result %v", err)
        }()
        if err = json.Unmarshal(raw, raftOpItem); err != nil {
                return
        }
        return
}

func UnmarshalOldVersionRaftLog(raw []byte) (opItem *rndWrtOpItem, err error) {
        raftOpItem := new(RaftCmdItem)
        defer func() {
                log.LogDebugf("Unmarsh use oldVersion,result %v", err)
        }()
        if err = json.Unmarshal(raw, raftOpItem); err != nil {
                return
        }
        opItem, err = UnmarshalOldVersionRandWriteOpItem(raftOpItem.V)
        if err != nil {
                return
        }
        opItem.opcode = uint8(raftOpItem.Op)
        return
}

func UnmarshalOldVersionRandWriteOpItem(raw []byte) (result *rndWrtOpItem, err error) {
        var opItem rndWrtOpItem
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &opItem.extentID); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.offset); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.size); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &opItem.crc); err != nil {
                return
        }
        opItem.data = make([]byte, opItem.size)
        if _, err = buff.Read(opItem.data); err != nil {
                return
        }
        result = &opItem
        return
}

// CheckLeader checks if itself is the leader during read
func (dp *DataPartition) CheckLeader(request *repl.Packet, connect net.Conn) (err error) {
        //  and use another getRaftLeaderAddr() to return the actual address
        _, ok := dp.IsRaftLeader()
        if !ok {
                err = raft.ErrNotLeader
                logContent := fmt.Sprintf("action[ReadCheck] %v.", request.LogMessage(request.GetOpMsg(), connect.RemoteAddr().String(), request.StartT, err))
                log.LogWarnf(logContent)
                return
        }

        return
}

type ItemIterator struct {
        applyID uint64
}

// NewItemIterator creates a new item iterator.
func NewItemIterator(applyID uint64) *ItemIterator {
        si := new(ItemIterator)
        si.applyID = applyID
        return si
}

// ApplyIndex returns the appliedID
func (si *ItemIterator) ApplyIndex() uint64 {
        return si.applyID
}

// Close Closes the iterator.
func (si *ItemIterator) Close() {
        // do nothing
}

// Next returns the next item in the iterator.
func (si *ItemIterator) Next() (data []byte, err error) {
        // appIDBuf := make([]byte, 8)
        // binary.BigEndian.PutUint64(appIDBuf, si.applyID)
        // data = appIDBuf[:]
        err = io.EOF
        return
}

// ApplyRandomWrite random write apply
func (dp *DataPartition) ApplyRandomWrite(command []byte, raftApplyID uint64) (respStatus interface{}, err error) {
        opItem := &rndWrtOpItem{}
        respStatus = proto.OpOk
        defer func() {
                if err == nil {
                        dp.uploadApplyID(raftApplyID)
                        log.LogDebugf("action[ApplyRandomWrite] dp(%v) raftApplyID(%v) success!", dp.partitionID, raftApplyID)
                } else {
                        if respStatus == proto.OpExistErr { // for tryAppendWrite
                                err = nil
                                log.LogDebugf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry[20]",
                                        raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err)
                                return
                        }
                        err = fmt.Errorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry[20]",
                                raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err)
                        log.LogErrorf("action[ApplyRandomWrite] Partition(%v) failed err %v", dp.partitionID, err)
                        exporter.Warning(err.Error())
                        if respStatus == proto.OpOk {
                                respStatus = proto.OpDiskErr
                        }
                        panic(newRaftApplyError(err))
                }
        }()

        if opItem, err = UnmarshalRandWriteRaftLog(command); err != nil {
                log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v) unmarshal failed(%v)", raftApplyID, dp.partitionID, err)
                return
        }
        log.LogDebugf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v)",
                raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size)

        for i := 0; i < 20; i++ {
                dp.disk.allocCheckLimit(proto.FlowWriteType, uint32(opItem.size))
                dp.disk.allocCheckLimit(proto.IopsWriteType, 1)

                var syncWrite bool
                writeType := storage.RandomWriteType
                if opItem.opcode == proto.OpRandomWrite || opItem.opcode == proto.OpSyncRandomWrite {
                        if dp.verSeq > 0 {
                                err = storage.VerNotConsistentError
                                log.LogErrorf("action[ApplyRandomWrite] volume [%v] dp [%v] %v,client need update to newest version!", dp.volumeID, dp.partitionID, err)
                                return
                        }
                } else if opItem.opcode == proto.OpRandomWriteAppend || opItem.opcode == proto.OpSyncRandomWriteAppend {
                        writeType = storage.AppendRandomWriteType
                } else if opItem.opcode == proto.OpTryWriteAppend || opItem.opcode == proto.OpSyncTryWriteAppend {
                        writeType = storage.AppendWriteType
                }

                if opItem.opcode == proto.OpSyncRandomWriteAppend || opItem.opcode == proto.OpSyncRandomWrite || opItem.opcode == proto.OpSyncRandomWriteVer {
                        syncWrite = true
                }

                dp.disk.limitWrite.Run(int(opItem.size), func() {
                        respStatus, err = dp.ExtentStore().Write(opItem.extentID, opItem.offset, opItem.size, opItem.data, opItem.crc, writeType, syncWrite)
                })
                if err == nil {
                        break
                }
                if IsDiskErr(err.Error()) {
                        panic(newRaftApplyError(err))
                }
                if strings.Contains(err.Error(), storage.ExtentNotFoundError.Error()) {
                        err = nil
                        return
                }
                if (opItem.opcode == proto.OpTryWriteAppend || opItem.opcode == proto.OpSyncTryWriteAppend) && respStatus == proto.OpTryOtherExtent {
                        err = nil
                        return
                }
                log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v)_Extent(%v)_ExtentOffset(%v)_Size(%v) apply err(%v) retry(%v)",
                        raftApplyID, dp.partitionID, opItem.extentID, opItem.offset, opItem.size, err, i)
        }

        return
}

// RandomWriteSubmit submits the proposal to raft.
func (dp *DataPartition) RandomWriteSubmit(pkg *repl.Packet) (err error) {
        val, err := MarshalRandWriteRaftLog(pkg.Opcode, pkg.ExtentID, pkg.ExtentOffset, int64(pkg.Size), pkg.Data, pkg.CRC)
        if err != nil {
                log.LogErrorf("action[RandomWriteSubmit] [%v] marshal error %v", dp.partitionID, err)
                return
        }
        pkg.ResultCode, err = dp.Submit(val)
        return
}

func (dp *DataPartition) Submit(val []byte) (retCode uint8, err error) {
        var resp interface{}
        resp, err = dp.Put(nil, val)
        retCode, _ = resp.(uint8)
        if err != nil {
                log.LogErrorf("action[RandomWriteSubmit] submit err %v", err)
                return
        }
        return
}

func (dp *DataPartition) CheckWriteVer(p *repl.Packet) (err error) {
        log.LogDebugf("action[CheckWriteVer] packet %v dpseq %v ", p, dp.verSeq)
        if atomic.LoadUint64(&dp.verSeq) == p.VerSeq {
                return
        }

        if p.Opcode == proto.OpSyncRandomWrite || p.Opcode == proto.OpRandomWrite {
                err = fmt.Errorf("volume enable mulit version")
                log.LogErrorf("action[CheckWriteVer] error %v", err)
                return
        }
        if p.VerSeq < dp.verSeq {
                p.ExtentType |= proto.MultiVersionFlag
                p.ExtentType |= proto.VersionListFlag

                if p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer {
                        err = storage.VerNotConsistentError
                        log.LogDebugf("action[CheckWriteVer] dp %v client verSeq[%v] small than dataPartiton ver[%v]",
                                dp.config.PartitionID, p.VerSeq, dp.verSeq)
                }

                p.VerSeq = dp.verSeq
                dp.volVersionInfoList.RWLock.RLock()
                p.VerList = make([]*proto.VolVersionInfo, len(dp.volVersionInfoList.VerList))
                copy(p.VerList, dp.volVersionInfoList.VerList)
                dp.volVersionInfoList.RWLock.RUnlock()
                log.LogInfof("action[CheckWriteVer] partitionId %v reqId %v verList %v seq %v dpVerList %v",
                        p.PartitionID, p.ReqID, p.VerList, p.VerSeq, dp.volVersionInfoList.VerList)
                return
        } else if p.VerSeq > dp.verSeq {
                log.LogWarnf("action[CheckWriteVer] partitionId %v reqId %v verList (%v) seq %v old one(%v)",
                        p.PartitionID, p.ReqID, p.VerList, p.VerSeq, dp.volVersionInfoList.VerList)
                dp.verSeq = p.VerSeq
                dp.volVersionInfoList.RWLock.Lock()
                dp.volVersionInfoList.VerList = make([]*proto.VolVersionInfo, len(p.VerList))
                copy(dp.volVersionInfoList.VerList, p.VerList)
                dp.volVersionInfoList.RWLock.Unlock()
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "encoding/binary"
        "encoding/json"
        "fmt"
        "net"
        "os"
        "path"
        "strconv"
        "strings"
        "sync/atomic"
        "time"

        raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

type dataPartitionCfg struct {
        VolName       string              `json:"vol_name"`
        ClusterID     string              `json:"cluster_id"`
        PartitionID   uint64              `json:"partition_id"`
        PartitionSize int                 `json:"partition_size"`
        PartitionType int                 `json:"partition_type"`
        Peers         []proto.Peer        `json:"peers"`
        Hosts         []string            `json:"hosts"`
        NodeID        uint64              `json:"-"`
        RaftStore     raftstore.RaftStore `json:"-"`
        ReplicaNum    int
        VerSeq        uint64 `json:"ver_seq"`
        CreateType    int
        Forbidden     bool
}

func (dp *DataPartition) raftPort() (heartbeat, replica int, err error) {
        raftConfig := dp.config.RaftStore.RaftConfig()
        heartbeatAddrSplits := strings.Split(raftConfig.HeartbeatAddr, ":")
        replicaAddrSplits := strings.Split(raftConfig.ReplicateAddr, ":")
        if len(heartbeatAddrSplits) != 2 {
                err = errors.New("illegal heartbeat address")
                return
        }
        if len(replicaAddrSplits) != 2 {
                err = errors.New("illegal replica address")
                return
        }
        heartbeat, err = strconv.Atoi(heartbeatAddrSplits[1])
        if err != nil {
                return
        }
        replica, err = strconv.Atoi(replicaAddrSplits[1])
        if err != nil {
                return
        }
        return
}

// StartRaft start raft instance when data partition start or restore.
func (dp *DataPartition) StartRaft(isLoad bool) (err error) {
        // cache or preload partition not support raft and repair.
        if !dp.isNormalType() {
                return nil
        }

        var (
                heartbeatPort int
                replicaPort   int
                peers         []raftstore.PeerAddress
        )
        defer func() {
                if r := recover(); r != nil {
                        mesg := fmt.Sprintf("StartRaft(%v)  Raft Panic (%v)", dp.partitionID, r)
                        log.LogError(mesg)
                        if isLoad {
                                err = errors.New(mesg)
                        } else {
                                log.LogFlush()
                                panic(mesg)
                        }
                }
        }()

        if heartbeatPort, replicaPort, err = dp.raftPort(); err != nil {
                return
        }
        for _, peer := range dp.config.Peers {
                addr := strings.Split(peer.Addr, ":")[0]
                rp := raftstore.PeerAddress{
                        Peer: raftproto.Peer{
                                ID: peer.ID,
                        },
                        Address:       addr,
                        HeartbeatPort: heartbeatPort,
                        ReplicaPort:   replicaPort,
                }
                peers = append(peers, rp)
        }
        log.LogDebugf("start partition(%v) raft peers: %s path: %s",
                dp.partitionID, peers, dp.path)
        pc := &raftstore.PartitionConfig{
                ID:      uint64(dp.partitionID),
                Applied: dp.appliedID,
                Peers:   peers,
                SM:      dp,
                WalPath: dp.path,
        }

        dp.raftPartition, err = dp.config.RaftStore.CreatePartition(pc)
        if err == nil {
                dp.ForceSetRaftRunning()
                dp.ForceSetDataPartitionToFininshLoad()
        }
        return
}

func (dp *DataPartition) raftStopped() bool {
        return atomic.LoadInt32(&dp.raftStatus) == RaftStatusStopped
}

func (dp *DataPartition) stopRaft() {
        if atomic.CompareAndSwapInt32(&dp.raftStatus, RaftStatusRunning, RaftStatusStopped) {
                // cache or preload partition not support raft and repair.
                if !dp.isNormalType() {
                        return
                }
                log.LogErrorf("[FATAL] stop raft partition(%v)", dp.partitionID)
                dp.raftPartition.Stop()
        }
}

func (dp *DataPartition) CanRemoveRaftMember(peer proto.Peer, force bool) error {
        if !dp.isNormalType() {
                return fmt.Errorf("CanRemoveRaftMember (%v) not support", dp)
        }

        downReplicas := dp.config.RaftStore.RaftServer().GetDownReplicas(dp.partitionID)
        hasExsit := false
        for _, p := range dp.config.Peers {
                if p.ID == peer.ID {
                        hasExsit = true
                        break
                }
        }
        if !hasExsit {
                log.LogInfof("action[CanRemoveRaftMember] replicaNum %v peers %v, peer %v not found", dp.replicaNum, len(dp.config.Peers), peer)
                return nil
        }

        hasDownReplicasExcludePeer := make([]uint64, 0)
        for _, nodeID := range downReplicas {
                if nodeID.NodeID == peer.ID {
                        continue
                }
                // check nodeID is valid
                hasDownReplicasExcludePeer = append(hasDownReplicasExcludePeer, nodeID.NodeID)
        }

        log.LogInfof("action[CanRemoveRaftMember] dp %v replicaNum %v peers %v", dp.partitionID, dp.replicaNum, len(dp.config.Peers))
        if dp.replicaNum == 2 && len(dp.config.Peers) == 2 && force {
                return nil
        }

        sumReplicas := len(dp.config.Peers)
        if sumReplicas%2 == 1 {
                if sumReplicas-len(hasDownReplicasExcludePeer) > (sumReplicas/2 + 1) {
                        return nil
                }
        } else {
                if sumReplicas-len(hasDownReplicasExcludePeer) >= (sumReplicas/2 + 1) {
                        return nil
                }
        }

        return fmt.Errorf("hasDownReplicasExcludePeer(%v) too much,so donnot offline (%v)", downReplicas, peer)
}

// StartRaftLoggingSchedule starts the task schedule as follows:
// 1. write the raft applied id into disk.
// 2. collect the applied ids from raft members.
// 3. based on the minimum applied id to cutoff and delete the saved raft log in order to free the disk space.
func (dp *DataPartition) StartRaftLoggingSchedule() {
        // cache or preload partition not support raft and repair.
        if !dp.isNormalType() {
                return
        }

        getAppliedIDTimer := time.NewTimer(time.Second * 1)
        truncateRaftLogTimer := time.NewTimer(time.Minute * 10)
        storeAppliedIDTimer := time.NewTimer(time.Second * 10)

        log.LogDebugf("[startSchedule] hello DataPartition schedule")

        for {
                select {
                case <-dp.stopC:
                        log.LogDebugf("[startSchedule] stop partition(%v)", dp.partitionID)
                        getAppliedIDTimer.Stop()
                        truncateRaftLogTimer.Stop()
                        storeAppliedIDTimer.Stop()
                        return

                case extentID := <-dp.stopRaftC:
                        dp.stopRaft()
                        log.LogErrorf("action[ExtentRepair] stop raft partition(%v)_%v", dp.partitionID, extentID)

                case <-getAppliedIDTimer.C:
                        if !dp.raftStopped() {
                                dp.updateMaxMinAppliedID()
                        }
                        getAppliedIDTimer.Reset(time.Minute * 1)

                case <-truncateRaftLogTimer.C:
                        if dp.raftStopped() {
                                break
                        }

                        if dp.minAppliedID > dp.lastTruncateID { // Has changed
                                appliedID := atomic.LoadUint64(&dp.appliedID)
                                if err := dp.storeAppliedID(appliedID); err != nil {
                                        log.LogErrorf("partition [%v] persist applied ID [%v] during scheduled truncate raft log failed: %v", dp.partitionID, appliedID, err)
                                        truncateRaftLogTimer.Reset(time.Minute)
                                        continue
                                }
                                dp.raftPartition.Truncate(dp.minAppliedID)
                                dp.lastTruncateID = dp.minAppliedID
                                if err := dp.PersistMetadata(); err != nil {
                                        log.LogErrorf("partition [%v] persist metadata during scheduled truncate raft log failed: %v", dp.partitionID, err)
                                        truncateRaftLogTimer.Reset(time.Minute)
                                        continue
                                }
                                log.LogInfof("partition [%v] scheduled truncate raft log [applied: %v, truncated: %v]", dp.partitionID, appliedID, dp.minAppliedID)
                        }
                        truncateRaftLogTimer.Reset(time.Minute)

                case <-storeAppliedIDTimer.C:
                        appliedID := atomic.LoadUint64(&dp.appliedID)
                        if err := dp.storeAppliedID(appliedID); err != nil {
                                log.LogErrorf("partition [%v] scheduled persist applied ID [%v] failed: %v", dp.partitionID, appliedID, err)
                        }
                        storeAppliedIDTimer.Reset(time.Second * 10)
                }
        }
}

// StartRaftAfterRepair starts the raft after repairing a partition.
// It can only happens after all the extent files are repaired by the leader.
// When the repair is finished, the local dp.partitionSize is same as the leader's dp.partitionSize.
// The repair task can be done in statusUpdateScheduler->LaunchRepair.
func (dp *DataPartition) StartRaftAfterRepair(isLoad bool) {
        log.LogDebugf("StartRaftAfterRepair enter")
        // cache or preload partition not support raft and repair.
        if !dp.isNormalType() {
                return
        }
        var (
                initPartitionSize, initMaxExtentID uint64
                currLeaderPartitionSize            uint64
                err                                error
        )
        timer := time.NewTicker(5 * time.Second)
        for {
                select {
                case <-timer.C:
                        err = nil
                        if dp.isLeader { // primary does not need to wait repair
                                if err := dp.StartRaft(isLoad); err != nil {
                                        log.LogErrorf("PartitionID(%v) leader start raft err(%v).", dp.partitionID, err)
                                        continue
                                }
                                log.LogDebugf("PartitionID(%v) leader started.", dp.partitionID)
                                return
                        }

                        if dp.stopRecover && dp.isDecommissionRecovering() {
                                log.LogDebugf("action[StartRaftAfterRepair] PartitionID(%v) receive stop signal.", dp.partitionID)
                                continue
                        }

                        // wait for dp.replicas to be updated
                        if dp.getReplicaLen() == 0 {
                                continue
                        }
                        if initMaxExtentID == 0 || initPartitionSize == 0 {
                                initMaxExtentID, initPartitionSize, err = dp.getLeaderMaxExtentIDAndPartitionSize()
                        }

                        if err != nil {
                                log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) get MaxExtentID  err(%v)", dp.partitionID, err)
                                continue
                        }

                        // get the partition size from the primary and compare it with the loparal one
                        currLeaderPartitionSize, err = dp.getLeaderPartitionSize(initMaxExtentID)
                        if err != nil {
                                log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) get leader size err(%v)", dp.partitionID, err)
                                continue
                        }

                        dp.leaderSize = int(currLeaderPartitionSize)

                        if currLeaderPartitionSize < initPartitionSize {
                                initPartitionSize = currLeaderPartitionSize
                        }
                        localSize := dp.extentStore.StoreSizeExtentID(initMaxExtentID)
                        dp.decommissionRepairProgress = float64(localSize) / float64(initPartitionSize)
                        log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) initMaxExtentID(%v) initPartitionSize(%v) currLeaderPartitionSize(%v)"+
                                "localSize(%v)", dp.partitionID, initMaxExtentID, initPartitionSize, currLeaderPartitionSize, localSize)

                        if initPartitionSize > localSize {
                                log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) leader size(%v) local size(%v) wait snapshot recover", dp.partitionID, initPartitionSize, localSize)
                                continue
                        }

                        if err := dp.StartRaft(isLoad); err != nil {
                                log.LogErrorf("action[StartRaftAfterRepair] PartitionID(%v) start raft err(%v). Retry after 20s.", dp.partitionID, err)
                                timer.Reset(5 * time.Second)
                                continue
                        }
                        // start raft
                        dp.DataPartitionCreateType = proto.NormalCreateDataPartition
                        log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) change to NormalCreateDataPartition",
                                dp.partitionID)
                        dp.decommissionRepairProgress = float64(1)
                        dp.PersistMetadata()
                        log.LogInfof("action[StartRaftAfterRepair] PartitionID(%v) raft started!", dp.partitionID)
                        return
                case <-dp.stopC:
                        log.LogDebugf("action[StartRaftAfterRepair] PartitionID(%v) receive dp stop signal!!.", dp.partitionID)
                        timer.Stop()
                        return
                }
        }
}

// Add a raft node.
func (dp *DataPartition) addRaftNode(req *proto.AddDataPartitionRaftMemberRequest, index uint64) (isUpdated bool, err error) {
        // cache or preload partition not support raft and repair.
        if !dp.isNormalType() {
                return false, fmt.Errorf("addRaftNode (%v) not support", dp)
        }

        var (
                heartbeatPort int
                replicaPort   int
        )
        if heartbeatPort, replicaPort, err = dp.raftPort(); err != nil {
                return
        }
        log.LogInfof("action[addRaftNode] add raft node peer [%v]", req.AddPeer)
        found := false
        for _, peer := range dp.config.Peers {
                if peer.ID == req.AddPeer.ID {
                        found = true
                        break
                }
        }
        isUpdated = !found
        if !isUpdated {
                return
        }
        data, _ := json.Marshal(req)
        log.LogInfof("addRaftNode: partitionID(%v) nodeID(%v) index(%v) data(%v) ",
                req.PartitionId, dp.config.NodeID, index, string(data))
        dp.config.Peers = append(dp.config.Peers, req.AddPeer)
        dp.config.Hosts = append(dp.config.Hosts, req.AddPeer.Addr)
        dp.replicasLock.Lock()
        dp.replicas = make([]string, len(dp.config.Hosts))
        copy(dp.replicas, dp.config.Hosts)
        dp.replicasLock.Unlock()
        addr := strings.Split(req.AddPeer.Addr, ":")[0]
        dp.config.RaftStore.AddNodeWithPort(req.AddPeer.ID, addr, heartbeatPort, replicaPort)
        return
}

// Delete a raft node.
func (dp *DataPartition) removeRaftNode(req *proto.RemoveDataPartitionRaftMemberRequest, index uint64) (isUpdated bool, err error) {
        // cache or preload partition not support raft and repair.
        if !dp.isNormalType() {
                return false, fmt.Errorf("removeRaftNode (%v) not support", dp)
        }

        var canRemoveSelf bool
        if canRemoveSelf, err = dp.canRemoveSelf(); err != nil {
                return
        }
        peerIndex := -1
        data, _ := json.Marshal(req)
        isUpdated = false
        log.LogInfof("Start RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                req.PartitionId, dp.config.NodeID, string(data))
        for i, peer := range dp.config.Peers {
                if peer.ID == req.RemovePeer.ID {
                        peerIndex = i
                        isUpdated = true
                        break
                }
        }
        if !isUpdated {
                log.LogInfof("NoUpdate RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                        req.PartitionId, dp.config.NodeID, string(data))
                return
        }
        hostIndex := -1
        for index, host := range dp.config.Hosts {
                if host == req.RemovePeer.Addr {
                        hostIndex = index
                        break
                }
        }
        if hostIndex != -1 {
                dp.config.Hosts = append(dp.config.Hosts[:hostIndex], dp.config.Hosts[hostIndex+1:]...)
        }
        dp.config.Peers = append(dp.config.Peers[:peerIndex], dp.config.Peers[peerIndex+1:]...)
        if dp.config.NodeID == req.RemovePeer.ID && !dp.IsDataPartitionLoading() && canRemoveSelf {
                dp.raftPartition.Delete()
                dp.Disk().space.DeletePartition(dp.partitionID)
                isUpdated = false
        }
        // update dp replicas after removing a raft node
        if isUpdated {
                dp.replicasLock.Lock()
                dp.replicas = make([]string, len(dp.config.Hosts))
                copy(dp.replicas, dp.config.Hosts)
                dp.replicasLock.Unlock()
        }
        log.LogInfof("Finish RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                req.PartitionId, dp.config.NodeID, string(data))

        return
}

func (dp *DataPartition) storeAppliedID(applyIndex uint64) (err error) {
        filename := path.Join(dp.Path(), TempApplyIndexFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                fp.Close()
                os.Remove(filename)
        }()
        if _, err = fp.WriteString(fmt.Sprintf("%d", applyIndex)); err != nil {
                return
        }
        fp.Sync()
        err = os.Rename(filename, path.Join(dp.Path(), ApplyIndexFile))
        return
}

// LoadAppliedID loads the applied IDs to the memory.
func (dp *DataPartition) LoadAppliedID() (err error) {
        filename := path.Join(dp.Path(), ApplyIndexFile)
        if _, err = os.Stat(filename); err != nil {
                return
        }
        data, err := os.ReadFile(filename)
        if err != nil {
                err = errors.NewErrorf("[loadApplyIndex] OpenFile: %s", err.Error())
                return
        }
        if len(data) == 0 {
                err = errors.NewErrorf("[loadApplyIndex]: ApplyIndex is empty")
                return
        }
        if _, err = fmt.Sscanf(string(data), "%d", &dp.appliedID); err != nil {
                err = errors.NewErrorf("[loadApplyID] ReadApplyID: %s", err.Error())
                return
        }
        dp.extentStore.ApplyId = dp.appliedID
        return
}

func (dp *DataPartition) SetMinAppliedID(id uint64) {
        dp.minAppliedID = id
}

func (dp *DataPartition) GetAppliedID() (id uint64) {
        return dp.appliedID
}

func (s *DataNode) parseRaftConfig(cfg *config.Config) (err error) {
        s.raftDir = cfg.GetString(ConfigKeyRaftDir)
        if s.raftDir == "" {
                return fmt.Errorf("bad raftDir config")
        }
        s.tickInterval = int(cfg.GetFloat(CfgTickInterval))
        s.raftHeartbeat = cfg.GetString(ConfigKeyRaftHeartbeat)
        s.raftReplica = cfg.GetString(ConfigKeyRaftReplica)
        s.raftRecvBufSize = int(cfg.GetInt(CfgRaftRecvBufSize))
        log.LogDebugf("[parseRaftConfig] load raftDir(%v).", s.raftDir)
        log.LogDebugf("[parseRaftConfig] load raftHearbeat(%v).", s.raftHeartbeat)
        log.LogDebugf("[parseRaftConfig] load raftReplica(%v).", s.raftReplica)
        return
}

func (s *DataNode) startRaftServer(cfg *config.Config) (err error) {
        log.LogInfo("Start: startRaftServer")

        s.parseRaftConfig(cfg)

        if s.clusterUuidEnable {
                if err = config.CheckOrStoreClusterUuid(s.raftDir, s.clusterUuid, false); err != nil {
                        log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
                        return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
                }
        }

        constCfg := config.ConstConfig{
                Listen:           s.port,
                RaftHeartbetPort: s.raftHeartbeat,
                RaftReplicaPort:  s.raftReplica,
        }
        ok := false
        if ok, err = config.CheckOrStoreConstCfg(s.raftDir, config.DefaultConstConfigFile, &constCfg); !ok {
                log.LogErrorf("constCfg check failed %v %v %v %v", s.raftDir, config.DefaultConstConfigFile, constCfg, err)
                return fmt.Errorf("constCfg check failed %v %v %v %v", s.raftDir, config.DefaultConstConfigFile, constCfg, err)
        }

        if _, err = os.Stat(s.raftDir); err != nil {
                if err = os.MkdirAll(s.raftDir, 0o755); err != nil {
                        err = errors.NewErrorf("create raft server dir: %s", err.Error())
                        log.LogErrorf("action[startRaftServer] cannot start raft server err(%v)", err)
                        return
                }
        }

        heartbeatPort, err := strconv.Atoi(s.raftHeartbeat)
        if err != nil {
                err = errors.NewErrorf("Raft heartbeat port configuration error: %s", err.Error())
                return
        }
        replicatePort, err := strconv.Atoi(s.raftReplica)
        if err != nil {
                err = errors.NewErrorf("Raft replica port configuration error: %s", err.Error())
                return
        }

        raftConf := &raftstore.Config{
                NodeID:            s.nodeID,
                RaftPath:          s.raftDir,
                IPAddr:            LocalIP,
                HeartbeatPort:     heartbeatPort,
                ReplicaPort:       replicatePort,
                NumOfLogsToRetain: DefaultRaftLogsToRetain,
                TickInterval:      s.tickInterval,
                RecvBufSize:       s.raftRecvBufSize,
        }
        s.raftStore, err = raftstore.NewRaftStore(raftConf, cfg)
        if err != nil {
                err = errors.NewErrorf("new raftStore: %s", err.Error())
                log.LogErrorf("action[startRaftServer] cannot start raft server err(%v)", err)
        }

        return
}

func (s *DataNode) stopRaftServer() {
        if s.raftStore != nil {
                s.raftStore.Stop()
        }
}

// NewPacketToBroadcastMinAppliedID returns a new packet to broadcast the min applied ID.
func NewPacketToBroadcastMinAppliedID(partitionID uint64, minAppliedID uint64) (p *repl.Packet) {
        p = new(repl.Packet)
        p.Opcode = proto.OpBroadcastMinAppliedID
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        p.Data = make([]byte, 8)
        binary.BigEndian.PutUint64(p.Data, minAppliedID)
        p.Size = uint32(len(p.Data))
        return
}

// NewPacketToGetAppliedID returns a new packet to get the applied ID.
func NewPacketToGetAppliedID(partitionID uint64) (p *repl.Packet) {
        p = new(repl.Packet)
        p.Opcode = proto.OpGetAppliedId
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        return
}

// NewPacketToGetPartitionSize returns a new packet to get the partition size.
func NewPacketToGetPartitionSize(partitionID uint64) (p *repl.Packet) {
        p = new(repl.Packet)
        p.Opcode = proto.OpGetPartitionSize
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        return
}

// NewPacketToGetPartitionSize returns a new packet to get the partition size.
func NewPacketToGetMaxExtentIDAndPartitionSIze(partitionID uint64) (p *repl.Packet) {
        p = new(repl.Packet)
        p.Opcode = proto.OpGetMaxExtentIDAndPartitionSize
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        return
}

func (dp *DataPartition) findMinAppliedID(allAppliedIDs []uint64) (minAppliedID uint64, index int) {
        index = 0
        minAppliedID = allAppliedIDs[0]
        for i := 1; i < len(allAppliedIDs); i++ {
                if allAppliedIDs[i] < minAppliedID {
                        minAppliedID = allAppliedIDs[i]
                        index = i
                }
        }
        return minAppliedID, index
}

func (dp *DataPartition) findMaxAppliedID(allAppliedIDs []uint64) (maxAppliedID uint64, index int) {
        for i := 0; i < len(allAppliedIDs); i++ {
                if allAppliedIDs[i] > maxAppliedID {
                        maxAppliedID = allAppliedIDs[i]
                        index = i
                }
        }
        return maxAppliedID, index
}

// Get the partition size from the leader.
func (dp *DataPartition) getLeaderPartitionSize(maxExtentID uint64) (size uint64, err error) {
        var conn *net.TCPConn

        p := NewPacketToGetPartitionSize(dp.partitionID)
        p.ExtentID = maxExtentID
        target := dp.getReplicaAddr(0)
        conn, err = gConnPool.GetConnect(target) // get remote connect
        if err != nil {
                err = errors.Trace(err, " partition(%v) get host(%v) connect", dp.partitionID, target)
                return
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        err = p.WriteToConn(conn) // write command to the remote host
        if err != nil {
                err = errors.Trace(err, "partition(%v) write to host(%v)", dp.partitionID, target)
                return
        }
        err = p.ReadFromConnWithVer(conn, 60)
        if err != nil {
                err = errors.Trace(err, "partition(%v) read from host(%v)", dp.partitionID, target)
                return
        }

        if p.ResultCode != proto.OpOk {
                err = errors.Trace(err, "partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
                return
        }
        size = binary.BigEndian.Uint64(p.Data)
        log.LogInfof("partition(%v) MaxExtentID(%v) size(%v)", dp.partitionID, maxExtentID, size)

        return
}

func (dp *DataPartition) getMaxExtentIDAndPartitionSize(target string) (maxExtentID, PartitionSize uint64, err error) {
        var conn *net.TCPConn
        p := NewPacketToGetMaxExtentIDAndPartitionSIze(dp.partitionID)

        conn, err = gConnPool.GetConnect(target) // get remote connect
        if err != nil {
                err = errors.Trace(err, " partition(%v) get host(%v) connect", dp.partitionID, target)
                return
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        err = p.WriteToConn(conn) // write command to the remote host
        if err != nil {
                err = errors.Trace(err, "partition(%v) write to host(%v)", dp.partitionID, target)
                return
        }
        err = p.ReadFromConnWithVer(conn, 60)
        if err != nil {
                err = errors.Trace(err, "partition(%v) read from host(%v)", dp.partitionID, target)
                return
        }

        if p.ResultCode != proto.OpOk {
                err = errors.Trace(err, "partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
                return
        }
        maxExtentID = binary.BigEndian.Uint64(p.Data[0:8])
        PartitionSize = binary.BigEndian.Uint64(p.Data[8:16])

        log.LogInfof("partition(%v) maxExtentID(%v) PartitionSize(%v) on leader", dp.partitionID, maxExtentID, PartitionSize)
        return
}

// Get the MaxExtentID partition  from the leader.
func (dp *DataPartition) getLeaderMaxExtentIDAndPartitionSize() (maxExtentID, PartitionSize uint64, err error) {
        target := dp.getReplicaAddr(0)
        return dp.getMaxExtentIDAndPartitionSize(target)
}

// Get the MaxExtentID partition  from the leader.
func (dp *DataPartition) getMemberExtentIDAndPartitionSize() (maxExtentID, PartitionSize uint64, err error) {
        target := dp.getReplicaAddr(1)
        return dp.getMaxExtentIDAndPartitionSize(target)
}

func (dp *DataPartition) broadcastMinAppliedID(minAppliedID uint64) (err error) {
        for i := 0; i < dp.getReplicaLen(); i++ {
                p := NewPacketToBroadcastMinAppliedID(dp.partitionID, minAppliedID)
                replicaHostParts := strings.Split(dp.getReplicaAddr(i), ":")
                replicaHost := strings.TrimSpace(replicaHostParts[0])
                if LocalIP == replicaHost {
                        log.LogDebugf("partition(%v) local no send msg. localIP(%v) replicaHost(%v) appliedId(%v)",
                                dp.partitionID, LocalIP, replicaHost, dp.appliedID)
                        dp.minAppliedID = minAppliedID
                        continue
                }
                target := dp.getReplicaAddr(i)
                var conn *net.TCPConn
                conn, err = gConnPool.GetConnect(target)
                if err != nil {
                        return
                }
                err = p.WriteToConn(conn)
                if err != nil {
                        gConnPool.PutConnect(conn, true)
                        return
                }
                err = p.ReadFromConnWithVer(conn, 60)
                if err != nil {
                        gConnPool.PutConnect(conn, true)
                        return
                }
                gConnPool.PutConnect(conn, false)
                log.LogDebugf("partition(%v) minAppliedID(%v)", dp.partitionID, minAppliedID)
        }

        return
}

// Get all replica applied ids
func (dp *DataPartition) getAllReplicaAppliedID() (allAppliedID []uint64, replyNum uint8) {
        allAppliedID = make([]uint64, dp.getReplicaLen())
        for i := 0; i < dp.getReplicaLen(); i++ {
                p := NewPacketToGetAppliedID(dp.partitionID)
                replicaHostParts := strings.Split(dp.getReplicaAddr(i), ":")
                replicaHost := strings.TrimSpace(replicaHostParts[0])
                if LocalIP == replicaHost {
                        log.LogDebugf("partition(%v) local no send msg. localIP(%v) replicaHost(%v) appliedId(%v)",
                                dp.partitionID, LocalIP, replicaHost, dp.appliedID)
                        allAppliedID[i] = dp.appliedID
                        replyNum++
                        continue
                }
                target := dp.getReplicaAddr(i)
                appliedID, err := dp.getRemoteAppliedID(target, p)
                if err != nil {
                        log.LogErrorf("partition(%v) getRemoteAppliedID Failed(%v).", dp.partitionID, err)
                        continue
                }
                if appliedID == 0 {
                        log.LogDebugf("[getAllReplicaAppliedID] partition(%v) local appliedID(%v) replicaHost(%v) appliedID=0",
                                dp.partitionID, dp.appliedID, replicaHost)
                }
                allAppliedID[i] = appliedID
                replyNum++
        }

        return
}

// Get target members' applied id
func (dp *DataPartition) getRemoteAppliedID(target string, p *repl.Packet) (appliedID uint64, err error) {
        var conn *net.TCPConn
        start := time.Now().UnixNano()
        defer func() {
                if err != nil {
                        err = fmt.Errorf(p.LogMessage(p.GetOpMsg(), target, start, err))
                        log.LogErrorf(err.Error())
                }
        }()

        conn, err = gConnPool.GetConnect(target)
        if err != nil {
                return
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        err = p.WriteToConn(conn) // write command to the remote host
        if err != nil {
                return
        }
        err = p.ReadFromConnWithVer(conn, 60)
        if err != nil {
                return
        }
        if p.ResultCode != proto.OpOk {
                err = errors.NewErrorf("partition(%v) result code not ok (%v) from host(%v)", dp.partitionID, p.ResultCode, target)
                return
        }
        appliedID = binary.BigEndian.Uint64(p.Data)

        log.LogDebugf("[getRemoteAppliedID] partition(%v) remoteAppliedID(%v)", dp.partitionID, appliedID)

        return
}

// Get all members' applied ids and find the minimum one
func (dp *DataPartition) updateMaxMinAppliedID() {
        var (
                minAppliedID uint64
                maxAppliedID uint64
        )

        // Get the applied id by the leader
        _, isLeader := dp.IsRaftLeader()
        if !isLeader {
                return
        }

        // if leader has not applied the raft, no need to get others
        if dp.appliedID == 0 {
                return
        }

        allAppliedID, replyNum := dp.getAllReplicaAppliedID()
        if replyNum == 0 {
                log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) Get appliedId failed!", dp.partitionID)
                return
        }
        if replyNum == uint8(len(allAppliedID)) { // update dp.minAppliedID when every member had replied
                minAppliedID, _ = dp.findMinAppliedID(allAppliedID)
                log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) localID(%v) OK! oldMinID(%v) newMinID(%v) allAppliedID(%v)",
                        dp.partitionID, dp.appliedID, dp.minAppliedID, minAppliedID, allAppliedID)
                dp.broadcastMinAppliedID(minAppliedID)
        }

        maxAppliedID, _ = dp.findMaxAppliedID(allAppliedID)
        log.LogDebugf("[updateMaxMinAppliedID] PartitionID(%v) localID(%v) OK! oldMaxID(%v) newMaxID(%v)",
                dp.partitionID, dp.appliedID, dp.maxAppliedID, maxAppliedID)
        dp.maxAppliedID = maxAppliedID
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

/* The functions below implement the interfaces defined in the raft library. */

// Apply puts the data onto the disk.
func (dp *DataPartition) Apply(command []byte, index uint64) (resp interface{}, err error) {
        buff := bytes.NewBuffer(command)
        var version uint32
        if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
                return
        }
        resp = proto.OpOk
        if version != BinaryMarshalMagicVersion {
                var opItem *RaftCmdItem
                if opItem, err = UnmarshalRaftCmd(command); err != nil {
                        log.LogErrorf("[ApplyRandomWrite] ApplyID(%v) Partition(%v) unmarshal failed(%v)", index, dp.partitionID, err)
                        return
                }
                log.LogInfof("[ApplyRandomWrite] ApplyID(%v) Partition(%v) opItem Op(%v)", index, dp.partitionID, opItem.Op)
                if opItem.Op == uint32(proto.OpVersionOp) {
                        dp.fsmVersionOp(opItem)
                        return
                }
                return
        }
        if index > dp.metaAppliedID {
                resp, err = dp.ApplyRandomWrite(command, index)
                return
        }
        log.LogDebugf("[DataPartition.Apply] dp[%v] metaAppliedID(%v) index(%v) no need apply", dp.partitionID, dp.metaAppliedID, index)
        return
}

// ApplyMemberChange supports adding new raft member or deleting an existing raft member.
// It does not support updating an existing member at this point.
func (dp *DataPartition) ApplyMemberChange(confChange *raftproto.ConfChange, index uint64) (resp interface{}, err error) {
        defer func(index uint64) {
                if err == nil {
                        dp.uploadApplyID(index)
                } else {
                        err = fmt.Errorf("[ApplyMemberChange] ApplyID(%v) Partition(%v) apply err(%v)]", index, dp.partitionID, err)
                        exporter.Warning(err.Error())
                        panic(newRaftApplyError(err))
                }
        }(index)

        // Change memory the status
        var (
                isUpdated bool
        )
        switch confChange.Type {
        case raftproto.ConfAddNode:
                req := &proto.AddDataPartitionRaftMemberRequest{}
                if err = json.Unmarshal(confChange.Context, req); err != nil {
                        return
                }
                log.LogInfof("action[ApplyMemberChange] ConfAddNode [%v], partitionId [%v]", req.AddPeer, req.PartitionId)
                isUpdated, err = dp.addRaftNode(req, index)
                if isUpdated && err == nil {
                        // Perform the update replicas operation asynchronously after the execution of the member change applying
                        // related process.
                        updateWG := sync.WaitGroup{}
                        updateWG.Add(1)

                        go func() {
                                defer updateWG.Done()
                                //may fetch old replica, e.g. 3-replica back to 2-replica for adding raft member not return
                                //if err = dp.updateReplicas(true); err != nil {
                                //        log.LogErrorf("ApplyMemberChange: update partition %v replicas failed: %v", dp.partitionID, err)
                                //        return
                                //}
                                if dp.isLeader {
                                        dp.ExtentStore().MoveAllToBrokenTinyExtentC(storage.TinyExtentCount)
                                }
                        }()
                        updateWG.Wait()
                }
        case raftproto.ConfRemoveNode:
                req := &proto.RemoveDataPartitionRaftMemberRequest{}
                if err = json.Unmarshal(confChange.Context, req); err != nil {
                        return
                }
                log.LogInfof("action[ApplyMemberChange] ConfRemoveNode [%v], partitionId [%v]", req.RemovePeer, req.PartitionId)
                isUpdated, err = dp.removeRaftNode(req, index)
        case raftproto.ConfUpdateNode:
                log.LogDebugf("[updateRaftNode]: not support.")
        default:
                // do nothing
        }
        if err != nil {
                log.LogErrorf("action[ApplyMemberChange] dp(%v) type(%v) err(%v).", dp.partitionID, confChange.Type, err)
                if IsDiskErr(err.Error()) {
                        panic(newRaftApplyError(err))
                }
                return
        }
        if isUpdated {
                dp.DataPartitionCreateType = proto.NormalCreateDataPartition
                if err = dp.PersistMetadata(); err != nil {
                        log.LogErrorf("action[ApplyMemberChange] dp(%v) PersistMetadata err(%v).", dp.partitionID, err)
                        if IsDiskErr(err.Error()) {
                                panic(newRaftApplyError(err))
                        }
                        return
                }
        }
        return
}

// Snapshot persists the in-memory data (as a snapshot) to the disk.
// Note that the data in each data partition has already been saved on the disk. Therefore there is no need to take the
// snapshot in this case.
func (dp *DataPartition) Snapshot() (raftproto.Snapshot, error) {
        snapIterator := NewItemIterator(dp.raftPartition.AppliedIndex())
        log.LogInfof("SendSnapShot PartitionID(%v) Snapshot lastTruncateID(%v) currentApplyID(%v) firstCommitID(%v)",
                dp.partitionID, dp.lastTruncateID, dp.appliedID, dp.raftPartition.CommittedIndex())
        return snapIterator, nil
}

// ApplySnapshot asks the raft leader for the snapshot data to recover the contents on the local disk.
func (dp *DataPartition) ApplySnapshot(peers []raftproto.Peer, iterator raftproto.SnapIterator) (err error) {
        // Never delete the raft log which hadn't applied, so snapshot no need.
        log.LogInfof("PartitionID(%v) ApplySnapshot to (%v)", dp.partitionID, dp.raftPartition.CommittedIndex())
        return
}

// HandleFatalEvent notifies the application when panic happens.
func (dp *DataPartition) HandleFatalEvent(err *raft.FatalError) {
        if isRaftApplyError(err.Err.Error()) {
                dp.stopRaft()
                dp.checkIsDiskError(err.Err, 0)
                log.LogCriticalf("action[HandleFatalEvent] raft apply err(%v), partitionId:%v", err, dp.partitionID)
        } else {
                log.LogFatalf("action[HandleFatalEvent] err(%v), partitionId:%v", err, dp.partitionID)
        }
}

// HandleLeaderChange notifies the application when the raft leader has changed.
func (dp *DataPartition) HandleLeaderChange(leader uint64) {
        defer func() {
                if r := recover(); r != nil {
                        mesg := fmt.Sprintf("HandleLeaderChange(%v)  Raft Panic (%v)", dp.partitionID, r)
                        panic(mesg)
                }
        }()
        if dp.config.NodeID == leader {
                dp.isRaftLeader = true
        }
}

// Put submits the raft log to the raft store.
func (dp *DataPartition) Put(key interface{}, val interface{}) (resp interface{}, err error) {
        if dp.raftStopped() {
                err = fmt.Errorf("%s key=%v", RaftNotStarted, key)
                return
        }
        resp, err = dp.raftPartition.Submit(val.([]byte))
        return
}

// Get returns the raft log based on the given key. It is not needed for replicating data partition.
func (dp *DataPartition) Get(key interface{}) (interface{}, error) {
        return nil, nil
}

// Del deletes the raft log based on the given key. It is not needed for replicating data partition.
func (dp *DataPartition) Del(key interface{}) (interface{}, error) {
        return nil, nil
}

func (dp *DataPartition) uploadApplyID(applyID uint64) {
        atomic.StoreUint64(&dp.appliedID, applyID)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "bytes"
        "errors"
        "fmt"
        "net"
        "net/http"
        "os"
        "os/exec"
        "regexp"
        "runtime"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/cmd/common"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/repl"
        masterSDK "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/atomicutil"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/loadutil"
        "github.com/cubefs/cubefs/util/log"

        "github.com/xtaci/smux"
)

var (
        ErrIncorrectStoreType          = errors.New("Incorrect store type")
        ErrNoSpaceToCreatePartition    = errors.New("No disk space to create a data partition")
        ErrNewSpaceManagerFailed       = errors.New("Creater new space manager failed")
        ErrGetMasterDatanodeInfoFailed = errors.New("Failed to get datanode info from master")

        LocalIP, serverPort string
        gConnPool           = util.NewConnectPool()
        // MasterClient        = masterSDK.NewMasterClient(nil, false)
        MasterClient *masterSDK.MasterCLientWithResolver
)

const (
        DefaultZoneName            = proto.DefaultZoneName
        DefaultRaftDir             = "raft"
        DefaultRaftLogsToRetain    = 10 // Count of raft logs per data partition
        DefaultDiskMaxErr          = 1
        DefaultDiskRetainMin       = 5 * util.GB // GB
        DefaultNameResolveInterval = 1           // minutes

        DefaultDiskUnavailableErrorCount          = 5
        DefaultDiskUnavailablePartitionErrorCount = 3
)

const (
        ModuleName = "dataNode"
)

const (
        ConfigKeyLocalIP       = "localIP"         // string
        ConfigKeyPort          = "port"            // int
        ConfigKeyMasterAddr    = "masterAddr"      // array
        ConfigKeyZone          = "zoneName"        // string
        ConfigKeyDisks         = "disks"           // array
        ConfigKeyRaftDir       = "raftDir"         // string
        ConfigKeyRaftHeartbeat = "raftHeartbeat"   // string
        ConfigKeyRaftReplica   = "raftReplica"     // string
        CfgTickInterval        = "tickInterval"    // int
        CfgRaftRecvBufSize     = "raftRecvBufSize" // int

        ConfigKeyDiskPath         = "diskPath"            // string
        configNameResolveInterval = "nameResolveInterval" // int

        /*
         * Metrics Degrade Level
         * minus value: turn off metrics collection.
         * 0 or 1: full metrics.
         * 2: 1/2 of the metrics will be collected.
         * 3: 1/3 of the metrics will be collected.
         * ...
         */
        CfgMetricsDegrade = "metricsDegrade" // int

        CfgDiskRdonlySpace = "diskRdonlySpace" // int
        // smux Config
        ConfigKeyEnableSmuxClient  = "enableSmuxConnPool" // bool
        ConfigKeySmuxPortShift     = "smuxPortShift"      // int
        ConfigKeySmuxMaxConn       = "smuxMaxConn"        // int
        ConfigKeySmuxStreamPerConn = "smuxStreamPerConn"  // int
        ConfigKeySmuxMaxBuffer     = "smuxMaxBuffer"      // int
        ConfigKeySmuxTotalStream   = "sumxTotalStream"    // int

        // rate limit control enable
        ConfigDiskQosEnable = "diskQosEnable" // bool
        ConfigDiskReadIocc  = "diskReadIocc"  // int
        ConfigDiskReadIops  = "diskReadIops"  // int
        ConfigDiskReadFlow  = "diskReadFlow"  // int
        ConfigDiskWriteIocc = "diskWriteIocc" // int
        ConfigDiskWriteIops = "diskWriteIops" // int
        ConfigDiskWriteFlow = "diskWriteFlow" // int

        ConfigServiceIDKey = "serviceIDKey"

        // disk status becomes unavailable if disk error partition count reaches this value
        ConfigKeyDiskUnavailablePartitionErrorCount = "diskUnavailablePartitionErrorCount"
)

const cpuSampleDuration = 1 * time.Second

// DataNode defines the structure of a data node.
type DataNode struct {
        space           *SpaceManager
        port            string
        zoneName        string
        clusterID       string
        localIP         string
        bindIp          bool
        localServerAddr string
        nodeID          uint64
        raftDir         string
        raftHeartbeat   string
        raftReplica     string
        raftStore       raftstore.RaftStore
        tickInterval    int
        raftRecvBufSize int
        startTime       int64

        tcpListener net.Listener
        stopC       chan bool

        smuxPortShift      int
        enableSmuxConnPool bool
        smuxConnPool       *util.SmuxConnectPool
        smuxListener       net.Listener
        smuxServerConfig   *smux.Config
        smuxConnPoolConfig *util.SmuxConnPoolConfig

        getRepairConnFunc func(target string) (net.Conn, error)
        putRepairConnFunc func(conn net.Conn, forceClose bool)

        metrics        *DataNodeMetrics
        metricsDegrade int64
        metricsCnt     uint64
        volUpdating    sync.Map // map[string]*verOp2Phase

        control common.Control

        diskQosEnable           bool
        diskQosEnableFromMaster bool
        diskReadIocc            int
        diskReadIops            int
        diskReadFlow            int
        diskWriteIocc           int
        diskWriteIops           int
        diskWriteFlow           int
        dpMaxRepairErrCnt       uint64
        dpRepairTimeOut         uint64
        clusterUuid             string
        clusterUuidEnable       bool
        serviceIDKey            string
        cpuUtil                 atomicutil.Float64
        cpuSamplerDone          chan struct{}

        diskUnavailablePartitionErrorCount uint64 // disk status becomes unavailable when disk error partition count reaches this value
}

type verOp2Phase struct {
        verSeq     uint64
        verPrepare uint64
        status     uint32
        step       uint32
        op         uint8
        sync.Mutex
}

func NewServer() *DataNode {
        return &DataNode{}
}

func (s *DataNode) Start(cfg *config.Config) (err error) {
        runtime.GOMAXPROCS(runtime.NumCPU())
        return s.control.Start(s, cfg, doStart)
}

// Shutdown shuts down the current data node.
func (s *DataNode) Shutdown() {
        s.control.Shutdown(s, doShutdown)
}

// Sync keeps data node in sync.
func (s *DataNode) Sync() {
        s.control.Sync()
}

// Workflow of starting up a data node.
func doStart(server common.Server, cfg *config.Config) (err error) {
        s, ok := server.(*DataNode)
        if !ok {
                return errors.New("Invalid node Type!")
        }

        s.stopC = make(chan bool)

        // parse the config file
        if err = s.parseConfig(cfg); err != nil {
                return
        }

        exporter.Init(ModuleName, cfg)
        s.registerMetrics()
        s.register(cfg)

        // parse the smux config
        if err = s.parseSmuxConfig(cfg); err != nil {
                return
        }
        // connection pool must be created before initSpaceManager
        s.initConnPool()

        // init limit
        initRepairLimit()

        // start the raft server
        if err = s.startRaftServer(cfg); err != nil {
                return
        }

        // create space manager (disk, partition, etc.)
        if err = s.startSpaceManager(cfg); err != nil {
                return
        }

        // check local partition compare with master ,if lack,then not start
        if _, err = s.checkLocalPartitionMatchWithMaster(); err != nil {
                log.LogError(err)
                exporter.Warning(err.Error())
                return
        }

        // tcp listening & tcp connection pool
        if err = s.startTCPService(); err != nil {
                return
        }

        // smux listening & smux connection pool
        if err = s.startSmuxService(cfg); err != nil {
                return
        }

        go s.registerHandler()

        s.scheduleTask()

        // start metrics (LackDpCount, etc.)
        s.startMetrics()

        // start cpu sampler
        s.startCpuSample()
        return
}

func doShutdown(server common.Server) {
        s, ok := server.(*DataNode)
        if !ok {
                return
        }
        s.closeMetrics()
        close(s.stopC)
        s.space.Stop()
        s.stopUpdateNodeInfo()
        s.stopTCPService()
        s.stopRaftServer()
        s.stopSmuxService()
        s.closeSmuxConnPool()
        MasterClient.Stop()
        // stop cpu sample
        close(s.cpuSamplerDone)
}

func (s *DataNode) parseConfig(cfg *config.Config) (err error) {
        var (
                port       string
                regexpPort *regexp.Regexp
        )
        LocalIP = cfg.GetString(ConfigKeyLocalIP)
        port = cfg.GetString(proto.ListenPort)
        s.bindIp = cfg.GetBool(proto.BindIpKey)
        serverPort = port
        if regexpPort, err = regexp.Compile(`^(\d)+$`); err != nil {
                return fmt.Errorf("Err:no port")
        }
        if !regexpPort.MatchString(port) {
                return fmt.Errorf("Err:port must string")
        }
        s.port = port

        /*for _, ip := range cfg.GetSlice(proto.MasterAddr) {
                MasterClient.AddNode(ip.(string))
        }*/

        updateInterval := cfg.GetInt(configNameResolveInterval)
        if updateInterval <= 0 || updateInterval > 60 {
                log.LogWarnf("name resolving interval[1-60] is set to default: %v", DefaultNameResolveInterval)
                updateInterval = DefaultNameResolveInterval
        }

        addrs := cfg.GetSlice(proto.MasterAddr)
        if len(addrs) == 0 {
                return fmt.Errorf("Err:masterAddr unavalid")
        }
        masters := make([]string, 0, len(addrs))
        for _, addr := range addrs {
                masters = append(masters, addr.(string))
        }
        MasterClient = masterSDK.NewMasterCLientWithResolver(masters, false, updateInterval)
        if MasterClient == nil {
                err = fmt.Errorf("parseConfig: masters addrs format err[%v]", masters)
                log.LogErrorf("parseConfig: masters addrs format err[%v]", masters)
                return err
        }
        if err = MasterClient.Start(); err != nil {
                return err
        }

        s.zoneName = cfg.GetString(ConfigKeyZone)
        if s.zoneName == "" {
                s.zoneName = DefaultZoneName
        }
        s.metricsDegrade = cfg.GetInt64(CfgMetricsDegrade)

        s.serviceIDKey = cfg.GetString(ConfigServiceIDKey)

        diskUnavailablePartitionErrorCount := cfg.GetInt64(ConfigKeyDiskUnavailablePartitionErrorCount)
        if diskUnavailablePartitionErrorCount <= 0 || diskUnavailablePartitionErrorCount > 100 {
                diskUnavailablePartitionErrorCount = DefaultDiskUnavailablePartitionErrorCount
                log.LogDebugf("action[parseConfig] ConfigKeyDiskUnavailablePartitionErrorCount(%v) out of range, set as default(%v)",
                        diskUnavailablePartitionErrorCount, DefaultDiskUnavailablePartitionErrorCount)
        }
        s.diskUnavailablePartitionErrorCount = uint64(diskUnavailablePartitionErrorCount)
        log.LogDebugf("action[parseConfig] load diskUnavailablePartitionErrorCount(%v)", s.diskUnavailablePartitionErrorCount)

        log.LogDebugf("action[parseConfig] load masterAddrs(%v).", MasterClient.Nodes())
        log.LogDebugf("action[parseConfig] load port(%v).", s.port)
        log.LogDebugf("action[parseConfig] load zoneName(%v).", s.zoneName)
        return
}

func (s *DataNode) initQosLimit(cfg *config.Config) {
        dn := s.space.dataNode
        dn.diskQosEnable = cfg.GetBoolWithDefault(ConfigDiskQosEnable, true)
        dn.diskReadIocc = cfg.GetInt(ConfigDiskReadIocc)
        dn.diskReadIops = cfg.GetInt(ConfigDiskReadIops)
        dn.diskReadFlow = cfg.GetInt(ConfigDiskReadFlow)
        dn.diskWriteIocc = cfg.GetInt(ConfigDiskWriteIocc)
        dn.diskWriteIops = cfg.GetInt(ConfigDiskWriteIops)
        dn.diskWriteFlow = cfg.GetInt(ConfigDiskWriteFlow)
        log.LogWarnf("action[initQosLimit] set qos [%v], read(iocc:%d iops:%d flow:%d) write(iocc:%d iops:%d flow:%d)",
                dn.diskQosEnable, dn.diskReadIocc, dn.diskReadIops, dn.diskReadFlow, dn.diskWriteIocc, dn.diskWriteIops, dn.diskWriteFlow)
}

func (s *DataNode) updateQosLimit() {
        for _, disk := range s.space.disks {
                disk.updateQosLimiter()
        }
}

func (s *DataNode) startSpaceManager(cfg *config.Config) (err error) {
        s.startTime = time.Now().Unix()
        s.space = NewSpaceManager(s)
        if len(strings.TrimSpace(s.port)) == 0 {
                err = ErrNewSpaceManagerFailed
                return
        }

        s.space.SetRaftStore(s.raftStore)
        s.space.SetNodeID(s.nodeID)
        s.space.SetClusterID(s.clusterID)
        s.initQosLimit(cfg)

        diskRdonlySpace := uint64(cfg.GetInt64(CfgDiskRdonlySpace))
        if diskRdonlySpace < DefaultDiskRetainMin {
                diskRdonlySpace = DefaultDiskRetainMin
        }

        log.LogInfof("startSpaceManager preReserveSpace %d", diskRdonlySpace)

        paths := make([]string, 0)
        diskPath := cfg.GetString(ConfigKeyDiskPath)
        if diskPath != "" {
                paths, err = parseDiskPath(diskPath)
                if err != nil {
                        log.LogErrorf("parse diskpath failed, path %s, err %s", diskPath, err.Error())
                        return err
                }
        } else {
                for _, p := range cfg.GetSlice(ConfigKeyDisks) {
                        paths = append(paths, p.(string))
                }
        }

        var wg sync.WaitGroup
        for _, d := range paths {
                log.LogDebugf("action[startSpaceManager] load disk raw config(%v).", d)

                // format "PATH:RESET_SIZE
                arr := strings.Split(d, ":")
                if len(arr) != 2 {
                        return errors.New("Invalid disk configuration. Example: PATH:RESERVE_SIZE")
                }
                path := arr[0]
                fileInfo, err := os.Stat(path)
                if err != nil {
                        log.LogErrorf("Stat disk path [%v] error: [%s]", path, err)
                        continue
                }
                if !fileInfo.IsDir() {
                        return errors.New("Disk path is not dir")
                }
                if s.clusterUuidEnable {
                        if err = config.CheckOrStoreClusterUuid(path, s.clusterUuid, false); err != nil {
                                log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
                                return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err.Error())
                        }
                }
                reservedSpace, err := strconv.ParseUint(arr[1], 10, 64)
                if err != nil {
                        return fmt.Errorf("Invalid disk reserved space. Error: %s", err.Error())
                }

                if reservedSpace < DefaultDiskRetainMin {
                        reservedSpace = DefaultDiskRetainMin
                }

                wg.Add(1)
                go func(wg *sync.WaitGroup, path string, reservedSpace uint64) {
                        defer wg.Done()
                        s.space.LoadDisk(path, reservedSpace, diskRdonlySpace, DefaultDiskMaxErr)
                }(&wg, path, reservedSpace)
        }

        wg.Wait()
        // start async sample
        s.space.StartDiskSample()
        s.updateQosLimit() // load from config
        return nil
}

// execute shell to find all paths
// out: like, /disk1:1024, /disk2:1024
func parseDiskPath(pathStr string) (disks []string, err error) {
        log.LogInfof("parse diskpath, %s", pathStr)

        arr := strings.Split(pathStr, ":")
        if len(arr) != 2 {
                return disks, fmt.Errorf("diskPath cfg should be diskPathPrefix:RESERVE_SIZE")
        }

        shell := fmt.Sprintf("mount | grep %s | awk '{print $3}'", arr[0])
        cmd := exec.Command("/bin/sh", "-c", shell)
        log.LogWarnf("execute diskPath shell, %s", shell)
        out, err := cmd.CombinedOutput()
        if err != nil {
                return disks, fmt.Errorf("execute shell failed, %s", err.Error())
        }

        disks = make([]string, 0)
        lines := bytes.Split(out, []byte("\n"))
        for _, line := range lines {
                str := strings.TrimSpace(string(line))
                if str == "" {
                        continue
                }

                disks = append(disks, fmt.Sprintf("%s:%s", string(line), arr[1]))
        }

        return disks, nil
}

// registers the data node on the master to report the information such as IsIPV4 address.
// The startup of a data node will be blocked until the registration succeeds.
func (s *DataNode) register(cfg *config.Config) {
        var err error

        timer := time.NewTimer(0)

        // get the IsIPV4 address, cluster ID and node ID from the master
        for {
                select {
                case <-timer.C:
                        var ci *proto.ClusterInfo
                        if ci, err = MasterClient.AdminAPI().GetClusterInfo(); err != nil {
                                log.LogErrorf("action[registerToMaster] cannot get ip from master(%v) err(%v).",
                                        MasterClient.Leader(), err)
                                timer.Reset(2 * time.Second)
                                continue
                        }
                        masterAddr := MasterClient.Leader()
                        s.clusterUuid = ci.ClusterUuid
                        s.clusterUuidEnable = ci.ClusterUuidEnable
                        s.clusterID = ci.Cluster
                        if LocalIP == "" {
                                LocalIP = string(ci.Ip)
                        }
                        s.localServerAddr = fmt.Sprintf("%s:%v", LocalIP, s.port)
                        if !util.IsIPV4(LocalIP) {
                                log.LogErrorf("action[registerToMaster] got an invalid local ip(%v) from master(%v).",
                                        LocalIP, masterAddr)
                                timer.Reset(2 * time.Second)
                                continue
                        }

                        // register this data node on the master
                        var nodeID uint64
                        if nodeID, err = MasterClient.NodeAPI().AddDataNodeWithAuthNode(fmt.Sprintf("%s:%v", LocalIP, s.port),
                                s.zoneName, s.serviceIDKey); err != nil {
                                log.LogErrorf("action[registerToMaster] cannot register this node to master[%v] err(%v).",
                                        masterAddr, err)
                                timer.Reset(2 * time.Second)
                                continue
                        }
                        exporter.RegistConsul(s.clusterID, ModuleName, cfg)
                        s.nodeID = nodeID
                        log.LogDebugf("register: register DataNode: nodeID(%v)", s.nodeID)
                        return
                case <-s.stopC:
                        timer.Stop()
                        return
                }
        }
}

type DataNodeInfo struct {
        Addr                      string
        PersistenceDataPartitions []uint64
}

func (s *DataNode) checkLocalPartitionMatchWithMaster() (lackPartitions []uint64, err error) {
        convert := func(node *proto.DataNodeInfo) *DataNodeInfo {
                result := &DataNodeInfo{}
                result.Addr = node.Addr
                result.PersistenceDataPartitions = node.PersistenceDataPartitions
                return result
        }
        var dataNode *proto.DataNodeInfo
        for i := 0; i < 3; i++ {
                if dataNode, err = MasterClient.NodeAPI().GetDataNode(s.localServerAddr); err != nil {
                        log.LogErrorf("checkLocalPartitionMatchWithMaster error %v", err)
                        continue
                }
                break
        }
        if dataNode == nil {
                err = ErrGetMasterDatanodeInfoFailed
                return
        }
        dinfo := convert(dataNode)
        if len(dinfo.PersistenceDataPartitions) == 0 {
                return
        }

        for _, partitionID := range dinfo.PersistenceDataPartitions {
                dp := s.space.Partition(partitionID)
                if dp == nil {
                        lackPartitions = append(lackPartitions, partitionID)
                }
        }

        if len(lackPartitions) == 0 {
                log.LogInfo("checkLocalPartitionMatchWithMaster no lack")
        } else {
                log.LogErrorf("checkLocalPartitionMatchWithMaster lack ids [%v]", lackPartitions)
        }
        return
}

func (s *DataNode) checkPartitionInMemoryMatchWithInDisk() (lackPartitions []uint64) {
        s.space.partitionMutex.RLock()
        partitions := make([]*DataPartition, 0)
        for _, dp := range s.space.partitions {
                partitions = append(partitions, dp)
        }
        s.space.partitionMutex.RUnlock()

        for _, dp := range partitions {
                stat, err := os.Stat(dp.path)
                if err != nil {
                        lackPartitions = append(lackPartitions, dp.partitionID)
                        log.LogErrorf("action[checkPartitionInMemoryMatchWithInDisk] stat dataPartition[%v] fail, path[%v], err[%v]", dp.partitionID, dp.Path(), err)
                        continue
                }
                if !stat.IsDir() {
                        lackPartitions = append(lackPartitions, dp.partitionID)
                        log.LogErrorf("action[checkPartitionInMemoryMatchWithInDisk] dataPartition[%v] is not directory, path[%v]", dp.partitionID, dp.Path())
                        continue
                }
        }
        return
}

func (s *DataNode) registerHandler() {
        http.HandleFunc("/disks", s.getDiskAPI)
        http.HandleFunc("/partitions", s.getPartitionsAPI)
        http.HandleFunc("/partition", s.getPartitionAPI)
        http.HandleFunc("/extent", s.getExtentAPI)
        http.HandleFunc("/block", s.getBlockCrcAPI)
        http.HandleFunc("/stats", s.getStatAPI)
        http.HandleFunc("/raftStatus", s.getRaftStatus)
        http.HandleFunc("/setAutoRepairStatus", s.setAutoRepairStatus)
        http.HandleFunc("/getTinyDeleted", s.getTinyDeleted)
        http.HandleFunc("/getNormalDeleted", s.getNormalDeleted)
        http.HandleFunc("/getSmuxPoolStat", s.getSmuxPoolStat())
        http.HandleFunc("/setMetricsDegrade", s.setMetricsDegrade)
        http.HandleFunc("/getMetricsDegrade", s.getMetricsDegrade)
        http.HandleFunc("/qosEnable", s.setQosEnable())
        http.HandleFunc("/genClusterVersionFile", s.genClusterVersionFile)
        http.HandleFunc("/setDiskBad", s.setDiskBadAPI)
        http.HandleFunc("/setDiskQos", s.setDiskQos)
        http.HandleFunc("/getDiskQos", s.getDiskQos)
}

func (s *DataNode) startTCPService() (err error) {
        log.LogInfo("Start: startTCPService")
        addr := fmt.Sprintf(":%v", s.port)
        if s.bindIp {
                addr = fmt.Sprintf("%s:%v", LocalIP, s.port)
        }
        l, err := net.Listen(NetworkProtocol, addr)
        log.LogDebugf("action[startTCPService] listen %v address(%v).", NetworkProtocol, addr)
        if err != nil {
                log.LogError("failed to listen, err:", err)
                return
        }
        s.tcpListener = l
        go func(ln net.Listener) {
                for {
                        conn, err := ln.Accept()
                        if err != nil {
                                log.LogErrorf("action[startTCPService] failed to accept, err:%s", err.Error())
                                break
                        }
                        log.LogDebugf("action[startTCPService] accept connection from %s.", conn.RemoteAddr().String())
                        go s.serveConn(conn)
                }
        }(l)
        return
}

func (s *DataNode) stopTCPService() (err error) {
        if s.tcpListener != nil {

                s.tcpListener.Close()
                log.LogDebugf("action[stopTCPService] stop tcp service.")
        }
        return
}

func (s *DataNode) serveConn(conn net.Conn) {
        space := s.space
        space.Stats().AddConnection()
        c, _ := conn.(*net.TCPConn)
        c.SetKeepAlive(true)
        c.SetNoDelay(true)
        packetProcessor := repl.NewReplProtocol(conn, s.Prepare, s.OperatePacket, s.Post)
        packetProcessor.ServerConn()
        space.Stats().RemoveConnection()
}

func (s *DataNode) startSmuxService(cfg *config.Config) (err error) {
        log.LogInfo("Start: startSmuxService")
        addr := fmt.Sprintf(":%v", s.port)
        if s.bindIp {
                addr = fmt.Sprintf("%s:%v", LocalIP, s.port)
        }
        addr = util.ShiftAddrPort(addr, s.smuxPortShift)
        log.LogInfof("SmuxListenAddr: (%v)", addr)

        // server
        l, err := net.Listen(NetworkProtocol, addr)
        log.LogDebugf("action[startSmuxService] listen %v address(%v).", NetworkProtocol, addr)
        if err != nil {
                log.LogError("failed to listen smux addr, err:", err)
                return
        }
        s.smuxListener = l
        go func(ln net.Listener) {
                for {
                        conn, err := ln.Accept()
                        if err != nil {
                                log.LogErrorf("action[startSmuxService] failed to accept, err:%s", err.Error())
                                break
                        }
                        log.LogDebugf("action[startSmuxService] accept connection from %s.", conn.RemoteAddr().String())
                        go s.serveSmuxConn(conn)
                }
        }(l)
        return
}

func (s *DataNode) stopSmuxService() (err error) {
        if s.smuxListener != nil {
                s.smuxListener.Close()
                log.LogDebugf("action[stopSmuxService] stop smux service.")
        }
        return
}

func (s *DataNode) serveSmuxConn(conn net.Conn) {
        space := s.space
        space.Stats().AddConnection()
        c, _ := conn.(*net.TCPConn)
        c.SetKeepAlive(true)
        c.SetNoDelay(true)
        var sess *smux.Session
        var err error
        sess, err = smux.Server(conn, s.smuxServerConfig)
        if err != nil {
                log.LogErrorf("action[serveSmuxConn] failed to serve smux connection, addr(%v), err(%v)", c.RemoteAddr(), err)
                return
        }
        defer func() {
                sess.Close()
                space.Stats().RemoveConnection()
        }()
        for {
                stream, err := sess.AcceptStream()
                if err != nil {
                        if util.FilterSmuxAcceptError(err) != nil {
                                log.LogErrorf("action[startSmuxService] failed to accept, err: %s", err)
                        } else {
                                log.LogInfof("action[startSmuxService] accept done, err: %s", err)
                        }
                        break
                }
                go s.serveSmuxStream(stream)
        }
}

func (s *DataNode) serveSmuxStream(stream *smux.Stream) {
        packetProcessor := repl.NewReplProtocol(stream, s.Prepare, s.OperatePacket, s.Post)
        if s.enableSmuxConnPool {
                packetProcessor.SetSmux(s.getRepairConnFunc, s.putRepairConnFunc)
        }
        packetProcessor.ServerConn()
}

func (s *DataNode) parseSmuxConfig(cfg *config.Config) error {
        s.enableSmuxConnPool = cfg.GetBool(ConfigKeyEnableSmuxClient)
        s.smuxPortShift = int(cfg.GetInt64(ConfigKeySmuxPortShift))
        if s.smuxPortShift == 0 {
                s.smuxPortShift = util.DefaultSmuxPortShift
        }
        // smux server cfg
        s.smuxServerConfig = util.DefaultSmuxConfig()
        maxBuffer := cfg.GetInt64(ConfigKeySmuxMaxBuffer)
        if maxBuffer > 0 {
                s.smuxServerConfig.MaxReceiveBuffer = int(maxBuffer)
                if s.smuxServerConfig.MaxStreamBuffer > int(maxBuffer) {
                        s.smuxServerConfig.MaxStreamBuffer = int(maxBuffer)
                }
                if err := smux.VerifyConfig(s.smuxServerConfig); err != nil {
                        return err
                }
        }

        // smux conn pool config
        if s.enableSmuxConnPool {
                s.smuxConnPoolConfig = util.DefaultSmuxConnPoolConfig()
                if maxBuffer > 0 {
                        s.smuxConnPoolConfig.MaxReceiveBuffer = int(maxBuffer)
                        if s.smuxConnPoolConfig.MaxStreamBuffer > int(maxBuffer) {
                                s.smuxConnPoolConfig.MaxStreamBuffer = int(maxBuffer)
                        }
                }
                maxConn := cfg.GetInt64(ConfigKeySmuxMaxConn)
                if maxConn > 0 {
                        if s.smuxConnPoolConfig.ConnsPerAddr < int(maxConn) {
                                s.smuxConnPoolConfig.ConnsPerAddr = int(maxConn)
                        }
                }
                maxStreamPerConn := cfg.GetInt64(ConfigKeySmuxStreamPerConn)
                if maxStreamPerConn > 0 {
                        s.smuxConnPoolConfig.StreamsPerConn = int(maxStreamPerConn)
                }
                totalStreams := cfg.GetInt64(ConfigKeySmuxTotalStream)
                if totalStreams > 0 {
                        s.smuxConnPoolConfig.TotalStreams = int(totalStreams)
                }
                if err := util.VerifySmuxPoolConfig(s.smuxConnPoolConfig); err != nil {
                        return err
                }
        }
        log.LogDebugf("[parseSmuxConfig] load smuxPortShift(%v).", s.smuxPortShift)
        log.LogDebugf("[parseSmuxConfig] load enableSmuxConnPool(%v).", s.enableSmuxConnPool)
        log.LogDebugf("[parseSmuxConfig] load smuxServerConfig(%v).", s.smuxServerConfig)
        log.LogDebugf("[parseSmuxConfig] load smuxConnPoolConfig(%v).", s.smuxConnPoolConfig)
        return nil
}

func (s *DataNode) initConnPool() {
        if s.enableSmuxConnPool {
                log.LogInfof("Start: init smux conn pool")
                s.smuxConnPool = util.NewSmuxConnectPool(s.smuxConnPoolConfig)
                s.getRepairConnFunc = func(target string) (net.Conn, error) {
                        addr := util.ShiftAddrPort(target, s.smuxPortShift)
                        log.LogDebugf("[dataNode.getRepairConnFunc] get smux conn, addr(%v)", addr)
                        return s.smuxConnPool.GetConnect(addr)
                }
                s.putRepairConnFunc = func(conn net.Conn, forceClose bool) {
                        log.LogDebugf("[dataNode.putRepairConnFunc] put smux conn, addr(%v), forceClose(%v)", conn.RemoteAddr().String(), forceClose)
                        s.smuxConnPool.PutConnect(conn.(*smux.Stream), forceClose)
                }
        } else {
                s.getRepairConnFunc = func(target string) (conn net.Conn, err error) {
                        log.LogDebugf("[dataNode.getRepairConnFunc] get tcp conn, addr(%v)", target)
                        return gConnPool.GetConnect(target)
                }
                s.putRepairConnFunc = func(conn net.Conn, forceClose bool) {
                        log.LogDebugf("[dataNode.putRepairConnFunc] put tcp conn, addr(%v), forceClose(%v)", conn.RemoteAddr().String(), forceClose)
                        gConnPool.PutConnect(conn.(*net.TCPConn), forceClose)
                }
        }
}

func (s *DataNode) closeSmuxConnPool() {
        if s.smuxConnPool != nil {
                s.smuxConnPool.Close()
                log.LogDebugf("action[stopSmuxService] stop smux conn pool")
        }
}

func (s *DataNode) shallDegrade() bool {
        level := atomic.LoadInt64(&s.metricsDegrade)
        if level < 0 {
                return true
        }
        if level == 0 {
                return false
        }
        cnt := atomic.LoadUint64(&s.metricsCnt)
        return cnt%uint64(level) != 0
}

func (s *DataNode) scheduleTask() {
        go s.startUpdateNodeInfo()
        s.scheduleToCheckLackPartitions()
}

func (s *DataNode) startCpuSample() {
        s.cpuSamplerDone = make(chan struct{})
        go func() {
                for {
                        select {
                        case <-s.cpuSamplerDone:
                                return
                        default:
                                // this function will sleep cpuSampleDuration
                                used, err := loadutil.GetCpuUtilPercent(cpuSampleDuration)
                                if err == nil {
                                        s.cpuUtil.Store(used)
                                }
                        }
                }
        }()
}

func (s *DataNode) scheduleToCheckLackPartitions() {
        go func() {
                for {
                        lackPartitionsInMem, err := s.checkLocalPartitionMatchWithMaster()
                        if err != nil {
                                log.LogError(err)
                        }
                        if len(lackPartitionsInMem) > 0 {
                                err = fmt.Errorf("action[scheduleToLackDataPartitions] lackPartitions %v in datanode %v memory",
                                        lackPartitionsInMem, s.localServerAddr)
                                log.LogErrorf(err.Error())
                        }
                        s.space.stats.updateMetricLackPartitionsInMem(uint64(len(lackPartitionsInMem)))

                        lackPartitionsInDisk := s.checkPartitionInMemoryMatchWithInDisk()
                        if len(lackPartitionsInDisk) > 0 {
                                err = fmt.Errorf("action[scheduleToLackDataPartitions] lackPartitions %v in datanode %v disk",
                                        lackPartitionsInDisk, s.localServerAddr)
                                log.LogErrorf(err.Error())
                        }
                        s.space.stats.updateMetricLackPartitionsInDisk(uint64(len(lackPartitionsInDisk)))

                        time.Sleep(1 * time.Minute)
                }
        }()
}

func IsDiskErr(errMsg string) bool {
        return strings.Contains(errMsg, syscall.EIO.Error()) ||
                strings.Contains(errMsg, syscall.EROFS.Error()) ||
                strings.Contains(errMsg, syscall.EACCES.Error())
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "encoding/json"
        "fmt"
        "net/http"
        "os"
        "path"
        "strconv"
        "sync/atomic"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/log"
)

var AutoRepairStatus = true

func (s *DataNode) getDiskAPI(w http.ResponseWriter, r *http.Request) {
        disks := make([]interface{}, 0)
        for _, diskItem := range s.space.GetDisks() {
                disk := &struct {
                        Path         string `json:"path"`
                        Total        uint64 `json:"total"`
                        Used         uint64 `json:"used"`
                        Available    uint64 `json:"available"`
                        Unallocated  uint64 `json:"unallocated"`
                        Allocated    uint64 `json:"allocated"`
                        Status       int    `json:"status"`
                        RestSize     uint64 `json:"restSize"`
                        DiskRdoSize  uint64 `json:"diskRdoSize"`
                        Partitions   int    `json:"partitions"`
                        Decommission bool   `json:"decommission"`
                }{
                        Path:         diskItem.Path,
                        Total:        diskItem.Total,
                        Used:         diskItem.Used,
                        Available:    diskItem.Available,
                        Unallocated:  diskItem.Unallocated,
                        Allocated:    diskItem.Allocated,
                        Status:       diskItem.Status,
                        RestSize:     diskItem.ReservedSpace,
                        DiskRdoSize:  diskItem.DiskRdonlySpace,
                        Partitions:   diskItem.PartitionCount(),
                        Decommission: diskItem.GetDecommissionStatus(),
                }
                disks = append(disks, disk)
        }
        diskReport := &struct {
                Disks []interface{} `json:"disks"`
                Zone  string        `json:"zone"`
        }{
                Disks: disks,
                Zone:  s.zoneName,
        }
        s.buildSuccessResp(w, diskReport)
}

func (s *DataNode) getStatAPI(w http.ResponseWriter, r *http.Request) {
        response := &proto.DataNodeHeartbeatResponse{}
        s.buildHeartBeatResponse(response)

        s.buildSuccessResp(w, response)
}

func (s *DataNode) setAutoRepairStatus(w http.ResponseWriter, r *http.Request) {
        const (
                paramAutoRepair = "autoRepair"
        )
        if err := r.ParseForm(); err != nil {
                err = fmt.Errorf("parse form fail: %v", err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        autoRepair, err := strconv.ParseBool(r.FormValue(paramAutoRepair))
        if err != nil {
                err = fmt.Errorf("parse param %v fail: %v", paramAutoRepair, err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        AutoRepairStatus = autoRepair
        s.buildSuccessResp(w, autoRepair)
}

func (s *DataNode) getRaftStatus(w http.ResponseWriter, r *http.Request) {
        const (
                paramRaftID = "raftID"
        )
        if err := r.ParseForm(); err != nil {
                err = fmt.Errorf("parse form fail: %v", err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        raftID, err := strconv.ParseUint(r.FormValue(paramRaftID), 10, 64)
        if err != nil {
                err = fmt.Errorf("parse param %v fail: %v", paramRaftID, err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        raftStatus := s.raftStore.RaftStatus(raftID)
        s.buildSuccessResp(w, raftStatus)
}

func (s *DataNode) getPartitionsAPI(w http.ResponseWriter, r *http.Request) {
        partitions := make([]interface{}, 0)
        s.space.RangePartitions(func(dp *DataPartition) bool {
                partition := &struct {
                        ID       uint64   `json:"id"`
                        Size     int      `json:"size"`
                        Used     int      `json:"used"`
                        Status   int      `json:"status"`
                        Path     string   `json:"path"`
                        Replicas []string `json:"replicas"`
                }{
                        ID:       dp.partitionID,
                        Size:     dp.Size(),
                        Used:     dp.Used(),
                        Status:   dp.Status(),
                        Path:     dp.Path(),
                        Replicas: dp.Replicas(),
                }
                partitions = append(partitions, partition)
                return true
        })
        result := &struct {
                Partitions     []interface{} `json:"partitions"`
                PartitionCount int           `json:"partitionCount"`
        }{
                Partitions:     partitions,
                PartitionCount: len(partitions),
        }
        s.buildSuccessResp(w, result)
}

func (s *DataNode) getPartitionAPI(w http.ResponseWriter, r *http.Request) {
        const (
                paramPartitionID = "id"
        )
        var (
                partitionID          uint64
                files                []*storage.ExtentInfo
                err                  error
                tinyDeleteRecordSize int64
                raftSt               *raft.Status
        )
        if err = r.ParseForm(); err != nil {
                err = fmt.Errorf("parse form fail: %v", err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if partitionID, err = strconv.ParseUint(r.FormValue(paramPartitionID), 10, 64); err != nil {
                err = fmt.Errorf("parse param %v fail: %v", paramPartitionID, err)
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        partition := s.space.Partition(partitionID)
        if partition == nil {
                s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
                return
        }
        if files, tinyDeleteRecordSize, err = partition.ExtentStore().GetAllWatermarks(nil); err != nil {
                err = fmt.Errorf("get watermark fail: %v", err)
                s.buildFailureResp(w, http.StatusInternalServerError, err.Error())
                return
        }

        if partition.IsDataPartitionLoading() {
                raftSt = &raft.Status{Stopped: true}
        } else {
                raftSt = partition.raftPartition.Status()
        }

        result := &struct {
                VolName              string                `json:"volName"`
                ID                   uint64                `json:"id"`
                Size                 int                   `json:"size"`
                Used                 int                   `json:"used"`
                Status               int                   `json:"status"`
                Path                 string                `json:"path"`
                Files                []*storage.ExtentInfo `json:"extents"`
                FileCount            int                   `json:"fileCount"`
                Replicas             []string              `json:"replicas"`
                TinyDeleteRecordSize int64                 `json:"tinyDeleteRecordSize"`
                RaftStatus           *raft.Status          `json:"raftStatus"`
        }{
                VolName:              partition.volumeID,
                ID:                   partition.partitionID,
                Size:                 partition.Size(),
                Used:                 partition.Used(),
                Status:               partition.Status(),
                Path:                 partition.Path(),
                Files:                files,
                FileCount:            len(files),
                Replicas:             partition.Replicas(),
                TinyDeleteRecordSize: tinyDeleteRecordSize,
                RaftStatus:           raftSt,
        }

        if partition.isNormalType() {
                result.RaftStatus = partition.raftPartition.Status()
        }

        s.buildSuccessResp(w, result)
}

func (s *DataNode) getExtentAPI(w http.ResponseWriter, r *http.Request) {
        var (
                partitionID uint64
                extentID    int
                err         error
                extentInfo  *storage.ExtentInfo
        )
        if err = r.ParseForm(); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if partitionID, err = strconv.ParseUint(r.FormValue("partitionID"), 10, 64); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if extentID, err = strconv.Atoi(r.FormValue("extentID")); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        partition := s.space.Partition(partitionID)
        if partition == nil {
                s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
                return
        }
        if extentInfo, err = partition.ExtentStore().Watermark(uint64(extentID)); err != nil {
                s.buildFailureResp(w, 500, err.Error())
                return
        }

        s.buildSuccessResp(w, extentInfo)
}

func (s *DataNode) getBlockCrcAPI(w http.ResponseWriter, r *http.Request) {
        var (
                partitionID uint64
                extentID    int
                err         error
                blocks      []*storage.BlockCrc
        )
        if err = r.ParseForm(); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if partitionID, err = strconv.ParseUint(r.FormValue("partitionID"), 10, 64); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if extentID, err = strconv.Atoi(r.FormValue("extentID")); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        partition := s.space.Partition(partitionID)
        if partition == nil {
                s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
                return
        }
        if blocks, err = partition.ExtentStore().ScanBlocks(uint64(extentID)); err != nil {
                s.buildFailureResp(w, 500, err.Error())
                return
        }

        s.buildSuccessResp(w, blocks)
}

func (s *DataNode) getTinyDeleted(w http.ResponseWriter, r *http.Request) {
        var (
                partitionID uint64
                err         error
                extentInfo  []storage.ExtentDeleted
        )
        if err = r.ParseForm(); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if partitionID, err = strconv.ParseUint(r.FormValue("id"), 10, 64); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        partition := s.space.Partition(partitionID)
        if partition == nil {
                s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
                return
        }
        if extentInfo, err = partition.ExtentStore().GetHasDeleteTinyRecords(); err != nil {
                s.buildFailureResp(w, 500, err.Error())
                return
        }

        s.buildSuccessResp(w, extentInfo)
}

func (s *DataNode) getNormalDeleted(w http.ResponseWriter, r *http.Request) {
        var (
                partitionID uint64
                err         error
                extentInfo  []storage.ExtentDeleted
        )
        if err = r.ParseForm(); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        if partitionID, err = strconv.ParseUint(r.FormValue("id"), 10, 64); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        partition := s.space.Partition(partitionID)
        if partition == nil {
                s.buildFailureResp(w, http.StatusNotFound, "partition not exist")
                return
        }
        if extentInfo, err = partition.ExtentStore().GetHasDeleteExtent(); err != nil {
                s.buildFailureResp(w, 500, err.Error())
                return
        }

        s.buildSuccessResp(w, extentInfo)
}

func (s *DataNode) setQosEnable() func(http.ResponseWriter, *http.Request) {
        return func(w http.ResponseWriter, r *http.Request) {
                var (
                        err    error
                        enable bool
                )
                if err = r.ParseForm(); err != nil {
                        s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }
                if enable, err = strconv.ParseBool(r.FormValue("enable")); err != nil {
                        s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }
                s.diskQosEnable = enable
                s.buildSuccessResp(w, "success")
        }
}

func (s *DataNode) setDiskQos(w http.ResponseWriter, r *http.Request) {
        if err := r.ParseForm(); err != nil {
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        parser := func(key string) (val int64, err error, has bool) {
                valStr := r.FormValue(key)
                if valStr == "" {
                        return 0, nil, false
                }
                has = true
                val, err = strconv.ParseInt(valStr, 10, 64)
                return
        }

        updated := false
        for key, pVal := range map[string]*int{
                ConfigDiskReadIocc:  &s.diskReadIocc,
                ConfigDiskReadIops:  &s.diskReadIops,
                ConfigDiskReadFlow:  &s.diskReadFlow,
                ConfigDiskWriteIocc: &s.diskWriteIocc,
                ConfigDiskWriteIops: &s.diskWriteIops,
                ConfigDiskWriteFlow: &s.diskWriteFlow,
        } {
                val, err, has := parser(key)
                if err != nil {
                        s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }
                if has {
                        updated = true
                        *pVal = int(val)
                }
        }

        if updated {
                s.updateQosLimit()
        }
        s.buildSuccessResp(w, "success")
}

func (s *DataNode) getDiskQos(w http.ResponseWriter, r *http.Request) {
        disks := make([]interface{}, 0)
        for _, diskItem := range s.space.GetDisks() {
                disk := &struct {
                        Path  string        `json:"path"`
                        Read  LimiterStatus `json:"read"`
                        Write LimiterStatus `json:"write"`
                }{
                        Path:  diskItem.Path,
                        Read:  diskItem.limitRead.Status(),
                        Write: diskItem.limitWrite.Status(),
                }
                disks = append(disks, disk)
        }
        diskStatus := &struct {
                Disks []interface{} `json:"disks"`
                Zone  string        `json:"zone"`
        }{
                Disks: disks,
                Zone:  s.zoneName,
        }
        s.buildSuccessResp(w, diskStatus)
}

func (s *DataNode) getSmuxPoolStat() func(http.ResponseWriter, *http.Request) {
        return func(w http.ResponseWriter, r *http.Request) {
                if !s.enableSmuxConnPool {
                        s.buildFailureResp(w, 500, "smux pool not supported")
                        return
                }
                if s.smuxConnPool == nil {
                        s.buildFailureResp(w, 500, "smux pool now is nil")
                        return
                }
                stat := s.smuxConnPool.GetStat()
                s.buildSuccessResp(w, stat)
        }
}

func (s *DataNode) setMetricsDegrade(w http.ResponseWriter, r *http.Request) {
        if err := r.ParseForm(); err != nil {
                w.Write([]byte(err.Error()))
                return
        }

        if level := r.FormValue("level"); level != "" {
                val, err := strconv.Atoi(level)
                if err != nil {
                        w.Write([]byte("Set metrics degrade level failed\n"))
                } else {
                        atomic.StoreInt64(&s.metricsDegrade, int64(val))
                        w.Write([]byte(fmt.Sprintf("Set metrics degrade level to %v successfully\n", val)))
                }
        }
}

func (s *DataNode) getMetricsDegrade(w http.ResponseWriter, r *http.Request) {
        w.Write([]byte(fmt.Sprintf("%v\n", atomic.LoadInt64(&s.metricsDegrade))))
}

func (s *DataNode) genClusterVersionFile(w http.ResponseWriter, r *http.Request) {
        paths := make([]string, 0)
        s.space.RangePartitions(func(partition *DataPartition) bool {
                paths = append(paths, partition.disk.Path)
                return true
        })
        paths = append(paths, s.raftDir)

        for _, p := range paths {
                if _, err := os.Stat(path.Join(p, config.ClusterVersionFile)); err == nil || os.IsExist(err) {
                        s.buildFailureResp(w, http.StatusCreated, "cluster version file already exists in "+p)
                        return
                }
        }
        for _, p := range paths {
                if err := config.CheckOrStoreClusterUuid(p, s.clusterUuid, true); err != nil {
                        s.buildFailureResp(w, http.StatusInternalServerError, "Failed to create cluster version file in "+p)
                        return
                }
        }
        s.buildSuccessResp(w, "Generate cluster version file success")
}

func (s *DataNode) buildSuccessResp(w http.ResponseWriter, data interface{}) {
        s.buildJSONResp(w, http.StatusOK, data, "")
}

func (s *DataNode) buildFailureResp(w http.ResponseWriter, code int, msg string) {
        s.buildJSONResp(w, code, nil, msg)
}

// Create response for the API request.
func (s *DataNode) buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
        var (
                jsonBody []byte
                err      error
        )
        w.WriteHeader(code)
        w.Header().Set("Content-Type", "application/json")
        body := proto.HTTPReply{Code: int32(code), Msg: msg, Data: data}
        if jsonBody, err = json.Marshal(body); err != nil {
                return
        }
        w.Write(jsonBody)
}

func (s *DataNode) setDiskBadAPI(w http.ResponseWriter, r *http.Request) {
        const (
                paramDiskPath = "diskPath"
        )
        var (
                err      error
                diskPath string
                disk     *Disk
        )

        if err = r.ParseForm(); err != nil {
                err = fmt.Errorf("parse form fail: %v", err)
                log.LogErrorf("[setDiskBadAPI] %v", err.Error())
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }

        if diskPath = r.FormValue(paramDiskPath); diskPath == "" {
                err = fmt.Errorf("param(%v) is empty", paramDiskPath)
                log.LogErrorf("[setDiskBadAPI] %v", err.Error())
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }

        if disk, err = s.space.GetDisk(diskPath); err != nil {
                err = fmt.Errorf("not exit such dissk, path: %v", diskPath)
                log.LogErrorf("[setDiskBadAPI] %v", err.Error())
                s.buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }

        if disk.Status == proto.Unavailable {
                msg := fmt.Sprintf("disk(%v) status was already unavailable, nothing to do", disk.Path)
                log.LogInfof("[setDiskBadAPI] %v", msg)
                s.buildSuccessResp(w, msg)
                return
        }

        log.LogWarnf("[setDiskBadAPI] set bad disk, path: %v", disk.Path)
        disk.doDiskError()

        s.buildSuccessResp(w, "OK")
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "fmt"
        "math"
        "os"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/util/atomicutil"
        "github.com/cubefs/cubefs/util/loadutil"
        "github.com/cubefs/cubefs/util/log"
        "github.com/shirou/gopsutil/disk"
)

// SpaceManager manages the disk space.
type SpaceManager struct {
        clusterID            string
        disks                map[string]*Disk
        partitions           map[uint64]*DataPartition
        raftStore            raftstore.RaftStore
        nodeID               uint64
        diskMutex            sync.RWMutex
        partitionMutex       sync.RWMutex
        stats                *Stats
        stopC                chan bool
        selectedIndex        int // TODO what is selected index
        diskList             []string
        dataNode             *DataNode
        createPartitionMutex sync.RWMutex
        diskUtils            map[string]*atomicutil.Float64
        samplerDone          chan struct{}
}

const diskSampleDuration = 1 * time.Second

// NewSpaceManager creates a new space manager.
func NewSpaceManager(dataNode *DataNode) *SpaceManager {
        space := &SpaceManager{}
        space.disks = make(map[string]*Disk)
        space.diskList = make([]string, 0)
        space.partitions = make(map[uint64]*DataPartition)
        space.stats = NewStats(dataNode.zoneName)
        space.stopC = make(chan bool)
        space.dataNode = dataNode
        space.diskUtils = make(map[string]*atomicutil.Float64)
        go space.statUpdateScheduler()

        return space
}

func (manager *SpaceManager) Stop() {
        defer func() {
                recover()
        }()
        close(manager.stopC)
        // stop sampler
        close(manager.samplerDone)
        // Parallel stop data partitions.
        const maxParallelism = 128
        parallelism := int(math.Min(float64(maxParallelism), float64(len(manager.partitions))))
        wg := sync.WaitGroup{}
        partitionC := make(chan *DataPartition, parallelism)
        wg.Add(1)

        // Close raft store.
        for _, partition := range manager.partitions {
                partition.stopRaft()
        }

        go func(c chan<- *DataPartition) {
                defer wg.Done()
                for _, partition := range manager.partitions {
                        c <- partition
                }
                close(c)
        }(partitionC)

        for i := 0; i < parallelism; i++ {
                wg.Add(1)
                go func(c <-chan *DataPartition) {
                        defer wg.Done()
                        var partition *DataPartition
                        for {
                                if partition = <-c; partition == nil {
                                        return
                                }
                                partition.Stop()
                        }
                }(partitionC)
        }
        wg.Wait()
}

func (manager *SpaceManager) GetAllDiskPartitions() []*disk.PartitionStat {
        manager.diskMutex.RLock()
        defer manager.diskMutex.RUnlock()
        partitions := make([]*disk.PartitionStat, 0, len(manager.disks))
        for _, disk := range manager.disks {
                partition := disk.GetDiskPartition()
                if partition != nil {
                        partitions = append(partitions, partition)
                }
        }
        return partitions
}

func (manager *SpaceManager) FillIoUtils(samples map[string]loadutil.DiskIoSample) {
        manager.diskMutex.RLock()
        defer manager.diskMutex.RUnlock()
        for _, sample := range samples {
                util := manager.diskUtils[sample.GetPartition().Device]
                if util != nil {
                        util.Store(sample.GetIoUtilPercent())
                }
        }
}

func (manager *SpaceManager) StartDiskSample() {
        manager.samplerDone = make(chan struct{})
        go func() {
                for {
                        select {
                        case <-manager.samplerDone:
                                return
                        default:
                                partitions := manager.GetAllDiskPartitions()
                                samples, err := loadutil.GetDisksIoSample(partitions, diskSampleDuration)
                                if err != nil {
                                        log.LogErrorf("failed to sample disk %v\n", err.Error())
                                        return
                                }
                                manager.FillIoUtils(samples)
                        }
                }
        }()
}

func (manager *SpaceManager) GetDiskUtils() map[string]float64 {
        utils := make(map[string]float64)
        manager.diskMutex.RLock()
        defer manager.diskMutex.RUnlock()
        for device, used := range manager.diskUtils {
                utils[device] = used.Load()
        }
        return utils
}

func (manager *SpaceManager) SetNodeID(nodeID uint64) {
        manager.nodeID = nodeID
}

func (manager *SpaceManager) GetNodeID() (nodeID uint64) {
        return manager.nodeID
}

func (manager *SpaceManager) SetClusterID(clusterID string) {
        manager.clusterID = clusterID
}

func (manager *SpaceManager) GetClusterID() (clusterID string) {
        return manager.clusterID
}

func (manager *SpaceManager) SetRaftStore(raftStore raftstore.RaftStore) {
        manager.raftStore = raftStore
}

func (manager *SpaceManager) GetRaftStore() (raftStore raftstore.RaftStore) {
        return manager.raftStore
}

func (manager *SpaceManager) RangePartitions(f func(partition *DataPartition) bool) {
        if f == nil {
                return
        }
        manager.partitionMutex.RLock()
        partitions := make([]*DataPartition, 0)
        for _, dp := range manager.partitions {
                partitions = append(partitions, dp)
        }
        manager.partitionMutex.RUnlock()

        for _, partition := range partitions {
                if !f(partition) {
                        break
                }
        }
}

func (manager *SpaceManager) GetDisks() (disks []*Disk) {
        manager.diskMutex.RLock()
        defer manager.diskMutex.RUnlock()
        disks = make([]*Disk, 0)
        for _, disk := range manager.disks {
                disks = append(disks, disk)
        }
        return
}

func (manager *SpaceManager) Stats() *Stats {
        return manager.stats
}

func (manager *SpaceManager) LoadDisk(path string, reservedSpace, diskRdonlySpace uint64, maxErrCnt int) (err error) {
        var (
                disk    *Disk
                visitor PartitionVisitor
        )

        if diskRdonlySpace < reservedSpace {
                diskRdonlySpace = reservedSpace
        }

        log.LogDebugf("action[LoadDisk] load disk from path(%v).", path)
        visitor = func(dp *DataPartition) {
                manager.partitionMutex.Lock()
                defer manager.partitionMutex.Unlock()
                if _, has := manager.partitions[dp.partitionID]; !has {
                        manager.partitions[dp.partitionID] = dp
                        log.LogDebugf("action[LoadDisk] put partition(%v) to manager manager.", dp.partitionID)
                }
        }

        if _, err = manager.GetDisk(path); err != nil {
                disk, err = NewDisk(path, reservedSpace, diskRdonlySpace, maxErrCnt, manager)
                if err != nil {
                        log.LogErrorf("NewDisk fail err:[%v]", err)
                        return
                }
                err = disk.RestorePartition(visitor)
                if err != nil {
                        log.LogErrorf("RestorePartition fail err:[%v]", err)
                        return
                }
                manager.putDisk(disk)
                err = nil
                go disk.doBackendTask()
        }
        return
}

func (manager *SpaceManager) GetDisk(path string) (d *Disk, err error) {
        manager.diskMutex.RLock()
        defer manager.diskMutex.RUnlock()
        disk, has := manager.disks[path]
        if has && disk != nil {
                d = disk
                return
        }
        err = fmt.Errorf("disk(%v) not exsit", path)
        return
}

func (manager *SpaceManager) putDisk(d *Disk) {
        manager.diskMutex.Lock()
        manager.disks[d.Path] = d
        manager.diskList = append(manager.diskList, d.Path)
        if d.GetDiskPartition() != nil {
                manager.diskUtils[d.GetDiskPartition().Device] = &atomicutil.Float64{}
                manager.diskUtils[d.GetDiskPartition().Device].Store(0)
        }
        manager.diskMutex.Unlock()
}

func (manager *SpaceManager) updateMetrics() {
        manager.diskMutex.RLock()
        var (
                total, used, available                                 uint64
                totalPartitionSize, remainingCapacityToCreatePartition uint64
                maxCapacityToCreatePartition, partitionCnt             uint64
        )
        maxCapacityToCreatePartition = 0
        for _, d := range manager.disks {
                if d.Status == proto.Unavailable {
                        log.LogInfof("disk is broken, not stat disk useage, diskpath %s", d.Path)
                        continue
                }

                total += d.Total
                used += d.Used
                available += d.Available
                totalPartitionSize += d.Allocated
                remainingCapacityToCreatePartition += d.Unallocated
                partitionCnt += uint64(d.PartitionCount())
                if maxCapacityToCreatePartition < d.Unallocated {
                        maxCapacityToCreatePartition = d.Unallocated
                }
        }
        manager.diskMutex.RUnlock()
        log.LogDebugf("action[updateMetrics] total(%v) used(%v) available(%v) totalPartitionSize(%v)  remainingCapacityToCreatePartition(%v) "+
                "partitionCnt(%v) maxCapacityToCreatePartition(%v) ", total, used, available, totalPartitionSize, remainingCapacityToCreatePartition, partitionCnt, maxCapacityToCreatePartition)
        manager.stats.updateMetrics(total, used, available, totalPartitionSize,
                remainingCapacityToCreatePartition, maxCapacityToCreatePartition, partitionCnt)
}

func (manager *SpaceManager) minPartitionCnt(decommissionedDisks []string) (d *Disk) {
        manager.diskMutex.Lock()
        defer manager.diskMutex.Unlock()
        var (
                minWeight     float64
                minWeightDisk *Disk
        )
        decommissionedDiskMap := make(map[string]struct{})
        for _, disk := range decommissionedDisks {
                decommissionedDiskMap[disk] = struct{}{}
        }
        minWeight = math.MaxFloat64
        for _, disk := range manager.disks {
                if _, ok := decommissionedDiskMap[disk.Path]; ok {
                        log.LogInfof("action[minPartitionCnt] exclude decommissioned disk[%v]", disk.Path)
                        continue
                }
                if disk.Status != proto.ReadWrite {
                        continue
                }
                diskWeight := disk.getSelectWeight()
                if diskWeight < minWeight {
                        minWeight = diskWeight
                        minWeightDisk = disk
                }
        }
        if minWeightDisk == nil {
                return
        }
        if minWeightDisk.Status != proto.ReadWrite {
                return
        }
        d = minWeightDisk
        return d
}

func (manager *SpaceManager) statUpdateScheduler() {
        go func() {
                ticker := time.NewTicker(10 * time.Second)
                for {
                        select {
                        case <-ticker.C:
                                manager.updateMetrics()
                        case <-manager.stopC:
                                ticker.Stop()
                                return
                        }
                }
        }()
}

func (manager *SpaceManager) Partition(partitionID uint64) (dp *DataPartition) {
        manager.partitionMutex.RLock()
        defer manager.partitionMutex.RUnlock()
        dp = manager.partitions[partitionID]
        return
}

func (manager *SpaceManager) AttachPartition(dp *DataPartition) {
        manager.partitionMutex.Lock()
        defer manager.partitionMutex.Unlock()
        manager.partitions[dp.partitionID] = dp
}

// DetachDataPartition removes a data partition from the partition map.
func (manager *SpaceManager) DetachDataPartition(partitionID uint64) {
        manager.partitionMutex.Lock()
        defer manager.partitionMutex.Unlock()
        delete(manager.partitions, partitionID)
}

func (manager *SpaceManager) CreatePartition(request *proto.CreateDataPartitionRequest) (dp *DataPartition, err error) {
        manager.partitionMutex.Lock()
        defer manager.partitionMutex.Unlock()
        dpCfg := &dataPartitionCfg{
                PartitionID:   request.PartitionId,
                VolName:       request.VolumeId,
                Peers:         request.Members,
                Hosts:         request.Hosts,
                RaftStore:     manager.raftStore,
                NodeID:        manager.nodeID,
                ClusterID:     manager.clusterID,
                PartitionSize: request.PartitionSize,
                PartitionType: int(request.PartitionTyp),
                ReplicaNum:    request.ReplicaNum,
                VerSeq:        request.VerSeq,
                CreateType:    request.CreateType,
                Forbidden:     false,
        }
        log.LogInfof("action[CreatePartition] dp %v dpCfg.Peers %v request.Members %v",
                dpCfg.PartitionID, dpCfg.Peers, request.Members)
        dp = manager.partitions[dpCfg.PartitionID]
        if dp != nil {
                if err = dp.IsEquareCreateDataPartitionRequst(request); err != nil {
                        return nil, err
                }
                return
        }
        disk := manager.minPartitionCnt(request.DecommissionedDisks)
        if disk == nil {
                return nil, ErrNoSpaceToCreatePartition
        }
        if dp, err = CreateDataPartition(dpCfg, disk, request); err != nil {
                return
        }
        manager.partitions[dp.partitionID] = dp
        return
}

// DeletePartition deletes a partition based on the partition id.
func (manager *SpaceManager) DeletePartition(dpID uint64) {
        manager.partitionMutex.Lock()

        dp := manager.partitions[dpID]
        if dp == nil {
                manager.partitionMutex.Unlock()
                return
        }

        delete(manager.partitions, dpID)
        manager.partitionMutex.Unlock()
        dp.Stop()
        dp.Disk().DetachDataPartition(dp)
        os.RemoveAll(dp.Path())
}

func (s *DataNode) buildHeartBeatResponse(response *proto.DataNodeHeartbeatResponse) {
        response.Status = proto.TaskSucceeds
        stat := s.space.Stats()
        stat.Lock()
        response.Used = stat.Used
        response.Total = stat.Total
        response.Available = stat.Available
        response.CreatedPartitionCnt = uint32(stat.CreatedPartitionCnt)
        response.TotalPartitionSize = stat.TotalPartitionSize
        response.MaxCapacity = stat.MaxCapacityToCreatePartition
        response.RemainingCapacity = stat.RemainingCapacityToCreatePartition
        response.BadDisks = make([]string, 0)
        response.BadDiskStats = make([]proto.BadDiskStat, 0)
        response.StartTime = s.startTime
        stat.Unlock()

        response.ZoneName = s.zoneName
        response.PartitionReports = make([]*proto.DataPartitionReport, 0)
        space := s.space
        space.RangePartitions(func(partition *DataPartition) bool {
                leaderAddr, isLeader := partition.IsRaftLeader()
                vr := &proto.DataPartitionReport{
                        VolName:                    partition.volumeID,
                        PartitionID:                uint64(partition.partitionID),
                        PartitionStatus:            partition.Status(),
                        Total:                      uint64(partition.Size()),
                        Used:                       uint64(partition.Used()),
                        DiskPath:                   partition.Disk().Path,
                        IsLeader:                   isLeader,
                        ExtentCount:                partition.GetExtentCount(),
                        NeedCompare:                true,
                        DecommissionRepairProgress: partition.decommissionRepairProgress,
                }
                log.LogDebugf("action[Heartbeats] dpid(%v), status(%v) total(%v) used(%v) leader(%v) isLeader(%v).", vr.PartitionID, vr.PartitionStatus, vr.Total, vr.Used, leaderAddr, vr.IsLeader)
                response.PartitionReports = append(response.PartitionReports, vr)
                return true
        })

        disks := space.GetDisks()
        for _, d := range disks {
                if d.Status == proto.Unavailable {
                        response.BadDisks = append(response.BadDisks, d.Path)

                        bds := proto.BadDiskStat{
                                DiskPath:             d.Path,
                                TotalPartitionCnt:    d.PartitionCount(),
                                DiskErrPartitionList: d.GetDiskErrPartitionList(),
                        }
                        response.BadDiskStats = append(response.BadDiskStats, bds)
                }
        }
}

func (manager *SpaceManager) getPartitionIds() []uint64 {
        res := make([]uint64, 0)
        for id := range manager.partitions {
                res = append(res, id)
        }
        return res
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "sync"
        "sync/atomic"
        "time"
)

// Stats defines various metrics that will be collected during the execution.
type Stats struct {
        inDataSize  uint64
        outDataSize uint64
        inFlow      uint64
        outFlow     uint64

        Zone                               string
        ConnectionCnt                      int64
        ClusterID                          string
        TCPAddr                            string
        Start                              time.Time
        Total                              uint64
        Used                               uint64
        Available                          uint64 // available space
        TotalPartitionSize                 uint64 // dataPartitionCnt * dataPartitionSize
        RemainingCapacityToCreatePartition uint64
        CreatedPartitionCnt                uint64
        LackPartitionsInMem                uint64
        LackPartitionsInDisk               uint64

        // the maximum capacity among all the disks that can be used to create partition
        MaxCapacityToCreatePartition uint64

        sync.Mutex
}

// NewStats creates a new Stats.
func NewStats(zone string) (s *Stats) {
        s = new(Stats)
        s.Zone = zone
        return s
}

// AddConnection adds a connection.
func (s *Stats) AddConnection() {
        atomic.AddInt64(&s.ConnectionCnt, 1)
}

// RemoveConnection removes a connection.
func (s *Stats) RemoveConnection() {
        atomic.AddInt64(&s.ConnectionCnt, -1)
}

// GetConnectionCount gets the connection count.
func (s *Stats) GetConnectionCount() int64 {
        return atomic.LoadInt64(&s.ConnectionCnt)
}

func (s *Stats) updateMetrics(
        total, used, available, createdPartitionWeights, remainWeightsForCreatePartition,
        maxWeightsForCreatePartition, dataPartitionCnt uint64) {
        s.Lock()
        defer s.Unlock()

        s.Total = total
        s.Used = used
        s.Available = available
        s.TotalPartitionSize = createdPartitionWeights
        s.RemainingCapacityToCreatePartition = remainWeightsForCreatePartition
        s.MaxCapacityToCreatePartition = maxWeightsForCreatePartition
        s.CreatedPartitionCnt = dataPartitionCnt
}

func (s *Stats) updateMetricLackPartitionsInMem(lackPartitionsInMem uint64) {
        s.Lock()
        defer s.Unlock()

        s.LackPartitionsInMem = lackPartitionsInMem
}

func (s *Stats) updateMetricLackPartitionsInDisk(lackPartitionsInDisk uint64) {
        s.Lock()
        defer s.Unlock()

        s.LackPartitionsInDisk = lackPartitionsInDisk
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "hash/crc32"
        "net"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

var ErrForbiddenDataPartition = errors.New("the data partition is forbidden")

func (s *DataNode) getPacketTpLabels(p *repl.Packet) map[string]string {
        labels := make(map[string]string)
        labels[exporter.Vol] = ""
        labels[exporter.Op] = ""
        labels[exporter.PartId] = ""
        labels[exporter.Disk] = ""

        if part, ok := p.Object.(*DataPartition); ok {
                labels[exporter.Vol] = part.volumeID
                labels[exporter.Op] = p.GetOpMsg()
                if exporter.EnablePid {
                        labels[exporter.PartId] = fmt.Sprintf("%d", part.partitionID)
                        labels[exporter.Disk] = part.path
                }
        }

        return labels
}

func isColdVolExtentDelErr(p *repl.Packet) bool {
        if p.Object == nil {
                return false
        }

        partition, ok := p.Object.(*DataPartition)
        if !ok {
                return false
        }

        if proto.IsNormalDp(partition.partitionType) {
                return false
        }

        if p.ResultCode == proto.OpNotExistErr {
                return true
        }

        return false
}

func (s *DataNode) OperatePacket(p *repl.Packet, c net.Conn) (err error) {
        var (
                tpLabels map[string]string
                tpObject *exporter.TimePointCount
        )
        log.LogDebugf("action[OperatePacket] %v, pack [%v]", p.GetOpMsg(), p)
        shallDegrade := p.ShallDegrade()
        sz := p.Size
        if !shallDegrade {
                tpObject = exporter.NewTPCnt(p.GetOpMsg())
                tpLabels = s.getPacketTpLabels(p)
        }
        start := time.Now().UnixNano()
        defer func() {
                resultSize := p.Size
                p.Size = sz
                if p.IsErrPacket() {
                        err = fmt.Errorf("op(%v) error(%v)", p.GetOpMsg(), string(p.Data[:resultSize]))
                        logContent := fmt.Sprintf("action[OperatePacket] %v.",
                                p.LogMessage(p.GetOpMsg(), c.RemoteAddr().String(), start, err))
                        if isColdVolExtentDelErr(p) {
                                log.LogInfof(logContent)
                        } else {
                                log.LogErrorf(logContent)
                        }
                } else {
                        logContent := fmt.Sprintf("action[OperatePacket] %v.",
                                p.LogMessage(p.GetOpMsg(), c.RemoteAddr().String(), start, nil))
                        switch p.Opcode {
                        case proto.OpStreamRead, proto.OpRead, proto.OpExtentRepairRead, proto.OpStreamFollowerRead:
                        case proto.OpReadTinyDeleteRecord:
                                log.LogRead(logContent)
                        case proto.OpWrite, proto.OpRandomWrite,
                                proto.OpRandomWriteVer, proto.OpSyncRandomWriteVer,
                                proto.OpRandomWriteAppend, proto.OpSyncRandomWriteAppend,
                                proto.OpTryWriteAppend, proto.OpSyncTryWriteAppend,
                                proto.OpSyncRandomWrite, proto.OpSyncWrite, proto.OpMarkDelete, proto.OpSplitMarkDelete:
                                log.LogWrite(logContent)
                        default:
                                log.LogInfo(logContent)
                        }
                }
                p.Size = resultSize
                if !shallDegrade {
                        tpObject.SetWithLabels(err, tpLabels)
                }
        }()
        switch p.Opcode {
        case proto.OpCreateExtent:
                s.handlePacketToCreateExtent(p)
        case proto.OpWrite, proto.OpSyncWrite:
                s.handleWritePacket(p)
        case proto.OpStreamRead:
                s.handleStreamReadPacket(p, c, StreamRead)
        case proto.OpStreamFollowerRead:
                s.extentRepairReadPacket(p, c, StreamRead)
        case proto.OpExtentRepairRead:
                s.handleExtentRepairReadPacket(p, c, RepairRead)
        case proto.OpTinyExtentRepairRead:
                s.handleTinyExtentRepairReadPacket(p, c)
        case proto.OpMarkDelete, proto.OpSplitMarkDelete:
                s.handleMarkDeletePacket(p, c)
        case proto.OpBatchDeleteExtent:
                s.handleBatchMarkDeletePacket(p, c)
        case proto.OpRandomWrite, proto.OpSyncRandomWrite,
                proto.OpRandomWriteAppend, proto.OpSyncRandomWriteAppend,
                proto.OpTryWriteAppend, proto.OpSyncTryWriteAppend,
                proto.OpRandomWriteVer, proto.OpSyncRandomWriteVer:
                s.handleRandomWritePacket(p)
        case proto.OpNotifyReplicasToRepair:
                s.handlePacketToNotifyExtentRepair(p)
        case proto.OpGetAllWatermarks:
                s.handlePacketToGetAllWatermarks(p)
        case proto.OpCreateDataPartition:
                s.handlePacketToCreateDataPartition(p)
        case proto.OpLoadDataPartition:
                s.handlePacketToLoadDataPartition(p)
        case proto.OpDeleteDataPartition:
                s.handlePacketToDeleteDataPartition(p)
        case proto.OpDataNodeHeartbeat:
                s.handleHeartbeatPacket(p)
        case proto.OpGetAppliedId:
                s.handlePacketToGetAppliedID(p)
        case proto.OpDecommissionDataPartition:
                s.handlePacketToDecommissionDataPartition(p)
        case proto.OpAddDataPartitionRaftMember:
                s.handlePacketToAddDataPartitionRaftMember(p)
        case proto.OpRemoveDataPartitionRaftMember:
                s.handlePacketToRemoveDataPartitionRaftMember(p)
        case proto.OpDataPartitionTryToLeader:
                s.handlePacketToDataPartitionTryToLeader(p)
        case proto.OpGetPartitionSize:
                s.handlePacketToGetPartitionSize(p)
        case proto.OpGetMaxExtentIDAndPartitionSize:
                s.handlePacketToGetMaxExtentIDAndPartitionSize(p)
        case proto.OpReadTinyDeleteRecord:
                s.handlePacketToReadTinyDeleteRecordFile(p, c)
        case proto.OpBroadcastMinAppliedID:
                s.handleBroadcastMinAppliedID(p)
        case proto.OpVersionOperation:
                s.handleUpdateVerPacket(p)
        case proto.OpStopDataPartitionRepair:
                s.handlePacketToStopDataPartitionRepair(p)
        default:
                p.PackErrorBody(repl.ErrorUnknownOp.Error(), repl.ErrorUnknownOp.Error()+strconv.Itoa(int(p.Opcode)))
        }

        return
}

// Handle OpCreateExtent packet.
func (s *DataNode) handlePacketToCreateExtent(p *repl.Packet) {
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionCreateExtent, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        partition := p.Object.(*DataPartition)
        if partition.Available() <= 0 || !partition.disk.CanWrite() {
                err = storage.NoSpaceError
                return
        } else if partition.disk.Status == proto.Unavailable {
                err = storage.BrokenDiskError
                return
        }

        // in case too many extents
        if partition.GetExtentCount() >= storage.MaxExtentCount+10 {
                err = storage.NoSpaceError
                return
        }

        partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
        partition.disk.limitWrite.Run(0, func() {
                err = partition.ExtentStore().Create(p.ExtentID)
        })
}

// Handle OpCreateDataPartition packet.
func (s *DataNode) handlePacketToCreateDataPartition(p *repl.Packet) {
        var (
                err   error
                bytes []byte
                dp    *DataPartition
        )
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionCreateDataPartition, err.Error())
                }
        }()
        task := &proto.AdminTask{}
        if err = json.Unmarshal(p.Data, task); err != nil {
                err = fmt.Errorf("cannnot unmashal adminTask")
                return
        }
        request := &proto.CreateDataPartitionRequest{}
        if task.OpCode != proto.OpCreateDataPartition {
                err = fmt.Errorf("from master Task(%v) failed,error unavali opcode(%v)", task.ToString(), task.OpCode)
                return
        }

        bytes, err = json.Marshal(task.Request)
        if err != nil {
                err = fmt.Errorf("from master Task(%v) cannot unmashal CreateDataPartition, err %s", task.ToString(), err.Error())
                return
        }
        p.AddMesgLog(string(bytes))
        if err = json.Unmarshal(bytes, request); err != nil {
                err = fmt.Errorf("from master Task(%v) cannot unmashal CreateDataPartitionRequest struct, err(%s)", task.ToString(), err.Error())
                return
        }
        p.PartitionID = request.PartitionId
        if dp, err = s.space.CreatePartition(request); err != nil {
                err = fmt.Errorf("from master Task(%v) cannot create Partition err(%v)", task.ToString(), err)
                return
        }
        p.PacketOkWithBody([]byte(dp.Disk().Path))
}

func (s *DataNode) commitDelVersion(volumeID string, verSeq uint64) (err error) {
        for _, partition := range s.space.partitions {
                if partition.config.VolName != volumeID {
                        continue
                }
                verListMgr := partition.volVersionInfoList
                verListMgr.RWLock.Lock()
                for i, ver := range verListMgr.VerList {
                        if i == len(verListMgr.VerList)-1 {
                                log.LogWarnf("action[commitDelVersion] dp[%v] seq %v, seqArray size %v newest ver %v",
                                        partition.config.PartitionID, verSeq, len(verListMgr.VerList), ver.Ver)
                                break
                        }
                        if ver.Ver == verSeq {
                                log.LogInfof("action[commitDelVersion] updateVerList dp[%v] seq %v,seqArray size %v", partition.config.PartitionID, verSeq, len(verListMgr.VerList))
                                verListMgr.VerList = append(verListMgr.VerList[:i], verListMgr.VerList[i+1:]...)
                                break
                        }
                }
                verListMgr.RWLock.Unlock()
        }
        return
}

func (s *DataNode) commitCreateVersion(req *proto.MultiVersionOpRequest) (err error) {
        log.LogInfof("action[commitCreateVersion] handle master version reqeust %v", req)
        var (
                value interface{}
                ok    bool
                wg    sync.WaitGroup
        )
        if value, ok = s.volUpdating.Load(req.VolumeID); !ok {
                log.LogWarnf("action[commitCreateVersion] vol %v not found seq %v", req.VolumeID, req.VerSeq)
                return
        }

        ver2Phase := value.(*verOp2Phase)
        log.LogInfof("action[commitCreateVersion] try commit volume %v ver2Phase seq %v with req seq %v",
                req.VolumeID, ver2Phase.verPrepare, req.VerSeq)
        if req.VerSeq < ver2Phase.verSeq {
                log.LogWarnf("action[commitCreateVersion] vol %v seq %v create less than loal %v", req.VolumeID, req.VerSeq, ver2Phase.verSeq)
                return
        }
        if ver2Phase.step != proto.CreateVersionPrepare {
                log.LogWarnf("action[commitCreateVersion] vol %v seq %v step not prepare", req.VolumeID, ver2Phase.step)
        }

        s.space.partitionMutex.RLock()
        defer s.space.partitionMutex.RUnlock()
        resultCh := make(chan error, len(s.space.partitions))
        for _, partition := range s.space.partitions {
                if partition.config.VolName != req.VolumeID {
                        continue
                }
                if !partition.isRaftLeader {
                        continue
                }
                wg.Add(1)
                go func(partition *DataPartition) {
                        defer wg.Done()
                        log.LogInfof("action[commitCreateVersion] volume %v dp[%v] do HandleVersionOp verSeq[%v]",
                                partition.volumeID, partition.partitionID, partition.verSeq)
                        if err = partition.HandleVersionOp(req); err != nil {
                                log.LogErrorf("action[commitCreateVersion] volume %v dp[%v] do HandleVersionOp verSeq[%v] err %v",
                                        partition.volumeID, partition.partitionID, partition.verSeq, err)
                                resultCh <- err
                                return
                        }
                }(partition)
        }

        wg.Wait()
        select {
        case err = <-resultCh:
                if err != nil {
                        close(resultCh)
                        return
                }
        default:
                log.LogInfof("action[commitCreateVersion] volume %v do HandleVersionOp verseq [%v] finished", req.VolumeID, req.VerSeq)
        }
        close(resultCh)
        if req.Op == proto.DeleteVersion {
                return
        }

        if req.Op == proto.CreateVersionPrepare {
                log.LogInfof("action[commitCreateVersion] commit volume %v prepare seq %v with commit seq %v",
                        req.VolumeID, ver2Phase.verPrepare, req.VerSeq)
                return
        }

        ver2Phase.verSeq = req.VerSeq
        ver2Phase.step = proto.CreateVersionCommit
        ver2Phase.status = proto.VersionWorkingFinished
        log.LogInfof("action[commitCreateVersion] commit volume %v prepare seq %v with commit seq %v",
                req.VolumeID, ver2Phase.verPrepare, req.VerSeq)

        return
}

func (s *DataNode) prepareCreateVersion(req *proto.MultiVersionOpRequest) (err error, opAagin bool) {
        var ver2Phase *verOp2Phase
        if value, ok := s.volUpdating.Load(req.VolumeID); ok {
                ver2Phase = value.(*verOp2Phase)
                if req.VerSeq < ver2Phase.verSeq {
                        err = fmt.Errorf("seq %v create less than loal %v", req.VerSeq, ver2Phase.verSeq)
                        log.LogInfof("action[prepareCreateVersion] volume %v update to ver %v step %v", req.VolumeID, req.VerSeq, ver2Phase.step)
                        return
                } else if req.VerSeq == ver2Phase.verPrepare {
                        if ver2Phase.step == proto.VersionWorking {
                                opAagin = true
                                return
                        }
                }
        }
        ver2Phase = &verOp2Phase{}
        ver2Phase.step = uint32(req.Op)
        ver2Phase.status = proto.VersionWorking
        ver2Phase.verPrepare = req.VerSeq

        s.volUpdating.Store(req.VolumeID, ver2Phase)

        log.LogInfof("action[prepareCreateVersion] volume %v update seq to %v step %v",
                req.VolumeID, req.VerSeq, ver2Phase.step)
        return
}

// Handle OpHeartbeat packet.
func (s *DataNode) handleUpdateVerPacket(p *repl.Packet) {
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionUpdateVersion, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()

        task := &proto.AdminTask{}
        err = json.Unmarshal(p.Data, task)
        if err != nil {
                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                return
        }
        request := &proto.MultiVersionOpRequest{}
        response := &proto.MultiVersionOpResponse{}
        response.Op = task.OpCode
        response.Status = proto.TaskSucceeds

        if task.OpCode == proto.OpVersionOperation {
                marshaled, _ := json.Marshal(task.Request)
                if err = json.Unmarshal(marshaled, request); err != nil {
                        log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                        response.Status = proto.TaskFailed
                        goto end
                }

                if request.Op == proto.CreateVersionPrepare {
                        if err, _ = s.prepareCreateVersion(request); err != nil {
                                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                                goto end
                        }
                        if err = s.commitCreateVersion(request); err != nil {
                                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                                goto end
                        }
                } else if request.Op == proto.CreateVersionCommit {
                        if err = s.commitCreateVersion(request); err != nil {
                                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                                goto end
                        }
                } else if request.Op == proto.DeleteVersion {
                        if err = s.commitDelVersion(request.VolumeID, request.VerSeq); err != nil {
                                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                                goto end
                        }
                }

                response.VerSeq = request.VerSeq
                response.Op = request.Op
                response.Addr = request.Addr
                response.VolumeID = request.VolumeID

        } else {
                err = fmt.Errorf("illegal opcode")
                log.LogErrorf("action[handleUpdateVerPacket] handle master version reqeust err %v", err)
                goto end
        }
end:
        if err != nil {
                response.Result = err.Error()
        }
        task.Response = response
        log.LogInfof("action[handleUpdateVerPacket] rsp to client,req vol %v, verseq %v, op %v", request.VolumeID, request.VerSeq, request.Op)
        if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
                err = errors.Trace(err, "handleUpdateVerPacket to master failed.")
                log.LogErrorf(err.Error())
                return
        }
}

func (s *DataNode) checkVolumeForbidden(volNames []string) {
        s.space.RangePartitions(func(partition *DataPartition) bool {
                for _, volName := range volNames {
                        if volName == partition.volumeID {
                                partition.SetForbidden(true)
                                return true
                        }
                }
                partition.SetForbidden(false)
                return true
        })
}

func (s *DataNode) checkDecommissionDisks(decommissionDisks []string) {
        decommissionDiskSet := util.NewSet()
        for _, disk := range decommissionDisks {
                decommissionDiskSet.Add(disk)
        }
        disks := s.space.GetDisks()
        for _, disk := range disks {
                if disk.GetDecommissionStatus() && !decommissionDiskSet.Has(disk.Path) {
                        log.LogDebugf("action[checkDecommissionDisks] mark %v to be undecommissioned", disk.Path)
                        disk.MarkDecommissionStatus(false)
                        continue
                }
                if !disk.GetDecommissionStatus() && decommissionDiskSet.Has(disk.Path) {
                        log.LogDebugf("action[checkDecommissionDisks] mark %v to be decommissioned", disk.Path)
                        disk.MarkDecommissionStatus(true)
                        continue
                }
        }
}

// Handle OpHeartbeat packet.
func (s *DataNode) handleHeartbeatPacket(p *repl.Packet) {
        var err error
        task := &proto.AdminTask{}
        err = json.Unmarshal(p.Data, task)
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionCreateDataPartition, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        if err != nil {
                return
        }

        go func() {
                request := &proto.HeartBeatRequest{}
                response := &proto.DataNodeHeartbeatResponse{}
                s.buildHeartBeatResponse(response)

                if task.OpCode == proto.OpDataNodeHeartbeat {
                        marshaled, _ := json.Marshal(task.Request)
                        _ = json.Unmarshal(marshaled, request)
                        response.Status = proto.TaskSucceeds
                        if s.diskQosEnableFromMaster != request.EnableDiskQos {
                                log.LogWarnf("action[handleHeartbeatPacket] master command disk qos enable change to [%v], local conf enable [%v]",
                                        request.EnableDiskQos,
                                        s.diskQosEnable)
                        }

                        // set volume forbidden
                        s.checkVolumeForbidden(request.ForbiddenVols)
                        // set decommission disks
                        s.checkDecommissionDisks(request.DecommissionDisks)
                        s.diskQosEnableFromMaster = request.EnableDiskQos

                        var needUpdate bool
                        for _, pair := range []struct {
                                replace uint64
                                origin  *int
                        }{
                                {request.QosFlowWriteLimit, &s.diskWriteFlow},
                                {request.QosFlowReadLimit, &s.diskReadFlow},
                                {request.QosIopsWriteLimit, &s.diskWriteIops},
                                {request.QosIopsReadLimit, &s.diskReadIops},
                        } {
                                if pair.replace > 0 && int(pair.replace) != *pair.origin {
                                        *pair.origin = int(pair.replace)
                                        needUpdate = true
                                }
                        }

                        // set cpu util and io used in here
                        response.CpuUtil = s.cpuUtil.Load()
                        response.IoUtils = s.space.GetDiskUtils()

                        if needUpdate {
                                log.LogWarnf("action[handleHeartbeatPacket] master change disk qos limit to [flowWrite %v, flowRead %v, iopsWrite %v, iopsRead %v]",
                                        s.diskWriteFlow, s.diskReadFlow, s.diskWriteIops, s.diskReadIops)
                                s.updateQosLimit()
                        }
                } else {
                        response.Status = proto.TaskFailed
                        err = fmt.Errorf("illegal opcode")
                        response.Result = err.Error()
                }
                task.Response = response
                if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
                        err = errors.Trace(err, "heartbeat to master(%v) failed.", request.MasterAddr)
                        log.LogErrorf(err.Error())
                        return
                }
        }()
}

// Handle OpDeleteDataPartition packet.
func (s *DataNode) handlePacketToDeleteDataPartition(p *repl.Packet) {
        task := &proto.AdminTask{}
        err := json.Unmarshal(p.Data, task)
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionDeleteDataPartition, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        if err != nil {
                return
        }
        request := &proto.DeleteDataPartitionRequest{}
        if task.OpCode == proto.OpDeleteDataPartition {
                bytes, _ := json.Marshal(task.Request)
                p.AddMesgLog(string(bytes))
                err = json.Unmarshal(bytes, request)
                if err != nil {
                        return
                } else {
                        s.space.DeletePartition(request.PartitionId)
                }
        } else {
                err = fmt.Errorf("illegal opcode ")
        }
        if err != nil {
                err = errors.Trace(err, "delete DataPartition failed,PartitionID(%v)", request.PartitionId)
                log.LogErrorf("action[handlePacketToDeleteDataPartition] err(%v).", err)
        }
        log.LogInfof(fmt.Sprintf("action[handlePacketToDeleteDataPartition] %v error(%v)", request.PartitionId, err))
}

// Handle OpLoadDataPartition packet.
func (s *DataNode) handlePacketToLoadDataPartition(p *repl.Packet) {
        task := &proto.AdminTask{}
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionLoadDataPartition, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        err = json.Unmarshal(p.Data, task)
        p.PacketOkReply()
        go s.asyncLoadDataPartition(task)
}

func (s *DataNode) asyncLoadDataPartition(task *proto.AdminTask) {
        var err error
        request := &proto.LoadDataPartitionRequest{}
        response := &proto.LoadDataPartitionResponse{}
        if task.OpCode == proto.OpLoadDataPartition {
                bytes, _ := json.Marshal(task.Request)
                json.Unmarshal(bytes, request)
                dp := s.space.Partition(request.PartitionId)
                if dp == nil {
                        response.Status = proto.TaskFailed
                        response.PartitionId = uint64(request.PartitionId)
                        err = fmt.Errorf(fmt.Sprintf("DataPartition(%v) not found", request.PartitionId))
                        response.Result = err.Error()
                } else {
                        response = dp.Load()
                        response.PartitionId = uint64(request.PartitionId)
                        response.Status = proto.TaskSucceeds
                }
        } else {
                response.PartitionId = uint64(request.PartitionId)
                response.Status = proto.TaskFailed
                err = fmt.Errorf("illegal opcode")
                response.Result = err.Error()
        }
        task.Response = response
        if err = MasterClient.NodeAPI().ResponseDataNodeTask(task); err != nil {
                err = errors.Trace(err, "load DataPartition failed,PartitionID(%v)", request.PartitionId)
                log.LogError(errors.Stack(err))
        }
}

// Handle OpMarkDelete packet.
func (s *DataNode) handleMarkDeletePacket(p *repl.Packet, c net.Conn) {
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionBatchMarkDelete, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        partition := p.Object.(*DataPartition)
        // NOTE: we cannot prevent mark delete
        // even the partition is forbidden, because
        // the inode already be deleted in meta partition
        // if we prevent it, we will get "orphan extents"
        if proto.IsTinyExtentType(p.ExtentType) || p.Opcode == proto.OpSplitMarkDelete {
                ext := new(proto.TinyExtentDeleteRecord)
                err = json.Unmarshal(p.Data, ext)
                if err == nil {
                        log.LogInfof("handleMarkDeletePacket Delete PartitionID(%v)_Extent(%v)_Offset(%v)_Size(%v)",
                                p.PartitionID, p.ExtentID, ext.ExtentOffset, ext.Size)
                        partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
                        partition.disk.limitWrite.Run(0, func() {
                                err = partition.ExtentStore().MarkDelete(p.ExtentID, int64(ext.ExtentOffset), int64(ext.Size))
                                if err != nil {
                                        log.LogErrorf("action[handleMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
                                }
                        })
                }
        } else {
                log.LogInfof("handleMarkDeletePacket Delete PartitionID(%v)_Extent(%v)",
                        p.PartitionID, p.ExtentID)
                partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
                partition.disk.limitWrite.Run(0, func() {
                        err = partition.ExtentStore().MarkDelete(p.ExtentID, 0, 0)
                        if err != nil {
                                log.LogErrorf("action[handleMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
                        }
                })
        }
}

// Handle OpMarkDelete packet.
func (s *DataNode) handleBatchMarkDeletePacket(p *repl.Packet, c net.Conn) {
        var err error
        defer func() {
                if err != nil {
                        log.LogErrorf(fmt.Sprintf("(%v) error(%v).", p.GetUniqueLogId(), err))
                        p.PackErrorBody(ActionBatchMarkDelete, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        partition := p.Object.(*DataPartition)
        // NOTE: we cannot prevent mark delete
        // even the partition is forbidden, because
        // the inode already be deleted in meta partition
        // if we prevent it, we will get "orphan extents"
        var exts []*proto.ExtentKey
        err = json.Unmarshal(p.Data, &exts)
        store := partition.ExtentStore()
        if err == nil {
                for _, ext := range exts {
                        if deleteLimiteRater.Allow() {
                                log.LogInfof(fmt.Sprintf("recive DeleteExtent (%v) from (%v)", ext, c.RemoteAddr().String()))
                                partition.disk.allocCheckLimit(proto.IopsWriteType, 1)
                                partition.disk.limitWrite.Run(0, func() {
                                        err = store.MarkDelete(ext.ExtentId, int64(ext.ExtentOffset), int64(ext.Size))
                                        if err != nil {
                                                log.LogErrorf("action[handleBatchMarkDeletePacket]: failed to mark delete extent(%v), %v", p.ExtentID, err)
                                        }
                                })
                                if err != nil {
                                        return
                                }
                        } else {
                                log.LogInfof("delete limiter reach(%v), remote (%v) try again.", deleteLimiteRater.Limit(), c.RemoteAddr().String())
                                err = storage.TryAgainError
                        }
                }
        }
}

// Handle OpWrite packet.
func (s *DataNode) handleWritePacket(p *repl.Packet) {
        var (
                err                     error
                metricPartitionIOLabels map[string]string
                partitionIOMetric       *exporter.TimePointCount
        )
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionWrite, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        partition := p.Object.(*DataPartition)
        if partition.IsForbidden() {
                err = ErrForbiddenDataPartition
                return
        }
        shallDegrade := p.ShallDegrade()
        if !shallDegrade {
                metricPartitionIOLabels = GetIoMetricLabels(partition, "write")
        }
        if partition.Available() <= 0 || !partition.disk.CanWrite() {
                err = storage.NoSpaceError
                return
        } else if partition.disk.Status == proto.Unavailable {
                err = storage.BrokenDiskError
                return
        }
        store := partition.ExtentStore()
        if proto.IsTinyExtentType(p.ExtentType) {
                if !shallDegrade {
                        partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
                }

                partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
                partition.disk.allocCheckLimit(proto.IopsWriteType, 1)

                if writable := partition.disk.limitWrite.TryRun(int(p.Size), func() {
                        _, err = store.Write(p.ExtentID, p.ExtentOffset, int64(p.Size), p.Data, p.CRC, storage.AppendWriteType, p.IsSyncWrite())
                }); !writable {
                        err = storage.TryAgainError
                        return
                }
                if !shallDegrade {
                        s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
                        partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
                }
                partition.checkIsDiskError(err, WriteFlag)
                return
        }

        if p.Size <= util.BlockSize {
                if !shallDegrade {
                        partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
                }

                partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
                partition.disk.allocCheckLimit(proto.IopsWriteType, 1)

                if writable := partition.disk.limitWrite.TryRun(int(p.Size), func() {
                        _, err = store.Write(p.ExtentID, p.ExtentOffset, int64(p.Size), p.Data, p.CRC, storage.AppendWriteType, p.IsSyncWrite())
                }); !writable {
                        err = storage.TryAgainError
                        return
                }
                if !shallDegrade {
                        s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
                        partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
                }
                partition.checkIsDiskError(err, WriteFlag)
        } else {
                size := p.Size
                offset := 0
                for size > 0 {
                        if size <= 0 {
                                break
                        }
                        currSize := util.Min(int(size), util.BlockSize)
                        data := p.Data[offset : offset+currSize]
                        crc := crc32.ChecksumIEEE(data)
                        if !shallDegrade {
                                partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
                        }

                        partition.disk.allocCheckLimit(proto.FlowWriteType, uint32(currSize))
                        partition.disk.allocCheckLimit(proto.IopsWriteType, 1)

                        if writable := partition.disk.limitWrite.TryRun(currSize, func() {
                                _, err = store.Write(p.ExtentID, p.ExtentOffset+int64(offset), int64(currSize), data, crc, storage.AppendWriteType, p.IsSyncWrite())
                        }); !writable {
                                err = storage.TryAgainError
                                return
                        }
                        if !shallDegrade {
                                s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
                                partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
                        }
                        partition.checkIsDiskError(err, WriteFlag)
                        if err != nil {
                                break
                        }
                        size -= uint32(currSize)
                        offset += currSize
                }
        }
}

func (s *DataNode) handleRandomWritePacket(p *repl.Packet) {
        var (
                err error

                metricPartitionIOLabels map[string]string
                partitionIOMetric       *exporter.TimePointCount
        )

        defer func() {
                log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v resultCode %v extid %v err %v",
                        p.Opcode, p.VerSeq, p.PartitionID, p.ResultCode, p.ExtentID, err)
                if err != nil {
                        p.PackErrorBody(ActionWrite, err.Error())
                } else {
                        // avoid rsp pack ver info into package which client need do more work to read buffer
                        if p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer {
                                p.Opcode = proto.OpSyncRandomWriteVerRsp
                        }
                        if p.Opcode == proto.OpTryWriteAppend && p.ResultCode == proto.OpTryOtherExtent {
                                p.PackErrorBody(ActionWrite, storage.SnapshotNeedNewExtentError.Error())
                                p.ResultCode = proto.OpTryOtherExtent
                                log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v resultCode %v extid %v", p.Opcode, p.VerSeq, p.PartitionID, p.ResultCode, p.ExtentID)
                                return
                        }
                        p.PacketOkReply()
                }
        }()

        partition := p.Object.(*DataPartition)
        if partition.IsForbidden() {
                err = ErrForbiddenDataPartition
                return
        }
        log.LogDebugf("action[handleRandomWritePacket opcod %v seq %v dpid %v dpseq %v extid %v", p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID)
        // cache or preload partition not support raft and repair.
        if !partition.isNormalType() {
                err = raft.ErrStopped
                return
        }

        _, isLeader := partition.IsRaftLeader()
        if !isLeader {
                err = raft.ErrNotLeader
                return
        }
        shallDegrade := p.ShallDegrade()
        if !shallDegrade {
                metricPartitionIOLabels = GetIoMetricLabels(partition, "randwrite")
                partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
        }

        err = partition.RandomWriteSubmit(p)
        if !shallDegrade {
                s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
                partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
        }

        if err != nil && strings.Contains(err.Error(), raft.ErrNotLeader.Error()) {
                err = raft.ErrNotLeader
                log.LogErrorf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v extid %v err %v", p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID, err)
                return
        }

        if err == nil && p.ResultCode != proto.OpOk && p.ResultCode != proto.OpTryOtherExtent {
                log.LogErrorf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v extid %v ResultCode %v",
                        p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, p.ExtentID, p.ResultCode)
                err = storage.TryAgainError
                return
        }
        log.LogDebugf("action[handleRandomWritePacket] opcod %v seq %v dpid %v dpseq %v after raft submit err %v resultCode %v",
                p.Opcode, p.VerSeq, p.PartitionID, partition.verSeq, err, p.ResultCode)
}

func (s *DataNode) handleStreamReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionStreamRead, err.Error())
                        p.WriteToConn(connect)
                }
        }()
        partition := p.Object.(*DataPartition)

        // cache or preload partition not support raft and repair.
        if !partition.isNormalType() {
                err = raft.ErrStopped
                return
        }

        if err = partition.CheckLeader(p, connect); err != nil {
                return
        }
        s.extentRepairReadPacket(p, connect, isRepairRead)
}

func (s *DataNode) handleExtentRepairReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
        var err error
        log.LogDebugf("handleExtentRepairReadPacket %v", p)
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionStreamRead, err.Error())
                        p.WriteToConn(connect)
                        return
                }
                fininshDoExtentRepair()
        }()

        err = requestDoExtentRepair()
        if err != nil {
                return
        }

        s.extentRepairReadPacket(p, connect, isRepairRead)
}

func (s *DataNode) handleTinyExtentRepairReadPacket(p *repl.Packet, connect net.Conn) {
        s.tinyExtentRepairRead(p, connect)
}

func (s *DataNode) extentRepairReadPacket(p *repl.Packet, connect net.Conn, isRepairRead bool) {
        var (
                err error

                metricPartitionIOLabels     map[string]string
                partitionIOMetric, tpObject *exporter.TimePointCount
        )
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionStreamRead, err.Error())
                        p.WriteToConn(connect)
                }
        }()
        partition := p.Object.(*DataPartition)
        needReplySize := p.Size
        offset := p.ExtentOffset
        store := partition.ExtentStore()
        shallDegrade := p.ShallDegrade()
        if !shallDegrade {
                metricPartitionIOLabels = GetIoMetricLabels(partition, "read")
        }
        log.LogDebugf("extentRepairReadPacket dp %v offset %v needSize %v", partition.partitionID, offset, needReplySize)
        for {
                if needReplySize <= 0 {
                        break
                }
                err = nil
                reply := repl.NewStreamReadResponsePacket(p.ReqID, p.PartitionID, p.ExtentID)
                reply.StartT = p.StartT
                currReadSize := uint32(util.Min(int(needReplySize), util.ReadBlockSize))
                if currReadSize == util.ReadBlockSize {
                        reply.Data, _ = proto.Buffers.Get(util.ReadBlockSize)
                } else {
                        reply.Data = make([]byte, currReadSize)
                }
                if !shallDegrade {
                        partitionIOMetric = exporter.NewTPCnt(MetricPartitionIOName)
                        tpObject = exporter.NewTPCnt(fmt.Sprintf("Repair_%s", p.GetOpMsg()))
                }
                reply.ExtentOffset = offset
                p.Size = currReadSize
                p.ExtentOffset = offset

                partition.Disk().allocCheckLimit(proto.IopsReadType, 1)
                partition.Disk().allocCheckLimit(proto.FlowReadType, currReadSize)

                partition.disk.limitRead.Run(int(currReadSize), func() {
                        reply.CRC, err = store.Read(reply.ExtentID, offset, int64(currReadSize), reply.Data, isRepairRead)
                })
                if !shallDegrade {
                        s.metrics.MetricIOBytes.AddWithLabels(int64(p.Size), metricPartitionIOLabels)
                        partitionIOMetric.SetWithLabels(err, metricPartitionIOLabels)
                        tpObject.Set(err)
                }
                partition.checkIsDiskError(err, ReadFlag)
                p.CRC = reply.CRC
                if err != nil {
                        log.LogErrorf("action[operatePacket] err %v", err)
                        return
                }
                reply.Size = currReadSize
                reply.ResultCode = proto.OpOk
                reply.Opcode = p.Opcode
                p.ResultCode = proto.OpOk
                if err = reply.WriteToConn(connect); err != nil {
                        return
                }
                needReplySize -= currReadSize
                offset += int64(currReadSize)
                if currReadSize == util.ReadBlockSize {
                        proto.Buffers.Put(reply.Data)
                }
                logContent := fmt.Sprintf("action[operatePacket] %v.",
                        reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
                log.LogReadf(logContent)
        }
        p.PacketOkReply()
}

func (s *DataNode) handlePacketToGetAllWatermarks(p *repl.Packet) {
        var (
                buf       []byte
                fInfoList []*storage.ExtentInfo
                err       error
        )
        partition := p.Object.(*DataPartition)
        store := partition.ExtentStore()
        if proto.IsNormalExtentType(p.ExtentType) {
                fInfoList, _, err = store.GetAllWatermarks(storage.NormalExtentFilter())
        } else {
                extents := make([]uint64, 0)
                err = json.Unmarshal(p.Data, &extents)
                if err == nil {
                        fInfoList, _, err = store.GetAllWatermarks(storage.TinyExtentFilter(extents))
                }
        }
        if err != nil {
                p.PackErrorBody(ActionGetAllExtentWatermarks, err.Error())
        } else {
                buf, err = json.Marshal(fInfoList)
                if err != nil {
                        p.PackErrorBody(ActionGetAllExtentWatermarks, err.Error())
                } else {
                        p.PacketOkWithByte(buf)
                }
        }
}

func (s *DataNode) writeEmptyPacketOnTinyExtentRepairRead(reply *repl.Packet, newOffset, currentOffset int64, connect net.Conn) (replySize int64, err error) {
        replySize = newOffset - currentOffset
        reply.Data = make([]byte, 0)
        reply.Size = 0
        reply.CRC = crc32.ChecksumIEEE(reply.Data)
        reply.ResultCode = proto.OpOk
        reply.ExtentOffset = currentOffset
        reply.Arg[0] = EmptyResponse
        binary.BigEndian.PutUint64(reply.Arg[1:9], uint64(replySize))
        err = reply.WriteToConn(connect)
        reply.Size = uint32(replySize)
        logContent := fmt.Sprintf("action[operatePacket] %v.",
                reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
        log.LogReadf(logContent)

        return
}

func (s *DataNode) attachAvaliSizeOnTinyExtentRepairRead(reply *repl.Packet, avaliSize uint64) {
        binary.BigEndian.PutUint64(reply.Arg[9:17], avaliSize)
}

// Handle tinyExtentRepairRead packet.
func (s *DataNode) tinyExtentRepairRead(request *repl.Packet, connect net.Conn) {
        var (
                err                 error
                needReplySize       int64
                tinyExtentFinfoSize uint64
        )

        defer func() {
                if err != nil {
                        request.PackErrorBody(ActionStreamReadTinyExtentRepair, err.Error())
                        request.WriteToConn(connect)
                }
        }()
        if !storage.IsTinyExtent(request.ExtentID) {
                err = fmt.Errorf("unavali extentID (%v)", request.ExtentID)
                return
        }

        partition := request.Object.(*DataPartition)
        store := partition.ExtentStore()
        tinyExtentFinfoSize, err = store.TinyExtentGetFinfoSize(request.ExtentID)
        if err != nil {
                return
        }
        needReplySize = int64(request.Size)
        offset := request.ExtentOffset
        if uint64(request.ExtentOffset)+uint64(request.Size) > tinyExtentFinfoSize {
                needReplySize = int64(tinyExtentFinfoSize - uint64(request.ExtentOffset))
        }
        avaliReplySize := uint64(needReplySize)

        var newOffset, newEnd int64
        for {
                if needReplySize <= 0 {
                        break
                }
                reply := repl.NewTinyExtentStreamReadResponsePacket(request.ReqID, request.PartitionID, request.ExtentID)
                reply.ArgLen = TinyExtentRepairReadResponseArgLen
                reply.Arg = make([]byte, TinyExtentRepairReadResponseArgLen)
                s.attachAvaliSizeOnTinyExtentRepairRead(reply, avaliReplySize)
                newOffset, newEnd, err = store.TinyExtentAvaliOffset(request.ExtentID, offset)
                if err != nil {
                        return
                }
                if newOffset > offset {
                        var replySize int64
                        if replySize, err = s.writeEmptyPacketOnTinyExtentRepairRead(reply, newOffset, offset, connect); err != nil {
                                return
                        }
                        needReplySize -= replySize
                        offset += replySize
                        continue
                }
                currNeedReplySize := newEnd - newOffset
                currReadSize := uint32(util.Min(int(currNeedReplySize), util.ReadBlockSize))
                if currReadSize == util.ReadBlockSize {
                        reply.Data, _ = proto.Buffers.Get(util.ReadBlockSize)
                } else {
                        reply.Data = make([]byte, currReadSize)
                }
                reply.ExtentOffset = offset
                reply.CRC, err = store.Read(reply.ExtentID, offset, int64(currReadSize), reply.Data, false)
                if err != nil {
                        return
                }
                reply.Size = uint32(currReadSize)
                reply.ResultCode = proto.OpOk
                if err = reply.WriteToConn(connect); err != nil {
                        connect.Close()
                        return
                }
                needReplySize -= int64(currReadSize)
                offset += int64(currReadSize)
                if currReadSize == util.ReadBlockSize {
                        proto.Buffers.Put(reply.Data)
                }
                logContent := fmt.Sprintf("action[operatePacket] %v.",
                        reply.LogMessage(reply.GetOpMsg(), connect.RemoteAddr().String(), reply.StartT, err))
                log.LogReadf(logContent)
        }

        request.PacketOkReply()
}

func (s *DataNode) handlePacketToReadTinyDeleteRecordFile(p *repl.Packet, connect net.Conn) {
        var err error
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionStreamReadTinyDeleteRecord, err.Error())
                        p.WriteToConn(connect)
                }
        }()
        partition := p.Object.(*DataPartition)
        store := partition.ExtentStore()
        localTinyDeleteFileSize, err := store.LoadTinyDeleteFileOffset()
        if err != nil {
                return
        }
        needReplySize := localTinyDeleteFileSize - p.ExtentOffset
        offset := p.ExtentOffset
        reply := repl.NewReadTinyDeleteRecordResponsePacket(p.ReqID, p.PartitionID)
        reply.StartT = time.Now().UnixNano()
        for {
                if needReplySize <= 0 {
                        break
                }
                err = nil
                currReadSize := uint32(util.Min(int(needReplySize), MaxSyncTinyDeleteBufferSize))
                reply.Data = make([]byte, currReadSize)
                reply.ExtentOffset = offset
                reply.CRC, err = store.ReadTinyDeleteRecords(offset, int64(currReadSize), reply.Data)
                if err != nil {
                        err = fmt.Errorf(ActionStreamReadTinyDeleteRecord+" localTinyDeleteRecordSize(%v) offset(%v)"+
                                " currReadSize(%v) err(%v)", localTinyDeleteFileSize, offset, currReadSize, err)
                        return
                }
                reply.Size = uint32(currReadSize)
                reply.ResultCode = proto.OpOk
                if err = reply.WriteToConn(connect); err != nil {
                        return
                }
                needReplySize -= int64(currReadSize)
                offset += int64(currReadSize)
        }
        p.PacketOkReply()
}

// Handle OpNotifyReplicasToRepair packet.
func (s *DataNode) handlePacketToNotifyExtentRepair(p *repl.Packet) {
        var err error
        partition := p.Object.(*DataPartition)
        mf := new(DataPartitionRepairTask)
        err = json.Unmarshal(p.Data, mf)
        if err != nil {
                p.PackErrorBody(ActionRepair, err.Error())
                return
        }
        partition.DoExtentStoreRepair(mf)
        p.PacketOkReply()
}

// Handle OpBroadcastMinAppliedID
func (s *DataNode) handleBroadcastMinAppliedID(p *repl.Packet) {
        partition := p.Object.(*DataPartition)
        minAppliedID := binary.BigEndian.Uint64(p.Data)
        if minAppliedID > 0 {
                partition.SetMinAppliedID(minAppliedID)
        }
        log.LogDebugf("[handleBroadcastMinAppliedID] partition(%v) minAppliedID(%v)", partition.partitionID, minAppliedID)
        p.PacketOkReply()
}

// Handle handlePacketToGetAppliedID packet.
func (s *DataNode) handlePacketToGetAppliedID(p *repl.Packet) {
        partition := p.Object.(*DataPartition)
        appliedID := partition.GetAppliedID()
        buf := make([]byte, 8)
        binary.BigEndian.PutUint64(buf, appliedID)
        p.PacketOkWithBody(buf)
        p.AddMesgLog(fmt.Sprintf("_AppliedID(%v)", appliedID))
}

func (s *DataNode) handlePacketToGetPartitionSize(p *repl.Packet) {
        partition := p.Object.(*DataPartition)
        usedSize := partition.extentStore.StoreSizeExtentID(p.ExtentID)
        buf := make([]byte, 8)
        binary.BigEndian.PutUint64(buf, uint64(usedSize))
        p.AddMesgLog(fmt.Sprintf("partitionSize_(%v)", usedSize))
        p.PacketOkWithBody(buf)
}

func (s *DataNode) handlePacketToGetMaxExtentIDAndPartitionSize(p *repl.Packet) {
        partition := p.Object.(*DataPartition)
        maxExtentID, totalPartitionSize := partition.extentStore.GetMaxExtentIDAndPartitionSize()

        buf := make([]byte, 16)
        binary.BigEndian.PutUint64(buf[0:8], uint64(maxExtentID))
        binary.BigEndian.PutUint64(buf[8:16], totalPartitionSize)
        p.PacketOkWithBody(buf)
}

func (s *DataNode) handlePacketToDecommissionDataPartition(p *repl.Packet) {
        var (
                err          error
                reqData      []byte
                isRaftLeader bool
                req          = &proto.DataPartitionDecommissionRequest{}
        )

        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionDecommissionPartition, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()

        adminTask := &proto.AdminTask{}
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                return
        }

        reqData, err = json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                return
        }
        p.AddMesgLog(string(reqData))
        dp := s.space.Partition(req.PartitionId)
        if dp == nil {
                err = fmt.Errorf("partition %v not exsit", req.PartitionId)
                return
        }
        p.PartitionID = req.PartitionId

        isRaftLeader, err = s.forwardToRaftLeader(dp, p, false)
        if !isRaftLeader {
                err = raft.ErrNotLeader
                return
        }
        if req.AddPeer.ID == req.RemovePeer.ID {
                err = errors.NewErrorf("[opOfflineDataPartition]: AddPeer(%v) same withRemovePeer(%v)", req.AddPeer, req.RemovePeer)
                return
        }
        if req.AddPeer.ID != 0 {
                _, err = dp.ChangeRaftMember(raftProto.ConfAddNode, raftProto.Peer{ID: req.AddPeer.ID}, reqData)
                if err != nil {
                        return
                }
        }
        _, err = dp.ChangeRaftMember(raftProto.ConfRemoveNode, raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
        if err != nil {
                return
        }
}

func (s *DataNode) handlePacketToAddDataPartitionRaftMember(p *repl.Packet) {
        var (
                err          error
                reqData      []byte
                isRaftLeader bool
                req          = &proto.AddDataPartitionRaftMemberRequest{}
        )

        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionAddDataPartitionRaftMember, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()

        adminTask := &proto.AdminTask{}
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                return
        }

        reqData, err = json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                return
        }

        log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] %v, partition id %v", req.AddPeer, req.PartitionId)

        p.AddMesgLog(string(reqData))
        dp := s.space.Partition(req.PartitionId)
        if dp == nil {
                err = proto.ErrDataPartitionNotExists
                return
        }
        p.PartitionID = req.PartitionId
        if dp.IsExistReplica(req.AddPeer.Addr) {
                log.LogInfof("handlePacketToAddDataPartitionRaftMember recive MasterCommand: %v "+
                        "addRaftAddr(%v) has exsit", string(reqData), req.AddPeer.Addr)
                return
        }
        isRaftLeader, err = s.forwardToRaftLeader(dp, p, false)
        if !isRaftLeader {
                return
        }
        log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] before ChangeRaftMember %v which is sync. partition id %v", req.AddPeer, req.PartitionId)

        if req.AddPeer.ID != 0 {
                _, err = dp.ChangeRaftMember(raftProto.ConfAddNode, raftProto.Peer{ID: req.AddPeer.ID}, reqData)
                if err != nil {
                        return
                }
        }
        log.LogInfof("action[handlePacketToAddDataPartitionRaftMember] after ChangeRaftMember %v, partition id %v", req.AddPeer, &req.PartitionId)
}

func (s *DataNode) handlePacketToRemoveDataPartitionRaftMember(p *repl.Packet) {
        var (
                err          error
                reqData      []byte
                isRaftLeader bool
                req          = &proto.RemoveDataPartitionRaftMemberRequest{}
        )

        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionRemoveDataPartitionRaftMember, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()

        adminTask := &proto.AdminTask{}
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                return
        }

        reqData, err = json.Marshal(adminTask.Request)
        p.AddMesgLog(string(reqData))
        if err != nil {
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                return
        }

        dp := s.space.Partition(req.PartitionId)
        if dp == nil {
                return
        }

        log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember], req %v (%s) RemoveRaftPeer(%s) dp %v replicaNum %v",
                p.GetReqID(), string(reqData), req.RemovePeer.Addr, dp.partitionID, dp.replicaNum)

        p.PartitionID = req.PartitionId

        if !dp.IsExistReplica(req.RemovePeer.Addr) {
                log.LogWarnf("action[handlePacketToRemoveDataPartitionRaftMember] receive MasterCommand:  req %v[%v] "+
                        "RemoveRaftPeer(%v) has not exist", p.GetReqID(), string(reqData), req.RemovePeer.Addr)
                return
        }

        isRaftLeader, err = s.forwardToRaftLeader(dp, p, req.Force)
        if !isRaftLeader {
                log.LogWarnf("handlePacketToRemoveDataPartitionRaftMember return no leader")
                return
        }
        if err = dp.CanRemoveRaftMember(req.RemovePeer, req.Force); err != nil {
                log.LogWarnf("action[handlePacketToRemoveDataPartitionRaftMember] CanRemoveRaftMember failed "+
                        "req %v dp %v err %v",
                        p.GetReqID(), dp.partitionID, err.Error())
                return
        }

        if req.Force {
                cc := &raftProto.ConfChange{
                        Type: raftProto.ConfRemoveNode,
                        Peer: raftProto.Peer{
                                ID: req.RemovePeer.ID,
                        },
                        Context: reqData,
                }
                s.raftStore.RaftServer().RemoveRaftForce(dp.partitionID, cc)
                dp.ApplyMemberChange(cc, 0)
                dp.PersistMetadata()
                return
        }

        if req.RemovePeer.ID != 0 {
                log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember] ChangeRaftMember "+
                        "req %v dp %v RemovePeer.ID %v", p.GetReqID(), dp.partitionID, req.RemovePeer.ID)
                _, err = dp.ChangeRaftMember(raftProto.ConfRemoveNode, raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
                if err != nil {
                        return
                }
        }
        log.LogDebugf("action[handlePacketToRemoveDataPartitionRaftMember] CanRemoveRaftMember complete "+
                "req %v dp %v ", p.GetReqID(), dp.partitionID)
}

func (s *DataNode) handlePacketToDataPartitionTryToLeader(p *repl.Packet) {
        var err error

        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionDataPartitionTryToLeader, err.Error())
                        log.LogWarnf("handlePacketToDataPartitionTryToLeader: %v ", err.Error())
                } else {
                        p.PacketOkReply()
                        log.LogDebugf("handlePacketToDataPartitionTryToLeader: partition %v success ", p.PartitionID)
                }
        }()
        log.LogDebugf("handlePacketToDataPartitionTryToLeader: partition %v ", p.PartitionID)
        dp := s.space.Partition(p.PartitionID)
        if dp == nil {
                err = fmt.Errorf("partition %v not exsit", p.PartitionID)
                return
        }

        if dp.raftStatus != RaftStatusRunning {
                err = fmt.Errorf("partition %v raft not running", p.PartitionID)
                return
        }

        if dp.raftPartition.IsRaftLeader() {
                log.LogWarnf("handlePacketToDataPartitionTryToLeader: %v is already leader", p.PartitionID)
                return
        }
        err = dp.raftPartition.TryToLeader(dp.partitionID)
}

func (s *DataNode) forwardToRaftLeader(dp *DataPartition, p *repl.Packet, force bool) (ok bool, err error) {
        var (
                conn       *net.TCPConn
                leaderAddr string
        )

        if leaderAddr, ok = dp.IsRaftLeader(); ok {
                return
        }
        // return NoLeaderError if leaderAddr is nil
        if leaderAddr == "" {
                if force {
                        ok = true
                        log.LogInfof("action[forwardToRaftLeader] no leader but replica num %v continue", dp.replicaNum)
                        return
                }
                err = storage.NoLeaderError
                return
        }

        // forward the packet to the leader if local one is not the leader
        conn, err = gConnPool.GetConnect(leaderAddr)
        if err != nil {
                return
        }
        defer func() {
                gConnPool.PutConnect(conn, err != nil)
        }()
        err = p.WriteToConn(conn)
        if err != nil {
                return
        }
        if err = p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
                return
        }

        return
}

func (s *DataNode) handlePacketToStopDataPartitionRepair(p *repl.Packet) {
        task := &proto.AdminTask{}
        err := json.Unmarshal(p.Data, task)
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionStopDataPartitionRepair, err.Error())
                } else {
                        p.PacketOkReply()
                }
        }()
        if err != nil {
                return
        }
        request := &proto.StopDataPartitionRepairRequest{}
        if task.OpCode != proto.OpStopDataPartitionRepair {
                err = fmt.Errorf("action[handlePacketToStopDataPartitionRepair] illegal opcode ")
                log.LogWarnf("action[handlePacketToStopDataPartitionRepair] illegal opcode ")
                return
        }

        bytes, _ := json.Marshal(task.Request)
        p.AddMesgLog(string(bytes))
        err = json.Unmarshal(bytes, request)
        if err != nil {
                return
        }
        log.LogDebugf("action[handlePacketToStopDataPartitionRepair] try stop %v", request.PartitionId)
        dp := s.space.Partition(request.PartitionId)
        if dp == nil {
                err = proto.ErrDataPartitionNotExists
                log.LogWarnf("action[handlePacketToStopDataPartitionRepair] cannot find dp %v", request.PartitionId)
                return
        }
        dp.StopDecommissionRecover(request.Stop)
        log.LogInfof("action[handlePacketToStopDataPartitionRepair] %v stop %v success", request.PartitionId, request.Stop)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
)

func (s *DataNode) Post(p *repl.Packet) error {
        if p.IsMasterCommand() {
                p.NeedReply = true
        }
        if p.IsReadOperation() && p.AfterPre {
                p.NeedReply = false
        }
        s.cleanupPkt(p)
        s.addMetrics(p)
        return nil
}

func (s *DataNode) cleanupPkt(p *repl.Packet) {
        if p.IsMasterCommand() {
                return
        }
        if !p.IsLeaderPacket() {
                return
        }
        s.releaseExtent(p)
}

func (s *DataNode) releaseExtent(p *repl.Packet) {
        if p == nil || !storage.IsTinyExtent(p.ExtentID) || p.ExtentID <= 0 || atomic.LoadInt32(&p.IsReleased) == IsReleased {
                return
        }
        if !proto.IsTinyExtentType(p.ExtentType) || !p.IsLeaderPacket() || !p.IsNormalWriteOperation() || !p.IsForwardPkt() {
                return
        }
        if p.Object == nil {
                return
        }
        partition := p.Object.(*DataPartition)
        store := partition.ExtentStore()
        if p.IsErrPacket() {
                store.SendToBrokenTinyExtentC(p.ExtentID)
        } else {
                store.SendToAvailableTinyExtentC(p.ExtentID)
        }
        atomic.StoreInt32(&p.IsReleased, IsReleased)
}

func (s *DataNode) addMetrics(p *repl.Packet) {
        if p.IsMasterCommand() || p.ShallDegrade() {
                return
        }
        p.AfterTp()
        if p.Object == nil {
                return
        }
        partition := p.Object.(*DataPartition)
        if partition == nil {
                return
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package datanode

import (
        "encoding/json"
        "fmt"
        "hash/crc32"
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/repl"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/log"
)

func (s *DataNode) Prepare(p *repl.Packet) (err error) {
        defer func() {
                p.SetPacketHasPrepare()
                if err != nil {
                        p.PackErrorBody(repl.ActionPreparePkt, err.Error())
                } else {
                        p.AfterPre = true
                }
        }()
        if p.IsMasterCommand() {
                return
        }
        atomic.AddUint64(&s.metricsCnt, 1)
        if !s.shallDegrade() {
                p.BeforeTp(s.clusterID)
                p.UnsetDegrade()
        } else {
                p.SetDegrade()
        }
        err = s.checkStoreMode(p)
        if err != nil {
                return
        }
        if err = s.checkCrc(p); err != nil {
                return
        }
        if err = s.checkPartition(p); err != nil {
                return
        }
        // For certain packet, we meed to add some additional extent information.
        if err = s.checkPacketAndPrepare(p); err != nil {
                return
        }

        return
}

func (s *DataNode) checkStoreMode(p *repl.Packet) (err error) {
        if proto.IsTinyExtentType(p.ExtentType) || proto.IsNormalExtentType(p.ExtentType) {
                return
        }
        log.LogErrorf("action[checkStoreMode] dp [%v] reqId [%v] extent type %v", p.PartitionID, p.ReqID, p.ExtentType)
        return ErrIncorrectStoreType
}

func (s *DataNode) checkCrc(p *repl.Packet) (err error) {
        if !p.IsNormalWriteOperation() {
                return
        }
        crc := crc32.ChecksumIEEE(p.Data[:p.Size])
        if crc != p.CRC {
                return storage.CrcMismatchError
        }

        return
}

func (s *DataNode) checkPartition(p *repl.Packet) (err error) {
        dp := s.space.Partition(p.PartitionID)
        if dp == nil {
                // err = proto.ErrDataPartitionNotExists
                err = fmt.Errorf("data partition not exists %v", p.PartitionID)
                return
        }
        p.Object = dp
        if p.IsNormalWriteOperation() || p.IsCreateExtentOperation() {
                if dp.Available() <= 0 {
                        err = storage.NoSpaceError
                        return
                }
        }
        if p.IsNormalWriteOperation() || p.IsRandomWrite() {
                dp.disk.allocCheckLimit(proto.FlowWriteType, uint32(p.Size))
                dp.disk.allocCheckLimit(proto.IopsWriteType, 1)
        }
        return
}

func (s *DataNode) checkPacketAndPrepare(p *repl.Packet) error {
        partition := p.Object.(*DataPartition)
        store := p.Object.(*DataPartition).ExtentStore()
        var (
                extentID uint64
                err      error
        )

        log.LogDebugf("action[prepare.checkPacketAndPrepare] pack opcode (%v) p.IsLeaderPacket(%v) p (%v)", p.Opcode, p.IsLeaderPacket(), p)
        if p.IsRandomWrite() || p.IsSnapshotModWriteAppendOperation() || p.IsNormalWriteOperation() {
                if err = partition.CheckWriteVer(p); err != nil {
                        return err
                }
        }
        if p.IsLeaderPacket() && proto.IsTinyExtentType(p.ExtentType) && p.IsNormalWriteOperation() {
                extentID, err = store.GetAvailableTinyExtent()
                if err != nil {
                        return fmt.Errorf("checkPacketAndPrepare partition %v GetAvailableTinyExtent error %v", p.PartitionID, err.Error())
                }
                p.ExtentID = extentID
                p.ExtentOffset, err = store.GetTinyExtentOffset(extentID)
                if err != nil {
                        return fmt.Errorf("checkPacketAndPrepare partition %v  %v GetTinyExtentOffset error %v", p.PartitionID, extentID, err.Error())
                }
        } else if p.IsSnapshotModWriteAppendOperation() {
                if proto.IsTinyExtentType(p.ExtentType) {
                        extentID, err = store.GetAvailableTinyExtent()
                        if err != nil {
                                log.LogErrorf("err %v", err)
                                return fmt.Errorf("checkPacketAndPrepare partition %v GetAvailableTinyExtent error %v", p.PartitionID, err.Error())
                        }
                        p.ExtentID = extentID
                        p.ExtentOffset, err = store.GetTinyExtentOffset(p.ExtentID)
                        if err != nil {
                                err = fmt.Errorf("checkPacketAndPrepare partition %v  %v GetTinyExtentOffset error %v", p.PartitionID, extentID, err.Error())
                                log.LogErrorf("err %v", err)
                        }
                        log.LogDebugf("action[prepare.checkPacketAndPrepare] dp %v append randomWrite p.ExtentOffset %v Kernel(file)Offset %v",
                                p.PartitionID, p.ExtentOffset, p.KernelOffset)
                        return err
                }

                p.ExtentOffset, err = store.GetExtentSnapshotModOffset(p.ExtentID, p.Size)
                log.LogDebugf("action[prepare.checkPacketAndPrepare] pack (%v) partition %v %v", p, p.PartitionID, extentID)
                if err != nil {
                        return fmt.Errorf("checkPacketAndPrepare partition %v  %v GetSnapshotModExtentOffset error %v", p.PartitionID, extentID, err.Error())
                }
        } else if p.IsLeaderPacket() && p.IsCreateExtentOperation() {
                if partition.isNormalType() && partition.GetExtentCount() >= storage.MaxExtentCount*3 {
                        return fmt.Errorf("checkPacketAndPrepare partition %v has reached maxExtentId", p.PartitionID)
                }
                p.ExtentID, err = store.NextExtentID()
                if err != nil {
                        return fmt.Errorf("checkPacketAndPrepare partition %v allocCheckLimit NextExtentId error %v", p.PartitionID, err)
                }
        } else if p.IsLeaderPacket() &&
                ((p.IsMarkDeleteExtentOperation() && proto.IsTinyExtentType(p.ExtentType)) ||
                        (p.IsMarkSplitExtentOperation() && !proto.IsTinyExtentType(p.ExtentType))) {

                log.LogDebugf("checkPacketAndPrepare. packet opCode %v p.ExtentType %v", p.Opcode, p.ExtentType)

                record := new(proto.TinyExtentDeleteRecord)
                if err := json.Unmarshal(p.Data[:p.Size], record); err != nil {
                        return fmt.Errorf("checkPacketAndPrepare failed %v", err.Error())
                }
                p.Data, _ = json.Marshal(record)
                p.Size = uint32(len(p.Data))
        }

        if (p.IsCreateExtentOperation() || p.IsNormalWriteOperation()) && p.ExtentID == 0 {
                return fmt.Errorf("checkPacketAndPrepare partition %v invalid extent id. ", p.PartitionID)
        }

        p.OrgBuffer = p.Data

        return nil
}

package fuse

import (
        "sync"
)

const (
        NumOfBlockPool = 9
)

const (
        BlockSize = 4096
)

const (
        PoolSize4K = BlockSize * (1 << iota)
        PoolSize8K
        PoolSize16K
        PoolSize32K
        PoolSize64K
        PoolSize128K
        PoolSize256K
        PoolSize512K
        PoolSize1024K
)

const (
        PoolSizeWithHeader4K = BlockSize*(1<<iota) + OutHeaderSize
        PoolSizeWithHeader8K
        PoolSizeWithHeader16K
        PoolSizeWithHeader32K
        PoolSizeWithHeader64K
        PoolSizeWithHeader128K
        PoolSizeWithHeader256K
        PoolSizeWithHeader512K
        PoolSizeWithHeader1024K
)

var ReadBlockPool = [NumOfBlockPool]*sync.Pool{}

func InitReadBlockPool() {
        ReadBlockPool[0] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader4K)
        }}
        ReadBlockPool[1] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader8K)
        }}
        ReadBlockPool[2] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader16K)
        }}
        ReadBlockPool[3] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader32K)
        }}
        ReadBlockPool[4] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader64K)
        }}
        ReadBlockPool[5] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader128K)
        }}
        ReadBlockPool[6] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader256K)
        }}
        ReadBlockPool[7] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader512K)
        }}
        ReadBlockPool[8] = &sync.Pool{New: func() interface{} {
                return make([]byte, PoolSizeWithHeader1024K)
        }}
}

func GetBlockBuf(size int) []byte {
        var data []byte
        switch size {
        case PoolSize4K:
                data = ReadBlockPool[0].Get().([]byte)
        case PoolSize8K:
                data = ReadBlockPool[1].Get().([]byte)
        case PoolSize16K:
                data = ReadBlockPool[2].Get().([]byte)
        case PoolSize32K:
                data = ReadBlockPool[3].Get().([]byte)
        case PoolSize64K:
                data = ReadBlockPool[4].Get().([]byte)
        case PoolSize128K:
                data = ReadBlockPool[5].Get().([]byte)
        case PoolSize256K:
                data = ReadBlockPool[6].Get().([]byte)
        case PoolSize512K:
                data = ReadBlockPool[7].Get().([]byte)
        case PoolSize1024K:
                data = ReadBlockPool[8].Get().([]byte)
        default:
                data = make([]byte, OutHeaderSize+size)
        }
        return data
}

func PutBlockBuf(data []byte) {
        switch len(data) {
        case PoolSizeWithHeader4K:
                ReadBlockPool[0].Put(data)
        case PoolSizeWithHeader8K:
                ReadBlockPool[1].Put(data)
        case PoolSizeWithHeader16K:
                ReadBlockPool[2].Put(data)
        case PoolSizeWithHeader32K:
                ReadBlockPool[3].Put(data)
        case PoolSizeWithHeader64K:
                ReadBlockPool[4].Put(data)
        case PoolSizeWithHeader128K:
                ReadBlockPool[5].Put(data)
        case PoolSizeWithHeader256K:
                ReadBlockPool[6].Put(data)
        case PoolSizeWithHeader512K:
                ReadBlockPool[7].Put(data)
        case PoolSizeWithHeader1024K:
                ReadBlockPool[8].Put(data)
        default:
                return
        }
}

package fuse

import "unsafe"

// buffer provides a mechanism for constructing a message from
// multiple segments.
type buffer []byte

// alloc allocates size bytes and returns a pointer to the new
// segment.
func (w *buffer) alloc(size uintptr) unsafe.Pointer {
        s := int(size)
        if len(*w)+s > cap(*w) {
                old := *w
                *w = make([]byte, len(*w), 2*cap(*w)+s)
                copy(*w, old)
        }
        l := len(*w)
        *w = (*w)[:l+s]
        return unsafe.Pointer(&(*w)[l])
}

// reset clears out the contents of the buffer.
func (w *buffer) reset() {
        for i := range (*w)[:cap(*w)] {
                (*w)[i] = 0
        }
        *w = (*w)[:0]
}

func newBuffer(extra uintptr) buffer {
        const hdrSize = unsafe.Sizeof(outHeader{})
        buf := make(buffer, hdrSize, hdrSize+extra)
        return buf
}

package fuse

import (
        "runtime"
)

func stack() string {
        buf := make([]byte, 1024)
        return string(buf[:runtime.Stack(buf, false)])
}

func nop(msg interface{}) {}

// Debug is called to output debug messages, including protocol
// traces. The default behavior is to do nothing.
//
// The messages have human-friendly string representations and are
// safe to marshal to JSON.
//
// Implementations must not retain msg.
var Debug func(msg interface{}) = nop

package fuse

import (
        "syscall"
)

const (
        ENODATA = Errno(syscall.ENODATA)
)

const (
        errNoXattr = ENODATA
)

func init() {
        errnoNames[errNoXattr] = "ENODATA"
}

// FUSE service loop, for servers that wish to use it.

package fs // import "github.com/cubefs/cubefs/depends/bazil.org/fuse/fs"

import (
        "encoding/binary"
        "fmt"
        "hash/fnv"
        "io"
        "log"
        "net"
        "os"
        "reflect"
        "runtime"
        "strings"
        "sync"
        "time"
        "unsafe"

        "github.com/cubefs/cubefs/proto"

        "bytes"

        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
        "github.com/cubefs/cubefs/depends/bazil.org/fuse/fuseutil"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/stat"
        "golang.org/x/net/context"
        "golang.org/x/time/rate"
)

const (
        attrValidTime  = 1 * time.Minute
        entryValidTime = 1 * time.Minute
)

const (
        defaultForgetServeLimit = rate.Limit(1 << 16)
        defaultForgetServeBurst = 128
)

var ForgetServeLimit *rate.Limiter = rate.NewLimiter(defaultForgetServeLimit, defaultForgetServeBurst)

// TODO: FINISH DOCS

type FSStatType uint32

const (
        FSStatResume FSStatType = iota
        FSStatSuspend
        FSStatShutdown
        FSStatRestore
)

// An FS is the interface required of a file system.
//
// Other FUSE requests can be handled by implementing methods from the
// FS* interfaces, for example FSStatfser.
type FS interface {
        // Root is called to obtain the Node for the file system root.
        Root() (Node, error)
        Node(ino, pino uint64, mode uint32) (Node, error)
        State() (FSStatType, string)
        Notify(stat FSStatType, msg interface{})
}

type FSStatfser interface {
        // Statfs is called to obtain file system metadata.
        // It should write that data to resp.
        Statfs(ctx context.Context, req *fuse.StatfsRequest, resp *fuse.StatfsResponse) error
}

type FSDestroyer interface {
        // Destroy is called when the file system is shutting down.
        //
        // Linux only sends this request for block device backed (fuseblk)
        // filesystems, to allow them to flush writes to disk before the
        // unmount completes.
        Destroy()
}

type FSInodeGenerator interface {
        // GenerateInode is called to pick a dynamic inode number when it
        // would otherwise be 0.
        //
        // Not all filesystems bother tracking inodes, but FUSE requires
        // the inode to be set, and fewer duplicates in general makes UNIX
        // tools work better.
        //
        // Operations where the nodes may return 0 inodes include Getattr,
        // Setattr and ReadDir.
        //
        // If FS does not implement FSInodeGenerator, GenerateDynamicInode
        // is used.
        //
        // Implementing this is useful to e.g. constrain the range of
        // inode values used for dynamic inodes.
        GenerateInode(parentInode uint64, name string) uint64
}

// A Node is the interface required of a file or directory.
// See the documentation for type FS for general information
// pertaining to all methods.
//
// A Node must be usable as a map key, that is, it cannot be a
// function, map or slice.
//
// Other FUSE requests can be handled by implementing methods from the
// Node* interfaces, for example NodeOpener.
//
// Methods returning Node should take care to return the same Node
// when the result is logically the same instance. Without this, each
// Node will get a new NodeID, causing spurious cache invalidations,
// extra lookups and aliasing anomalies. This may not matter for a
// simple, read-only filesystem.
type Node interface {
        // Attr fills attr with the standard metadata for the node.
        //
        // Fields with reasonable defaults are prepopulated. For example,
        // all times are set to a fixed moment when the program started.
        //
        // If Inode is left as 0, a dynamic inode number is chosen.
        //
        // The result may be cached for the duration set in Valid.
        Attr(ctx context.Context, attr *fuse.Attr) error
}

type NodeGetattrer interface {
        // Getattr obtains the standard metadata for the receiver.
        // It should store that metadata in resp.
        //
        // If this method is not implemented, the attributes will be
        // generated based on Attr(), with zero values filled in.
        Getattr(ctx context.Context, req *fuse.GetattrRequest, resp *fuse.GetattrResponse) error
}

type NodeSetattrer interface {
        // Setattr sets the standard metadata for the receiver.
        //
        // Note, this is also used to communicate changes in the size of
        // the file, outside of Writes.
        //
        // req.Valid is a bitmask of what fields are actually being set.
        // For example, the method should not change the mode of the file
        // unless req.Valid.Mode() is true.
        Setattr(ctx context.Context, req *fuse.SetattrRequest, resp *fuse.SetattrResponse) error
}

type NodeSymlinker interface {
        // Symlink creates a new symbolic link in the receiver, which must be a directory.
        //
        // TODO is the above true about directories?
        Symlink(ctx context.Context, req *fuse.SymlinkRequest) (Node, error)
}

// This optional request will be called only for symbolic link nodes.
type NodeReadlinker interface {
        // Readlink reads a symbolic link.
        Readlink(ctx context.Context, req *fuse.ReadlinkRequest) (string, error)
}

type NodeLinker interface {
        // Link creates a new directory entry in the receiver based on an
        // existing Node. Receiver must be a directory.
        Link(ctx context.Context, req *fuse.LinkRequest, old Node) (Node, error)
}

type NodeRemover interface {
        // Remove removes the entry with the given name from
        // the receiver, which must be a directory.  The entry to be removed
        // may correspond to a file (unlink) or to a directory (rmdir).
        Remove(ctx context.Context, req *fuse.RemoveRequest) error
}

type NodeAccesser interface {
        // Access checks whether the calling context has permission for
        // the given operations on the receiver. If so, Access should
        // return nil. If not, Access should return EPERM.
        //
        // Note that this call affects the result of the access(2) system
        // call but not the open(2) system call. If Access is not
        // implemented, the Node behaves as if it always returns nil
        // (permission granted), relying on checks in Open instead.
        Access(ctx context.Context, req *fuse.AccessRequest) error
}

type NodeStringLookuper interface {
        // Lookup looks up a specific entry in the receiver,
        // which must be a directory.  Lookup should return a Node
        // corresponding to the entry.  If the name does not exist in
        // the directory, Lookup should return ENOENT.
        //
        // Lookup need not to handle the names "." and "..".
        Lookup(ctx context.Context, name string) (Node, error)
}

type NodeRequestLookuper interface {
        // Lookup looks up a specific entry in the receiver.
        // See NodeStringLookuper for more.
        Lookup(ctx context.Context, req *fuse.LookupRequest, resp *fuse.LookupResponse) (Node, error)
}

type NodeMkdirer interface {
        Mkdir(ctx context.Context, req *fuse.MkdirRequest) (Node, error)
}

type NodeOpener interface {
        // Open opens the receiver. After a successful open, a client
        // process has a file descriptor referring to this Handle.
        //
        // Open can also be also called on non-files. For example,
        // directories are Opened for ReadDir or fchdir(2).
        //
        // If this method is not implemented, the open will always
        // succeed, and the Node itself will be used as the Handle.
        //
        // XXX note about access.  XXX OpenFlags.
        Open(ctx context.Context, req *fuse.OpenRequest, resp *fuse.OpenResponse) (Handle, error)
}

type NodeCreater interface {
        // Create creates a new directory entry in the receiver, which
        // must be a directory.
        Create(ctx context.Context, req *fuse.CreateRequest, resp *fuse.CreateResponse) (Node, Handle, error)
}

type NodeForgetter interface {
        // Forget about this node. This node will not receive further
        // method calls.
        //
        // Forget is not necessarily seen on unmount, as all nodes are
        // implicitly forgotten as part part of the unmount.
        Forget()
}

type NodeRenamer interface {
        Rename(ctx context.Context, req *fuse.RenameRequest, newDir Node) error
}

type NodeMknoder interface {
        Mknod(ctx context.Context, req *fuse.MknodRequest) (Node, error)
}

// TODO this should be on Handle not Node
type NodeFsyncer interface {
        Fsync(ctx context.Context, req *fuse.FsyncRequest) error
}

type NodeGetxattrer interface {
        // Getxattr gets an extended attribute by the given name from the
        // node.
        //
        // If there is no xattr by that name, returns fuse.ErrNoXattr.
        Getxattr(ctx context.Context, req *fuse.GetxattrRequest, resp *fuse.GetxattrResponse) error
}

type NodeListxattrer interface {
        // Listxattr lists the extended attributes recorded for the node.
        Listxattr(ctx context.Context, req *fuse.ListxattrRequest, resp *fuse.ListxattrResponse) error
}

type NodeSetxattrer interface {
        // Setxattr sets an extended attribute with the given name and
        // value for the node.
        Setxattr(ctx context.Context, req *fuse.SetxattrRequest) error
}

type NodeRemovexattrer interface {
        // Removexattr removes an extended attribute for the name.
        //
        // If there is no xattr by that name, returns fuse.ErrNoXattr.
        Removexattr(ctx context.Context, req *fuse.RemovexattrRequest) error
}

var startTime = time.Now()

func nodeAttr(ctx context.Context, n Node, attr *fuse.Attr) error {
        attr.Valid = attrValidTime
        attr.Nlink = 1
        attr.Atime = startTime
        attr.Mtime = startTime
        attr.Ctime = startTime
        attr.Crtime = startTime
        if err := n.Attr(ctx, attr); err != nil {
                return err
        }
        return nil
}

// A Handle is the interface required of an opened file or directory.
// See the documentation for type FS for general information
// pertaining to all methods.
//
// Other FUSE requests can be handled by implementing methods from the
// Handle* interfaces. The most common to implement are HandleReader,
// HandleReadDirer, and HandleWriter.
//
// TODO implement methods: Getlk, Setlk, Setlkw
type Handle interface {
}

type HandleFlusher interface {
        // Flush is called each time the file or directory is closed.
        // Because there can be multiple file descriptors referring to a
        // single opened file, Flush can be called multiple times.
        Flush(ctx context.Context, req *fuse.FlushRequest) error
}

type HandleReadAller interface {
        ReadAll(ctx context.Context) ([]byte, error)
}

type HandleReadDirer interface {
        ReadDir(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) ([]fuse.Dirent, error)
}

type HandleReadDirAller interface {
        ReadDirAll(ctx context.Context) ([]fuse.Dirent, error)
}

type HandleReader interface {
        // Read requests to read data from the handle.
        //
        // There is a page cache in the kernel that normally submits only
        // page-aligned reads spanning one or more pages. However, you
        // should not rely on this. To see individual requests as
        // submitted by the file system clients, set OpenDirectIO.
        //
        // Note that reads beyond the size of the file as reported by Attr
        // are not even attempted (except in OpenDirectIO mode).
        Read(ctx context.Context, req *fuse.ReadRequest, resp *fuse.ReadResponse) error
}

type HandleWriter interface {
        // Write requests to write data into the handle at the given offset.
        // Store the amount of data written in resp.Size.
        //
        // There is a writeback page cache in the kernel that normally submits
        // only page-aligned writes spanning one or more pages. However,
        // you should not rely on this. To see individual requests as
        // submitted by the file system clients, set OpenDirectIO.
        //
        // Writes that grow the file are expected to update the file size
        // (as seen through Attr). Note that file size changes are
        // communicated also through Setattr.
        Write(ctx context.Context, req *fuse.WriteRequest, resp *fuse.WriteResponse) error
}

type HandleReleaser interface {
        Release(ctx context.Context, req *fuse.ReleaseRequest) error
}

type Config struct {
        // Function to send debug log messages to. If nil, use fuse.Debug.
        // Note that changing this or fuse.Debug may not affect existing
        // calls to Serve.
        //
        // See fuse.Debug for the rules that log functions must follow.
        Debug func(msg interface{})

        // Function to put things into context for processing the request.
        // The returned context must have ctx as its parent.
        //
        // Note that changing this may not affect existing calls to Serve.
        //
        // Must not retain req.
        WithContext func(ctx context.Context, req fuse.Request) context.Context
}

// New returns a new FUSE server ready to serve this kernel FUSE
// connection.
//
// Config may be nil.
func New(conn *fuse.Conn, config *Config) *Server {
        s := &Server{
                conn:         conn,
                req:          map[fuse.RequestID]*serveRequest{},
                nodeRef:      map[Node]fuse.NodeID{},
                dynamicInode: GenerateDynamicInode,
        }
        if config != nil {
                s.debug = config.Debug
                s.context = config.WithContext
        }
        if s.debug == nil {
                s.debug = fuse.Debug
        }
        return s
}

type Server struct {
        // set in New
        conn    *fuse.Conn
        debug   func(msg interface{})
        context func(ctx context.Context, req fuse.Request) context.Context

        // set once at Serve time
        fs           FS
        dynamicInode func(parent uint64, name string) uint64

        // state, protected by meta
        meta       sync.Mutex
        req        map[fuse.RequestID]*serveRequest
        node       []*serveNode
        nodeRef    map[Node]fuse.NodeID
        handle     []*serveHandle
        freeNode   []fuse.NodeID
        freeHandle []fuse.HandleID
        nodeGen    uint64

        // Allocated to ensure worker goroutines finish before Serve returns
        wg sync.WaitGroup
}

const (
        ContextNodeVersionV1   uint32 = 1
        ContextHandleVersionV1 uint32 = 1
        ContextNodeVersion     uint32 = ContextNodeVersionV1
        ContextHandleVersion   uint32 = ContextHandleVersionV1
        NodeListFileName       string = "/tmp/CubeFS-fuse-Nodes.list"
        HandleListFileName     string = "/tmp/CubeFS-fuse-Handles.list"
)

func WriteVersion(file *os.File, version uint32) error {
        data := make([]byte, 4)
        binary.BigEndian.PutUint32(data, version)
        _, err := file.Write(data)
        return err
}

func ReadVersion(file *os.File) (uint32, error) {
        data := make([]byte, 4)
        _, err := file.Read(data)
        if err != nil {
                return 0, err
        }

        version := binary.BigEndian.Uint32(data)
        return version, nil
}

type ContextNode struct {
        Inode      uint64
        ParentIno  uint64
        Generation uint64
        Refs       uint64
        NodeID     uint64
        Mode       uint32
        Rsvd       uint32
}

func (cn *ContextNode) String() string {
        return fmt.Sprintf("nodeid:%v inode:%v parent:%v gen:%v refs:%v mode:%o",
                cn.NodeID, cn.Inode, cn.ParentIno, cn.Generation, cn.Refs, cn.Mode)
}

func ContextNodeToBytes(cn *ContextNode) []byte {
        var buf []byte = make([]byte, unsafe.Sizeof(ContextNode{}))
        binary.BigEndian.PutUint64(buf[0:8], cn.Inode)
        binary.BigEndian.PutUint64(buf[8:16], cn.ParentIno)
        binary.BigEndian.PutUint64(buf[16:24], cn.Generation)
        binary.BigEndian.PutUint64(buf[24:32], cn.Refs)
        binary.BigEndian.PutUint64(buf[32:40], cn.NodeID)
        binary.BigEndian.PutUint32(buf[40:44], cn.Mode)
        return buf
}

func ContextNodeFromBytes(buf []byte) *ContextNode {
        cn := &ContextNode{}
        cn.Inode = binary.BigEndian.Uint64(buf[0:8])
        cn.ParentIno = binary.BigEndian.Uint64(buf[8:16])
        cn.Generation = binary.BigEndian.Uint64(buf[16:24])
        cn.Refs = binary.BigEndian.Uint64(buf[24:32])
        cn.NodeID = binary.BigEndian.Uint64(buf[32:40])
        cn.Mode = binary.BigEndian.Uint32(buf[40:44])
        return cn
}

type ContextHandle struct {
        HandleID uint64
        NodeID   uint64
}

func (ch *ContextHandle) String() string {
        return fmt.Sprintf("handleid:%v nodeid:%v", ch.HandleID, ch.NodeID)
}

func ContextHandleToBytes(ch *ContextHandle) []byte {
        var buf []byte = make([]byte, unsafe.Sizeof(ContextHandle{}))
        binary.BigEndian.PutUint64(buf[0:8], ch.HandleID)
        binary.BigEndian.PutUint64(buf[8:16], ch.NodeID)
        return buf
}

func ContextHandleFromBytes(buf []byte) *ContextHandle {
        ch := &ContextHandle{}
        ch.HandleID = binary.BigEndian.Uint64(buf[0:8])
        ch.NodeID = binary.BigEndian.Uint64(buf[8:16])
        return ch
}

func (s *Server) TrySuspend(fs FS) bool {
        var err error
        var msg string
        var ret bool

        stat, sockaddr := fs.State()
        if stat == FSStatSuspend {
                if msg, err = s.SaveFuseContext(fs); err != nil {
                        s.CleanupFuseContext()
                        fs.Notify(stat, err)
                        goto out
                }
                if err = s.SaveFuseDevFd(sockaddr); err != nil {
                        s.CleanupFuseContext()
                        fs.Notify(stat, err)
                        goto out
                }

                fs.Notify(stat, msg)

        out:
                for {
                        stat, _ = fs.State()
                        if stat == FSStatShutdown {
                                ret = true
                                break
                        } else if stat == FSStatResume {
                                s.CleanupFuseContext()
                                ret = false
                                break
                        } else {
                                runtime.Gosched()
                        }
                }
        }

        return ret
}

func (s *Server) CleanupFuseContext() {
        os.Remove(NodeListFileName)
        os.Remove(HandleListFileName)
}

func (s *Server) SaveFuseContext(fs FS) (msg string, err error) {
        var (
                nodeListFile   *os.File
                handleListFile *os.File
                ncount         int
                hcount         int
                skip           uint64
        )
        // Wait all received requests to finish
        // FIXME: add a timeout to avoid waiting forever
        s.wg.Wait()

        if nodeListFile, err = os.OpenFile(NodeListFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to create nodes list file: %v", err)
                return
        }
        defer nodeListFile.Close()
        if handleListFile, err = os.OpenFile(HandleListFileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0644); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to create s list file: %v", err)
                return
        }
        defer handleListFile.Close()

        if err = WriteVersion(nodeListFile, ContextNodeVersion); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to write nodes list file: %v", err)
                return
        }
        if err = WriteVersion(handleListFile, ContextHandleVersion); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to write handles list file: %v", err)
                return
        }

        s.meta.Lock()
        // s.node[0] is nil and s.node[1] is root.
        // No need to save root since it is created everytime fuse is mounted.
        skip = 2
        for i, sn := range s.node[skip:] {
                var (
                        attr   fuse.Attr = fuse.Attr{}
                        nodeid uint64    = skip + uint64(i)
                        n      int
                )

                if sn == nil {
                        continue
                }

                sn.wg.Wait()

                if err = sn.node.Attr(context.TODO(), &attr); err != nil {
                        s.meta.Unlock()
                        err = fmt.Errorf("SaveFuseContext: failed to get mode of node %v: %v", sn.inode, err)
                        return
                }
                cn := &ContextNode{sn.inode, attr.ParentIno, sn.generation, sn.refs, nodeid, uint32(attr.Mode), 0}
                data := ContextNodeToBytes(cn)
                if n, err = nodeListFile.Write(data); n != len(data) || err != nil {
                        s.meta.Unlock()
                        err = fmt.Errorf("SaveFuseContext: failed to write nodes list file: %v", err)
                        return
                }

                ncount++
                // check if need stop
                if ncount%20 == 0 {
                        stat, _ := fs.State()
                        if stat != FSStatSuspend {
                                s.meta.Unlock()
                                err = fmt.Errorf("SaveFuseContext: detect state changed to %v", stat)
                                return
                        }
                }
        }

        skip = 1
        for i, sh := range s.handle[skip:] {
                var (
                        handleid uint64 = skip + uint64(i)
                        n        int
                )

                if sh == nil {
                        continue
                }

                if hdl, ok := sh.handle.(HandleFlusher); ok {
                        if err = hdl.Flush(context.TODO(), nil); err != nil {
                                s.meta.Unlock()
                                err = fmt.Errorf("SaveFuseContext: flush handle %v: %v\n",
                                        s.node[sh.nodeID].inode, err)
                                return
                        }
                }
                ch := &ContextHandle{handleid, uint64(sh.nodeID)}
                data := ContextHandleToBytes(ch)
                if n, err = handleListFile.Write(data); n != len(data) || err != nil {
                        s.meta.Unlock()
                        err = fmt.Errorf("SaveFuseContext: failed to write handles list file: %v", err)
                        return
                }

                hcount++
                // check 'if' need stop
                if hcount%20 == 0 {
                        stat, _ := fs.State()
                        if stat != FSStatSuspend {
                                s.meta.Unlock()
                                err = fmt.Errorf("SaveFuseContext: detect state changed to %v", stat)
                                return
                        }
                }
        }
        s.meta.Unlock()

        if err = nodeListFile.Sync(); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to sync nodes list file: %v", err)
                return
        }

        if err = handleListFile.Sync(); err != nil {
                err = fmt.Errorf("SaveFuseContext: failed to sync handles list file: %v", err)
                return
        }

        msg = fmt.Sprintf("Node count: %d  Handle count: %d", ncount, hcount)
        return
}

func (s *Server) SaveFuseDevFd(sockaddr string) (err error) {
        var addr *net.UnixAddr
        var conn *net.UnixConn
        var fud *os.File
        var socket *os.File

        defer func() {
                if socket != nil {
                        socket.Close()
                }
                if conn != nil {
                        conn.Close()
                }
        }()

        if addr, err = net.ResolveUnixAddr("unix", sockaddr); err != nil {
                return fmt.Errorf("SaveFuseDevFd: failed to create unix addr: %v", err)
        }

        if conn, err = net.DialUnix("unix", nil, addr); err != nil {
                return fmt.Errorf("SaveFuseDevFd: failed to connect unix socket: %v", err)
        }

        if socket, err = conn.File(); err != nil {
                return fmt.Errorf("SaveFuseDevFd: failed to get socket file: %v", err)
        }

        fud = s.conn.GetFuseDevFile()
        if fud == nil {
                return fmt.Errorf("SaveFuseDevFd: fuse dev not exist")
        }

        if err = util.SendFd(socket, fud.Name(), fud.Fd()); err != nil {
                return fmt.Errorf("SaveFuseDevFd: failed to send fuse dev file: %v", err)
        }

        return nil
}

func (s *Server) TryRestore(fs FS) error {
        stat, sockaddr := fs.State()

        if stat != FSStatRestore {
                return nil
        }

        err := s.LoadFuseContext(fs, sockaddr)
        if err != nil {
                return err
        }
        if s.conn.GetFuseDevFile() == nil {
                if err = s.LoadFuseDevFd(sockaddr); err != nil {
                        return err
                }
        }

        fs.Notify(stat, "")

        for {
                stat, _ = fs.State()
                if stat == FSStatResume {
                        //s.CleanupFuseContext()
                        break
                } else if stat == FSStatRestore {
                        runtime.Gosched()
                } else {
                        return fmt.Errorf("Unknown state changed %v", stat)
                }
        }

        return nil
}

func (s *Server) LoadFuseContext(fs FS, sockaddr string) error {
        nodeListFile, err := os.OpenFile(NodeListFileName, os.O_RDONLY, 0644)
        if err != nil {
                err = fmt.Errorf("LoadFuseContext: failed to open nodes list file: %v\n", err)
                return err
        }
        defer nodeListFile.Close()
        handleListFile, err := os.OpenFile(HandleListFileName, os.O_RDONLY, 0644)
        if err != nil {
                err = fmt.Errorf("LoadFuseContext: failed to open handles list file: %v\n", err)
                return err
        }
        defer handleListFile.Close()

        cnVersion, err := ReadVersion(nodeListFile)
        if err != nil {
                err = fmt.Errorf("LoadFuseContext: failed to read nodes version: %v\n", err)
                return err
        }
        chVersion, err := ReadVersion(handleListFile)
        if err != nil {
                err = fmt.Errorf("LoadFuseContext: failed to read handles version: %v\n", err)
                return err
        }

        for {
                var (
                        data  []byte = make([]byte, unsafe.Sizeof(ContextNode{}))
                        rsize int
                )

                rsize, err = nodeListFile.Read(data)
                if rsize == 0 || err == io.EOF {
                        err = nil
                        break
                }

                if cnVersion == ContextNodeVersionV1 {
                        cn := ContextNodeFromBytes(data)
                        sn := &serveNode{inode: cn.Inode, generation: cn.Generation, refs: cn.Refs}
                        if sn.node, err = fs.Node(cn.Inode, cn.ParentIno, cn.Mode); err != nil {
                                err = fmt.Errorf("LoadFuseContext: failed to get fs.Node of %v: %v\n", sn.inode, err)
                                return err
                        }

                        for uint64(len(s.node)) < cn.NodeID {
                                freeNodeID := fuse.NodeID(len(s.node))
                                s.freeNode = append(s.freeNode, freeNodeID)
                                s.node = append(s.node, nil)
                        }
                        s.node = append(s.node, sn)
                        s.nodeRef[sn.node] = fuse.NodeID(cn.NodeID)
                } else {
                        err = fmt.Errorf("LoadFuseContext: unrecognize nodes file version %v\n", cnVersion)
                        return err
                }
        }

        for {
                var (
                        data  []byte = make([]byte, unsafe.Sizeof(ContextHandle{}))
                        rsize int
                        hdl   Handle
                )
                rsize, err = handleListFile.Read(data)
                if rsize == 0 || err == io.EOF {
                        err = nil
                        break
                }

                if chVersion == ContextHandleVersionV1 {
                        ch := ContextHandleFromBytes(data)
                        if ch.NodeID > uint64(len(s.node)) {
                                err = fmt.Errorf("LoadFuseContext: invalid handle(%v) len of s.node %v\n",
                                        ch, len(s.node))
                                return err
                        }

                        sn := s.node[ch.NodeID]
                        if node, ok := sn.node.(NodeOpener); ok {
                                // create streamers for cubefs
                                if hdl, err = node.Open(context.TODO(), nil, nil); err != nil {
                                        err = fmt.Errorf("LoadFuseContext: failed to open handle %v: %v\n", sn.inode, err)
                                        return err
                                }
                        } else {
                                hdl = sn.node
                        }

                        sh := &serveHandle{handle: hdl, nodeID: fuse.NodeID(ch.NodeID)}
                        for uint64(len(s.handle)) < ch.HandleID {
                                freeHandleID := fuse.HandleID(len(s.handle))
                                s.freeHandle = append(s.freeHandle, freeHandleID)
                                s.handle = append(s.handle, nil)
                        }
                        s.handle = append(s.handle, sh)
                } else {
                        err = fmt.Errorf("LoadFuseContext: unrecognize handles file version %v\n", chVersion)
                        return err
                }
        }

        return err
}

func (s *Server) LoadFuseDevFd(sockaddr string) (err error) {
        var (
                addr   *net.UnixAddr
                conn   *net.UnixConn
                fud    *os.File
                socket *os.File
        )

        defer func() {
                if socket != nil {
                        socket.Close()
                }
                if conn != nil {
                        conn.Close()
                }
        }()

        if addr, err = net.ResolveUnixAddr("unix", sockaddr); err != nil {
                err = fmt.Errorf("LoadFuseDevFd: failed to create unix addr: %v", err)
                return
        }

        if conn, err = net.DialUnix("unix", nil, addr); err != nil {
                err = fmt.Errorf("LoadFuseDevFd: failed to connect unix socket: %v", err)
                return
        }

        if socket, err = conn.File(); err != nil {
                err = fmt.Errorf("LoadFuseDevFd: failed to get socket file: %v", err)
                return
        }

        if fud, err = util.RecvFd(socket); err != nil {
                err = fmt.Errorf("LoadFuseDevFd: failed to receive fuse dev file: %v", err)
                return
        }

        s.conn.SetFuseDevFile(fud)

        return
}

// Serve serves the FUSE connection by making calls to the methods
// of fs and the Nodes and Handles it makes available.  It returns only
// when the connection has been closed or an unexpected error occurs.
func (s *Server) Serve(fs FS, opt *proto.MountOptions) error {
        defer s.wg.Wait() // Wait for worker goroutines to complete before return

        s.fs = fs
        if dyn, ok := fs.(FSInodeGenerator); ok {
                s.dynamicInode = dyn.GenerateInode
        }

        root, err := fs.Root()
        if err != nil {
                return fmt.Errorf("cannot obtain root node: %v", err)
        }
        // Recognize the root node if it's ever returned from Lookup,
        // passed to Invalidate, etc.
        s.nodeRef[root] = 1
        s.node = append(s.node, nil, &serveNode{
                inode:      1,
                generation: s.nodeGen,
                node:       root,
                refs:       1,
        })
        s.handle = append(s.handle, nil)

        if err = s.TryRestore(fs); err != nil {
                return fmt.Errorf("restore fail: %v", err)
        }

        for {
                if s.TrySuspend(fs) {
                        break
                }

                req, err := s.conn.ReadRequest()
                if err != nil {
                        if err == io.EOF {
                                break
                        }
                        return err
                }

                switch req.(type) {
                case *fuse.ForgetRequest:
                        ctx := context.Background()
                        ForgetServeLimit.Wait(ctx)
                default:
                }

                s.wg.Add(1)
                go func() {
                        defer s.wg.Done()
                        if opt != nil && opt.RequestTimeout > 0 {
                                s.serveWithTimeOut(req, opt.RequestTimeout)
                        } else {
                                s.serve(req)
                        }
                }()
        }
        return nil
}

// Serve serves a FUSE connection with the default settings. See
// Server.Serve.
func Serve(c *fuse.Conn, fs FS, opt *proto.MountOptions) error {
        server := New(c, nil)
        return server.Serve(fs, opt)
}

type nothing struct{}

type serveRequest struct {
        Request fuse.Request
        cancel  func()
}

type serveNode struct {
        inode      uint64
        generation uint64
        node       Node
        refs       uint64

        // Delay freeing the NodeID until waitgroup is done. This allows
        // using the NodeID for short periods of time without holding the
        // Server.meta lock.
        //
        // Rules:
        //
        //     - hold Server.meta while calling wg.Add, then unlock
        //     - do NOT try to reacquire Server.meta
        wg sync.WaitGroup
}

func (sn *serveNode) attr(ctx context.Context, attr *fuse.Attr) error {
        err := nodeAttr(ctx, sn.node, attr)
        if attr.Inode == 0 {
                attr.Inode = sn.inode
        }
        return err
}

type serveHandle struct {
        handle   Handle
        readData []byte
        nodeID   fuse.NodeID
}

// NodeRef is deprecated. It remains here to decrease code churn on
// FUSE library users. You may remove it from your program now;
// returning the same Node values are now recognized automatically,
// without needing NodeRef.
type NodeRef struct{}

func (c *Server) saveNode(inode uint64, node Node) (id fuse.NodeID, gen uint64) {
        c.meta.Lock()
        defer c.meta.Unlock()

        if id, ok := c.nodeRef[node]; ok {
                sn := c.node[id]
                sn.refs++
                return id, sn.generation
        }

        sn := &serveNode{inode: inode, node: node, refs: 1}
        if n := len(c.freeNode); n > 0 {
                id = c.freeNode[n-1]
                c.freeNode = c.freeNode[:n-1]
                c.node[id] = sn
                c.nodeGen++
        } else {
                id = fuse.NodeID(len(c.node))
                c.node = append(c.node, sn)
        }
        sn.generation = c.nodeGen
        c.nodeRef[node] = id
        return id, sn.generation
}

func (c *Server) saveHandle(handle Handle, nodeID fuse.NodeID) (id fuse.HandleID) {
        c.meta.Lock()
        shandle := &serveHandle{handle: handle, nodeID: nodeID}
        if n := len(c.freeHandle); n > 0 {
                id = c.freeHandle[n-1]
                c.freeHandle = c.freeHandle[:n-1]
                c.handle[id] = shandle
        } else {
                id = fuse.HandleID(len(c.handle))
                c.handle = append(c.handle, shandle)
        }
        c.meta.Unlock()
        return
}

type nodeRefcountDropBug struct {
        N    uint64
        Refs uint64
        Node fuse.NodeID
}

func (n *nodeRefcountDropBug) String() string {
        return fmt.Sprintf("bug: trying to drop %d of %d references to %v", n.N, n.Refs, n.Node)
}

func (c *Server) dropNode(id fuse.NodeID, n uint64) (forget bool) {
        c.meta.Lock()
        defer c.meta.Unlock()
        snode := c.node[id]

        if snode == nil {
                // this should only happen if refcounts kernel<->us disagree
                // *and* two ForgetRequests for the same node race each other;
                // this indicates a bug somewhere
                c.debug(nodeRefcountDropBug{N: n, Node: id})

                // we may end up triggering Forget twice, but that's better
                // than not even once, and that's the best we can do
                return true
        }

        if n > snode.refs {
                c.debug(nodeRefcountDropBug{N: n, Refs: snode.refs, Node: id})
                n = snode.refs
        }

        snode.refs -= n
        if snode.refs == 0 {
                snode.wg.Wait()
                c.node[id] = nil
                delete(c.nodeRef, snode.node)
                c.freeNode = append(c.freeNode, id)
                return true
        }
        return false
}

func (c *Server) dropHandle(id fuse.HandleID) {
        c.meta.Lock()
        c.handle[id] = nil
        c.freeHandle = append(c.freeHandle, id)
        c.meta.Unlock()
}

type missingHandle struct {
        Handle    fuse.HandleID
        MaxHandle fuse.HandleID
}

func (m missingHandle) String() string {
        return fmt.Sprint("missing handle: ", m.Handle, m.MaxHandle)
}

// Returns nil for invalid handles.
func (c *Server) getHandle(id fuse.HandleID) (shandle *serveHandle) {
        c.meta.Lock()
        defer c.meta.Unlock()
        if id < fuse.HandleID(len(c.handle)) {
                shandle = c.handle[uint(id)]
        }
        if shandle == nil {
                c.debug(missingHandle{
                        Handle:    id,
                        MaxHandle: fuse.HandleID(len(c.handle)),
                })
        }
        return
}

type request struct {
        Op      string
        Request *fuse.Header
        In      interface{} `json:",omitempty"`
}

func (r request) String() string {
        return fmt.Sprintf("<- %s", r.In)
}

type logResponseHeader struct {
        ID fuse.RequestID
}

func (m logResponseHeader) String() string {
        return fmt.Sprintf("ID=%v", m.ID)
}

type response struct {
        Op      string
        Request logResponseHeader
        Out     interface{} `json:",omitempty"`
        // Errno contains the errno value as a string, for example "EPERM".
        Errno string `json:",omitempty"`
        // Error may contain a free form error message.
        Error string `json:",omitempty"`
}

func (r response) errstr() string {
        s := r.Errno
        if r.Error != "" {
                // prefix the errno constant to the long form message
                s = s + ": " + r.Error
        }
        return s
}

func (r response) String() string {
        switch {
        case r.Errno != "" && r.Out != nil:
                return fmt.Sprintf("-> [%v] %v error=%s", r.Request, r.Out, r.errstr())
        case r.Errno != "":
                return fmt.Sprintf("-> [%v] %s error=%s", r.Request, r.Op, r.errstr())
        case r.Out != nil:
                // make sure (seemingly) empty values are readable
                switch r.Out.(type) {
                case string:
                        return fmt.Sprintf("-> [%v] %s %q", r.Request, r.Op, r.Out)
                case []byte:
                        return fmt.Sprintf("-> [%v] %s [% x]", r.Request, r.Op, r.Out)
                default:
                        return fmt.Sprintf("-> [%v] %v", r.Request, r.Out)
                }
        default:
                return fmt.Sprintf("-> [%v] %s", r.Request, r.Op)
        }
}

type notification struct {
        Op   string
        Node fuse.NodeID
        Out  interface{} `json:",omitempty"`
        Err  string      `json:",omitempty"`
}

func (n notification) String() string {
        var buf bytes.Buffer
        fmt.Fprintf(&buf, "=> %s %v", n.Op, n.Node)
        if n.Out != nil {
                // make sure (seemingly) empty values are readable
                switch n.Out.(type) {
                case string:
                        fmt.Fprintf(&buf, " %q", n.Out)
                case []byte:
                        fmt.Fprintf(&buf, " [% x]", n.Out)
                default:
                        fmt.Fprintf(&buf, " %s", n.Out)
                }
        }
        if n.Err != "" {
                fmt.Fprintf(&buf, " Err:%v", n.Err)
        }
        return buf.String()
}

type logMissingNode struct {
        MaxNode fuse.NodeID
}

func opName(req fuse.Request) string {
        t := reflect.Indirect(reflect.ValueOf(req)).Type()
        s := t.Name()
        s = strings.TrimSuffix(s, "Request")
        return s
}

type logLinkRequestOldNodeNotFound struct {
        Request *fuse.Header
        In      *fuse.LinkRequest
}

func (m *logLinkRequestOldNodeNotFound) String() string {
        return fmt.Sprintf("In LinkRequest (request %v), node %d not found", m.Request.Hdr().ID, m.In.OldNode)
}

type renameNewDirNodeNotFound struct {
        Request *fuse.Header
        In      *fuse.RenameRequest
}

func (m *renameNewDirNodeNotFound) String() string {
        return fmt.Sprintf("In RenameRequest (request %v), node %d not found", m.Request.Hdr().ID, m.In.NewDir)
}

type handlerPanickedError struct {
        Request interface{}
        Err     interface{}
}

var _ error = handlerPanickedError{}

func (h handlerPanickedError) Error() string {
        return fmt.Sprintf("handler panicked: %v", h.Err)
}

var _ fuse.ErrorNumber = handlerPanickedError{}

func (h handlerPanickedError) Errno() fuse.Errno {
        if err, ok := h.Err.(fuse.ErrorNumber); ok {
                return err.Errno()
        }
        return fuse.DefaultErrno
}

// handlerTerminatedError happens when a handler terminates itself
// with runtime.Goexit. This is most commonly because of incorrect use
// of testing.TB.FailNow, typically via t.Fatal.
type handlerTerminatedError struct {
        Request interface{}
}

var _ error = handlerTerminatedError{}

func (h handlerTerminatedError) Error() string {
        return fmt.Sprintf("handler terminated (called runtime.Goexit)")
}

var _ fuse.ErrorNumber = handlerTerminatedError{}

func (h handlerTerminatedError) Errno() fuse.Errno {
        return fuse.DefaultErrno
}

type handleNotReaderError struct {
        handle Handle
}

var _ error = handleNotReaderError{}

func (e handleNotReaderError) Error() string {
        return fmt.Sprintf("handle has no Read: %T", e.handle)
}

var _ fuse.ErrorNumber = handleNotReaderError{}

func (e handleNotReaderError) Errno() fuse.Errno {
        return fuse.ENOTSUP
}

func initLookupResponse(s *fuse.LookupResponse) {
        s.EntryValid = entryValidTime
}

func (c *Server) serve(r fuse.Request) {
        ctx, cancel := context.WithCancel(context.Background())
        defer cancel()
        parentCtx := ctx
        if c.context != nil {
                ctx = c.context(ctx, r)
        }

        req := &serveRequest{Request: r, cancel: cancel}

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("fuse:"+opName(r), nil, bgTime, 1)
        }()

        c.debug(request{
                Op:      opName(r),
                Request: r.Hdr(),
                In:      r,
        })
        node, snode, hdr, ok := c.checkNode(r, req)
        if ok {
                return
        }
        done := c.done(r, hdr)

        var responded bool
        defer func() {
                if rec := recover(); rec != nil {
                        const size = 1 << 16
                        buf := make([]byte, size)
                        n := runtime.Stack(buf, false)
                        buf = buf[:n]
                        log.Printf("fuse: panic in handler for %v: %v\n%s", r, rec, buf)
                        err := handlerPanickedError{
                                Request: r,
                                Err:     rec,
                        }
                        done(err)
                        r.RespondError(err)
                        return
                }

                if !responded {
                        err := handlerTerminatedError{
                                Request: r,
                        }
                        done(err)
                        r.RespondError(err)
                }
        }()

        if err := c.handleRequest(ctx, node, snode, r, done); err != nil {
                if err == context.Canceled {
                        select {
                        case <-parentCtx.Done():
                                // We canceled the parent context because of an
                                // incoming interrupt request, so return EINTR
                                // to trigger the right behavior in the client app.
                                //
                                // Only do this when it's the parent context that was
                                // canceled, not a context controlled by the program
                                // using this library, so we don't return EINTR too
                                // eagerly -- it might cause busy loops.
                                //
                                // Decent write-up on role of EINTR:
                                // http://250bpm.com/blog:12
                                err = fuse.EINTR
                        default:
                                // nothing
                        }
                }
                done(err)
                r.RespondError(err)
        }

        // disarm runtime.Goexit protection
        responded = true
}

func (c *Server) done(r fuse.Request, hdr *fuse.Header) func(resp interface{}) {
        // Call this before responding.
        // After responding is too late: we might get another request
        // with the same ID and be very confused.
        done := func(resp interface{}) {
                msg := response{
                        Op:      opName(r),
                        Request: logResponseHeader{ID: hdr.ID},
                }
                if err, ok := resp.(error); ok {
                        msg.Error = err.Error()
                        if ferr, ok := err.(fuse.ErrorNumber); ok {
                                errno := ferr.Errno()
                                msg.Errno = errno.ErrnoName()
                                if errno == err {
                                        // it's just a fuse.Errno with no extra detail;
                                        // skip the textual message for log readability
                                        msg.Error = ""
                                }
                        } else {
                                msg.Errno = fuse.DefaultErrno.ErrnoName()
                        }
                } else {
                        msg.Out = resp
                }
                c.debug(msg)

                c.meta.Lock()
                delete(c.req, hdr.ID)
                c.meta.Unlock()
        }
        return done
}

func (c *Server) checkNode(r fuse.Request, req *serveRequest) (Node, *serveNode, *fuse.Header, bool) {
        var node Node
        var snode *serveNode
        c.meta.Lock()
        hdr := r.Hdr()
        if id := hdr.Node; id != 0 {
                if id < fuse.NodeID(len(c.node)) {
                        snode = c.node[uint(id)]
                }
                if snode == nil {
                        c.meta.Unlock()
                        c.debug(response{
                                Op:      opName(r),
                                Request: logResponseHeader{ID: hdr.ID},
                                Error:   fuse.ESTALE.ErrnoName(),
                                // this is the only place that sets both Error and
                                // Out; not sure if i want to do that; might get rid
                                // of len(c.node) things altogether
                                Out: logMissingNode{
                                        MaxNode: fuse.NodeID(len(c.node)),
                                },
                        })
                        r.RespondError(fuse.ESTALE)
                        return nil, nil, nil, true
                }
                node = snode.node
        }
        if c.req[hdr.ID] != nil {
                // This happens with OSXFUSE.  Assume it's okay and
                // that we'll never see an interrupt for this one.
                // Otherwise everything wedges.  TODO: Report to OSXFUSE?
                //
                // TODO this might have been because of missing done() calls
        } else {
                c.req[hdr.ID] = req
        }
        c.meta.Unlock()
        return node, snode, hdr, false
}

func (c *Server) serveWithTimeOut(r fuse.Request, requestTimeout int64) {
        ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(requestTimeout))
        defer cancel()

        doneChan := make(chan error, 1)
        parentCtx := ctx
        if c.context != nil {
                ctx = c.context(ctx, r)
        }

        req := &serveRequest{Request: r, cancel: cancel}

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("fuse:"+opName(r), nil, bgTime, 1)
        }()

        c.debug(request{
                Op:      opName(r),
                Request: r.Hdr(),
                In:      r,
        })
        node, snode, hdr, ok := c.checkNode(r, req)
        if ok {
                return
        }
        done := c.done(r, hdr)

        go func() {
                defer func() {
                        if rec := recover(); rec != nil {
                                const size = 1 << 16
                                buf := make([]byte, size)
                                n := runtime.Stack(buf, false)
                                buf = buf[:n]
                                log.Printf("fuse: panic in handler for %v: %v\n%s", r, rec, buf)
                                err := handlerPanickedError{
                                        Request: r,
                                        Err:     rec,
                                }
                                done(err)
                                r.RespondError(err)
                                return
                        }
                }()

                doneChan <- c.handleRequest(ctx, node, snode, r, done)
        }()

        select {
        case err := <-doneChan:
                if err != nil {
                        if err == context.Canceled {
                                select {
                                case <-parentCtx.Done():
                                        err = fuse.EINTR
                                default:
                                        // nothing
                                }
                        }
                        done(err)
                        r.RespondError(err)
                }
        case <-ctx.Done():
                err := ctx.Err()
                if err != nil {
                        if err.Error() == "context canceled" {
                                // Context is finished, ignore
                        } else if err.Error() == "context deadline exceeded" {
                                log.Printf("request timeout, err: [%v], req: [%v], conn: [%v], pid: [%v]", ctx.Err(), r, r.Hdr().Conn, r.Hdr().Pid)
                                done(fuse.ETIME)
                                r.RespondError(fuse.ETIME)
                        } else {
                                done(fuse.EIO)
                                r.RespondError(fuse.EIO)
                        }
                }
        }
}

// handleRequest will either a) call done(s) and r.Respond(s) OR b) return an error.
func (c *Server) handleRequest(ctx context.Context, node Node, snode *serveNode, r fuse.Request, done func(resp interface{})) error {
        switch r := r.(type) {
        default:
                // Note: To FUSE, ENOSYS means "this server never implements this request."
                // It would be inappropriate to return ENOSYS for other operations in this
                // switch that might only be unavailable in some contexts, not all.
                return fuse.ENOSYS

        case *fuse.StatfsRequest:
                s := &fuse.StatfsResponse{}
                if fs, ok := c.fs.(FSStatfser); ok {
                        if err := fs.Statfs(ctx, r, s); err != nil {
                                return err
                        }
                }
                done(s)
                r.Respond(s)
                return nil

        // Node operations.
        case *fuse.GetattrRequest:
                s := &fuse.GetattrResponse{}
                if n, ok := node.(NodeGetattrer); ok {
                        if err := n.Getattr(ctx, r, s); err != nil {
                                return err
                        }
                } else {
                        if err := snode.attr(ctx, &s.Attr); err != nil {
                                return err
                        }
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.SetattrRequest:
                s := &fuse.SetattrResponse{}
                if n, ok := node.(NodeSetattrer); ok {
                        if err := n.Setattr(ctx, r, s); err != nil {
                                return err
                        }
                }

                if err := snode.attr(ctx, &s.Attr); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.SymlinkRequest:
                s := &fuse.SymlinkResponse{}
                initLookupResponse(&s.LookupResponse)
                n, ok := node.(NodeSymlinker)
                if !ok {
                        return fuse.EIO // XXX or EPERM like Mkdir?
                }
                n2, err := n.Symlink(ctx, r)
                if err != nil {
                        return err
                }
                if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.NewName, n2); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.ReadlinkRequest:
                n, ok := node.(NodeReadlinker)
                if !ok {
                        return fuse.EIO /// XXX or EPERM?
                }
                target, err := n.Readlink(ctx, r)
                if err != nil {
                        return err
                }
                done(target)
                r.Respond(target)
                return nil

        case *fuse.LinkRequest:
                n, ok := node.(NodeLinker)
                if !ok {
                        return fuse.EIO /// XXX or EPERM?
                }
                c.meta.Lock()
                var oldNode *serveNode
                if int(r.OldNode) < len(c.node) {
                        oldNode = c.node[r.OldNode]
                }
                c.meta.Unlock()
                if oldNode == nil {
                        c.debug(logLinkRequestOldNodeNotFound{
                                Request: r.Hdr(),
                                In:      r,
                        })
                        return fuse.EIO
                }
                n2, err := n.Link(ctx, r, oldNode.node)
                if err != nil {
                        return err
                }
                s := &fuse.LookupResponse{}
                initLookupResponse(s)
                if err := c.saveLookup(ctx, s, snode, r.NewName, n2); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.RemoveRequest:
                n, ok := node.(NodeRemover)
                if !ok {
                        return fuse.EIO /// XXX or EPERM?
                }
                err := n.Remove(ctx, r)
                if err != nil {
                        return err
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.AccessRequest:
                if n, ok := node.(NodeAccesser); ok {
                        if err := n.Access(ctx, r); err != nil {
                                return err
                        }
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.LookupRequest:
                var n2 Node
                var err error
                s := &fuse.LookupResponse{}
                initLookupResponse(s)
                if n, ok := node.(NodeStringLookuper); ok {
                        n2, err = n.Lookup(ctx, r.Name)
                } else if n, ok := node.(NodeRequestLookuper); ok {
                        n2, err = n.Lookup(ctx, r, s)
                } else {
                        return fuse.ENOENT
                }
                if err != nil {
                        return err
                }
                if err := c.saveLookup(ctx, s, snode, r.Name, n2); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.MkdirRequest:
                s := &fuse.MkdirResponse{}
                initLookupResponse(&s.LookupResponse)
                n, ok := node.(NodeMkdirer)
                if !ok {
                        return fuse.EPERM
                }
                n2, err := n.Mkdir(ctx, r)
                if err != nil {
                        return err
                }
                if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.Name, n2); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.OpenRequest:
                s := &fuse.OpenResponse{}
                var h2 Handle
                if n, ok := node.(NodeOpener); ok {
                        hh, err := n.Open(ctx, r, s)
                        if err != nil {
                                return err
                        }
                        h2 = hh
                } else {
                        h2 = node
                }
                s.Handle = c.saveHandle(h2, r.Hdr().Node)
                done(s)
                r.Respond(s)
                return nil

        case *fuse.CreateRequest:
                n, ok := node.(NodeCreater)
                if !ok {
                        // If we send back ENOSYS, FUSE will try mknod+open.
                        return fuse.EPERM
                }
                s := &fuse.CreateResponse{OpenResponse: fuse.OpenResponse{}}
                initLookupResponse(&s.LookupResponse)
                n2, h2, err := n.Create(ctx, r, s)
                if err != nil {
                        return err
                }
                if err := c.saveLookup(ctx, &s.LookupResponse, snode, r.Name, n2); err != nil {
                        return err
                }
                s.Handle = c.saveHandle(h2, s.Node)
                done(s)
                r.Respond(s)
                return nil

        case *fuse.GetxattrRequest:
                n, ok := node.(NodeGetxattrer)
                if !ok {
                        return fuse.ENOTSUP
                }
                s := &fuse.GetxattrResponse{}
                err := n.Getxattr(ctx, r, s)
                if err != nil {
                        return err
                }
                if r.Size != 0 && uint64(len(s.Xattr)) > uint64(r.Size) {
                        return fuse.ERANGE
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.ListxattrRequest:
                n, ok := node.(NodeListxattrer)
                if !ok {
                        return fuse.ENOTSUP
                }
                s := &fuse.ListxattrResponse{}
                err := n.Listxattr(ctx, r, s)
                if err != nil {
                        return err
                }
                if r.Size != 0 && uint64(len(s.Xattr)) > uint64(r.Size) {
                        return fuse.ERANGE
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.SetxattrRequest:
                log.Println("SetxattrRequest")
                n, ok := node.(NodeSetxattrer)
                if !ok {
                        return fuse.ENOTSUP
                }
                err := n.Setxattr(ctx, r)
                if err != nil {
                        return err
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.RemovexattrRequest:
                n, ok := node.(NodeRemovexattrer)
                if !ok {
                        return fuse.ENOTSUP
                }
                err := n.Removexattr(ctx, r)
                if err != nil {
                        return err
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.ForgetRequest:
                forget := c.dropNode(r.Hdr().Node, r.N)
                if forget {
                        n, ok := node.(NodeForgetter)
                        if ok {
                                n.Forget()
                        }
                }
                done(nil)
                r.Respond()
                return nil

        // Handle operations.
        case *fuse.ReadRequest:
                shandle := c.getHandle(r.Handle)
                if shandle == nil {
                        return fuse.ESTALE
                }
                handle := shandle.handle
                s := &fuse.ReadResponse{}
                if r.Dir {
                        s.Data = make([]byte, r.Size)

                        // detect rewinddir(3) or similar seek and refresh
                        // contents
                        if r.Offset == 0 {
                                shandle.readData = nil
                        }

                        if h, ok := handle.(HandleReadDirer); ok {
                                var noMore bool

                                for !noMore && ((shandle.readData == nil) || (r.Offset+int64(r.Size) > int64(len(shandle.readData)))) {
                                        dirs, err := h.ReadDir(ctx, r, s)
                                        if err != nil {
                                                if err == io.EOF {
                                                        noMore = true
                                                } else {
                                                        return err
                                                }
                                        }
                                        for _, dir := range dirs {
                                                if dir.Inode == 0 {
                                                        dir.Inode = c.dynamicInode(snode.inode, dir.Name)
                                                }
                                                shandle.readData = fuse.AppendDirent(shandle.readData, dir)
                                        }
                                }
                        } else if h, ok := handle.(HandleReadDirAller); ok {
                                if shandle.readData == nil {
                                        dirs, err := h.ReadDirAll(ctx)
                                        if err != nil {
                                                return err
                                        }
                                        var data []byte
                                        for _, dir := range dirs {
                                                if dir.Inode == 0 {
                                                        dir.Inode = c.dynamicInode(snode.inode, dir.Name)
                                                }
                                                data = fuse.AppendDirent(data, dir)
                                        }
                                        shandle.readData = data
                                }
                        }
                        fuseutil.HandleRead(r, s, shandle.readData)
                } else {
                        s.Data = fuse.GetBlockBuf(r.Size)
                        if h, ok := handle.(HandleReadAller); ok {
                                if shandle.readData == nil {
                                        data, err := h.ReadAll(ctx)
                                        if err != nil {
                                                return err
                                        }
                                        if data == nil {
                                                data = []byte{}
                                        }
                                        shandle.readData = data
                                }
                                fuseutil.HandleRead(r, s, shandle.readData)
                                done(s)
                                r.Respond(s)
                                return nil
                        }
                        h, ok := handle.(HandleReader)
                        if !ok {
                                err := handleNotReaderError{handle: handle}
                                return err
                        }
                        if err := h.Read(ctx, r, s); err != nil {
                                return err
                        }
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.WriteRequest:
                shandle := c.getHandle(r.Handle)
                if shandle == nil {
                        return fuse.ESTALE
                }

                s := &fuse.WriteResponse{}
                if h, ok := shandle.handle.(HandleWriter); ok {
                        if err := h.Write(ctx, r, s); err != nil {
                                return err
                        }
                        done(s)
                        r.Respond(s)
                        return nil
                }
                return fuse.EIO

        case *fuse.FlushRequest:
                shandle := c.getHandle(r.Handle)
                if shandle == nil {
                        return fuse.ESTALE
                }
                handle := shandle.handle

                if h, ok := handle.(HandleFlusher); ok {
                        if err := h.Flush(ctx, r); err != nil {
                                return err
                        }
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.ReleaseRequest:
                shandle := c.getHandle(r.Handle)
                if shandle == nil {
                        return fuse.ESTALE
                }
                handle := shandle.handle

                // No matter what, release the handle.
                c.dropHandle(r.Handle)

                if h, ok := handle.(HandleReleaser); ok {
                        if err := h.Release(ctx, r); err != nil {
                                return err
                        }
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.DestroyRequest:
                if fs, ok := c.fs.(FSDestroyer); ok {
                        fs.Destroy()
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.RenameRequest:
                c.meta.Lock()
                var newDirNode *serveNode
                if int(r.NewDir) < len(c.node) {
                        newDirNode = c.node[r.NewDir]
                }
                c.meta.Unlock()
                if newDirNode == nil {
                        c.debug(renameNewDirNodeNotFound{
                                Request: r.Hdr(),
                                In:      r,
                        })
                        return fuse.EIO
                }
                n, ok := node.(NodeRenamer)
                if !ok {
                        return fuse.EIO // XXX or EPERM like Mkdir?
                }
                err := n.Rename(ctx, r, newDirNode.node)
                if err != nil {
                        return err
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.MknodRequest:
                n, ok := node.(NodeMknoder)
                if !ok {
                        return fuse.EIO
                }
                n2, err := n.Mknod(ctx, r)
                if err != nil {
                        return err
                }
                s := &fuse.LookupResponse{}
                initLookupResponse(s)
                if err := c.saveLookup(ctx, s, snode, r.Name, n2); err != nil {
                        return err
                }
                done(s)
                r.Respond(s)
                return nil

        case *fuse.FsyncRequest:
                n, ok := node.(NodeFsyncer)
                if !ok {
                        return fuse.EIO
                }
                err := n.Fsync(ctx, r)
                if err != nil {
                        return err
                }
                done(nil)
                r.Respond()
                return nil

        case *fuse.InterruptRequest:
                c.meta.Lock()
                ireq := c.req[r.IntrID]
                if ireq != nil && ireq.cancel != nil {
                        ireq.cancel()
                        ireq.cancel = nil
                }
                c.meta.Unlock()
                done(nil)
                r.Respond()
                return nil

                /*        case *FsyncdirRequest:
                                return ENOSYS

                        case *GetlkRequest, *SetlkRequest, *SetlkwRequest:
                                return ENOSYS

                        case *BmapRequest:
                                return ENOSYS

                        case *SetvolnameRequest, *GetxtimesRequest, *ExchangeRequest:
                                return ENOSYS
                */
        }

        panic("not reached")
}

func (c *Server) saveLookup(ctx context.Context, s *fuse.LookupResponse, snode *serveNode, elem string, n2 Node) error {
        if err := nodeAttr(ctx, n2, &s.Attr); err != nil {
                return err
        }
        if s.Attr.Inode == 0 {
                s.Attr.Inode = c.dynamicInode(snode.inode, elem)
        }

        s.Node, s.Generation = c.saveNode(s.Attr.Inode, n2)
        return nil
}

type invalidateNodeDetail struct {
        Off  int64
        Size int64
}

func (i invalidateNodeDetail) String() string {
        return fmt.Sprintf("Off:%d Size:%d", i.Off, i.Size)
}

func errstr(err error) string {
        if err == nil {
                return ""
        }
        return err.Error()
}

func (s *Server) invalidateNode(node Node, off int64, size int64) error {
        s.meta.Lock()
        id, ok := s.nodeRef[node]
        if ok {
                snode := s.node[id]
                snode.wg.Add(1)
                defer snode.wg.Done()
        }
        s.meta.Unlock()
        if !ok {
                // This is what the kernel would have said, if we had been
                // able to send this message; it's not cached.
                return fuse.ErrNotCached
        }
        // Delay logging until after we can record the error too. We
        // consider a /dev/fuse write to be instantaneous enough to not
        // need separate before and after messages.
        err := s.conn.InvalidateNode(id, off, size)
        s.debug(notification{
                Op:   "InvalidateNode",
                Node: id,
                Out: invalidateNodeDetail{
                        Off:  off,
                        Size: size,
                },
                Err: errstr(err),
        })
        return err
}

// InvalidateNodeAttr invalidates the kernel cache of the attributes
// of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeAttr(node Node) error {
        return s.invalidateNode(node, 0, 0)
}

// InvalidateNodeData invalidates the kernel cache of the attributes
// and data of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeData(node Node) error {
        return s.invalidateNode(node, 0, -1)
}

// InvalidateNodeDataRange invalidates the kernel cache of the
// attributes and a range of the data of node.
//
// Returns fuse.ErrNotCached if the kernel is not currently caching
// the node.
func (s *Server) InvalidateNodeDataRange(node Node, off int64, size int64) error {
        return s.invalidateNode(node, off, size)
}

type invalidateEntryDetail struct {
        Name string
}

func (i invalidateEntryDetail) String() string {
        return fmt.Sprintf("%q", i.Name)
}

// InvalidateEntry invalidates the kernel cache of the directory entry
// identified by parent node and entry basename.
//
// Kernel may or may not cache directory listings. To invalidate
// those, use InvalidateNode to invalidate all of the data for a
// directory. (As of 2015-06, Linux FUSE does not cache directory
// listings.)
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (s *Server) InvalidateEntry(parent Node, name string) error {
        s.meta.Lock()
        id, ok := s.nodeRef[parent]
        if ok {
                snode := s.node[id]
                snode.wg.Add(1)
                defer snode.wg.Done()
        }
        s.meta.Unlock()
        if !ok {
                // This is what the kernel would have said, if we had been
                // able to send this message; it's not cached.
                return fuse.ErrNotCached
        }
        err := s.conn.InvalidateEntry(id, name)
        s.debug(notification{
                Op:   "InvalidateEntry",
                Node: id,
                Out: invalidateEntryDetail{
                        Name: name,
                },
                Err: errstr(err),
        })
        return err
}

// DataHandle returns a read-only Handle that satisfies reads
// using the given data.
func DataHandle(data []byte) Handle {
        return &dataHandle{data}
}

type dataHandle struct {
        data []byte
}

func (d *dataHandle) ReadAll(ctx context.Context) ([]byte, error) {
        return d.data, nil
}

// GenerateDynamicInode returns a dynamic inode.
//
// The parent inode and current entry name are used as the criteria
// for choosing a pseudorandom inode. This makes it likely the same
// entry will get the same inode on multiple runs.
func GenerateDynamicInode(parent uint64, name string) uint64 {
        h := fnv.New64a()
        var buf [8]byte
        binary.LittleEndian.PutUint64(buf[:], parent)
        _, _ = h.Write(buf[:])
        _, _ = h.Write([]byte(name))
        var inode uint64
        for {
                inode = h.Sum64()
                if inode != 0 {
                        break
                }
                // there's a tiny probability that result is zero; change the
                // input a little and try again
                _, _ = h.Write([]byte{'x'})
        }
        return inode
}

// FUSE directory tree, for servers that wish to use it with the service loop.

package fs

import (
        "os"
        pathpkg "path"
        "strings"

        "golang.org/x/net/context"
)

import (
        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
)

// A Tree implements a basic read-only directory tree for FUSE.
// The Nodes contained in it may still be writable.
type Tree struct {
        tree
}

func (t *Tree) Root() (Node, error) {
        return &t.tree, nil
}

// Add adds the path to the tree, resolving to the given node.
// If path or a prefix of path has already been added to the tree,
// Add panics.
//
// Add is only safe to call before starting to serve requests.
func (t *Tree) Add(path string, node Node) {
        path = pathpkg.Clean("/" + path)[1:]
        elems := strings.Split(path, "/")
        dir := Node(&t.tree)
        for i, elem := range elems {
                dt, ok := dir.(*tree)
                if !ok {
                        panic("fuse: Tree.Add for " + strings.Join(elems[:i], "/") + " and " + path)
                }
                n := dt.lookup(elem)
                if n != nil {
                        if i+1 == len(elems) {
                                panic("fuse: Tree.Add for " + path + " conflicts with " + elem)
                        }
                        dir = n
                } else {
                        if i+1 == len(elems) {
                                dt.add(elem, node)
                        } else {
                                dir = &tree{}
                                dt.add(elem, dir)
                        }
                }
        }
}

type treeDir struct {
        name string
        node Node
}

type tree struct {
        dir []treeDir
}

func (t *tree) lookup(name string) Node {
        for _, d := range t.dir {
                if d.name == name {
                        return d.node
                }
        }
        return nil
}

func (t *tree) add(name string, n Node) {
        t.dir = append(t.dir, treeDir{name, n})
}

func (t *tree) Attr(ctx context.Context, a *fuse.Attr) error {
        a.Mode = os.ModeDir | 0555
        return nil
}

func (t *tree) Lookup(ctx context.Context, name string) (Node, error) {
        n := t.lookup(name)
        if n != nil {
                return n, nil
        }
        return nil, fuse.ENOENT
}

func (t *tree) ReadDirAll(ctx context.Context) ([]fuse.Dirent, error) {
        var out []fuse.Dirent
        for _, d := range t.dir {
                out = append(out, fuse.Dirent{Name: d.name})
        }
        return out, nil
}

// See the file LICENSE for copyright and licensing information.

// Adapted from Plan 9 from User Space's src/cmd/9pfuse/fuse.c,
// which carries this notice:
//
// The files in this directory are subject to the following license.
//
// The author of this software is Russ Cox.
//
//         Copyright (c) 2006 Russ Cox
//
// Permission to use, copy, modify, and distribute this software for any
// purpose without fee is hereby granted, provided that this entire notice
// is included in all copies of any software which is or includes a copy
// or modification of this software and in all copies of the supporting
// documentation for such software.
//
// THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
// WARRANTY.  IN PARTICULAR, THE AUTHOR MAKES NO REPRESENTATION OR WARRANTY
// OF ANY KIND CONCERNING THE MERCHANTABILITY OF THIS SOFTWARE OR ITS
// FITNESS FOR ANY PARTICULAR PURPOSE.

// Package fuse enables writing FUSE file systems on Linux, OS X, and FreeBSD.
//
// On OS X, it requires OSXFUSE (http://osxfuse.github.com/).
//
// There are two approaches to writing a FUSE file system.  The first is to speak
// the low-level message protocol, reading from a Conn using ReadRequest and
// writing using the various Respond methods.  This approach is closest to
// the actual interaction with the kernel and can be the simplest one in contexts
// such as protocol translators.
//
// Servers of synthesized file systems tend to share common
// bookkeeping abstracted away by the second approach, which is to
// call fs.Serve to serve the FUSE protocol using an implementation of
// the service methods in the interfaces FS* (file system), Node* (file
// or directory), and Handle* (opened file or directory).
// There are a daunting number of such methods that can be written,
// but few are required.
// The specific methods are described in the documentation for those interfaces.
//
// The hellofs subdirectory contains a simple illustration of the fs.Serve approach.
//
// Service Methods
//
// The required and optional methods for the FS, Node, and Handle interfaces
// have the general form
//
//        Op(ctx context.Context, req *OpRequest, resp *OpResponse) error
//
// where Op is the name of a FUSE operation. Op reads request
// parameters from req and writes results to resp. An operation whose
// only result is the error result omits the resp parameter.
//
// Multiple goroutines may call service methods simultaneously; the
// methods being called are responsible for appropriate
// synchronization.
//
// The operation must not hold on to the request or response,
// including any []byte fields such as WriteRequest.Data or
// SetxattrRequest.Xattr.
//
// Errors
//
// Operations can return errors. The FUSE interface can only
// communicate POSIX errno error numbers to file system clients, the
// message is not visible to file system clients. The returned error
// can implement ErrorNumber to control the errno returned. Without
// ErrorNumber, a generic errno (EIO) is returned.
//
// Error messages will be visible in the debug log as part of the
// response.
//
// Interrupted Operations
//
// In some file systems, some operations
// may take an undetermined amount of time.  For example, a Read waiting for
// a network message or a matching Write might wait indefinitely.  If the request
// is cancelled and no longer needed, the context will be cancelled.
// Blocking operations should select on a receive from ctx.Done() and attempt to
// abort the operation early if the receive succeeds (meaning the channel is closed).
// To indicate that the operation failed because it was aborted, return fuse.EINTR.
//
// If an operation does not block for an indefinite amount of time, supporting
// cancellation is not necessary.
//
// Authentication
//
// All requests types embed a Header, meaning that the method can
// inspect req.Pid, req.Uid, and req.Gid as necessary to implement
// permission checking. The kernel FUSE layer normally prevents other
// users from accessing the FUSE file system (to change this, see
// AllowOther, AllowRoot), but does not enforce access modes (to
// change this, see DefaultPermissions).
//
// Mount Options
//
// Behavior and metadata of the mounted file system can be changed by
// passing MountOption values to Mount.
//
package fuse // import "github.com/cubefs/cubefs/depends/bazil.org/fuse"

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "os"
        "sync"
        "syscall"
        "time"
        "unsafe"
)

// A Conn represents a connection to a mounted FUSE file system.
type Conn struct {
        // Ready is closed when the mount is complete or has failed.
        Ready <-chan struct{}

        // MountError stores any error from the mount process. Only valid
        // after Ready is closed.
        MountError error

        // File handle for kernel communication. Only safe to access if
        // rio or wio is held.
        dev *os.File
        wio sync.RWMutex
        rio sync.RWMutex

        // Protocol version negotiated with InitRequest/InitResponse.
        proto Protocol
}

func (c *Conn) GetFuseDevFile() *os.File {
        return c.dev
}

func (c *Conn) SetFuseDevFile(fud *os.File) {
        c.dev = fud
}

// MountpointDoesNotExistError is an error returned when the
// mountpoint does not exist.
type MountpointDoesNotExistError struct {
        Path string
}

var _ error = (*MountpointDoesNotExistError)(nil)

func (e *MountpointDoesNotExistError) Error() string {
        return fmt.Sprintf("mountpoint does not exist: %v", e.Path)
}

// Mount mounts a new FUSE connection on the named directory
// and returns a connection for reading and writing FUSE messages.
//
// After a successful return, caller must call Close to free
// resources.
//
// Even on successful return, the new mount is not guaranteed to be
// visible until after Conn.Ready is closed. See Conn.MountError for
// possible errors. Incoming requests on Conn must be served to make
// progress.
func Mount(dir string, needRestoreFuse bool, options ...MountOption) (*Conn, error) {
        conf := mountConfig{
                options: make(map[string]string),
        }
        for _, option := range options {
                if err := option(&conf); err != nil {
                        return nil, err
                }
        }

        ready := make(chan struct{}, 1)
        c := &Conn{
                Ready: ready,
        }

        if !needRestoreFuse {
                f, err := mount(dir, &conf, ready, &c.MountError)
                if err != nil {
                        return nil, err
                }
                c.dev = f

                if err := initMount(c, &conf); err != nil {
                        c.Close()
                        if err == ErrClosedWithoutInit {
                                // see if we can provide a better error
                                <-c.Ready
                                if err := c.MountError; err != nil {
                                        return nil, err
                                }
                        }
                        return nil, err
                }
        } else {
                close(ready)
                // FIXME: save protocol version when saving context?
                c.proto = Protocol{protoVersionMaxMajor, protoVersionMaxMinor}
        }

        InitReadBlockPool()
        return c, nil
}

type OldVersionError struct {
        Kernel     Protocol
        LibraryMin Protocol
}

func (e *OldVersionError) Error() string {
        return fmt.Sprintf("kernel FUSE version is too old: %v < %v", e.Kernel, e.LibraryMin)
}

var (
        ErrClosedWithoutInit = errors.New("fuse connection closed without init")
)

func initMount(c *Conn, conf *mountConfig) error {
        req, err := c.ReadRequest()
        if err != nil {
                if err == io.EOF {
                        return ErrClosedWithoutInit
                }
                return err
        }
        r, ok := req.(*InitRequest)
        if !ok {
                return fmt.Errorf("missing init, got: %T", req)
        }

        min := Protocol{protoVersionMinMajor, protoVersionMinMinor}
        if r.Kernel.LT(min) {
                req.RespondError(Errno(syscall.EPROTO))
                c.Close()
                return &OldVersionError{
                        Kernel:     r.Kernel,
                        LibraryMin: min,
                }
        }

        proto := Protocol{protoVersionMaxMajor, protoVersionMaxMinor}
        if r.Kernel.LT(proto) {
                // Kernel doesn't support the latest version we have.
                proto = r.Kernel
        }
        c.proto = proto

        s := &InitResponse{
                Library:      proto,
                MaxReadahead: conf.maxReadahead,
                MaxWrite:     maxWrite,
                Flags:        InitBigWrites | conf.initFlags,
        }
        r.Respond(s)
        return nil
}

// A Request represents a single FUSE request received from the kernel.
// Use a type switch to determine the specific kind.
// A request of unrecognized type will have concrete type *Header.
type Request interface {
        // Hdr returns the Header associated with this request.
        Hdr() *Header

        // RespondError responds to the request with the given error.
        RespondError(error)

        String() string
}

// A RequestID identifies an active FUSE request.
type RequestID uint64

func (r RequestID) String() string {
        return fmt.Sprintf("%#x", uint64(r))
}

// A NodeID is a number identifying a directory or file.
// It must be unique among IDs returned in LookupResponses
// that have not yet been forgotten by ForgetRequests.
type NodeID uint64

func (n NodeID) String() string {
        return fmt.Sprintf("%#x", uint64(n))
}

// A HandleID is a number identifying an open directory or file.
// It only needs to be unique while the directory or file is open.
type HandleID uint64

func (h HandleID) String() string {
        return fmt.Sprintf("%#x", uint64(h))
}

// The RootID identifies the root directory of a FUSE file system.
const RootID NodeID = rootID

// A Header describes the basic information sent in every request.
type Header struct {
        Conn *Conn     `json:"-"` // connection this request was received on
        ID   RequestID // unique ID for request
        Node NodeID    // file or directory the request is about
        Uid  uint32    // user ID of process making request
        Gid  uint32    // group ID of process making request
        Pid  uint32    // process ID of process making request

        // for returning to reqPool
        msg *message
}

func (h *Header) String() string {
        return fmt.Sprintf("ID=%v Node=%v Uid=%d Gid=%d Pid=%d", h.ID, h.Node, h.Uid, h.Gid, h.Pid)
}

func (h *Header) Hdr() *Header {
        return h
}

func (h *Header) noResponse() {
        putMessage(h.msg)
}

func (h *Header) respond(msg []byte) {
        out := (*outHeader)(unsafe.Pointer(&msg[0]))
        out.Unique = uint64(h.ID)
        h.Conn.respond(msg)
        putMessage(h.msg)
}

func (h *Header) respondDoNotReuseMsg(msg []byte) {
        out := (*outHeader)(unsafe.Pointer(&msg[0]))
        out.Unique = uint64(h.ID)
        h.Conn.respond(msg)
}

// An ErrorNumber is an error with a specific error number.
//
// Operations may return an error value that implements ErrorNumber to
// control what specific error number (errno) to return.
type ErrorNumber interface {
        // Errno returns the the error number (errno) for this error.
        Errno() Errno
}

const (
        // ENOSYS indicates that the call is not supported.
        ENOSYS = Errno(syscall.ENOSYS)

        // ESTALE is used by Serve to respond to violations of the FUSE protocol.
        ESTALE = Errno(syscall.ESTALE)

        ENOENT = Errno(syscall.ENOENT)
        EIO    = Errno(syscall.EIO)
        EPERM  = Errno(syscall.EPERM)

        // EINTR indicates request was interrupted by an InterruptRequest.
        // See also fs.Intr.
        EINTR = Errno(syscall.EINTR)

        ERANGE    = Errno(syscall.ERANGE)
        ENOTSUP   = Errno(syscall.ENOTSUP)
        EEXIST    = Errno(syscall.EEXIST)
        ETIME     = Errno(syscall.ETIME)
        ETIMEDOUT = Errno(syscall.ETIMEDOUT)
)

// DefaultErrno is the errno used when error returned does not
// implement ErrorNumber.
const DefaultErrno = EIO

var errnoNames = map[Errno]string{
        ENOSYS:    "ENOSYS",
        ESTALE:    "ESTALE",
        ENOENT:    "ENOENT",
        EIO:       "EIO",
        EPERM:     "EPERM",
        EINTR:     "EINTR",
        EEXIST:    "EEXIST",
        ETIME:     "ETIME",
        ETIMEDOUT: "ETIMEDOUT",
}

// Errno implements Error and ErrorNumber using a syscall.Errno.
type Errno syscall.Errno

var _ = ErrorNumber(Errno(0))
var _ = error(Errno(0))

func (e Errno) Errno() Errno {
        return e
}

func (e Errno) String() string {
        return syscall.Errno(e).Error()
}

func (e Errno) Error() string {
        return syscall.Errno(e).Error()
}

// ErrnoName returns the short non-numeric identifier for this errno.
// For example, "EIO".
func (e Errno) ErrnoName() string {
        s := errnoNames[e]
        if s == "" {
                s = fmt.Sprint(e.Errno())
        }
        return s
}

func (e Errno) MarshalText() ([]byte, error) {
        s := e.ErrnoName()
        return []byte(s), nil
}

func (h *Header) RespondError(err error) {
        errno := DefaultErrno
        if ferr, ok := err.(ErrorNumber); ok {
                errno = ferr.Errno()
        }
        // FUSE uses negative errors!
        // TODO: File bug report against OSXFUSE: positive error causes kernel panic.
        buf := newBuffer(0)
        hOut := (*outHeader)(unsafe.Pointer(&buf[0]))
        hOut.Error = -int32(errno)
        h.respondDoNotReuseMsg(buf)
}

// All requests read from the kernel, without data, are shorter than
// this.
var maxRequestSize = syscall.Getpagesize()
var bufSize = maxRequestSize + maxWrite

// reqPool is a pool of messages.
//
// Lifetime of a logical message is from getMessage to putMessage.
// getMessage is called by ReadRequest. putMessage is called by
// Conn.ReadRequest, Request.Respond, or Request.RespondError.
//
// Messages in the pool are guaranteed to have conn and off zeroed,
// buf allocated and len==bufSize, and hdr set.
var reqPool = sync.Pool{
        New: allocMessage,
}

func allocMessage() interface{} {
        m := &message{buf: make([]byte, bufSize)}
        m.hdr = (*inHeader)(unsafe.Pointer(&m.buf[0]))
        return m
}

func getMessage(c *Conn) *message {
        m := reqPool.Get().(*message)
        m.conn = c
        return m
}

func putMessage(m *message) {
        m.buf = m.buf[:bufSize]
        m.conn = nil
        m.off = 0
        reqPool.Put(m)
}

// a message represents the bytes of a single FUSE message
type message struct {
        conn *Conn
        buf  []byte    // all bytes
        hdr  *inHeader // header
        off  int       // offset for reading additional fields
}

func (m *message) len() uintptr {
        return uintptr(len(m.buf) - m.off)
}

func (m *message) data() unsafe.Pointer {
        var p unsafe.Pointer
        if m.off < len(m.buf) {
                p = unsafe.Pointer(&m.buf[m.off])
        }
        return p
}

func (m *message) bytes() []byte {
        return m.buf[m.off:]
}

func (m *message) Header() Header {
        h := m.hdr
        return Header{
                Conn: m.conn,
                ID:   RequestID(h.Unique),
                Node: NodeID(h.Nodeid),
                Uid:  h.Uid,
                Gid:  h.Gid,
                Pid:  h.Pid,

                msg: m,
        }
}

// fileMode returns a Go os.FileMode from a Unix mode.
func fileMode(unixMode uint32) os.FileMode {
        mode := os.FileMode(unixMode & 0777)
        switch unixMode & syscall.S_IFMT {
        case syscall.S_IFREG:
                // nothing
        case syscall.S_IFDIR:
                mode |= os.ModeDir
        case syscall.S_IFCHR:
                mode |= os.ModeCharDevice | os.ModeDevice
        case syscall.S_IFBLK:
                mode |= os.ModeDevice
        case syscall.S_IFIFO:
                mode |= os.ModeNamedPipe
        case syscall.S_IFLNK:
                mode |= os.ModeSymlink
        case syscall.S_IFSOCK:
                mode |= os.ModeSocket
        default:
                // no idea
                mode |= os.ModeDevice
        }
        if unixMode&syscall.S_ISUID != 0 {
                mode |= os.ModeSetuid
        }
        if unixMode&syscall.S_ISGID != 0 {
                mode |= os.ModeSetgid
        }
        return mode
}

type noOpcode struct {
        Opcode uint32
}

func (m noOpcode) String() string {
        return fmt.Sprintf("No opcode %v", m.Opcode)
}

type malformedMessage struct {
}

func (malformedMessage) String() string {
        return "malformed message"
}

// Close closes the FUSE connection.
func (c *Conn) Close() error {
        c.wio.Lock()
        defer c.wio.Unlock()
        c.rio.Lock()
        defer c.rio.Unlock()
        return c.dev.Close()
}

// caller must hold wio or rio
func (c *Conn) fd() int {
        return int(c.dev.Fd())
}

func (c *Conn) Protocol() Protocol {
        return c.proto
}

// ReadRequest returns the next FUSE request from the kernel.
//
// Caller must call either Request.Respond or Request.RespondError in
// a reasonable time. Caller must not retain Request after that call.
func (c *Conn) ReadRequest() (Request, error) {
        m := getMessage(c)
loop:
        c.rio.RLock()
        n, err := syscall.Read(c.fd(), m.buf)
        c.rio.RUnlock()
        if err == syscall.EINTR {
                // OSXFUSE sends EINTR to userspace when a request interrupt
                // completed before it got sent to userspace?
                goto loop
        }
        if err != nil && err != syscall.ENODEV {
                putMessage(m)
                return nil, err
        }
        if n <= 0 {
                putMessage(m)
                return nil, io.EOF
        }
        m.buf = m.buf[:n]

        if n < inHeaderSize {
                putMessage(m)
                return nil, errors.New("fuse: message too short")
        }

        // FreeBSD FUSE sends a short length in the header
        // for FUSE_INIT even though the actual read length is correct.
        if n == inHeaderSize+initInSize && m.hdr.Opcode == opInit && m.hdr.Len < uint32(n) {
                m.hdr.Len = uint32(n)
        }

        // OSXFUSE sometimes sends the wrong m.hdr.Len in a FUSE_WRITE message.
        if m.hdr.Len < uint32(n) && m.hdr.Len >= uint32(unsafe.Sizeof(writeIn{})) && m.hdr.Opcode == opWrite {
                m.hdr.Len = uint32(n)
        }

        if m.hdr.Len != uint32(n) {
                // prepare error message before returning m to pool
                err := fmt.Errorf("fuse: read %d opcode %d but expected %d", n, m.hdr.Opcode, m.hdr.Len)
                putMessage(m)
                return nil, err
        }

        m.off = inHeaderSize

        // Convert to data structures.
        // Do not trust kernel to hand us well-formed data.
        var req Request
        switch m.hdr.Opcode {
        default:
                Debug(noOpcode{Opcode: m.hdr.Opcode})
                goto unrecognized

        case opLookup:
                buf := m.bytes()
                n := len(buf)
                if n == 0 || buf[n-1] != '\x00' {
                        goto corrupt
                }
                req = &LookupRequest{
                        Header: m.Header(),
                        Name:   string(buf[:n-1]),
                }

        case opForget:
                in := (*forgetIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &ForgetRequest{
                        Header: m.Header(),
                        N:      in.Nlookup,
                }

        case opGetattr:
                switch {
                case c.proto.LT(Protocol{7, 9}):
                        req = &GetattrRequest{
                                Header: m.Header(),
                        }

                default:
                        in := (*getattrIn)(m.data())
                        if m.len() < unsafe.Sizeof(*in) {
                                goto corrupt
                        }
                        req = &GetattrRequest{
                                Header: m.Header(),
                                Flags:  GetattrFlags(in.GetattrFlags),
                                Handle: HandleID(in.Fh),
                        }
                }

        case opSetattr:
                in := (*setattrIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &SetattrRequest{
                        Header:   m.Header(),
                        Valid:    SetattrValid(in.Valid),
                        Handle:   HandleID(in.Fh),
                        Size:     in.Size,
                        Atime:    time.Unix(int64(in.Atime), int64(in.AtimeNsec)),
                        Mtime:    time.Unix(int64(in.Mtime), int64(in.MtimeNsec)),
                        Mode:     fileMode(in.Mode),
                        Uid:      in.Uid,
                        Gid:      in.Gid,
                        Bkuptime: in.BkupTime(),
                        Chgtime:  in.Chgtime(),
                        Flags:    in.Flags(),
                }

        case opReadlink:
                if len(m.bytes()) > 0 {
                        goto corrupt
                }
                req = &ReadlinkRequest{
                        Header: m.Header(),
                }

        case opSymlink:
                // m.bytes() is "newName\0target\0"
                names := m.bytes()
                if len(names) == 0 || names[len(names)-1] != 0 {
                        goto corrupt
                }
                i := bytes.IndexByte(names, '\x00')
                if i < 0 {
                        goto corrupt
                }
                newName, target := names[0:i], names[i+1:len(names)-1]
                req = &SymlinkRequest{
                        Header:  m.Header(),
                        NewName: string(newName),
                        Target:  string(target),
                }

        case opLink:
                in := (*linkIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                newName := m.bytes()[unsafe.Sizeof(*in):]
                if len(newName) < 2 || newName[len(newName)-1] != 0 {
                        goto corrupt
                }
                newName = newName[:len(newName)-1]
                req = &LinkRequest{
                        Header:  m.Header(),
                        OldNode: NodeID(in.Oldnodeid),
                        NewName: string(newName),
                }

        case opMknod:
                size := mknodInSize(c.proto)
                if m.len() < size {
                        goto corrupt
                }
                in := (*mknodIn)(m.data())
                name := m.bytes()[size:]
                if len(name) < 2 || name[len(name)-1] != '\x00' {
                        goto corrupt
                }
                name = name[:len(name)-1]
                r := &MknodRequest{
                        Header: m.Header(),
                        Mode:   fileMode(in.Mode),
                        Rdev:   in.Rdev,
                        Name:   string(name),
                }
                if c.proto.GE(Protocol{7, 12}) {
                        r.Umask = fileMode(in.Umask) & os.ModePerm
                }
                req = r

        case opMkdir:
                size := mkdirInSize(c.proto)
                if m.len() < size {
                        goto corrupt
                }
                in := (*mkdirIn)(m.data())
                name := m.bytes()[size:]
                i := bytes.IndexByte(name, '\x00')
                if i < 0 {
                        goto corrupt
                }
                r := &MkdirRequest{
                        Header: m.Header(),
                        Name:   string(name[:i]),
                        // observed on Linux: mkdirIn.Mode & syscall.S_IFMT == 0,
                        // and this causes fileMode to go into it's "no idea"
                        // code branch; enforce type to directory
                        Mode: fileMode((in.Mode &^ syscall.S_IFMT) | syscall.S_IFDIR),
                }
                if c.proto.GE(Protocol{7, 12}) {
                        r.Umask = fileMode(in.Umask) & os.ModePerm
                }
                req = r

        case opUnlink, opRmdir:
                buf := m.bytes()
                n := len(buf)
                if n == 0 || buf[n-1] != '\x00' {
                        goto corrupt
                }
                req = &RemoveRequest{
                        Header: m.Header(),
                        Name:   string(buf[:n-1]),
                        Dir:    m.hdr.Opcode == opRmdir,
                }

        case opRename:
                in := (*renameIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                newDirNodeID := NodeID(in.Newdir)
                oldNew := m.bytes()[unsafe.Sizeof(*in):]
                // oldNew should be "old\x00new\x00"
                if len(oldNew) < 4 {
                        goto corrupt
                }
                if oldNew[len(oldNew)-1] != '\x00' {
                        goto corrupt
                }
                i := bytes.IndexByte(oldNew, '\x00')
                if i < 0 {
                        goto corrupt
                }
                oldName, newName := string(oldNew[:i]), string(oldNew[i+1:len(oldNew)-1])
                req = &RenameRequest{
                        Header:  m.Header(),
                        NewDir:  newDirNodeID,
                        OldName: oldName,
                        NewName: newName,
                }

        case opOpendir, opOpen:
                in := (*openIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &OpenRequest{
                        Header: m.Header(),
                        Dir:    m.hdr.Opcode == opOpendir,
                        Flags:  openFlags(in.Flags),
                }

        case opRead, opReaddir:
                in := (*readIn)(m.data())
                if m.len() < readInSize(c.proto) {
                        goto corrupt
                }
                r := &ReadRequest{
                        Header: m.Header(),
                        Dir:    m.hdr.Opcode == opReaddir,
                        Handle: HandleID(in.Fh),
                        Offset: int64(in.Offset),
                        Size:   int(in.Size),
                }
                if c.proto.GE(Protocol{7, 9}) {
                        r.Flags = ReadFlags(in.ReadFlags)
                        r.LockOwner = in.LockOwner
                        r.FileFlags = openFlags(in.Flags)
                }
                req = r

        case opWrite:
                in := (*writeIn)(m.data())
                if m.len() < writeInSize(c.proto) {
                        goto corrupt
                }
                r := &WriteRequest{
                        Header: m.Header(),
                        Handle: HandleID(in.Fh),
                        Offset: int64(in.Offset),
                        Flags:  WriteFlags(in.WriteFlags),
                }
                if c.proto.GE(Protocol{7, 9}) {
                        r.LockOwner = in.LockOwner
                        r.FileFlags = openFlags(in.Flags)
                }
                buf := m.bytes()[writeInSize(c.proto):]
                if uint32(len(buf)) < in.Size {
                        goto corrupt
                }
                r.Data = buf
                req = r

        case opStatfs:
                req = &StatfsRequest{
                        Header: m.Header(),
                }

        case opRelease, opReleasedir:
                in := (*releaseIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &ReleaseRequest{
                        Header:       m.Header(),
                        Dir:          m.hdr.Opcode == opReleasedir,
                        Handle:       HandleID(in.Fh),
                        Flags:        openFlags(in.Flags),
                        ReleaseFlags: ReleaseFlags(in.ReleaseFlags),
                        LockOwner:    in.LockOwner,
                }

        case opFsync, opFsyncdir:
                in := (*fsyncIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &FsyncRequest{
                        Dir:    m.hdr.Opcode == opFsyncdir,
                        Header: m.Header(),
                        Handle: HandleID(in.Fh),
                        Flags:  in.FsyncFlags,
                }

        case opSetxattr:
                in := (*setxattrIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                m.off += int(unsafe.Sizeof(*in))
                name := m.bytes()
                i := bytes.IndexByte(name, '\x00')
                if i < 0 {
                        goto corrupt
                }
                xattr := name[i+1:]
                if uint32(len(xattr)) < in.Size {
                        goto corrupt
                }
                xattr = xattr[:in.Size]
                req = &SetxattrRequest{
                        Header:   m.Header(),
                        Flags:    in.Flags,
                        Position: in.position(),
                        Name:     string(name[:i]),
                        Xattr:    xattr,
                }

        case opGetxattr:
                in := (*getxattrIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                name := m.bytes()[unsafe.Sizeof(*in):]
                i := bytes.IndexByte(name, '\x00')
                if i < 0 {
                        goto corrupt
                }
                req = &GetxattrRequest{
                        Header:   m.Header(),
                        Name:     string(name[:i]),
                        Size:     in.Size,
                        Position: in.position(),
                }

        case opListxattr:
                in := (*getxattrIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &ListxattrRequest{
                        Header:   m.Header(),
                        Size:     in.Size,
                        Position: in.position(),
                }

        case opRemovexattr:
                buf := m.bytes()
                n := len(buf)
                if n == 0 || buf[n-1] != '\x00' {
                        goto corrupt
                }
                req = &RemovexattrRequest{
                        Header: m.Header(),
                        Name:   string(buf[:n-1]),
                }

        case opFlush:
                in := (*flushIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &FlushRequest{
                        Header:    m.Header(),
                        Handle:    HandleID(in.Fh),
                        Flags:     in.FlushFlags,
                        LockOwner: in.LockOwner,
                }

        case opInit:
                in := (*initIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &InitRequest{
                        Header:       m.Header(),
                        Kernel:       Protocol{in.Major, in.Minor},
                        MaxReadahead: in.MaxReadahead,
                        Flags:        InitFlags(in.Flags),
                }

        case opGetlk:
                panic("opGetlk")
        case opSetlk:
                panic("opSetlk")
        case opSetlkw:
                panic("opSetlkw")

        case opAccess:
                in := (*accessIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &AccessRequest{
                        Header: m.Header(),
                        Mask:   in.Mask,
                }

        case opCreate:
                size := createInSize(c.proto)
                if m.len() < size {
                        goto corrupt
                }
                in := (*createIn)(m.data())
                name := m.bytes()[size:]
                i := bytes.IndexByte(name, '\x00')
                if i < 0 {
                        goto corrupt
                }
                r := &CreateRequest{
                        Header: m.Header(),
                        Flags:  openFlags(in.Flags),
                        Mode:   fileMode(in.Mode),
                        Name:   string(name[:i]),
                }
                if c.proto.GE(Protocol{7, 12}) {
                        r.Umask = fileMode(in.Umask) & os.ModePerm
                }
                req = r

        case opInterrupt:
                in := (*interruptIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                req = &InterruptRequest{
                        Header: m.Header(),
                        IntrID: RequestID(in.Unique),
                }

        case opBmap:
                panic("opBmap")

        case opDestroy:
                req = &DestroyRequest{
                        Header: m.Header(),
                }

        // OS X
        case opSetvolname:
                panic("opSetvolname")
        case opGetxtimes:
                panic("opGetxtimes")
        case opExchange:
                in := (*exchangeIn)(m.data())
                if m.len() < unsafe.Sizeof(*in) {
                        goto corrupt
                }
                oldDirNodeID := NodeID(in.Olddir)
                newDirNodeID := NodeID(in.Newdir)
                oldNew := m.bytes()[unsafe.Sizeof(*in):]
                // oldNew should be "oldname\x00newname\x00"
                if len(oldNew) < 4 {
                        goto corrupt
                }
                if oldNew[len(oldNew)-1] != '\x00' {
                        goto corrupt
                }
                i := bytes.IndexByte(oldNew, '\x00')
                if i < 0 {
                        goto corrupt
                }
                oldName, newName := string(oldNew[:i]), string(oldNew[i+1:len(oldNew)-1])
                req = &ExchangeDataRequest{
                        Header:  m.Header(),
                        OldDir:  oldDirNodeID,
                        NewDir:  newDirNodeID,
                        OldName: oldName,
                        NewName: newName,
                        // TODO options
                }
        }

        return req, nil

corrupt:
        Debug(malformedMessage{})
        putMessage(m)
        return nil, fmt.Errorf("fuse: malformed message")

unrecognized:
        // Unrecognized message.
        // Assume higher-level code will send a "no idea what you mean" error.
        h := m.Header()
        return &h, nil
}

type bugShortKernelWrite struct {
        Written int64
        Length  int64
        Error   string
        Stack   string
}

func (b bugShortKernelWrite) String() string {
        return fmt.Sprintf("short kernel write: written=%d/%d error=%q stack=\n%s", b.Written, b.Length, b.Error, b.Stack)
}

type bugKernelWriteError struct {
        Error string
        Stack string
}

func (b bugKernelWriteError) String() string {
        return fmt.Sprintf("kernel write error: error=%q stack=\n%s", b.Error, b.Stack)
}

// safe to call even with nil error
func errorString(err error) string {
        if err == nil {
                return ""
        }
        return err.Error()
}

func (c *Conn) writeToKernel(msg []byte) error {
        out := (*outHeader)(unsafe.Pointer(&msg[0]))
        out.Len = uint32(len(msg))

        c.wio.RLock()
        defer c.wio.RUnlock()
        nn, err := syscall.Write(c.fd(), msg)
        if err == nil && nn != len(msg) {
                Debug(bugShortKernelWrite{
                        Written: int64(nn),
                        Length:  int64(len(msg)),
                        Error:   errorString(err),
                        Stack:   stack(),
                })
        }
        return err
}

func (c *Conn) respond(msg []byte) {
        if err := c.writeToKernel(msg); err != nil {
                Debug(bugKernelWriteError{
                        Error: errorString(err),
                        Stack: stack(),
                })
        }
}

type notCachedError struct{}

func (notCachedError) Error() string {
        return "node not cached"
}

var _ ErrorNumber = notCachedError{}

func (notCachedError) Errno() Errno {
        // Behave just like if the original syscall.ENOENT had been passed
        // straight through.
        return ENOENT
}

var (
        ErrNotCached = notCachedError{}
)

// sendInvalidate sends an invalidate notification to kernel.
//
// A returned ENOENT is translated to a friendlier error.
func (c *Conn) sendInvalidate(msg []byte) error {
        switch err := c.writeToKernel(msg); err {
        case syscall.ENOENT:
                return ErrNotCached
        default:
                return err
        }
}

// InvalidateNode invalidates the kernel cache of the attributes and a
// range of the data of a node.
//
// Giving offset 0 and size -1 means all data. To invalidate just the
// attributes, give offset 0 and size 0.
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (c *Conn) InvalidateNode(nodeID NodeID, off int64, size int64) error {
        buf := newBuffer(unsafe.Sizeof(notifyInvalInodeOut{}))
        h := (*outHeader)(unsafe.Pointer(&buf[0]))
        // h.Unique is 0
        h.Error = notifyCodeInvalInode
        out := (*notifyInvalInodeOut)(buf.alloc(unsafe.Sizeof(notifyInvalInodeOut{})))
        out.Ino = uint64(nodeID)
        out.Off = off
        out.Len = size
        return c.sendInvalidate(buf)
}

// InvalidateEntry invalidates the kernel cache of the directory entry
// identified by parent directory node ID and entry basename.
//
// Kernel may or may not cache directory listings. To invalidate
// those, use InvalidateNode to invalidate all of the data for a
// directory. (As of 2015-06, Linux FUSE does not cache directory
// listings.)
//
// Returns ErrNotCached if the kernel is not currently caching the
// node.
func (c *Conn) InvalidateEntry(parent NodeID, name string) error {
        const maxUint32 = ^uint32(0)
        if uint64(len(name)) > uint64(maxUint32) {
                // very unlikely, but we don't want to silently truncate
                return syscall.ENAMETOOLONG
        }
        buf := newBuffer(unsafe.Sizeof(notifyInvalEntryOut{}) + uintptr(len(name)) + 1)
        h := (*outHeader)(unsafe.Pointer(&buf[0]))
        // h.Unique is 0
        h.Error = notifyCodeInvalEntry
        out := (*notifyInvalEntryOut)(buf.alloc(unsafe.Sizeof(notifyInvalEntryOut{})))
        out.Parent = uint64(parent)
        out.Namelen = uint32(len(name))
        buf = append(buf, name...)
        buf = append(buf, '\x00')
        return c.sendInvalidate(buf)
}

// An InitRequest is the first request sent on a FUSE file system.
type InitRequest struct {
        Header `json:"-"`
        Kernel Protocol
        // Maximum readahead in bytes that the kernel plans to use.
        MaxReadahead uint32
        Flags        InitFlags
}

var _ = Request(&InitRequest{})

func (r *InitRequest) String() string {
        return fmt.Sprintf("Init [%v] %v ra=%d fl=%v", &r.Header, r.Kernel, r.MaxReadahead, r.Flags)
}

// An InitResponse is the response to an InitRequest.
type InitResponse struct {
        Library Protocol
        // Maximum readahead in bytes that the kernel can use. Ignored if
        // greater than InitRequest.MaxReadahead.
        MaxReadahead uint32
        Flags        InitFlags
        // Maximum size of a single write operation.
        // Linux enforces a minimum of 4 KiB.
        MaxWrite uint32
}

func (r *InitResponse) String() string {
        return fmt.Sprintf("Init %v ra=%d fl=%v w=%d", r.Library, r.MaxReadahead, r.Flags, r.MaxWrite)
}

// Respond replies to the request with the given response.
func (r *InitRequest) Respond(resp *InitResponse) {
        buf := newBuffer(unsafe.Sizeof(initOut{}))
        out := (*initOut)(buf.alloc(unsafe.Sizeof(initOut{})))
        out.Major = resp.Library.Major
        out.Minor = resp.Library.Minor
        out.MaxReadahead = resp.MaxReadahead
        out.Flags = uint32(resp.Flags)
        out.MaxWrite = resp.MaxWrite

        // MaxWrite larger than our receive buffer would just lead to
        // errors on large writes.
        if out.MaxWrite > maxWrite {
                out.MaxWrite = maxWrite
        }
        r.respond(buf)
}

// A StatfsRequest requests information about the mounted file system.
type StatfsRequest struct {
        Header `json:"-"`
}

var _ = Request(&StatfsRequest{})

func (r *StatfsRequest) String() string {
        return fmt.Sprintf("Statfs [%s]", &r.Header)
}

// Respond replies to the request with the given response.
func (r *StatfsRequest) Respond(resp *StatfsResponse) {
        buf := newBuffer(unsafe.Sizeof(statfsOut{}))
        out := (*statfsOut)(buf.alloc(unsafe.Sizeof(statfsOut{})))
        out.St = kstatfs{
                Blocks:  resp.Blocks,
                Bfree:   resp.Bfree,
                Bavail:  resp.Bavail,
                Files:   resp.Files,
                Ffree:   resp.Ffree,
                Bsize:   resp.Bsize,
                Namelen: resp.Namelen,
                Frsize:  resp.Frsize,
        }
        r.respond(buf)
}

// A StatfsResponse is the response to a StatfsRequest.
type StatfsResponse struct {
        Blocks  uint64 // Total data blocks in file system.
        Bfree   uint64 // Free blocks in file system.
        Bavail  uint64 // Free blocks in file system if you're not root.
        Files   uint64 // Total files in file system.
        Ffree   uint64 // Free files in file system.
        Bsize   uint32 // Block size
        Namelen uint32 // Maximum file name length?
        Frsize  uint32 // Fragment size, smallest addressable data size in the file system.
}

func (r *StatfsResponse) String() string {
        return fmt.Sprintf("Statfs blocks=%d/%d/%d files=%d/%d bsize=%d frsize=%d namelen=%d",
                r.Bavail, r.Bfree, r.Blocks,
                r.Ffree, r.Files,
                r.Bsize,
                r.Frsize,
                r.Namelen,
        )
}

// An AccessRequest asks whether the file can be accessed
// for the purpose specified by the mask.
type AccessRequest struct {
        Header `json:"-"`
        Mask   uint32
}

var _ = Request(&AccessRequest{})

func (r *AccessRequest) String() string {
        return fmt.Sprintf("Access [%s] mask=%#x", &r.Header, r.Mask)
}

// Respond replies to the request indicating that access is allowed.
// To deny access, use RespondError.
func (r *AccessRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// An Attr is the metadata for a single file or directory.
type Attr struct {
        Valid time.Duration // how long Attr can be cached

        Inode     uint64      // inode number
        Size      uint64      // size in bytes
        Blocks    uint64      // size in 512-byte units
        Atime     time.Time   // time of last access
        Mtime     time.Time   // time of last modification
        Ctime     time.Time   // time of last inode change
        Crtime    time.Time   // time of creation (OS X only)
        Mode      os.FileMode // file mode
        Nlink     uint32      // number of links (usually 1)
        Uid       uint32      // owner uid
        Gid       uint32      // group gid
        Rdev      uint32      // device numbers
        Flags     uint32      // chflags(2) flags (OS X only)
        BlockSize uint32      // preferred blocksize for filesystem I/O

        ParentIno uint64 // for cubefs's file only
}

func (a Attr) String() string {
        return fmt.Sprintf("valid=%v ino=%v size=%d mode=%v", a.Valid, a.Inode, a.Size, a.Mode)
}

func unix(t time.Time) (sec uint64, nsec uint32) {
        nano := t.UnixNano()
        sec = uint64(nano / 1e9)
        nsec = uint32(nano % 1e9)
        return
}

func (a *Attr) attr(out *attr, proto Protocol) {
        out.Ino = a.Inode
        out.Size = a.Size
        out.Blocks = a.Blocks
        out.Atime, out.AtimeNsec = unix(a.Atime)
        out.Mtime, out.MtimeNsec = unix(a.Mtime)
        out.Ctime, out.CtimeNsec = unix(a.Ctime)
        out.SetCrtime(unix(a.Crtime))
        out.Mode = uint32(a.Mode) & 0777
        switch {
        default:
                out.Mode |= syscall.S_IFREG
        case a.Mode&os.ModeDir != 0:
                out.Mode |= syscall.S_IFDIR
        case a.Mode&os.ModeDevice != 0:
                if a.Mode&os.ModeCharDevice != 0 {
                        out.Mode |= syscall.S_IFCHR
                } else {
                        out.Mode |= syscall.S_IFBLK
                }
        case a.Mode&os.ModeNamedPipe != 0:
                out.Mode |= syscall.S_IFIFO
        case a.Mode&os.ModeSymlink != 0:
                out.Mode |= syscall.S_IFLNK
        case a.Mode&os.ModeSocket != 0:
                out.Mode |= syscall.S_IFSOCK
        }
        if a.Mode&os.ModeSetuid != 0 {
                out.Mode |= syscall.S_ISUID
        }
        if a.Mode&os.ModeSetgid != 0 {
                out.Mode |= syscall.S_ISGID
        }
        out.Nlink = a.Nlink
        out.Uid = a.Uid
        out.Gid = a.Gid
        out.Rdev = a.Rdev
        out.SetFlags(a.Flags)
        if proto.GE(Protocol{7, 9}) {
                out.Blksize = a.BlockSize
        }

        return
}

// A GetattrRequest asks for the metadata for the file denoted by r.Node.
type GetattrRequest struct {
        Header `json:"-"`
        Flags  GetattrFlags
        Handle HandleID
}

var _ = Request(&GetattrRequest{})

func (r *GetattrRequest) String() string {
        return fmt.Sprintf("Getattr [%s] %v fl=%v", &r.Header, r.Handle, r.Flags)
}

// Respond replies to the request with the given response.
func (r *GetattrRequest) Respond(resp *GetattrResponse) {
        size := attrOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*attrOut)(buf.alloc(size))
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A GetattrResponse is the response to a GetattrRequest.
type GetattrResponse struct {
        Attr Attr // file attributes
}

func (r *GetattrResponse) String() string {
        return fmt.Sprintf("Getattr %v", r.Attr)
}

// A GetxattrRequest asks for the extended attributes associated with r.Node.
type GetxattrRequest struct {
        Header `json:"-"`

        // Maximum size to return.
        Size uint32

        // Name of the attribute requested.
        Name string

        // Offset within extended attributes.
        //
        // Only valid for OS X, and then only with the resource fork
        // attribute.
        Position uint32
}

var _ = Request(&GetxattrRequest{})

func (r *GetxattrRequest) String() string {
        return fmt.Sprintf("Getxattr [%s] %q %d @%d", &r.Header, r.Name, r.Size, r.Position)
}

// Respond replies to the request with the given response.
func (r *GetxattrRequest) Respond(resp *GetxattrResponse) {
        if r.Size == 0 {
                buf := newBuffer(unsafe.Sizeof(getxattrOut{}))
                out := (*getxattrOut)(buf.alloc(unsafe.Sizeof(getxattrOut{})))
                out.Size = uint32(len(resp.Xattr))
                r.respond(buf)
        } else {
                buf := newBuffer(uintptr(len(resp.Xattr)))
                buf = append(buf, resp.Xattr...)
                r.respond(buf)
        }
}

// A GetxattrResponse is the response to a GetxattrRequest.
type GetxattrResponse struct {
        Xattr []byte
}

func (r *GetxattrResponse) String() string {
        return fmt.Sprintf("Getxattr %x", r.Xattr)
}

// A ListxattrRequest asks to list the extended attributes associated with r.Node.
type ListxattrRequest struct {
        Header   `json:"-"`
        Size     uint32 // maximum size to return
        Position uint32 // offset within attribute list
}

var _ = Request(&ListxattrRequest{})

func (r *ListxattrRequest) String() string {
        return fmt.Sprintf("Listxattr [%s] %d @%d", &r.Header, r.Size, r.Position)
}

// Respond replies to the request with the given response.
func (r *ListxattrRequest) Respond(resp *ListxattrResponse) {
        if r.Size == 0 {
                buf := newBuffer(unsafe.Sizeof(getxattrOut{}))
                out := (*getxattrOut)(buf.alloc(unsafe.Sizeof(getxattrOut{})))
                out.Size = uint32(len(resp.Xattr))
                r.respond(buf)
        } else {
                buf := newBuffer(uintptr(len(resp.Xattr)))
                buf = append(buf, resp.Xattr...)
                r.respond(buf)
        }
}

// A ListxattrResponse is the response to a ListxattrRequest.
type ListxattrResponse struct {
        Xattr []byte
}

func (r *ListxattrResponse) String() string {
        return fmt.Sprintf("Listxattr %x", r.Xattr)
}

// Append adds an extended attribute name to the response.
func (r *ListxattrResponse) Append(names ...string) {
        for _, name := range names {
                r.Xattr = append(r.Xattr, name...)
                r.Xattr = append(r.Xattr, '\x00')
        }
}

// A RemovexattrRequest asks to remove an extended attribute associated with r.Node.
type RemovexattrRequest struct {
        Header `json:"-"`
        Name   string // name of extended attribute
}

var _ = Request(&RemovexattrRequest{})

func (r *RemovexattrRequest) String() string {
        return fmt.Sprintf("Removexattr [%s] %q", &r.Header, r.Name)
}

// Respond replies to the request, indicating that the attribute was removed.
func (r *RemovexattrRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A SetxattrRequest asks to set an extended attribute associated with a file.
type SetxattrRequest struct {
        Header `json:"-"`

        // Flags can make the request fail if attribute does/not already
        // exist. Unfortunately, the constants are platform-specific and
        // not exposed by Go1.2. Look for XATTR_CREATE, XATTR_REPLACE.
        //
        // TODO improve this later
        //
        // TODO XATTR_CREATE and exist -> EEXIST
        //
        // TODO XATTR_REPLACE and not exist -> ENODATA
        Flags uint32

        // Offset within extended attributes.
        //
        // Only valid for OS X, and then only with the resource fork
        // attribute.
        Position uint32

        Name  string
        Xattr []byte
}

var _ = Request(&SetxattrRequest{})

func trunc(b []byte, max int) ([]byte, string) {
        if len(b) > max {
                return b[:max], "..."
        }
        return b, ""
}

func (r *SetxattrRequest) String() string {
        xattr, tail := trunc(r.Xattr, 16)
        return fmt.Sprintf("Setxattr [%s] %q %x%s fl=%v @%#x", &r.Header, r.Name, xattr, tail, r.Flags, r.Position)
}

// Respond replies to the request, indicating that the extended attribute was set.
func (r *SetxattrRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A LookupRequest asks to look up the given name in the directory named by r.Node.
type LookupRequest struct {
        Header `json:"-"`
        Name   string
}

var _ = Request(&LookupRequest{})

func (r *LookupRequest) String() string {
        return fmt.Sprintf("Lookup [%s] %q", &r.Header, r.Name)
}

// Respond replies to the request with the given response.
func (r *LookupRequest) Respond(resp *LookupResponse) {
        size := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*entryOut)(buf.alloc(size))
        out.Nodeid = uint64(resp.Node)
        out.Generation = resp.Generation
        out.EntryValid = uint64(resp.EntryValid / time.Second)
        out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A LookupResponse is the response to a LookupRequest.
type LookupResponse struct {
        Node       NodeID
        Generation uint64
        EntryValid time.Duration
        Attr       Attr
}

func (r *LookupResponse) string() string {
        return fmt.Sprintf("%v gen=%d valid=%v attr={%v}", r.Node, r.Generation, r.EntryValid, r.Attr)
}

func (r *LookupResponse) String() string {
        return fmt.Sprintf("Lookup %s", r.string())
}

// An OpenRequest asks to open a file or directory
type OpenRequest struct {
        Header `json:"-"`
        Dir    bool // is this Opendir?
        Flags  OpenFlags
}

var _ = Request(&OpenRequest{})

func (r *OpenRequest) String() string {
        return fmt.Sprintf("Open [%s] dir=%v fl=%v", &r.Header, r.Dir, r.Flags)
}

// Respond replies to the request with the given response.
func (r *OpenRequest) Respond(resp *OpenResponse) {
        buf := newBuffer(unsafe.Sizeof(openOut{}))
        out := (*openOut)(buf.alloc(unsafe.Sizeof(openOut{})))
        out.Fh = uint64(resp.Handle)
        out.OpenFlags = uint32(resp.Flags)
        r.respond(buf)
}

// A OpenResponse is the response to a OpenRequest.
type OpenResponse struct {
        Handle HandleID
        Flags  OpenResponseFlags
}

func (r *OpenResponse) string() string {
        return fmt.Sprintf("%v fl=%v", r.Handle, r.Flags)
}

func (r *OpenResponse) String() string {
        return fmt.Sprintf("Open %s", r.string())
}

// A CreateRequest asks to create and open a file (not a directory).
type CreateRequest struct {
        Header `json:"-"`
        Name   string
        Flags  OpenFlags
        Mode   os.FileMode
        // Umask of the request. Not supported on OS X.
        Umask os.FileMode
}

var _ = Request(&CreateRequest{})

func (r *CreateRequest) String() string {
        return fmt.Sprintf("Create [%s] %q fl=%v mode=%v umask=%v", &r.Header, r.Name, r.Flags, r.Mode, r.Umask)
}

// Respond replies to the request with the given response.
func (r *CreateRequest) Respond(resp *CreateResponse) {
        eSize := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(eSize + unsafe.Sizeof(openOut{}))

        e := (*entryOut)(buf.alloc(eSize))
        e.Nodeid = uint64(resp.Node)
        e.Generation = resp.Generation
        e.EntryValid = uint64(resp.EntryValid / time.Second)
        e.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        e.AttrValid = uint64(resp.Attr.Valid / time.Second)
        e.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&e.Attr, r.Header.Conn.proto)

        o := (*openOut)(buf.alloc(unsafe.Sizeof(openOut{})))
        o.Fh = uint64(resp.Handle)
        o.OpenFlags = uint32(resp.Flags)

        r.respond(buf)
}

// A CreateResponse is the response to a CreateRequest.
// It describes the created node and opened handle.
type CreateResponse struct {
        LookupResponse
        OpenResponse
}

func (r *CreateResponse) String() string {
        return fmt.Sprintf("Create {%s} {%s}", r.LookupResponse.string(), r.OpenResponse.string())
}

// A MkdirRequest asks to create (but not open) a directory.
type MkdirRequest struct {
        Header `json:"-"`
        Name   string
        Mode   os.FileMode
        // Umask of the request. Not supported on OS X.
        Umask os.FileMode
}

var _ = Request(&MkdirRequest{})

func (r *MkdirRequest) String() string {
        return fmt.Sprintf("Mkdir [%s] %q mode=%v umask=%v", &r.Header, r.Name, r.Mode, r.Umask)
}

// Respond replies to the request with the given response.
func (r *MkdirRequest) Respond(resp *MkdirResponse) {
        size := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*entryOut)(buf.alloc(size))
        out.Nodeid = uint64(resp.Node)
        out.Generation = resp.Generation
        out.EntryValid = uint64(resp.EntryValid / time.Second)
        out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A MkdirResponse is the response to a MkdirRequest.
type MkdirResponse struct {
        LookupResponse
}

func (r *MkdirResponse) String() string {
        return fmt.Sprintf("Mkdir %v", r.LookupResponse.string())
}

// A ReadRequest asks to read from an open file.
type ReadRequest struct {
        Header    `json:"-"`
        Dir       bool // is this Readdir?
        Handle    HandleID
        Offset    int64
        Size      int
        Flags     ReadFlags
        LockOwner uint64
        FileFlags OpenFlags
}

var _ = Request(&ReadRequest{})

func (r *ReadRequest) String() string {
        return fmt.Sprintf("Read [%s] %v %d @%#x dir=%v fl=%v lock=%d ffl=%v", &r.Header, r.Handle, r.Size, r.Offset, r.Dir, r.Flags, r.LockOwner, r.FileFlags)
}

// Respond replies to the request with the given response.
func (r *ReadRequest) Respond(resp *ReadResponse) {
        if r.Dir {
                buf := newBuffer(uintptr(len(resp.Data)))
                buf = append(buf, resp.Data...)
                r.respond(buf)
        } else {
                r.respond(resp.Data)
                PutBlockBuf(resp.Data)
        }
}

// A ReadResponse is the response to a ReadRequest.
type ReadResponse struct {
        Data []byte
}

func (r *ReadResponse) String() string {
        return fmt.Sprintf("Read %d", len(r.Data))
}

type jsonReadResponse struct {
        Len uint64
}

func (r *ReadResponse) MarshalJSON() ([]byte, error) {
        j := jsonReadResponse{
                Len: uint64(len(r.Data)),
        }
        return json.Marshal(j)
}

// A ReleaseRequest asks to release (close) an open file handle.
type ReleaseRequest struct {
        Header       `json:"-"`
        Dir          bool // is this Releasedir?
        Handle       HandleID
        Flags        OpenFlags // flags from OpenRequest
        ReleaseFlags ReleaseFlags
        LockOwner    uint32
}

var _ = Request(&ReleaseRequest{})

func (r *ReleaseRequest) String() string {
        return fmt.Sprintf("Release [%s] %v fl=%v rfl=%v owner=%#x", &r.Header, r.Handle, r.Flags, r.ReleaseFlags, r.LockOwner)
}

// Respond replies to the request, indicating that the handle has been released.
func (r *ReleaseRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A DestroyRequest is sent by the kernel when unmounting the file system.
// No more requests will be received after this one, but it should still be
// responded to.
type DestroyRequest struct {
        Header `json:"-"`
}

var _ = Request(&DestroyRequest{})

func (r *DestroyRequest) String() string {
        return fmt.Sprintf("Destroy [%s]", &r.Header)
}

// Respond replies to the request.
func (r *DestroyRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A ForgetRequest is sent by the kernel when forgetting about r.Node
// as returned by r.N lookup requests.
type ForgetRequest struct {
        Header `json:"-"`
        N      uint64
}

var _ = Request(&ForgetRequest{})

func (r *ForgetRequest) String() string {
        return fmt.Sprintf("Forget [%s] %d", &r.Header, r.N)
}

// Respond replies to the request, indicating that the forgetfulness has been recorded.
func (r *ForgetRequest) Respond() {
        // Don't reply to forget messages.
        r.noResponse()
}

// A Dirent represents a single directory entry.
type Dirent struct {
        // Inode this entry names.
        Inode uint64

        // Type of the entry, for example DT_File.
        //
        // Setting this is optional. The zero value (DT_Unknown) means
        // callers will just need to do a Getattr when the type is
        // needed. Providing a type can speed up operations
        // significantly.
        Type DirentType

        // Name of the entry
        Name string
}

// Type of an entry in a directory listing.
type DirentType uint32

const (
        // These don't quite match os.FileMode; especially there's an
        // explicit unknown, instead of zero value meaning file. They
        // are also not quite syscall.DT_*; nothing says the FUSE
        // protocol follows those, and even if they were, we don't
        // want each fs to fiddle with syscall.

        // The shift by 12 is hardcoded in the FUSE userspace
        // low-level C library, so it's safe here.

        DT_Unknown DirentType = 0
        DT_Socket  DirentType = syscall.S_IFSOCK >> 12
        DT_Link    DirentType = syscall.S_IFLNK >> 12
        DT_File    DirentType = syscall.S_IFREG >> 12
        DT_Block   DirentType = syscall.S_IFBLK >> 12
        DT_Dir     DirentType = syscall.S_IFDIR >> 12
        DT_Char    DirentType = syscall.S_IFCHR >> 12
        DT_FIFO    DirentType = syscall.S_IFIFO >> 12
)

func (t DirentType) String() string {
        switch t {
        case DT_Unknown:
                return "unknown"
        case DT_Socket:
                return "socket"
        case DT_Link:
                return "link"
        case DT_File:
                return "file"
        case DT_Block:
                return "block"
        case DT_Dir:
                return "dir"
        case DT_Char:
                return "char"
        case DT_FIFO:
                return "fifo"
        }
        return "invalid"
}

// AppendDirent appends the encoded form of a directory entry to data
// and returns the resulting slice.
func AppendDirent(data []byte, dir Dirent) []byte {
        de := dirent{
                Ino:     dir.Inode,
                Namelen: uint32(len(dir.Name)),
                Type:    uint32(dir.Type),
        }
        de.Off = uint64(len(data) + direntSize + (len(dir.Name)+7)&^7)
        data = append(data, (*[direntSize]byte)(unsafe.Pointer(&de))[:]...)
        data = append(data, dir.Name...)
        n := direntSize + uintptr(len(dir.Name))
        if n%8 != 0 {
                var pad [8]byte
                data = append(data, pad[:8-n%8]...)
        }
        return data
}

// A WriteRequest asks to write to an open file.
type WriteRequest struct {
        Header
        Handle    HandleID
        Offset    int64
        Data      []byte
        Flags     WriteFlags
        LockOwner uint64
        FileFlags OpenFlags
}

var _ = Request(&WriteRequest{})

func (r *WriteRequest) String() string {
        return fmt.Sprintf("Write [%s] %v %d @%d fl=%v lock=%d ffl=%v", &r.Header, r.Handle, len(r.Data), r.Offset, r.Flags, r.LockOwner, r.FileFlags)
}

type jsonWriteRequest struct {
        Handle HandleID
        Offset int64
        Len    uint64
        Flags  WriteFlags
}

func (r *WriteRequest) MarshalJSON() ([]byte, error) {
        j := jsonWriteRequest{
                Handle: r.Handle,
                Offset: r.Offset,
                Len:    uint64(len(r.Data)),
                Flags:  r.Flags,
        }
        return json.Marshal(j)
}

// Respond replies to the request with the given response.
func (r *WriteRequest) Respond(resp *WriteResponse) {
        buf := newBuffer(unsafe.Sizeof(writeOut{}))
        out := (*writeOut)(buf.alloc(unsafe.Sizeof(writeOut{})))
        out.Size = uint32(resp.Size)
        r.respond(buf)
}

// A WriteResponse replies to a write indicating how many bytes were written.
type WriteResponse struct {
        Size int
}

func (r *WriteResponse) String() string {
        return fmt.Sprintf("Write %d", r.Size)
}

// A SetattrRequest asks to change one or more attributes associated with a file,
// as indicated by Valid.
type SetattrRequest struct {
        Header `json:"-"`
        Valid  SetattrValid
        Handle HandleID
        Size   uint64
        Atime  time.Time
        Mtime  time.Time
        Mode   os.FileMode
        Uid    uint32
        Gid    uint32

        // OS X only
        Bkuptime time.Time
        Chgtime  time.Time
        Crtime   time.Time
        Flags    uint32 // see chflags(2)
}

var _ = Request(&SetattrRequest{})

func (r *SetattrRequest) String() string {
        var buf bytes.Buffer
        fmt.Fprintf(&buf, "Setattr [%s]", &r.Header)
        if r.Valid.Mode() {
                fmt.Fprintf(&buf, " mode=%v", r.Mode)
        }
        if r.Valid.Uid() {
                fmt.Fprintf(&buf, " uid=%d", r.Uid)
        }
        if r.Valid.Gid() {
                fmt.Fprintf(&buf, " gid=%d", r.Gid)
        }
        if r.Valid.Size() {
                fmt.Fprintf(&buf, " size=%d", r.Size)
        }
        if r.Valid.Atime() {
                fmt.Fprintf(&buf, " atime=%v", r.Atime)
        }
        if r.Valid.AtimeNow() {
                fmt.Fprintf(&buf, " atime=now")
        }
        if r.Valid.Mtime() {
                fmt.Fprintf(&buf, " mtime=%v", r.Mtime)
        }
        if r.Valid.MtimeNow() {
                fmt.Fprintf(&buf, " mtime=now")
        }
        if r.Valid.Handle() {
                fmt.Fprintf(&buf, " handle=%v", r.Handle)
        } else {
                fmt.Fprintf(&buf, " handle=INVALID-%v", r.Handle)
        }
        if r.Valid.LockOwner() {
                fmt.Fprintf(&buf, " lockowner")
        }
        if r.Valid.Crtime() {
                fmt.Fprintf(&buf, " crtime=%v", r.Crtime)
        }
        if r.Valid.Chgtime() {
                fmt.Fprintf(&buf, " chgtime=%v", r.Chgtime)
        }
        if r.Valid.Bkuptime() {
                fmt.Fprintf(&buf, " bkuptime=%v", r.Bkuptime)
        }
        if r.Valid.Flags() {
                fmt.Fprintf(&buf, " flags=%v", r.Flags)
        }
        return buf.String()
}

// Respond replies to the request with the given response,
// giving the updated attributes.
func (r *SetattrRequest) Respond(resp *SetattrResponse) {
        size := attrOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*attrOut)(buf.alloc(size))
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A SetattrResponse is the response to a SetattrRequest.
type SetattrResponse struct {
        Attr Attr // file attributes
}

func (r *SetattrResponse) String() string {
        return fmt.Sprintf("Setattr %v", r.Attr)
}

// A FlushRequest asks for the current state of an open file to be flushed
// to storage, as when a file descriptor is being closed.  A single opened Handle
// may receive multiple FlushRequests over its lifetime.
type FlushRequest struct {
        Header    `json:"-"`
        Handle    HandleID
        Flags     uint32
        LockOwner uint64
}

var _ = Request(&FlushRequest{})

func (r *FlushRequest) String() string {
        return fmt.Sprintf("Flush [%s] %v fl=%#x lk=%#x", &r.Header, r.Handle, r.Flags, r.LockOwner)
}

// Respond replies to the request, indicating that the flush succeeded.
func (r *FlushRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A RemoveRequest asks to remove a file or directory from the
// directory r.Node.
type RemoveRequest struct {
        Header `json:"-"`
        Name   string // name of the entry to remove
        Dir    bool   // is this rmdir?
}

var _ = Request(&RemoveRequest{})

func (r *RemoveRequest) String() string {
        return fmt.Sprintf("Remove [%s] %q dir=%v", &r.Header, r.Name, r.Dir)
}

// Respond replies to the request, indicating that the file was removed.
func (r *RemoveRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// A SymlinkRequest is a request to create a symlink making NewName point to Target.
type SymlinkRequest struct {
        Header          `json:"-"`
        NewName, Target string
}

var _ = Request(&SymlinkRequest{})

func (r *SymlinkRequest) String() string {
        return fmt.Sprintf("Symlink [%s] from %q to target %q", &r.Header, r.NewName, r.Target)
}

// Respond replies to the request, indicating that the symlink was created.
func (r *SymlinkRequest) Respond(resp *SymlinkResponse) {
        size := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*entryOut)(buf.alloc(size))
        out.Nodeid = uint64(resp.Node)
        out.Generation = resp.Generation
        out.EntryValid = uint64(resp.EntryValid / time.Second)
        out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A SymlinkResponse is the response to a SymlinkRequest.
type SymlinkResponse struct {
        LookupResponse
}

func (r *SymlinkResponse) String() string {
        return fmt.Sprintf("Symlink %v", r.LookupResponse.string())
}

// A ReadlinkRequest is a request to read a symlink's target.
type ReadlinkRequest struct {
        Header `json:"-"`
}

var _ = Request(&ReadlinkRequest{})

func (r *ReadlinkRequest) String() string {
        return fmt.Sprintf("Readlink [%s]", &r.Header)
}

func (r *ReadlinkRequest) Respond(target string) {
        buf := newBuffer(uintptr(len(target)))
        buf = append(buf, target...)
        r.respond(buf)
}

// A LinkRequest is a request to create a hard link.
type LinkRequest struct {
        Header  `json:"-"`
        OldNode NodeID
        NewName string
}

var _ = Request(&LinkRequest{})

func (r *LinkRequest) String() string {
        return fmt.Sprintf("Link [%s] node %d to %q", &r.Header, r.OldNode, r.NewName)
}

func (r *LinkRequest) Respond(resp *LookupResponse) {
        size := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*entryOut)(buf.alloc(size))
        out.Nodeid = uint64(resp.Node)
        out.Generation = resp.Generation
        out.EntryValid = uint64(resp.EntryValid / time.Second)
        out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

// A RenameRequest is a request to rename a file.
type RenameRequest struct {
        Header           `json:"-"`
        NewDir           NodeID
        OldName, NewName string
}

var _ = Request(&RenameRequest{})

func (r *RenameRequest) String() string {
        return fmt.Sprintf("Rename [%s] from %q to dirnode %v %q", &r.Header, r.OldName, r.NewDir, r.NewName)
}

func (r *RenameRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

type MknodRequest struct {
        Header `json:"-"`
        Name   string
        Mode   os.FileMode
        Rdev   uint32
        // Umask of the request. Not supported on OS X.
        Umask os.FileMode
}

var _ = Request(&MknodRequest{})

func (r *MknodRequest) String() string {
        return fmt.Sprintf("Mknod [%s] Name %q mode=%v umask=%v rdev=%d", &r.Header, r.Name, r.Mode, r.Umask, r.Rdev)
}

func (r *MknodRequest) Respond(resp *LookupResponse) {
        size := entryOutSize(r.Header.Conn.proto)
        buf := newBuffer(size)
        out := (*entryOut)(buf.alloc(size))
        out.Nodeid = uint64(resp.Node)
        out.Generation = resp.Generation
        out.EntryValid = uint64(resp.EntryValid / time.Second)
        out.EntryValidNsec = uint32(resp.EntryValid % time.Second / time.Nanosecond)
        out.AttrValid = uint64(resp.Attr.Valid / time.Second)
        out.AttrValidNsec = uint32(resp.Attr.Valid % time.Second / time.Nanosecond)
        resp.Attr.attr(&out.Attr, r.Header.Conn.proto)
        r.respond(buf)
}

type FsyncRequest struct {
        Header `json:"-"`
        Handle HandleID
        // TODO bit 1 is datasync, not well documented upstream
        Flags uint32
        Dir   bool
}

var _ = Request(&FsyncRequest{})

func (r *FsyncRequest) String() string {
        return fmt.Sprintf("Fsync [%s] Handle %v Flags %v", &r.Header, r.Handle, r.Flags)
}

func (r *FsyncRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// An InterruptRequest is a request to interrupt another pending request. The
// response to that request should return an error status of EINTR.
type InterruptRequest struct {
        Header `json:"-"`
        IntrID RequestID // ID of the request to be interrupt.
}

var _ = Request(&InterruptRequest{})

func (r *InterruptRequest) Respond() {
        // nothing to do here
        r.noResponse()
}

func (r *InterruptRequest) String() string {
        return fmt.Sprintf("Interrupt [%s] ID %v", &r.Header, r.IntrID)
}

// An ExchangeDataRequest is a request to exchange the contents of two
// files, while leaving most metadata untouched.
//
// This request comes from OS X exchangedata(2) and represents its
// specific semantics. Crucially, it is very different from Linux
// renameat(2) RENAME_EXCHANGE.
//
// https://developer.apple.com/library/mac/documentation/Darwin/Reference/ManPages/man2/exchangedata.2.html
type ExchangeDataRequest struct {
        Header           `json:"-"`
        OldDir, NewDir   NodeID
        OldName, NewName string
        // TODO options
}

var _ = Request(&ExchangeDataRequest{})

func (r *ExchangeDataRequest) String() string {
        // TODO options
        return fmt.Sprintf("ExchangeData [%s] %v %q and %v %q", &r.Header, r.OldDir, r.OldName, r.NewDir, r.NewName)
}

func (r *ExchangeDataRequest) Respond() {
        buf := newBuffer(0)
        r.respond(buf)
}

// See the file LICENSE for copyright and licensing information.

// Derived from FUSE's fuse_kernel.h, which carries this notice:
/*
   This file defines the kernel interface of FUSE
   Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>


   This -- and only this -- header file may also be distributed under
   the terms of the BSD Licence as follows:

   Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

   THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
   FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   SUCH DAMAGE.
*/

package fuse

import (
        "fmt"
        "syscall"
        "unsafe"
)

// The FUSE version implemented by the package.
const (
        protoVersionMinMajor = 7
        protoVersionMinMinor = 8
        protoVersionMaxMajor = 7
        protoVersionMaxMinor = 12
)

const (
        rootID = 1
)

type kstatfs struct {
        Blocks  uint64
        Bfree   uint64
        Bavail  uint64
        Files   uint64
        Ffree   uint64
        Bsize   uint32
        Namelen uint32
        Frsize  uint32
        _       uint32
        Spare   [6]uint32
}

type fileLock struct {
        Start uint64
        End   uint64
        Type  uint32
        Pid   uint32
}

// GetattrFlags are bit flags that can be seen in GetattrRequest.
type GetattrFlags uint32

const (
        // Indicates the handle is valid.
        GetattrFh GetattrFlags = 1 << 0
)

var getattrFlagsNames = []flagName{
        {uint32(GetattrFh), "GetattrFh"},
}

func (fl GetattrFlags) String() string {
        return flagString(uint32(fl), getattrFlagsNames)
}

// The SetattrValid are bit flags describing which fields in the SetattrRequest
// are included in the change.
type SetattrValid uint32

const (
        SetattrMode   SetattrValid = 1 << 0
        SetattrUid    SetattrValid = 1 << 1
        SetattrGid    SetattrValid = 1 << 2
        SetattrSize   SetattrValid = 1 << 3
        SetattrAtime  SetattrValid = 1 << 4
        SetattrMtime  SetattrValid = 1 << 5
        SetattrHandle SetattrValid = 1 << 6

        // Linux only(?)
        SetattrAtimeNow  SetattrValid = 1 << 7
        SetattrMtimeNow  SetattrValid = 1 << 8
        SetattrLockOwner SetattrValid = 1 << 9 // http://www.mail-archive.com/git-commits-head@vger.kernel.org/msg27852.html

        // OS X only
        SetattrCrtime   SetattrValid = 1 << 28
        SetattrChgtime  SetattrValid = 1 << 29
        SetattrBkuptime SetattrValid = 1 << 30
        SetattrFlags    SetattrValid = 1 << 31
)

func (fl SetattrValid) Mode() bool      { return fl&SetattrMode != 0 }
func (fl SetattrValid) Uid() bool       { return fl&SetattrUid != 0 }
func (fl SetattrValid) Gid() bool       { return fl&SetattrGid != 0 }
func (fl SetattrValid) Size() bool      { return fl&SetattrSize != 0 }
func (fl SetattrValid) Atime() bool     { return fl&SetattrAtime != 0 }
func (fl SetattrValid) Mtime() bool     { return fl&SetattrMtime != 0 }
func (fl SetattrValid) Handle() bool    { return fl&SetattrHandle != 0 }
func (fl SetattrValid) AtimeNow() bool  { return fl&SetattrAtimeNow != 0 }
func (fl SetattrValid) MtimeNow() bool  { return fl&SetattrMtimeNow != 0 }
func (fl SetattrValid) LockOwner() bool { return fl&SetattrLockOwner != 0 }
func (fl SetattrValid) Crtime() bool    { return fl&SetattrCrtime != 0 }
func (fl SetattrValid) Chgtime() bool   { return fl&SetattrChgtime != 0 }
func (fl SetattrValid) Bkuptime() bool  { return fl&SetattrBkuptime != 0 }
func (fl SetattrValid) Flags() bool     { return fl&SetattrFlags != 0 }

func (fl SetattrValid) String() string {
        return flagString(uint32(fl), setattrValidNames)
}

var setattrValidNames = []flagName{
        {uint32(SetattrMode), "SetattrMode"},
        {uint32(SetattrUid), "SetattrUid"},
        {uint32(SetattrGid), "SetattrGid"},
        {uint32(SetattrSize), "SetattrSize"},
        {uint32(SetattrAtime), "SetattrAtime"},
        {uint32(SetattrMtime), "SetattrMtime"},
        {uint32(SetattrHandle), "SetattrHandle"},
        {uint32(SetattrAtimeNow), "SetattrAtimeNow"},
        {uint32(SetattrMtimeNow), "SetattrMtimeNow"},
        {uint32(SetattrLockOwner), "SetattrLockOwner"},
        {uint32(SetattrCrtime), "SetattrCrtime"},
        {uint32(SetattrChgtime), "SetattrChgtime"},
        {uint32(SetattrBkuptime), "SetattrBkuptime"},
        {uint32(SetattrFlags), "SetattrFlags"},
}

// Flags that can be seen in OpenRequest.Flags.
const (
        // Access modes. These are not 1-bit flags, but alternatives where
        // only one can be chosen. See the IsReadOnly etc convenience
        // methods.
        OpenReadOnly  OpenFlags = syscall.O_RDONLY
        OpenWriteOnly OpenFlags = syscall.O_WRONLY
        OpenReadWrite OpenFlags = syscall.O_RDWR

        // File was opened in append-only mode, all writes will go to end
        // of file. OS X does not provide this information.
        OpenAppend    OpenFlags = syscall.O_APPEND
        OpenCreate    OpenFlags = syscall.O_CREAT
        OpenDirectory OpenFlags = syscall.O_DIRECTORY
        OpenExclusive OpenFlags = syscall.O_EXCL
        OpenNonblock  OpenFlags = syscall.O_NONBLOCK
        OpenSync      OpenFlags = syscall.O_SYNC
        OpenTruncate  OpenFlags = syscall.O_TRUNC
)

// OpenAccessModeMask is a bitmask that separates the access mode
// from the other flags in OpenFlags.
const OpenAccessModeMask OpenFlags = syscall.O_ACCMODE

// OpenFlags are the O_FOO flags passed to open/create/etc calls. For
// example, os.O_WRONLY | os.O_APPEND.
type OpenFlags uint32

func (fl OpenFlags) String() string {
        // O_RDONLY, O_RWONLY, O_RDWR are not flags
        s := accModeName(fl & OpenAccessModeMask)
        flags := uint32(fl &^ OpenAccessModeMask)
        if flags != 0 {
                s = s + "+" + flagString(flags, openFlagNames)
        }
        return s
}

// Return true if OpenReadOnly is set.
func (fl OpenFlags) IsReadOnly() bool {
        return fl&OpenAccessModeMask == OpenReadOnly
}

// Return true if OpenWriteOnly is set.
func (fl OpenFlags) IsWriteOnly() bool {
        return fl&OpenAccessModeMask == OpenWriteOnly
}

// Return true if OpenReadWrite is set.
func (fl OpenFlags) IsReadWrite() bool {
        return fl&OpenAccessModeMask == OpenReadWrite
}

func accModeName(flags OpenFlags) string {
        switch flags {
        case OpenReadOnly:
                return "OpenReadOnly"
        case OpenWriteOnly:
                return "OpenWriteOnly"
        case OpenReadWrite:
                return "OpenReadWrite"
        default:
                return ""
        }
}

var openFlagNames = []flagName{
        {uint32(OpenAppend), "OpenAppend"},
        {uint32(OpenCreate), "OpenCreate"},
        {uint32(OpenDirectory), "OpenDirectory"},
        {uint32(OpenExclusive), "OpenExclusive"},
        {uint32(OpenNonblock), "OpenNonblock"},
        {uint32(OpenSync), "OpenSync"},
        {uint32(OpenTruncate), "OpenTruncate"},
}

// The OpenResponseFlags are returned in the OpenResponse.
type OpenResponseFlags uint32

const (
        OpenDirectIO    OpenResponseFlags = 1 << 0 // bypass page cache for this open file
        OpenKeepCache   OpenResponseFlags = 1 << 1 // don't invalidate the data cache on open
        OpenNonSeekable OpenResponseFlags = 1 << 2 // mark the file as non-seekable (not supported on OS X)

        OpenPurgeAttr OpenResponseFlags = 1 << 30 // OS X
        OpenPurgeUBC  OpenResponseFlags = 1 << 31 // OS X
)

func (fl OpenResponseFlags) String() string {
        return flagString(uint32(fl), openResponseFlagNames)
}

var openResponseFlagNames = []flagName{
        {uint32(OpenDirectIO), "OpenDirectIO"},
        {uint32(OpenKeepCache), "OpenKeepCache"},
        {uint32(OpenNonSeekable), "OpenNonSeekable"},
        {uint32(OpenPurgeAttr), "OpenPurgeAttr"},
        {uint32(OpenPurgeUBC), "OpenPurgeUBC"},
}

// The InitFlags are used in the Init exchange.
type InitFlags uint32

const (
        InitAsyncRead     InitFlags = 1 << 0
        InitPosixLocks    InitFlags = 1 << 1
        InitFileOps       InitFlags = 1 << 2
        InitAtomicTrunc   InitFlags = 1 << 3
        InitExportSupport InitFlags = 1 << 4
        InitBigWrites     InitFlags = 1 << 5
        // Do not mask file access modes with umask. Not supported on OS X.
        InitDontMask        InitFlags = 1 << 6
        InitSpliceWrite     InitFlags = 1 << 7
        InitSpliceMove      InitFlags = 1 << 8
        InitSpliceRead      InitFlags = 1 << 9
        InitFlockLocks      InitFlags = 1 << 10
        InitHasIoctlDir     InitFlags = 1 << 11
        InitAutoInvalData   InitFlags = 1 << 12
        InitDoReaddirplus   InitFlags = 1 << 13
        InitReaddirplusAuto InitFlags = 1 << 14
        InitAsyncDIO        InitFlags = 1 << 15
        InitWritebackCache  InitFlags = 1 << 16
        InitNoOpenSupport   InitFlags = 1 << 17

        InitPOSIXACL InitFlags = 1 << 20

        InitCaseSensitive InitFlags = 1 << 29 // OS X only
        InitVolRename     InitFlags = 1 << 30 // OS X only
        InitXtimes        InitFlags = 1 << 31 // OS X only
)

type flagName struct {
        bit  uint32
        name string
}

var initFlagNames = []flagName{
        {uint32(InitAsyncRead), "InitAsyncRead"},
        {uint32(InitPosixLocks), "InitPosixLocks"},
        {uint32(InitFileOps), "InitFileOps"},
        {uint32(InitAtomicTrunc), "InitAtomicTrunc"},
        {uint32(InitExportSupport), "InitExportSupport"},
        {uint32(InitBigWrites), "InitBigWrites"},
        {uint32(InitDontMask), "InitDontMask"},
        {uint32(InitSpliceWrite), "InitSpliceWrite"},
        {uint32(InitSpliceMove), "InitSpliceMove"},
        {uint32(InitSpliceRead), "InitSpliceRead"},
        {uint32(InitFlockLocks), "InitFlockLocks"},
        {uint32(InitHasIoctlDir), "InitHasIoctlDir"},
        {uint32(InitAutoInvalData), "InitAutoInvalData"},
        {uint32(InitDoReaddirplus), "InitDoReaddirplus"},
        {uint32(InitReaddirplusAuto), "InitReaddirplusAuto"},
        {uint32(InitAsyncDIO), "InitAsyncDIO"},
        {uint32(InitWritebackCache), "InitWritebackCache"},
        {uint32(InitNoOpenSupport), "InitNoOpenSupport"},
        {uint32(InitPOSIXACL), "InitPOSIXACL"},

        {uint32(InitCaseSensitive), "InitCaseSensitive"},
        {uint32(InitVolRename), "InitVolRename"},
        {uint32(InitXtimes), "InitXtimes"},
}

func (fl InitFlags) String() string {
        return flagString(uint32(fl), initFlagNames)
}

func flagString(f uint32, names []flagName) string {
        var s string

        if f == 0 {
                return "0"
        }

        for _, n := range names {
                if f&n.bit != 0 {
                        s += "+" + n.name
                        f &^= n.bit
                }
        }
        if f != 0 {
                s += fmt.Sprintf("%+#x", f)
        }
        return s[1:]
}

// The ReleaseFlags are used in the Release exchange.
type ReleaseFlags uint32

const (
        ReleaseFlush ReleaseFlags = 1 << 0
)

func (fl ReleaseFlags) String() string {
        return flagString(uint32(fl), releaseFlagNames)
}

var releaseFlagNames = []flagName{
        {uint32(ReleaseFlush), "ReleaseFlush"},
}

// Opcodes
const (
        opLookup      = 1
        opForget      = 2 // no reply
        opGetattr     = 3
        opSetattr     = 4
        opReadlink    = 5
        opSymlink     = 6
        opMknod       = 8
        opMkdir       = 9
        opUnlink      = 10
        opRmdir       = 11
        opRename      = 12
        opLink        = 13
        opOpen        = 14
        opRead        = 15
        opWrite       = 16
        opStatfs      = 17
        opRelease     = 18
        opFsync       = 20
        opSetxattr    = 21
        opGetxattr    = 22
        opListxattr   = 23
        opRemovexattr = 24
        opFlush       = 25
        opInit        = 26
        opOpendir     = 27
        opReaddir     = 28
        opReleasedir  = 29
        opFsyncdir    = 30
        opGetlk       = 31
        opSetlk       = 32
        opSetlkw      = 33
        opAccess      = 34
        opCreate      = 35
        opInterrupt   = 36
        opBmap        = 37
        opDestroy     = 38
        opIoctl       = 39 // Linux?
        opPoll        = 40 // Linux?

        // OS X
        opSetvolname = 61
        opGetxtimes  = 62
        opExchange   = 63
)

type entryOut struct {
        Nodeid         uint64 // Inode ID
        Generation     uint64 // Inode generation
        EntryValid     uint64 // Cache timeout for the name
        AttrValid      uint64 // Cache timeout for the attributes
        EntryValidNsec uint32
        AttrValidNsec  uint32
        Attr           attr
}

func entryOutSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 9}):
                return unsafe.Offsetof(entryOut{}.Attr) + unsafe.Offsetof(entryOut{}.Attr.Blksize)
        default:
                return unsafe.Sizeof(entryOut{})
        }
}

type forgetIn struct {
        Nlookup uint64
}

type getattrIn struct {
        GetattrFlags uint32
        _            uint32
        Fh           uint64
}

type attrOut struct {
        AttrValid     uint64 // Cache timeout for the attributes
        AttrValidNsec uint32
        _             uint32
        Attr          attr
}

func attrOutSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 9}):
                return unsafe.Offsetof(attrOut{}.Attr) + unsafe.Offsetof(attrOut{}.Attr.Blksize)
        default:
                return unsafe.Sizeof(attrOut{})
        }
}

// OS X
type getxtimesOut struct {
        Bkuptime     uint64
        Crtime       uint64
        BkuptimeNsec uint32
        CrtimeNsec   uint32
}

type mknodIn struct {
        Mode  uint32
        Rdev  uint32
        Umask uint32
        _     uint32
        // "filename\x00" follows.
}

func mknodInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 12}):
                return unsafe.Offsetof(mknodIn{}.Umask)
        default:
                return unsafe.Sizeof(mknodIn{})
        }
}

type mkdirIn struct {
        Mode  uint32
        Umask uint32
        // filename follows
}

func mkdirInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 12}):
                return unsafe.Offsetof(mkdirIn{}.Umask) + 4
        default:
                return unsafe.Sizeof(mkdirIn{})
        }
}

type renameIn struct {
        Newdir uint64
        // "oldname\x00newname\x00" follows
}

// OS X
type exchangeIn struct {
        Olddir  uint64
        Newdir  uint64
        Options uint64
        // "oldname\x00newname\x00" follows
}

type linkIn struct {
        Oldnodeid uint64
}

type setattrInCommon struct {
        Valid     uint32
        _         uint32
        Fh        uint64
        Size      uint64
        LockOwner uint64 // unused on OS X?
        Atime     uint64
        Mtime     uint64
        Unused2   uint64
        AtimeNsec uint32
        MtimeNsec uint32
        Unused3   uint32
        Mode      uint32
        Unused4   uint32
        Uid       uint32
        Gid       uint32
        Unused5   uint32
}

type openIn struct {
        Flags  uint32
        Unused uint32
}

type openOut struct {
        Fh        uint64
        OpenFlags uint32
        _         uint32
}

type createIn struct {
        Flags uint32
        Mode  uint32
        Umask uint32
        _     uint32
}

func createInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 12}):
                return unsafe.Offsetof(createIn{}.Umask)
        default:
                return unsafe.Sizeof(createIn{})
        }
}

type releaseIn struct {
        Fh           uint64
        Flags        uint32
        ReleaseFlags uint32
        LockOwner    uint32
}

type flushIn struct {
        Fh         uint64
        FlushFlags uint32
        _          uint32
        LockOwner  uint64
}

type readIn struct {
        Fh        uint64
        Offset    uint64
        Size      uint32
        ReadFlags uint32
        LockOwner uint64
        Flags     uint32
        _         uint32
}

func readInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 9}):
                return unsafe.Offsetof(readIn{}.ReadFlags) + 4
        default:
                return unsafe.Sizeof(readIn{})
        }
}

// The ReadFlags are passed in ReadRequest.
type ReadFlags uint32

const (
        // LockOwner field is valid.
        ReadLockOwner ReadFlags = 1 << 1
)

var readFlagNames = []flagName{
        {uint32(ReadLockOwner), "ReadLockOwner"},
}

func (fl ReadFlags) String() string {
        return flagString(uint32(fl), readFlagNames)
}

type writeIn struct {
        Fh         uint64
        Offset     uint64
        Size       uint32
        WriteFlags uint32
        LockOwner  uint64
        Flags      uint32
        _          uint32
}

func writeInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 9}):
                return unsafe.Offsetof(writeIn{}.LockOwner)
        default:
                return unsafe.Sizeof(writeIn{})
        }
}

type writeOut struct {
        Size uint32
        _    uint32
}

// The WriteFlags are passed in WriteRequest.
type WriteFlags uint32

const (
        WriteCache WriteFlags = 1 << 0
        // LockOwner field is valid.
        WriteLockOwner WriteFlags = 1 << 1
)

var writeFlagNames = []flagName{
        {uint32(WriteCache), "WriteCache"},
        {uint32(WriteLockOwner), "WriteLockOwner"},
}

func (fl WriteFlags) String() string {
        return flagString(uint32(fl), writeFlagNames)
}

const compatStatfsSize = 48

type statfsOut struct {
        St kstatfs
}

type fsyncIn struct {
        Fh         uint64
        FsyncFlags uint32
        _          uint32
}

type setxattrInCommon struct {
        Size  uint32
        Flags uint32
}

func (setxattrInCommon) position() uint32 {
        return 0
}

type getxattrInCommon struct {
        Size uint32
        _    uint32
}

func (getxattrInCommon) position() uint32 {
        return 0
}

type getxattrOut struct {
        Size uint32
        _    uint32
}

type lkIn struct {
        Fh      uint64
        Owner   uint64
        Lk      fileLock
        LkFlags uint32
        _       uint32
}

func lkInSize(p Protocol) uintptr {
        switch {
        case p.LT(Protocol{7, 9}):
                return unsafe.Offsetof(lkIn{}.LkFlags)
        default:
                return unsafe.Sizeof(lkIn{})
        }
}

type lkOut struct {
        Lk fileLock
}

type accessIn struct {
        Mask uint32
        _    uint32
}

type initIn struct {
        Major        uint32
        Minor        uint32
        MaxReadahead uint32
        Flags        uint32
}

const initInSize = int(unsafe.Sizeof(initIn{}))

type initOut struct {
        Major        uint32
        Minor        uint32
        MaxReadahead uint32
        Flags        uint32
        Unused       uint32
        MaxWrite     uint32
}

type interruptIn struct {
        Unique uint64
}

type bmapIn struct {
        Block     uint64
        BlockSize uint32
        _         uint32
}

type bmapOut struct {
        Block uint64
}

type inHeader struct {
        Len    uint32
        Opcode uint32
        Unique uint64
        Nodeid uint64
        Uid    uint32
        Gid    uint32
        Pid    uint32
        _      uint32
}

const inHeaderSize = int(unsafe.Sizeof(inHeader{}))

type outHeader struct {
        Len    uint32
        Error  int32
        Unique uint64
}

const OutHeaderSize = int(unsafe.Sizeof(outHeader{}))

type dirent struct {
        Ino     uint64
        Off     uint64
        Namelen uint32
        Type    uint32
        Name    [0]byte
}

const direntSize = 8 + 8 + 4 + 4

const (
        notifyCodePoll       int32 = 1
        notifyCodeInvalInode int32 = 2
        notifyCodeInvalEntry int32 = 3
)

type notifyInvalInodeOut struct {
        Ino uint64
        Off int64
        Len int64
}

type notifyInvalEntryOut struct {
        Parent  uint64
        Namelen uint32
        _       uint32
}

package fuse

import "time"

type attr struct {
        Ino       uint64
        Size      uint64
        Blocks    uint64
        Atime     uint64
        Mtime     uint64
        Ctime     uint64
        AtimeNsec uint32
        MtimeNsec uint32
        CtimeNsec uint32
        Mode      uint32
        Nlink     uint32
        Uid       uint32
        Gid       uint32
        Rdev      uint32
        Blksize   uint32
        padding   uint32
}

func (a *attr) Crtime() time.Time {
        return time.Time{}
}

func (a *attr) SetCrtime(s uint64, ns uint32) {
        // Ignored on Linux.
}

func (a *attr) SetFlags(f uint32) {
        // Ignored on Linux.
}

type setattrIn struct {
        setattrInCommon
}

func (in *setattrIn) BkupTime() time.Time {
        return time.Time{}
}

func (in *setattrIn) Chgtime() time.Time {
        return time.Time{}
}

func (in *setattrIn) Flags() uint32 {
        return 0
}

func openFlags(flags uint32) OpenFlags {
        // on amd64, the 32-bit O_LARGEFILE flag is always seen;
        // on i386, the flag probably depends on the app
        // requesting, but in any case should be utterly
        // uninteresting to us here; our kernel protocol messages
        // are not directly related to the client app's kernel
        // API/ABI
        flags &^= 0x8000

        return OpenFlags(flags)
}

type getxattrIn struct {
        getxattrInCommon
}

type setxattrIn struct {
        setxattrInCommon
}

package fuseutil // import "github.com/cubefs/cubefs/depends/bazil.org/fuse/fuseutil"

import (
        "github.com/cubefs/cubefs/depends/bazil.org/fuse"
)

// HandleRead handles a read request assuming that data is the entire file content.
// It adjusts the amount returned in resp according to req.Offset and req.Size.
func HandleRead(req *fuse.ReadRequest, resp *fuse.ReadResponse, data []byte) {
        if req.Offset >= int64(len(data)) {
                data = nil
        } else {
                data = data[req.Offset:]
        }
        if len(data) > req.Size {
                data = data[:req.Size]
        }
        n := copy(resp.Data[:req.Size], data)
        resp.Data = resp.Data[:n]
}

package fuse

import (
        "bufio"
        "errors"
        "io"
        "log"
        "sync"
)

var (
        // ErrOSXFUSENotFound is returned from Mount when the OSXFUSE
        // installation is not detected.
        //
        // Only happens on OS X. Make sure OSXFUSE is installed, or see
        // OSXFUSELocations for customization.
        ErrOSXFUSENotFound = errors.New("cannot locate OSXFUSE")
)

func neverIgnoreLine(line string) bool {
        return false
}

func lineLogger(wg *sync.WaitGroup, prefix string, ignore func(line string) bool, r io.ReadCloser) {
        defer wg.Done()

        scanner := bufio.NewScanner(r)
        for scanner.Scan() {
                line := scanner.Text()
                if ignore(line) {
                        continue
                }
                log.Printf("%s: %s", prefix, line)
        }
        if err := scanner.Err(); err != nil {
                log.Printf("%s, error reading: %v", prefix, err)
        }
}

package fuse

import (
        "fmt"
        "log"
        "net"
        "os"
        "os/exec"
        "strings"
        "sync"
        "syscall"
)

func handleFusermountStderr(errCh chan<- error) func(line string) (ignore bool) {
        return func(line string) (ignore bool) {
                if line == `fusermount: failed to open /etc/fuse.conf: Permission denied` {
                        // Silence this particular message, it occurs way too
                        // commonly and isn't very relevant to whether the mount
                        // succeeds or not.
                        return true
                }

                const (
                        noMountpointPrefix = `fusermount: failed to access mountpoint `
                        noMountpointSuffix = `: No such file or directory`
                )
                if strings.HasPrefix(line, noMountpointPrefix) && strings.HasSuffix(line, noMountpointSuffix) {
                        // re-extract it from the error message in case some layer
                        // changed the path
                        mountpoint := line[len(noMountpointPrefix) : len(line)-len(noMountpointSuffix)]
                        err := &MountpointDoesNotExistError{
                                Path: mountpoint,
                        }
                        select {
                        case errCh <- err:
                                return true
                        default:
                                // not the first error; fall back to logging it
                                return false
                        }
                }

                return false
        }
}

// isBoringFusermountError returns whether the Wait error is
// uninteresting; exit status 1 is.
func isBoringFusermountError(err error) bool {
        if err, ok := err.(*exec.ExitError); ok && err.Exited() {
                if status, ok := err.Sys().(syscall.WaitStatus); ok && status.ExitStatus() == 1 {
                        return true
                }
        }
        return false
}

func mount(dir string, conf *mountConfig, ready chan<- struct{}, errp *error) (fusefd *os.File, err error) {
        // linux mount is never delayed
        close(ready)

        fds, err := syscall.Socketpair(syscall.AF_FILE, syscall.SOCK_STREAM, 0)
        if err != nil {
                return nil, fmt.Errorf("socketpair error: %v", err)
        }

        writeFile := os.NewFile(uintptr(fds[0]), "fusermount-child-writes")
        defer writeFile.Close()

        readFile := os.NewFile(uintptr(fds[1]), "fusermount-parent-reads")
        defer readFile.Close()

        cmd := exec.Command(
                "fusermount",
                "-o", conf.getOptions(),
                "--",
                dir,
        )
        cmd.Env = append(os.Environ(), "_FUSE_COMMFD=3")

        cmd.ExtraFiles = []*os.File{writeFile}

        var wg sync.WaitGroup
        stdout, err := cmd.StdoutPipe()
        if err != nil {
                return nil, fmt.Errorf("setting up fusermount stderr: %v", err)
        }
        stderr, err := cmd.StderrPipe()
        if err != nil {
                return nil, fmt.Errorf("setting up fusermount stderr: %v", err)
        }

        if err := cmd.Start(); err != nil {
                return nil, fmt.Errorf("fusermount: %v", err)
        }
        helperErrCh := make(chan error, 1)
        wg.Add(2)
        go lineLogger(&wg, "mount helper output", neverIgnoreLine, stdout)
        go lineLogger(&wg, "mount helper error", handleFusermountStderr(helperErrCh), stderr)
        wg.Wait()
        if err := cmd.Wait(); err != nil {
                // see if we have a better error to report
                select {
                case helperErr := <-helperErrCh:
                        // log the Wait error if it's not what we expected
                        if !isBoringFusermountError(err) {
                                log.Printf("mount helper failed: %v", err)
                        }
                        // and now return what we grabbed from stderr as the real
                        // error
                        return nil, helperErr
                default:
                        // nope, fall back to generic message
                }

                return nil, fmt.Errorf("fusermount: %v", err)
        }

        c, err := net.FileConn(readFile)
        if err != nil {
                return nil, fmt.Errorf("FileConn from fusermount socket: %v", err)
        }
        defer c.Close()

        uc, ok := c.(*net.UnixConn)
        if !ok {
                return nil, fmt.Errorf("unexpected FileConn type; expected UnixConn, got %T", c)
        }

        buf := make([]byte, 32) // expect 1 byte
        oob := make([]byte, 32) // expect 24 bytes
        _, oobn, _, _, err := uc.ReadMsgUnix(buf, oob)
        scms, err := syscall.ParseSocketControlMessage(oob[:oobn])
        if err != nil {
                return nil, fmt.Errorf("ParseSocketControlMessage: %v", err)
        }
        if len(scms) != 1 {
                return nil, fmt.Errorf("expected 1 SocketControlMessage; got scms = %#v", scms)
        }
        scm := scms[0]
        gotFds, err := syscall.ParseUnixRights(&scm)
        if err != nil {
                return nil, fmt.Errorf("syscall.ParseUnixRights: %v", err)
        }
        if len(gotFds) != 1 {
                return nil, fmt.Errorf("wanted 1 fd; got %#v", gotFds)
        }
        f := os.NewFile(uintptr(gotFds[0]), "/dev/fuse")
        return f, nil
}

package fuse

import (
        "errors"
        "strings"
)

func dummyOption(conf *mountConfig) error {
        return nil
}

// mountConfig holds the configuration for a mount operation.
// Use it by passing MountOption values to Mount.
type mountConfig struct {
        options          map[string]string
        maxReadahead     uint32
        initFlags        InitFlags
        osxfuseLocations []OSXFUSEPaths
        RequestTimeout   int64
}

func escapeComma(s string) string {
        s = strings.Replace(s, `\`, `\\`, -1)
        s = strings.Replace(s, `,`, `\,`, -1)
        return s
}

// getOptions makes a string of options suitable for passing to FUSE
// mount flag `-o`. Returns an empty string if no options were set.
// Any platform specific adjustments should happen before the call.
func (m *mountConfig) getOptions() string {
        var opts []string
        for k, v := range m.options {
                k = escapeComma(k)
                if v != "" {
                        k += "=" + escapeComma(v)
                }
                opts = append(opts, k)
        }
        return strings.Join(opts, ",")
}

type mountOption func(*mountConfig) error

// MountOption is passed to Mount to change the behavior of the mount.
type MountOption mountOption

// FSName sets the file system name (also called source) that is
// visible in the list of mounted file systems.
//
// FreeBSD ignores this option.
func FSName(name string) MountOption {
        return func(conf *mountConfig) error {
                conf.options["fsname"] = name
                return nil
        }
}

// Subtype sets the subtype of the mount. The main type is always
// `fuse`. The type in a list of mounted file systems will look like
// `fuse.foo`.
//
// OS X ignores this option.
// FreeBSD ignores this option.
func Subtype(fstype string) MountOption {
        return func(conf *mountConfig) error {
                conf.options["subtype"] = fstype
                return nil
        }
}

// LocalVolume sets the volume to be local (instead of network),
// changing the behavior of Finder, Spotlight, and such.
//
// OS X only. Others ignore this option.
func LocalVolume() MountOption {
        return localVolume
}

// VolumeName sets the volume name shown in Finder.
//
// OS X only. Others ignore this option.
func VolumeName(name string) MountOption {
        return volumeName(name)
}

// NoAppleDouble makes OSXFUSE disallow files with names used by OS X
// to store extended attributes on file systems that do not support
// them natively.
//
// Such file names are:
//
//     ._*
//     .DS_Store
//
// OS X only.  Others ignore this option.
func NoAppleDouble() MountOption {
        return noAppleDouble
}

// NoAppleXattr makes OSXFUSE disallow extended attributes with the
// prefix "com.apple.". This disables persistent Finder state and
// other such information.
//
// OS X only.  Others ignore this option.
func NoAppleXattr() MountOption {
        return noAppleXattr
}

// ExclCreate causes O_EXCL flag to be set for only "truly" exclusive creates,
// i.e. create calls for which the initiator explicitly set the O_EXCL flag.
//
// OSXFUSE expects all create calls to return EEXIST in case the file
// already exists, regardless of whether O_EXCL was specified or not.
// To ensure this behavior, it normally sets OpenExclusive for all
// Create calls, regardless of whether the original call had it set.
// For distributed filesystems, that may force every file create to be
// a distributed consensus action, causing undesirable delays.
//
// This option makes the FUSE filesystem see the original flag value,
// and better decide when to ensure global consensus.
//
// Note that returning EEXIST on existing file create is still
// expected with OSXFUSE, regardless of the presence of the
// OpenExclusive flag.
//
// For more information, see
// https://github.com/osxfuse/osxfuse/issues/209
//
// OS X only. Others ignore this options.
// Requires OSXFUSE 3.4.1 or newer.
func ExclCreate() MountOption {
        return exclCreate
}

// DaemonTimeout sets the time in seconds between a request and a reply before
// the FUSE mount is declared dead.
//
// OS X and FreeBSD only. Others ignore this option.
func DaemonTimeout(name string) MountOption {
        return daemonTimeout(name)
}

var ErrCannotCombineAllowOtherAndAllowRoot = errors.New("cannot combine AllowOther and AllowRoot")

// AllowOther allows other users to access the file system.
//
// Only one of AllowOther or AllowRoot can be used.
func AllowOther() MountOption {
        return func(conf *mountConfig) error {
                if _, ok := conf.options["allow_root"]; ok {
                        return ErrCannotCombineAllowOtherAndAllowRoot
                }
                conf.options["allow_other"] = ""
                return nil
        }
}

// AllowRoot allows other users to access the file system.
//
// Only one of AllowOther or AllowRoot can be used.
//
// FreeBSD ignores this option.
func AllowRoot() MountOption {
        return func(conf *mountConfig) error {
                if _, ok := conf.options["allow_other"]; ok {
                        return ErrCannotCombineAllowOtherAndAllowRoot
                }
                conf.options["allow_root"] = ""
                return nil
        }
}

// AllowDev enables interpreting character or block special devices on the
// filesystem.
func AllowDev() MountOption {
        return func(conf *mountConfig) error {
                conf.options["dev"] = ""
                return nil
        }
}

// AllowSUID allows set-user-identifier or set-group-identifier bits to take
// effect.
func AllowSUID() MountOption {
        return func(conf *mountConfig) error {
                conf.options["suid"] = ""
                return nil
        }
}

// DefaultPermissions makes the kernel enforce access control based on
// the file mode (as in chmod).
//
// Without this option, the Node itself decides what is and is not
// allowed. This is normally ok because FUSE file systems cannot be
// accessed by other users without AllowOther/AllowRoot.
//
// FreeBSD ignores this option.
func DefaultPermissions() MountOption {
        return func(conf *mountConfig) error {
                conf.options["default_permissions"] = ""
                return nil
        }
}

// ReadOnly makes the mount read-only.
func ReadOnly() MountOption {
        return func(conf *mountConfig) error {
                conf.options["ro"] = ""
                return nil
        }
}

// MaxReadahead sets the number of bytes that can be prefetched for
// sequential reads. The kernel can enforce a maximum value lower than
// this.
//
// This setting makes the kernel perform speculative reads that do not
// originate from any client process. This usually tremendously
// improves read performance.
func MaxReadahead(n uint32) MountOption {
        return func(conf *mountConfig) error {
                conf.maxReadahead = n
                return nil
        }
}

// AsyncRead enables multiple outstanding read requests for the same
// handle. Without this, there is at most one request in flight at a
// time.
func AsyncRead() MountOption {
        return func(conf *mountConfig) error {
                conf.initFlags |= InitAsyncRead
                return nil
        }
}

// WritebackCache enables the kernel to buffer writes before sending
// them to the FUSE server. Without this, writethrough caching is
// used.
func WritebackCache() MountOption {
        return func(conf *mountConfig) error {
                conf.initFlags |= InitWritebackCache
                return nil
        }
}

func AutoInvalData(enable int64) MountOption {
        if enable > 0 {
                return func(conf *mountConfig) error {
                        conf.initFlags |= InitAutoInvalData
                        return nil
                }
        }
        return func(conf *mountConfig) error {
                return nil
        }
}

// OSXFUSEPaths describes the paths used by an installed OSXFUSE
// version. See OSXFUSELocationV3 for typical values.
type OSXFUSEPaths struct {
        // Prefix for the device file. At mount time, an incrementing
        // number is suffixed until a free FUSE device is found.
        DevicePrefix string
        // Path of the load helper, used to load the kernel extension if
        // no device files are found.
        Load string
        // Path of the mount helper, used for the actual mount operation.
        Mount string
        // Environment variable used to pass the path to the executable
        // calling the mount helper.
        DaemonVar string
}

// Default paths for OSXFUSE. See OSXFUSELocations.
var (
        OSXFUSELocationV3 = OSXFUSEPaths{
                DevicePrefix: "/dev/osxfuse",
                Load:         "/Library/Filesystems/osxfuse.fs/Contents/Resources/load_osxfuse",
                Mount:        "/Library/Filesystems/osxfuse.fs/Contents/Resources/mount_osxfuse",
                DaemonVar:    "MOUNT_OSXFUSE_DAEMON_PATH",
        }
        OSXFUSELocationV2 = OSXFUSEPaths{
                DevicePrefix: "/dev/osxfuse",
                Load:         "/Library/Filesystems/osxfusefs.fs/Support/load_osxfusefs",
                Mount:        "/Library/Filesystems/osxfusefs.fs/Support/mount_osxfusefs",
                DaemonVar:    "MOUNT_FUSEFS_DAEMON_PATH",
        }
)

// OSXFUSELocations sets where to look for OSXFUSE files. The
// arguments are all the possible locations. The previous locations
// are replaced.
//
// Without this option, OSXFUSELocationV3 and OSXFUSELocationV2 are
// used.
//
// OS X only. Others ignore this option.
func OSXFUSELocations(paths ...OSXFUSEPaths) MountOption {
        return func(conf *mountConfig) error {
                if len(paths) == 0 {
                        return errors.New("must specify at least one location for OSXFUSELocations")
                }
                // replace previous values, but make a copy so there's no
                // worries about caller mutating their slice
                conf.osxfuseLocations = append(conf.osxfuseLocations[:0], paths...)
                return nil
        }
}

// AllowNonEmptyMount allows the mounting over a non-empty directory.
//
// The files in it will be shadowed by the freshly created mount. By
// default these mounts are rejected to prevent accidental covering up
// of data, which could for example prevent automatic backup.
func AllowNonEmptyMount() MountOption {
        return func(conf *mountConfig) error {
                conf.options["nonempty"] = ""
                return nil
        }
}

// PosixACL enable posix ACL supported.
func PosixACL() MountOption {
        return func(conf *mountConfig) error {
                conf.initFlags |= InitPOSIXACL
                return nil
        }
}

// RequestTimeout set request timeout.
func RequestTimeout(timeout int64) MountOption {
        return func(conf *mountConfig) error {
                conf.RequestTimeout = timeout
                return nil
        }
}

package fuse

func localVolume(conf *mountConfig) error {
        return nil
}

func volumeName(name string) MountOption {
        return dummyOption
}

func daemonTimeout(name string) MountOption {
        return dummyOption
}

func noAppleXattr(conf *mountConfig) error {
        return nil
}

func noAppleDouble(conf *mountConfig) error {
        return nil
}

func exclCreate(conf *mountConfig) error {
        return nil
}

package fuse

import (
        "fmt"
)

// Protocol is a FUSE protocol version number.
type Protocol struct {
        Major uint32
        Minor uint32
}

func (p Protocol) String() string {
        return fmt.Sprintf("%d.%d", p.Major, p.Minor)
}

// LT returns whether a is less than b.
func (a Protocol) LT(b Protocol) bool {
        return a.Major < b.Major ||
                (a.Major == b.Major && a.Minor < b.Minor)
}

// GE returns whether a is greater than or equal to b.
func (a Protocol) GE(b Protocol) bool {
        return a.Major > b.Major ||
                (a.Major == b.Major && a.Minor >= b.Minor)
}

func (a Protocol) is79() bool {
        return a.GE(Protocol{7, 9})
}

// HasAttrBlockSize returns whether Attr.BlockSize is respected by the
// kernel.
func (a Protocol) HasAttrBlockSize() bool {
        return a.is79()
}

// HasReadWriteFlags returns whether ReadRequest/WriteRequest
// fields Flags and FileFlags are valid.
func (a Protocol) HasReadWriteFlags() bool {
        return a.is79()
}

// HasGetattrFlags returns whether GetattrRequest field Flags is
// valid.
func (a Protocol) HasGetattrFlags() bool {
        return a.is79()
}

func (a Protocol) is710() bool {
        return a.GE(Protocol{7, 10})
}

// HasOpenNonSeekable returns whether OpenResponse field Flags flag
// OpenNonSeekable is supported.
func (a Protocol) HasOpenNonSeekable() bool {
        return a.is710()
}

func (a Protocol) is712() bool {
        return a.GE(Protocol{7, 12})
}

// HasUmask returns whether CreateRequest/MkdirRequest/MknodRequest
// field Umask is valid.
func (a Protocol) HasUmask() bool {
        return a.is712()
}

// HasInvalidate returns whether InvalidateNode/InvalidateEntry are
// supported.
func (a Protocol) HasInvalidate() bool {
        return a.is712()
}

package fuse

// Unmount tries to unmount the filesystem mounted at dir.
func Unmount(dir string) error {
        return unmount(dir)
}

package fuse

import (
        "bytes"
        "errors"
        "os/exec"
)

func unmount(dir string) error {
        cmd := exec.Command("fusermount", "-u", dir)
        output, err := cmd.CombinedOutput()
        if err != nil {
                if len(output) > 0 {
                        output = bytes.TrimRight(output, "\n")
                        msg := err.Error() + ": " + string(output)
                        err = errors.New(msg)
                }
                return err
        }
        return nil
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "errors"
        "strings"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/storage"
)

const (
        _ = iota
        // KB killobytes
        KB = 1 << (10 * iota)
        // MB megabytes
        MB
)

const (
        defaultTickInterval    = time.Millisecond * 2000
        defaultHeartbeatTick   = 1
        defaultElectionTick    = 5
        defaultInflightMsgs    = 128
        defaultSizeReqBuffer   = 2048
        defaultSizeAppBuffer   = 2048
        defaultRetainLogs      = 20000
        defaultSizeSendBuffer  = 10240
        defaultReplConcurrency = 5
        defaultSnapConcurrency = 10
        defaultSizePerMsg      = MB
        defaultHeartbeatAddr   = ":3016"
        defaultReplicateAddr   = ":2015"
)

// Config contains the parameters to start a raft server.
// Default: Do not use lease mechanism.
// NOTE: NodeID and Resolver must be required.Other parameter has default value.
type Config struct {
        TransportConfig
        // NodeID is the identity of the local node. NodeID cannot be 0.
        // This parameter is required.
        NodeID uint64
        // TickInterval is the interval of timer which check heartbeat and election timeout.
        // The default value is 2s.
        TickInterval time.Duration
        // HeartbeatTick is the heartbeat interval. A leader sends heartbeat
        // message to maintain the leadership every heartbeat interval.
        // The default value is 2s.
        HeartbeatTick int
        // ElectionTick is the election timeout. If a follower does not receive any message
        // from the leader of current term during ElectionTick, it will become candidate and start an election.
        // ElectionTick must be greater than HeartbeatTick.
        // We suggest to use ElectionTick = 10 * HeartbeatTick to avoid unnecessary leader switching.
        // The default value is 10s.
        ElectionTick int
        // MaxSizePerMsg limits the max size of each append message.
        // The default value is 1M.
        MaxSizePerMsg uint64
        // MaxInflightMsgs limits the max number of in-flight append messages during optimistic replication phase.
        // The application transportation layer usually has its own sending buffer over TCP/UDP.
        // Setting MaxInflightMsgs to avoid overflowing that sending buffer.
        // The default value is 128.
        MaxInflightMsgs int
        // ReqBufferSize limits the max number of recive request chan buffer.
        // The default value is 1024.
        ReqBufferSize int
        // AppBufferSize limits the max number of apply chan buffer.
        // The default value is 2048.
        AppBufferSize int
        // RetainLogs controls how many logs we leave after truncate.
        // This is used so that we can quickly replay logs on a follower instead of being forced to send an entire snapshot.
        // The default value is 20000.
        RetainLogs uint64
        // LeaseCheck whether to use the lease mechanism.
        // The default value is false.
        // (this equal etcd's raft checkQuorum)
        LeaseCheck bool
        // PreVote enables the Pre-Vote algorithm described in raft thesis section
        // 9.6. This prevents disruption when a node that has been partitioned away
        // rejoins the cluster.
        PreVote bool
        // ReadOnlyOption specifies how the read only request is processed.
        //
        // ReadOnlySafe guarantees the linearizability of the read only request by
        // communicating with the quorum. It is the default and suggested option.
        //
        // ReadOnlyLeaseBased ensures linearizability of the read only request by
        // relying on the leader lease. It can be affected by clock drift.
        // If the clock drift is unbounded, leader might keep the lease longer than it
        // should (clock can move backward/pause without any bound). ReadIndex is not safe
        // in that case.
        // LeaseCheck MUST be enabled if ReadOnlyOption is ReadOnlyLeaseBased.
        ReadOnlyOption ReadOnlyOption
        transport      Transport
}

// TransportConfig raft server transport config
type TransportConfig struct {
        // HeartbeatAddr is the Heartbeat port.
        // The default value is 3016.
        HeartbeatAddr string
        // ReplicateAddr is the Replation port.
        // The default value is 2015.
        ReplicateAddr string
        // 发送队列大小
        SendBufferSize int
        //复制并发数(node->node)
        MaxReplConcurrency int
        // MaxSnapConcurrency limits the max number of snapshot concurrency.
        // The default value is 10.
        MaxSnapConcurrency int
        // This parameter is required.
        Resolver SocketResolver
}

// RaftConfig contains the parameters to create a raft.
type RaftConfig struct {
        ID           uint64
        Term         uint64
        Leader       uint64
        Applied      uint64
        Peers        []proto.Peer
        Storage      storage.Storage
        StateMachine StateMachine
        Monitor      Monitor
}

// DefaultConfig returns a Config with usable defaults.
func DefaultConfig() *Config {
        conf := &Config{
                TickInterval:    defaultTickInterval,
                HeartbeatTick:   defaultHeartbeatTick,
                ElectionTick:    defaultElectionTick,
                MaxSizePerMsg:   defaultSizePerMsg,
                MaxInflightMsgs: defaultInflightMsgs,
                ReqBufferSize:   defaultSizeReqBuffer,
                AppBufferSize:   defaultSizeAppBuffer,
                RetainLogs:      defaultRetainLogs,
                LeaseCheck:      false,
        }
        conf.HeartbeatAddr = defaultHeartbeatAddr
        conf.ReplicateAddr = defaultReplicateAddr
        conf.SendBufferSize = defaultSizeSendBuffer
        conf.MaxReplConcurrency = defaultReplConcurrency
        conf.MaxSnapConcurrency = defaultSnapConcurrency

        return conf
}

// validate returns an error if any required elements of the Config are missing or invalid.
func (c *Config) validate() error {
        if c.NodeID == 0 {
                return errors.New("NodeID is required")
        }
        if c.TransportConfig.Resolver == nil {
                return errors.New("Resolver is required")
        }
        if c.MaxSizePerMsg > 4*MB {
                return errors.New("MaxSizePerMsg it too high")
        }
        if c.MaxInflightMsgs > 1024 {
                return errors.New("MaxInflightMsgs is too high")
        }
        if c.MaxSnapConcurrency > 256 {
                return errors.New("MaxSnapConcurrency is too high")
        }
        if c.MaxReplConcurrency > 256 {
                return errors.New("MaxReplConcurrency is too high")
        }
        if c.ReadOnlyOption == ReadOnlyLeaseBased && !c.LeaseCheck {
                return errors.New("LeaseCheck MUST be enabled when use ReadOnlyLeaseBased")
        }

        if strings.TrimSpace(c.TransportConfig.HeartbeatAddr) == "" {
                c.TransportConfig.HeartbeatAddr = defaultHeartbeatAddr
        }
        if strings.TrimSpace(c.TransportConfig.ReplicateAddr) == "" {
                c.TransportConfig.ReplicateAddr = defaultReplicateAddr
        }
        if c.TickInterval < 5*time.Millisecond {
                c.TickInterval = defaultTickInterval
        }
        if c.HeartbeatTick <= 0 {
                c.HeartbeatTick = defaultHeartbeatTick
        }
        if c.ElectionTick <= 0 {
                c.ElectionTick = defaultElectionTick
        }
        if c.MaxSizePerMsg <= 0 {
                c.MaxSizePerMsg = defaultSizePerMsg
        }
        if c.MaxInflightMsgs <= 0 {
                c.MaxInflightMsgs = defaultInflightMsgs
        }
        if c.ReqBufferSize <= 0 {
                c.ReqBufferSize = defaultSizeReqBuffer
        }
        if c.AppBufferSize <= 0 {
                c.AppBufferSize = defaultSizeAppBuffer
        }
        if c.MaxSnapConcurrency <= 0 {
                c.MaxSnapConcurrency = defaultSnapConcurrency
        }
        if c.MaxReplConcurrency <= 0 {
                c.MaxReplConcurrency = defaultReplConcurrency
        }
        if c.SendBufferSize <= 0 {
                c.SendBufferSize = defaultSizeSendBuffer
        }
        return nil
}

// validate returns an error if any required elements of the ReplConfig are missing or invalid.
func (c *RaftConfig) validate() error {
        if c.ID == 0 {
                return errors.New("ID is required")
        }
        if len(c.Peers) == 0 {
                return errors.New("Peers is required")
        }
        if c.Storage == nil {
                return errors.New("Storage is required")
        }
        if c.StateMachine == nil {
                return errors.New("StateMachine is required")
        }

        return nil
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "errors"
)

var (
        ErrCompacted     = errors.New("requested index is unavailable due to compaction.")
        ErrRaftExists    = errors.New("raft already exists.")
        ErrRaftNotExists = errors.New("raft not exists.")
        ErrNotLeader     = errors.New("raft is not the leader.")
        ErrStopped       = errors.New("raft is already shutdown.")
        ErrSnapping      = errors.New("raft is doing snapshot.")
        ErrRetryLater    = errors.New("retry later")
)

type FatalError struct {
        ID  uint64
        Err error
}

// AppPanicError is panic error when repl occurred fatal error.
// The server will recover this panic and stop the shard repl.
type AppPanicError string

func (pe *AppPanicError) Error() string {
        return "Occurred application logic panic error: " + string(*pe)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

type respErr struct {
        errCh chan error
}

func (e *respErr) init() {
        e.errCh = make(chan error, 1)
}

func (e *respErr) respond(err error) {
        e.errCh <- err
        close(e.errCh)
}

func (e *respErr) error() <-chan error {
        return e.errCh
}

// Future the future
type Future struct {
        respErr
        respCh chan interface{}
}

func newFuture() *Future {
        f := &Future{
                respCh: make(chan interface{}, 1),
        }
        f.init()
        return f
}

func (f *Future) respond(resp interface{}, err error) {
        if err == nil {
                f.respCh <- resp
                close(f.respCh)
        } else {
                f.respErr.respond(err)
        }
}

// Response wait response
func (f *Future) Response() (resp interface{}, err error) {
        select {
        case err = <-f.error():
                return
        case resp = <-f.respCh:
                return
        }
}

// AsyncResponse export channels
func (f *Future) AsyncResponse() (respCh <-chan interface{}, errCh <-chan error) {
        return f.respCh, f.errCh
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package logger

import (
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)

// Logger encapsulation the log interface.
type Logger interface {
        IsEnableDebug() bool
        IsEnableInfo() bool
        IsEnableWarn() bool

        Debug(format string, v ...interface{})
        Info(format string, v ...interface{})
        Warn(format string, v ...interface{})
        Error(format string, v ...interface{})
}

var (
        stdLogger  = NewDefaultLogger(0)
        raftLogger = Logger(stdLogger)
)

func SetLogger(l Logger) {
        raftLogger = l
}

func IsEnableDebug() bool {
        return raftLogger.IsEnableDebug()
}

func IsEnableInfo() bool {
        return raftLogger.IsEnableInfo()
}

func IsEnableWarn() bool {
        return raftLogger.IsEnableWarn()
}

func Debug(format string, v ...interface{}) {
        raftLogger.Debug(format, v...)
}

func Info(format string, v ...interface{}) {
        raftLogger.Info(format, v...)
}

func Warn(format string, v ...interface{}) {
        raftLogger.Warn(format, v...)
}

func Error(format string, v ...interface{}) {
        raftLogger.Error(format, v...)
}

// DefaultLogger is a default implementation of the Logger interface.
type DefaultLogger struct {
        *log.Log
        debugEnable bool
        infoEnable  bool
        warnEnable  bool
}

func NewDefaultLogger(level int) *DefaultLogger {
        logger, err := log.NewLog("", "raft", "DEBUG")
        if err != nil {
                panic(err)
        }
        return &DefaultLogger{
                Log:         logger,
                debugEnable: level <= log.DebugLevel,
                infoEnable:  level <= log.InfoLevel,
                warnEnable:  level <= log.WarnLevel,
        }
}

func (l *DefaultLogger) header(lvl, msg string) string {
        return fmt.Sprintf("%s: %s", lvl, msg)
}

func (l *DefaultLogger) IsEnableDebug() bool {
        return l.debugEnable
}

func (l *DefaultLogger) Debug(format string, v ...interface{}) {
        l.Output(4, l.header("DEBUG", fmt.Sprintf(format, v...)), false)
}

func (l *DefaultLogger) IsEnableInfo() bool {
        return l.infoEnable
}

func (l *DefaultLogger) Info(format string, v ...interface{}) {
        l.Output(4, l.header("INFO", fmt.Sprintf(format, v...)), false)
}

func (l *DefaultLogger) IsEnableWarn() bool {
        return l.warnEnable
}

func (l *DefaultLogger) Warn(format string, v ...interface{}) {
        l.Output(4, l.header("WARN", fmt.Sprintf(format, v...)), false)
}

func (l *DefaultLogger) Error(format string, v ...interface{}) {
        l.Output(4, l.header("ERROR", fmt.Sprintf(format, v...)), false)
}

type FileLogger struct {
        *log.Log
        debugEnable bool
        infoEnable  bool
        warnEnable  bool
}

func NewFileLogger(logger *log.Log, level int) *FileLogger {
        return &FileLogger{
                Log:         logger,
                debugEnable: level <= log.DebugLevel,
                infoEnable:  level <= log.InfoLevel,
                warnEnable:  level <= log.WarnLevel,
        }
}

func (fl *FileLogger) IsEnableDebug() bool {
        return fl.debugEnable
}

func (fl *FileLogger) Debug(format string, v ...interface{}) {
        fl.Debug(fmt.Sprintf(format, v...))
}

func (fl *FileLogger) IsEnableInfo() bool {
        return fl.infoEnable
}

func (fl *FileLogger) Info(format string, v ...interface{}) {
        fl.Info(fmt.Sprintf(format, v...))
}

func (fl *FileLogger) IsEnableWarn() bool {
        return fl.warnEnable
}

func (fl *FileLogger) Warn(format string, v ...interface{}) {
        fl.Warn(fmt.Sprintf(format, v...))
}

func (fl *FileLogger) Error(format string, v ...interface{}) {
        fl.Error(fmt.Sprintf(format, v...))
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "sync"
)

var pool = newPoolFactory()

type poolFactory struct {
        applyPool    *sync.Pool
        proposalPool *sync.Pool
}

func newPoolFactory() *poolFactory {
        return &poolFactory{
                applyPool: &sync.Pool{
                        New: func() interface{} {
                                return new(apply)
                        },
                },

                proposalPool: &sync.Pool{
                        New: func() interface{} {
                                return new(proposal)
                        },
                },
        }
}

func (f *poolFactory) getApply() *apply {
        a := f.applyPool.Get().(*apply)
        a.command = nil
        a.future = nil
        a.readIndexes = nil
        return a
}

func (f *poolFactory) returnApply(a *apply) {
        if a != nil {
                f.applyPool.Put(a)
        }
}

func (f *poolFactory) getProposal() *proposal {
        p := f.proposalPool.Get().(*proposal)
        p.data = nil
        p.future = nil
        return p
}

func (f *poolFactory) returnProposal(p *proposal) {
        if p != nil {
                f.proposalPool.Put(p)
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//        http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proto

import (
        "encoding/binary"
        "fmt"
        "io"
        "sort"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

const (
        version1        byte   = 1
        peer_size       uint64 = 11
        entry_header    uint64 = 17
        snapmeta_header uint64 = 20
        message_header  uint64 = 68
)

// Peer codec
func (p *Peer) Encode(datas []byte) {
        datas[0] = byte(p.Type)
        binary.BigEndian.PutUint16(datas[1:], p.Priority)
        binary.BigEndian.PutUint64(datas[3:], p.ID)
}

func (p *Peer) Decode(datas []byte) {
        p.Type = PeerType(datas[0])
        p.Priority = binary.BigEndian.Uint16(datas[1:])
        p.ID = binary.BigEndian.Uint64(datas[3:])
}

// HardState codec
func (c *HardState) Encode(datas []byte) {
        binary.BigEndian.PutUint64(datas[0:], c.Term)
        binary.BigEndian.PutUint64(datas[8:], c.Commit)
        binary.BigEndian.PutUint64(datas[16:], c.Vote)
}

func (c *HardState) Decode(datas []byte) {
        c.Term = binary.BigEndian.Uint64(datas[0:])
        c.Commit = binary.BigEndian.Uint64(datas[8:])
        c.Vote = binary.BigEndian.Uint64(datas[16:])
}

func (c *HardState) Size() uint64 {
        return 24
}

// ConfChange codec
func (c *ConfChange) Encode() []byte {
        datas := make([]byte, 1+peer_size+uint64(len(c.Context)))
        datas[0] = byte(c.Type)
        c.Peer.Encode(datas[1:])
        if len(c.Context) > 0 {
                copy(datas[peer_size+1:], c.Context)
        }
        return datas
}

func (c *ConfChange) Decode(datas []byte) {
        c.Type = ConfChangeType(datas[0])
        c.Peer.Decode(datas[1:])
        if uint64(len(datas)) > peer_size+1 {
                c.Context = append([]byte{}, datas[peer_size+1:]...)
        }
}

// SnapshotMeta codec
func (m *SnapshotMeta) Size() uint64 {
        return snapmeta_header + peer_size*uint64(len(m.Peers))
}

func (m *SnapshotMeta) Encode(w io.Writer) error {
        buf := getByteSlice()
        defer returnByteSlice(buf)

        binary.BigEndian.PutUint64(buf, m.Index)
        binary.BigEndian.PutUint64(buf[8:], m.Term)
        binary.BigEndian.PutUint32(buf[16:], uint32(len(m.Peers)))
        if _, err := w.Write(buf[0:snapmeta_header]); err != nil {
                return err
        }

        for _, p := range m.Peers {
                p.Encode(buf)
                if _, err := w.Write(buf[0:peer_size]); err != nil {
                        return err
                }
        }
        return nil
}

func (m *SnapshotMeta) Decode(datas []byte) {
        m.Index = binary.BigEndian.Uint64(datas)
        m.Term = binary.BigEndian.Uint64(datas[8:])
        size := binary.BigEndian.Uint32(datas[16:])
        m.Peers = make([]Peer, size)
        start := snapmeta_header
        for i := uint32(0); i < size; i++ {
                m.Peers[i].Decode(datas[start:])
                start = start + peer_size
        }
}

// Entry codec
func (e *Entry) Size() uint64 {
        return entry_header + uint64(len(e.Data))
}

func (e *Entry) Encode(w io.Writer) error {
        buf := getByteSlice()
        defer returnByteSlice(buf)

        buf[0] = byte(e.Type)
        binary.BigEndian.PutUint64(buf[1:], e.Term)
        binary.BigEndian.PutUint64(buf[9:], e.Index)
        if _, err := w.Write(buf[0:entry_header]); err != nil {
                return err
        }

        if len(e.Data) > 0 {
                if _, err := w.Write(e.Data); err != nil {
                        return err
                }
        }
        return nil
}

func (e *Entry) Decode(datas []byte) {
        e.Type = EntryType(datas[0])
        e.Term = binary.BigEndian.Uint64(datas[1:])
        e.Index = binary.BigEndian.Uint64(datas[9:])
        if uint64(len(datas)) > entry_header {
                e.Data = datas[entry_header:]
        }
}

// Message codec
func (m *Message) Size() uint64 {
        if m.Type == ReqMsgSnapShot {
                return message_header + m.SnapshotMeta.Size()
        }

        size := message_header + 4
        if len(m.Entries) > 0 {
                for _, e := range m.Entries {
                        size = size + e.Size() + 4
                }
        }
        if len(m.Context) > 0 {
                size = size + uint64(len(m.Context))
        }
        return size
}

func (m *Message) Encode(w io.Writer) error {
        buf := getByteSlice()
        defer returnByteSlice(buf)

        binary.BigEndian.PutUint32(buf, uint32(m.Size()))
        buf[4] = version1
        buf[5] = byte(m.Type)
        if m.ForceVote {
                buf[6] = 1
        } else {
                buf[6] = 0
        }
        if m.Reject {
                buf[7] = 1
        } else {
                buf[7] = 0
        }
        binary.BigEndian.PutUint64(buf[8:], m.RejectHint)
        binary.BigEndian.PutUint64(buf[16:], m.ID)
        binary.BigEndian.PutUint64(buf[24:], m.From)
        binary.BigEndian.PutUint64(buf[32:], m.To)
        binary.BigEndian.PutUint64(buf[40:], m.Term)
        binary.BigEndian.PutUint64(buf[48:], m.LogTerm)
        binary.BigEndian.PutUint64(buf[56:], m.Index)
        binary.BigEndian.PutUint64(buf[64:], m.Commit)
        if _, err := w.Write(buf[0 : message_header+4]); err != nil {
                return err
        }

        if m.Type == ReqMsgSnapShot {
                return m.SnapshotMeta.Encode(w)
        }

        binary.BigEndian.PutUint32(buf, uint32(len(m.Entries)))
        if _, err := w.Write(buf[0:4]); err != nil {
                return err
        }
        if len(m.Entries) > 0 {
                for _, e := range m.Entries {
                        binary.BigEndian.PutUint32(buf, uint32(e.Size()))
                        if _, err := w.Write(buf[0:4]); err != nil {
                                return err
                        }
                        if err := e.Encode(w); err != nil {
                                return err
                        }
                }
        }
        if len(m.Context) > 0 {
                if _, err := w.Write(m.Context); err != nil {
                        return err
                }
        }
        return nil
}

func (m *Message) Decode(r *util.BufferReader) error {
        var (
                datas []byte
                err   error
        )
        if datas, err = r.ReadFull(4); err != nil {
                return err
        }

        cnt := int(binary.BigEndian.Uint32(datas))
        if cnt > 256*1024*1024 {
                return fmt.Errorf("msg len is too big, please check, %d", cnt)
        }

        if datas, err = r.ReadFull(cnt); err != nil {
                return err
        }

        if len(datas) == 0 {
                return nil
        }
        ver := datas[0]
        if ver == version1 {
                m.Type = MsgType(datas[1])
                m.ForceVote = (datas[2] == 1)
                m.Reject = (datas[3] == 1)
                m.RejectHint = binary.BigEndian.Uint64(datas[4:])
                m.ID = binary.BigEndian.Uint64(datas[12:])
                m.From = binary.BigEndian.Uint64(datas[20:])
                m.To = binary.BigEndian.Uint64(datas[28:])
                m.Term = binary.BigEndian.Uint64(datas[36:])
                m.LogTerm = binary.BigEndian.Uint64(datas[44:])
                m.Index = binary.BigEndian.Uint64(datas[52:])
                m.Commit = binary.BigEndian.Uint64(datas[60:])
                if m.Type == ReqMsgSnapShot {
                        m.SnapshotMeta.Decode(datas[message_header:])
                } else {
                        size := binary.BigEndian.Uint32(datas[message_header:])
                        start := message_header + 4
                        if size > 0 {
                                for i := uint32(0); i < size; i++ {
                                        esize := binary.BigEndian.Uint32(datas[start:])
                                        start = start + 4
                                        end := start + uint64(esize)
                                        entry := new(Entry)
                                        entry.Decode(datas[start:end])
                                        m.Entries = append(m.Entries, entry)
                                        start = end
                                }
                        }
                        if start < uint64(len(datas)) {
                                m.Context = datas[start:]
                        }
                }
        }
        return nil
}

func EncodeHBConext(ctx HeartbeatContext) (buf []byte) {
        sort.Slice(ctx, func(i, j int) bool {
                return ctx[i] < ctx[j]
        })

        scratch := make([]byte, binary.MaxVarintLen64)
        prev := uint64(0)
        for _, id := range ctx {
                n := binary.PutUvarint(scratch, id-prev)
                buf = append(buf, scratch[:n]...)
                prev = id
        }
        return
}

func DecodeHBContext(buf []byte) (ctx HeartbeatContext) {
        prev := uint64(0)
        for len(buf) > 0 {
                id, n := binary.Uvarint(buf)
                ctx = append(ctx, id+prev)
                prev = id + prev
                buf = buf[n:]
        }
        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proto

import (
        "sync"
)

var (
        msgPool = &sync.Pool{
                New: func() interface{} {
                        return &Message{
                                Entries: make([]*Entry, 0, 128),
                        }
                },
        }

        bytePool = &sync.Pool{
                New: func() interface{} {
                        return make([]byte, 128)
                },
        }
)

func GetMessage() *Message {
        msg := msgPool.Get().(*Message)
        msg.Reject = false
        msg.RejectHint = 0
        msg.ID = 0
        msg.From = 0
        msg.To = 0
        msg.Term = 0
        msg.LogTerm = 0
        msg.Index = 0
        msg.Commit = 0
        msg.SnapshotMeta.Index = 0
        msg.SnapshotMeta.Term = 0
        msg.SnapshotMeta.Peers = nil
        msg.Snapshot = nil
        msg.Context = nil
        msg.Entries = msg.Entries[0:0]
        return msg
}

func ReturnMessage(msg *Message) {
        if msg != nil {
                msgPool.Put(msg)
        }
}

func getByteSlice() []byte {
        return bytePool.Get().([]byte)
}

func returnByteSlice(b []byte) {
        bytePool.Put(b)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proto

import (
        "fmt"
)

type (
        MsgType        byte
        EntryType      byte
        ConfChangeType byte
        PeerType       byte
)

const (
        ReqMsgAppend MsgType = iota
        ReqMsgPreVote
        ReqMsgHeartBeat
        ReqMsgSnapShot
        ReqMsgVote
        RespMsgAppend
        RespMsgPreVote
        RespMsgHeartBeat
        RespMsgSnapShot
        RespMsgVote
        LocalMsgHup
        LocalMsgProp
        LeaseMsgOffline
        LeaseMsgTimeout
        ReqCheckQuorum
        RespCheckQuorum
)

const (
        ConfAddNode    ConfChangeType = 0
        ConfRemoveNode ConfChangeType = 1
        ConfUpdateNode ConfChangeType = 2

        EntryNormal     EntryType = 0
        EntryConfChange EntryType = 1

        PeerNormal  PeerType = 0
        PeerArbiter PeerType = 1
)

// The Snapshot interface is supplied by the application to access the snapshot data of application.
type Snapshot interface {
        SnapIterator
        ApplyIndex() uint64
        Close()
}

type SnapIterator interface {
        // if error=io.EOF represent snapshot terminated.
        Next() ([]byte, error)
}

type SnapshotMeta struct {
        Index uint64
        Term  uint64
        Peers []Peer
}

type Peer struct {
        Type     PeerType
        Priority uint16
        ID       uint64 // NodeID
        PeerID   uint64 // Replica ID, unique over all raft groups and all replicas in the same group
}

// HardState is the repl state,must persist to the storage.
type HardState struct {
        Term   uint64
        Commit uint64
        Vote   uint64
}

// Entry is the repl log entry.
type Entry struct {
        Type  EntryType
        Term  uint64
        Index uint64
        Data  []byte
}

// Message is the transport message.
type Message struct {
        Type         MsgType
        ForceVote    bool
        Reject       bool
        RejectHint   uint64
        ID           uint64
        From         uint64
        To           uint64
        Term         uint64
        LogTerm      uint64
        Index        uint64
        Commit       uint64
        SnapshotMeta SnapshotMeta
        Entries      []*Entry
        Context      []byte
        Snapshot     Snapshot // No need for codec
}

func (m *Message) ToString() (mesg string) {
        return fmt.Sprintf("Mesg:[%v] type(%v) ForceVote(%v) Reject(%v) RejectHint(%v) "+
                "From(%v) To(%v) Term(%v) LogTrem(%v) Index(%v) Commit(%v)", m.ID, m.Type.String(), m.ForceVote,
                m.Reject, m.RejectHint, m.From, m.To, m.Term, m.LogTerm, m.Index, m.Commit)
}

type ConfChange struct {
        Type    ConfChangeType
        Peer    Peer
        Context []byte
}

type HeartbeatContext []uint64

func (t MsgType) String() string {
        switch t {
        case 0:
                return "ReqMsgAppend"
        case 1:
                return "ReqMsgPreVote"
        case 2:
                return "ReqMsgHeartBeat"
        case 3:
                return "ReqMsgSnapShot"
        case 4:
                return "ReqMsgVote"
        case 5:
                return "RespMsgAppend"
        case 6:
                return "RespMsgPreVote"
        case 7:
                return "RespMsgHeartBeat"
        case 8:
                return "RespMsgSnapShot"
        case 9:
                return "RespMsgVote"
        case 10:
                return "LocalMsgHup"
        case 11:
                return "LocalMsgProp"
        case 12:
                return "LeaseMsgOffline"
        case 13:
                return "LeaseMsgTimeout"
        case 14:
                return "ReqCheckQuorum"
        case 15:
                return "RespCheckQuorum"
        }
        return "unknown"
}

func (t EntryType) String() string {
        switch t {
        case 0:
                return "EntryNormal"
        case 1:
                return "EntryConfChange"
        }
        return "unknown"
}

func (t ConfChangeType) String() string {
        switch t {
        case 0:
                return "ConfAddNode"
        case 1:
                return "ConfRemoveNode"
        case 2:
                return "ConfUpdateNode"
        }
        return "unknown"
}

func (t PeerType) String() string {
        switch t {
        case 0:
                return "PeerNormal"
        case 1:
                return "PeerArbiter"
        }
        return "unknown"
}

func (p Peer) String() string {
        return fmt.Sprintf(`"nodeID":"%v","peerID":"%v","priority":"%v","type":"%v"`,
                p.ID, p.PeerID, p.Priority, p.Type.String())
}

func (cc *ConfChange) String() string {
        return fmt.Sprintf(`{"type":"%v",%v}`, cc.Type, cc.Peer.String())
}

func (m *Message) IsResponseMsg() bool {
        return m.Type == RespMsgAppend || m.Type == RespMsgHeartBeat || m.Type == RespMsgVote ||
                m.Type == RespMsgPreVote || m.Type == RespMsgSnapShot || m.Type == RespCheckQuorum
}

func (m *Message) IsElectionMsg() bool {
        return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat || m.Type == ReqMsgVote || m.Type == RespMsgVote ||
                m.Type == ReqMsgPreVote || m.Type == RespMsgPreVote || m.Type == LeaseMsgOffline || m.Type == LeaseMsgTimeout
}

func (m *Message) IsHeartbeatMsg() bool {
        return m.Type == ReqMsgHeartBeat || m.Type == RespMsgHeartBeat
}

func (s *HardState) IsEmpty() bool {
        return s.Term == 0 && s.Vote == 0 && s.Commit == 0
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "runtime"
        "sync"
        "sync/atomic"
        "unsafe"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type proposal struct {
        cmdType proto.EntryType
        future  *Future
        data    []byte
}

type apply struct {
        term        uint64
        index       uint64
        future      *Future
        command     interface{}
        readIndexes []*Future
}

// handle user's get log entries request
type entryRequest struct {
        future     *Future
        index      uint64
        maxSize    uint64
        onlyCommit bool
}

type softState struct {
        leader uint64
        term   uint64
}

type peerState struct {
        peers map[uint64]proto.Peer
        mu    sync.RWMutex
}

type monitorStatus struct {
        conErrCount    uint8
        replicasErrCnt map[uint64]uint8
}

func (s *peerState) change(c *proto.ConfChange) {
        s.mu.Lock()
        switch c.Type {
        case proto.ConfAddNode:
                s.peers[c.Peer.ID] = c.Peer
        case proto.ConfRemoveNode:
                delete(s.peers, c.Peer.ID)
        case proto.ConfUpdateNode:
                s.peers[c.Peer.ID] = c.Peer
        }
        s.mu.Unlock()
}

func (s *peerState) replace(peers []proto.Peer) {
        s.mu.Lock()
        s.peers = nil
        s.peers = make(map[uint64]proto.Peer)
        for _, p := range peers {
                s.peers[p.ID] = p
        }
        s.mu.Unlock()
}

func (s *peerState) get() (nodes []uint64) {
        s.mu.RLock()
        for n := range s.peers {
                nodes = append(nodes, n)
        }
        s.mu.RUnlock()
        return
}

type raft struct {
        raftFsm           *raftFsm
        config            *Config
        raftConfig        *RaftConfig
        restoringSnapshot util.AtomicBool
        curApplied        util.AtomicUInt64
        curSoftSt         unsafe.Pointer
        prevSoftSt        softState
        prevHardSt        proto.HardState
        peerState         peerState
        pending           map[uint64]*Future
        snapping          map[uint64]*snapshotStatus
        mStatus           *monitorStatus
        propc             chan *proposal
        applyc            chan *apply
        recvc             chan *proto.Message
        snapRecvc         chan *snapshotRequest
        truncatec         chan uint64
        readIndexC        chan *Future
        statusc           chan chan *Status
        entryRequestC     chan *entryRequest
        readyc            chan struct{}
        tickc             chan struct{}
        electc            chan struct{}
        stopc             chan struct{}
        done              chan struct{}
        mu                sync.Mutex
}

func newRaft(config *Config, raftConfig *RaftConfig) (*raft, error) {
        defer util.HandleCrash()

        if err := raftConfig.validate(); err != nil {
                return nil, err
        }

        r, err := newRaftFsm(config, raftConfig)
        if err != nil {
                return nil, err
        }

        mStatus := &monitorStatus{
                conErrCount:    0,
                replicasErrCnt: make(map[uint64]uint8),
        }
        raft := &raft{
                raftFsm:       r,
                config:        config,
                raftConfig:    raftConfig,
                mStatus:       mStatus,
                pending:       make(map[uint64]*Future),
                snapping:      make(map[uint64]*snapshotStatus),
                recvc:         make(chan *proto.Message, config.ReqBufferSize),
                applyc:        make(chan *apply, config.AppBufferSize),
                propc:         make(chan *proposal, 256),
                snapRecvc:     make(chan *snapshotRequest, 1),
                truncatec:     make(chan uint64, 1),
                readIndexC:    make(chan *Future, 256),
                statusc:       make(chan chan *Status, 1),
                entryRequestC: make(chan *entryRequest, 16),
                tickc:         make(chan struct{}, 64),
                readyc:        make(chan struct{}, 1),
                electc:        make(chan struct{}, 1),
                stopc:         make(chan struct{}),
                done:          make(chan struct{}),
        }
        raft.curApplied.Set(r.raftLog.applied)
        raft.peerState.replace(raftConfig.Peers)

        util.RunWorker(raft.runApply, raft.handlePanic)
        util.RunWorker(raft.run, raft.handlePanic)
        return raft, nil
}

func (s *raft) stop() {
        select {
        case <-s.done:
                return
        default:
                s.doStop()
        }
        <-s.done

}

func (s *raft) doStop() {
        s.mu.Lock()
        defer s.mu.Unlock()

        select {
        case <-s.stopc:
                return
        default:
                s.raftFsm.StopFsm()
                close(s.stopc)
                s.restoringSnapshot.Set(false)
        }
}

func (s *raft) runApply() {
        defer func() {
                s.doStop()
                s.resetApply()
        }()

        loopCount := 0
        for {
                loopCount = loopCount + 1
                if loopCount > 16 {
                        loopCount = 0
                        runtime.Gosched()
                }

                select {
                case <-s.stopc:
                        return

                case apply := <-s.applyc:
                        if apply.index <= s.curApplied.Get() {
                                if len(apply.readIndexes) > 0 {
                                        respondReadIndex(apply.readIndexes, nil)
                                }
                                continue
                        }

                        var (
                                err  error
                                resp interface{}
                        )
                        switch cmd := apply.command.(type) {
                        case *proto.ConfChange:
                                resp, err = s.raftConfig.StateMachine.ApplyMemberChange(cmd, apply.index)
                                if cmd.Type == proto.ConfRemoveNode && err == nil {
                                        s.raftFsm.mo.RemovePeer(s.raftFsm.id, cmd.Peer)
                                }

                        case []byte:
                                resp, err = s.raftConfig.StateMachine.Apply(cmd, apply.index)
                        }

                        if apply.future != nil {
                                apply.future.respond(resp, err)
                        }
                        if len(apply.readIndexes) > 0 {
                                respondReadIndex(apply.readIndexes, nil)
                        }
                        s.curApplied.Set(apply.index)
                        pool.returnApply(apply)
                }
        }
}

func (s *raft) run() {
        defer func() {
                s.doStop()
                s.resetPending(ErrStopped)
                s.raftFsm.readOnly.reset(ErrStopped)
                s.stopSnapping()
                s.raftConfig.Storage.Close()
                close(s.done)
        }()

        s.prevHardSt.Term = s.raftFsm.term
        s.prevHardSt.Vote = s.raftFsm.vote
        s.prevHardSt.Commit = s.raftFsm.raftLog.committed
        s.maybeChange(true)

        loopCount := 0
        var readyc chan struct{}
        for {
                if readyc == nil && s.containsUpdate() {
                        readyc = s.readyc
                        readyc <- struct{}{}
                }

                select {
                case <-s.stopc:
                        return

                case <-s.tickc:
                        s.raftFsm.tick()
                        s.maybeChange(true)

                case pr := <-s.propc:
                        if s.raftFsm.leader != s.config.NodeID {
                                pr.future.respond(nil, ErrNotLeader)
                                pool.returnProposal(pr)
                                break
                        }

                        msg := proto.GetMessage()
                        msg.Type = proto.LocalMsgProp
                        msg.From = s.config.NodeID
                        starti := s.raftFsm.raftLog.lastIndex() + 1
                        s.pending[starti] = pr.future
                        msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data})
                        pool.returnProposal(pr)

                        flag := false
                        for i := 1; i < 64; i++ {
                                starti = starti + 1
                                select {
                                case pr := <-s.propc:
                                        s.pending[starti] = pr.future
                                        msg.Entries = append(msg.Entries, &proto.Entry{Term: s.raftFsm.term, Index: starti, Type: pr.cmdType, Data: pr.data})
                                        pool.returnProposal(pr)
                                default:
                                        flag = true
                                }
                                if flag {
                                        break
                                }
                        }
                        s.raftFsm.Step(msg)

                case m := <-s.recvc:
                        if _, ok := s.raftFsm.replicas[m.From]; ok || (!m.IsResponseMsg() && m.Type != proto.ReqMsgVote) ||
                                (m.Type == proto.ReqMsgVote && s.raftFsm.raftLog.isUpToDate(m.Index, m.LogTerm, 0, 0)) {
                                switch m.Type {
                                case proto.ReqMsgHeartBeat:
                                        // if s.raftFsm.leader == no leader, also need handler heartbeat request.
                                        // Otherwise PreCandidate will not change his state to Follower.
                                        // So remove the condition s.raftFsm.leader == m.From
                                        if m.From != s.config.NodeID {
                                                s.raftFsm.Step(m)
                                        }
                                case proto.RespMsgHeartBeat:
                                        if s.raftFsm.leader == s.config.NodeID && m.From != s.config.NodeID {
                                                s.raftFsm.Step(m)
                                        }
                                default:
                                        s.raftFsm.Step(m)
                                }
                                var respErr = true
                                if m.Type == proto.RespMsgAppend && !m.Reject {
                                        respErr = false
                                }
                                s.maybeChange(respErr)
                        } else if logger.IsEnableWarn() && m.Type != proto.RespMsgHeartBeat {
                                logger.Warn(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] ignored a %s message "+
                                        "without the replica from [%v term: %d].",
                                        s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.Term)
                        }

                case snapReq := <-s.snapRecvc:
                        s.handleSnapshot(snapReq)

                case <-readyc:
                        s.persist()
                        s.apply()
                        s.advance()
                        // Send all messages.
                        for _, msg := range s.raftFsm.msgs {
                                if msg.Type == proto.ReqMsgSnapShot {
                                        s.sendSnapshot(msg)
                                        continue
                                }
                                s.sendMessage(msg)
                        }
                        s.raftFsm.msgs = nil
                        readyc = nil
                        loopCount = loopCount + 1
                        if loopCount >= 2 {
                                loopCount = 0
                                runtime.Gosched()
                        }

                case <-s.electc:
                        msg := proto.GetMessage()
                        msg.Type = proto.LocalMsgHup
                        msg.From = s.config.NodeID
                        msg.ForceVote = true
                        logger.Debug("raft[%v] node %v try to leader", s.raftFsm.id, s.config.NodeID)
                        s.raftFsm.Step(msg)
                        s.maybeChange(true)

                case c := <-s.statusc:
                        c <- s.getStatus()

                case truncIndex := <-s.truncatec:
                        func() {
                                defer util.HandleCrash()

                                if lasti, err := s.raftConfig.Storage.LastIndex(); err != nil {
                                        logger.Error("raft[%v] truncate failed to get last index from storage: %v", s.raftFsm.id, err)
                                } else if lasti > s.config.RetainLogs {
                                        maxIndex := util.Min(truncIndex, lasti-s.config.RetainLogs)
                                        if err = s.raftConfig.Storage.Truncate(maxIndex); err != nil {
                                                logger.Error("raft[%v] truncate failed,error is: %v", s.raftFsm.id, err)
                                        }
                                }
                        }()

                case future := <-s.readIndexC:
                        futures := []*Future{future}
                        // handle in batch
                        var flag bool
                        for i := 1; i < 64; i++ {
                                select {
                                case f := <-s.readIndexC:
                                        futures = append(futures, f)
                                default:
                                        flag = true
                                }
                                if flag {
                                        break
                                }
                        }
                        s.raftFsm.addReadIndex(futures)

                case req := <-s.entryRequestC:
                        s.getEntriesInLoop(req)
                }
        }
}

func (s *raft) tick() {
        if s.restoringSnapshot.Get() {
                return
        }

        select {
        case <-s.stopc:
        case s.tickc <- struct{}{}:
        default:
                return
        }
}

func (s *raft) propose(cmd []byte, future *Future) {
        if !s.isLeader() {
                future.respond(nil, ErrNotLeader)
                return
        }

        pr := pool.getProposal()
        pr.cmdType = proto.EntryNormal
        pr.data = cmd
        pr.future = future

        select {
        case <-s.stopc:
                future.respond(nil, ErrStopped)
        case s.propc <- pr:
        }
}

func (s *raft) proposeMemberChange(cc *proto.ConfChange, future *Future) {
        if !s.isLeader() {
                future.respond(nil, ErrNotLeader)
                return
        }

        pr := pool.getProposal()
        pr.cmdType = proto.EntryConfChange
        pr.future = future
        pr.data = cc.Encode()

        select {
        case <-s.stopc:
                future.respond(nil, ErrStopped)
        case s.propc <- pr:
        }
}

func (s *raft) reciveMessage(m *proto.Message) {
        if s.restoringSnapshot.Get() {
                return
        }

        select {
        case <-s.stopc:
        case s.recvc <- m:
        default:
                logger.Warn(fmt.Sprintf("raft[%v] discard message(%v)", s.raftConfig.ID, m.ToString()))
                return
        }
}

func (s *raft) reciveSnapshot(m *snapshotRequest) {
        if s.restoringSnapshot.Get() {
                m.respond(ErrSnapping)
                return
        }

        select {
        case <-s.stopc:
                m.respond(ErrStopped)
                return
        case s.snapRecvc <- m:
        }
}

func (s *raft) status() *Status {
        if s.restoringSnapshot.Get() {
                return &Status{
                        ID:                s.raftFsm.id,
                        NodeID:            s.config.NodeID,
                        RestoringSnapshot: true,
                        State:             stateFollower.String(),
                }
        }

        c := make(chan *Status, 1)
        select {
        case <-s.stopc:
                return nil
        case s.statusc <- c:
                return <-c
        }
}

func (s *raft) truncate(index uint64) {
        logger.Debug("raft[%v] truncate index %v", s.raftFsm.id, index)
        if s.restoringSnapshot.Get() {
                return
        }

        select {
        case <-s.stopc:
        case s.truncatec <- index:
        default:
                return
        }
}

func (s *raft) tryToLeader(future *Future) {
        if s.restoringSnapshot.Get() {
                future.respond(nil, nil)
                return
        }

        select {
        case <-s.stopc:
                future.respond(nil, ErrStopped)
        case s.electc <- struct{}{}:
                future.respond(nil, nil)
        }
}

func (s *raft) leaderTerm() (leader, term uint64) {
        st := (*softState)(atomic.LoadPointer(&s.curSoftSt))
        if st == nil {
                return NoLeader, 0
        }
        return st.leader, st.term
}

func (s *raft) isLeader() bool {
        leader, _ := s.leaderTerm()
        return leader == s.config.NodeID
}

func (s *raft) applied() uint64 {
        return s.curApplied.Get()
}

func (s *raft) committed() uint64 {
        return s.raftFsm.raftLog.committed
}

func (s *raft) sendMessage(m *proto.Message) {
        s.config.transport.Send(m)
}

func (s *raft) maybeChange(respErr bool) {
        updated := false
        if s.prevSoftSt.term != s.raftFsm.term {
                updated = true
                s.prevSoftSt.term = s.raftFsm.term
                s.resetTick()
        }
        preLeader := s.prevSoftSt.leader
        if preLeader != s.raftFsm.leader {
                updated = true
                s.prevSoftSt.leader = s.raftFsm.leader
                if s.raftFsm.leader != s.config.NodeID {
                        if respErr || preLeader != s.config.NodeID {
                                s.resetPending(ErrNotLeader)
                        }
                        s.stopSnapping()
                }
                if logger.IsEnableWarn() {
                        if s.raftFsm.leader != NoLeader {
                                if preLeader == NoLeader {
                                        logger.Warn("raft:[%v] elected leader %v at term %d.", s.raftFsm.id, s.raftFsm.leader, s.raftFsm.term)
                                } else {
                                        logger.Warn("raft:[%v] changed leader from %v to %v at term %d.", s.raftFsm.id, preLeader, s.raftFsm.leader, s.raftFsm.term)
                                }
                        } else {
                                logger.Warn("raft:[%v] lost leader %v at term %d.", s.raftFsm.id, preLeader, s.raftFsm.term)
                        }
                }

                s.raftConfig.StateMachine.HandleLeaderChange(s.raftFsm.leader)
        }
        if updated {
                atomic.StorePointer(&s.curSoftSt, unsafe.Pointer(&softState{leader: s.raftFsm.leader, term: s.raftFsm.term}))
        }
}

func (s *raft) persist() {
        unstableEntries := s.raftFsm.raftLog.unstableEntries()
        if len(unstableEntries) > 0 {
                if err := s.raftConfig.Storage.StoreEntries(unstableEntries); err != nil {
                        panic(AppPanicError(fmt.Sprintf("[raft->persist][%v] storage storeEntries err: [%v].", s.raftFsm.id, err)))
                }
        }
        if s.raftFsm.raftLog.committed != s.prevHardSt.Commit || s.raftFsm.term != s.prevHardSt.Term || s.raftFsm.vote != s.prevHardSt.Vote {
                hs := proto.HardState{Term: s.raftFsm.term, Vote: s.raftFsm.vote, Commit: s.raftFsm.raftLog.committed}
                if err := s.raftConfig.Storage.StoreHardState(hs); err != nil {
                        panic(AppPanicError(fmt.Sprintf("[raft->persist][%v] storage storeHardState err: [%v].", s.raftFsm.id, err)))
                }
                s.prevHardSt = hs
        }
}

func (s *raft) apply() {
        committedEntries := s.raftFsm.raftLog.nextEnts(noLimit)
        // check ready read index
        if len(committedEntries) == 0 {
                readIndexes := s.raftFsm.readOnly.getReady(s.curApplied.Get())
                if len(readIndexes) == 0 {
                        return
                }
                apply := pool.getApply()
                apply.readIndexes = readIndexes
                select {
                case <-s.stopc:
                        respondReadIndex(readIndexes, ErrStopped)
                case s.applyc <- apply:
                }
                return
        }

        for _, entry := range committedEntries {
                apply := pool.getApply()
                apply.term = entry.Term
                apply.index = entry.Index
                if future, ok := s.pending[entry.Index]; ok {
                        apply.future = future
                        delete(s.pending, entry.Index)
                }
                apply.readIndexes = s.raftFsm.readOnly.getReady(entry.Index)

                switch entry.Type {
                case proto.EntryNormal:
                        if len(entry.Data) > 0 {
                                apply.command = entry.Data
                        }

                case proto.EntryConfChange:
                        cc := new(proto.ConfChange)
                        cc.Decode(entry.Data)
                        apply.command = cc
                        // repl apply
                        peerChange := cc.Peer
                        worked := s.raftFsm.applyConfChange(cc)
                        if cc.Type == proto.ConfRemoveNode && worked {
                                if _, ok := s.raftFsm.replicas[peerChange.PeerID]; !ok {
                                        if logger.IsEnableWarn() {
                                                logger.Warn("raft[%v] applying configuration peer [%v] be removed and stop snapshot", s.raftFsm.id, peerChange)
                                        }
                                        s.removeSnapping(peerChange.PeerID)
                                }
                        }

                        s.peerState.change(cc)
                        if logger.IsEnableWarn() {
                                logger.Warn("raft[%v] applying configuration change %v.", s.raftFsm.id, cc)
                        }
                }
                select {
                case <-s.stopc:
                        if apply.future != nil {
                                apply.future.respond(nil, ErrStopped)
                        }
                        if len(apply.readIndexes) > 0 {
                                respondReadIndex(apply.readIndexes, ErrStopped)
                        }
                case s.applyc <- apply:
                }
        }
}

func (s *raft) advance() {
        s.raftFsm.raftLog.appliedTo(s.raftFsm.raftLog.committed)
        entries := s.raftFsm.raftLog.unstableEntries()
        if len(entries) > 0 {
                s.raftFsm.raftLog.stableTo(entries[len(entries)-1].Index, entries[len(entries)-1].Term)
        }
}

func (s *raft) containsUpdate() bool {
        return len(s.raftFsm.raftLog.unstableEntries()) > 0 || s.raftFsm.raftLog.committed > s.raftFsm.raftLog.applied || len(s.raftFsm.msgs) > 0 ||
                s.raftFsm.raftLog.committed != s.prevHardSt.Commit || s.raftFsm.term != s.prevHardSt.Term || s.raftFsm.vote != s.prevHardSt.Vote ||
                s.raftFsm.readOnly.containsUpdate(s.curApplied.Get())
}

func (s *raft) resetPending(err error) {
        if len(s.pending) > 0 {
                for k, v := range s.pending {
                        v.respond(nil, err)
                        delete(s.pending, k)
                }
        }
}

func (s *raft) resetTick() {
        for {
                select {
                case <-s.tickc:
                default:
                        return
                }
        }
}

func (s *raft) resetApply() {
        for {
                select {
                case apply := <-s.applyc:
                        if apply.future != nil {
                                apply.future.respond(nil, ErrStopped)
                        }
                        if len(apply.readIndexes) > 0 {
                                respondReadIndex(apply.readIndexes, ErrStopped)
                        }
                        pool.returnApply(apply)
                default:
                        return
                }
        }
}

func (s *raft) getStatus() *Status {
        stopped := false
        select {
        case <-s.stopc:
                stopped = true
        default:
        }

        st := &Status{
                ID:                s.raftFsm.id,
                NodeID:            s.config.NodeID,
                Leader:            s.raftFsm.leader,
                Term:              s.raftFsm.term,
                Index:             s.raftFsm.raftLog.lastIndex(),
                Commit:            s.raftFsm.raftLog.committed,
                Applied:           s.curApplied.Get(),
                Vote:              s.raftFsm.vote,
                State:             s.raftFsm.state.String(),
                RestoringSnapshot: s.restoringSnapshot.Get(),
                PendQueue:         len(s.pending),
                RecvQueue:         len(s.recvc),
                AppQueue:          len(s.applyc),
                Stopped:           stopped,
        }
        if s.raftFsm.state == stateLeader {
                st.Replicas = make(map[uint64]*ReplicaStatus)
                for id, p := range s.raftFsm.replicas {
                        st.Replicas[id] = &ReplicaStatus{
                                Match:       p.match,
                                Commit:      p.committed,
                                Next:        p.next,
                                State:       p.state.String(),
                                Snapshoting: p.state == replicaStateSnapshot,
                                Paused:      p.paused,
                                Active:      p.active,
                                LastActive:  p.lastActive,
                                Inflight:    p.count,
                        }
                }
        }
        return st
}

func (s *raft) handlePanic(err interface{}) {
        fatalStopc <- s.raftFsm.id

        fatal := &FatalError{
                ID:  s.raftFsm.id,
                Err: fmt.Errorf("raft[%v] occur panic error: [%v]", s.raftFsm.id, err),
        }
        s.raftConfig.StateMachine.HandleFatalEvent(fatal)
}

func (s *raft) getPeers() (peers []uint64) {
        return s.peerState.get()
}

func (s *raft) readIndex(future *Future) {
        if !s.isLeader() {
                future.respond(nil, ErrNotLeader)
                return
        }

        select {
        case <-s.stopc:
                future.respond(nil, ErrStopped)
        case s.readIndexC <- future:
        }
}

func (s *raft) getEntries(future *Future, startIndex uint64, maxSize uint64) {
        req := &entryRequest{
                future:  future,
                index:   startIndex,
                maxSize: maxSize,
        }
        select {
        case <-s.stopc:
                future.respond(nil, ErrStopped)
        case s.entryRequestC <- req:
        }
}

func (s *raft) getEntriesInLoop(req *entryRequest) {
        select {
        case <-s.stopc:
                req.future.respond(nil, ErrStopped)
                return
        default:
        }

        if !s.isLeader() {
                req.future.respond(nil, ErrNotLeader)
                return
        }
        if req.index > s.raftFsm.raftLog.lastIndex() {
                req.future.respond(nil, nil)
                return
        }
        if req.index < s.raftFsm.raftLog.firstIndex() {
                req.future.respond(nil, ErrCompacted)
                return
        }
        entries, err := s.raftFsm.raftLog.entries(req.index, req.maxSize)
        req.future.respond(entries, err)
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "math/rand"
        "strings"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "time"
)

// CampaignType represents the type of campaigning
// the reason we use the type of string instead of uint64
// is because it's simpler to compare and fill in raft entries
type CampaignType string

// NoLeader is a placeholder nodeID used when there is no leader.
const NoLeader uint64 = 0

// Possible values for CampaignType
const (
        // campaignPreElection represents the first phase of a normal election when
        // Config.PreVote is true.
        campaignPreElection CampaignType = "CampaignPreElection"
        // campaignElection represents a normal (time-based) election (the second phase
        // of the election when Config.PreVote is true).
        campaignElection CampaignType = "CampaignElection"
)

type stepFunc func(r *raftFsm, m *proto.Message)

type raftFsm struct {
        id               uint64
        term             uint64
        vote             uint64
        leader           uint64
        electionElapsed  int
        heartbeatElapsed int
        // randElectionTick is a random number between[electiontimetick, 2 * electiontimetick - 1].
        // It gets reset when raft changes its state to follower or candidate.
        randElectionTick int
        // New configuration is ignored if there exists unapplied configuration.
        pendingConf bool
        state       fsmState
        sm          StateMachine
        config      *Config
        raftLog     *raftLog
        rand        *rand.Rand
        votes       map[uint64]bool
        acks        map[uint64]bool
        replicas    map[uint64]*replica
        readOnly    *readOnly
        msgs        []*proto.Message
        step        stepFunc
        tick        func()
        stopCh      chan struct{}

        mo Monitor
        //electionFirstBegin is used to mark the begin time of continuous election
        //It is valid if and only if mo != nil.
        electionFirstBegin time.Time
}

func (fsm *raftFsm) getReplicas() (m string) {
        for id := range fsm.replicas {
                m += fmt.Sprintf(" [%v] ,", id)
        }
        return m
}

func newRaftFsm(config *Config, raftConfig *RaftConfig) (*raftFsm, error) {
        raftlog, err := newRaftLog(raftConfig.Storage)
        if err != nil {
                return nil, err
        }
        hs, err := raftConfig.Storage.InitialState()
        if err != nil {
                return nil, err
        }

        r := &raftFsm{
                id:       raftConfig.ID,
                sm:       raftConfig.StateMachine,
                mo:       raftConfig.Monitor,
                config:   config,
                leader:   NoLeader,
                raftLog:  raftlog,
                replicas: make(map[uint64]*replica),
                readOnly: newReadOnly(raftConfig.ID, config.ReadOnlyOption),
        }
        r.rand = rand.New(rand.NewSource(int64(config.NodeID + r.id)))
        for _, p := range raftConfig.Peers {
                r.replicas[p.ID] = newReplica(p, 0)
        }

        if !hs.IsEmpty() {
                if raftConfig.Applied > r.raftLog.lastIndex() {
                        logger.Info("newRaft[%v] update [applied: %d, to lastindex: %d]", r.id, raftConfig.Applied, raftlog.lastIndex())
                        raftConfig.Applied = r.raftLog.lastIndex()
                }
                if hs.Commit > r.raftLog.lastIndex() {
                        logger.Info("newRaft[%v] update [hardState commit: %d, to lastindex: %d]", r.id, hs.Commit, raftlog.lastIndex())
                        hs.Commit = r.raftLog.lastIndex()
                }
                if err := r.loadState(hs); err != nil {
                        return nil, err
                }
        }

        logger.Info("newRaft[%v] [commit: %d, applied: %d, lastindex: %d]", r.id, raftlog.committed, raftConfig.Applied, raftlog.lastIndex())

        if raftConfig.Applied > 0 {
                lasti := raftlog.lastIndex()
                if lasti == 0 {
                        // If there is application data but no raft log, then restore to initial state.
                        raftlog.committed = 0
                        raftConfig.Applied = 0
                } else if lasti < raftConfig.Applied {
                        // If lastIndex<appliedIndex, then the log as the standard.
                        raftlog.committed = lasti
                        raftConfig.Applied = lasti
                } else if raftlog.committed < raftConfig.Applied {
                        raftlog.committed = raftConfig.Applied
                }
                raftlog.appliedTo(raftConfig.Applied)
        }

        // recover committed
        if err := r.recoverCommit(); err != nil {
                return nil, err
        }
        if raftConfig.Leader == config.NodeID {
                if raftConfig.Term != 0 && r.term <= raftConfig.Term {
                        r.term = raftConfig.Term
                        r.state = stateLeader
                        r.becomeLeader()
                        r.bcastAppend()
                } else {
                        r.becomeFollower(r.term, NoLeader)
                }
        } else {
                if raftConfig.Leader == NoLeader {
                        r.becomeFollower(r.term, NoLeader)
                } else {
                        r.becomeFollower(raftConfig.Term, raftConfig.Leader)
                }
        }

        if logger.IsEnableDebug() {
                peerStrs := make([]string, 0)
                for _, p := range r.peers() {
                        peerStrs = append(peerStrs, fmt.Sprintf("%v", p.String()))
                }
                logger.Debug("newRaft[%v] [peers: [%s], term: %d, commit: %d, applied: %d, lastindex: %d, lastterm: %d]",
                        r.id, strings.Join(peerStrs, ","), r.term, r.raftLog.committed, r.raftLog.applied, r.raftLog.lastIndex(), r.raftLog.lastTerm())
        }
        r.stopCh = make(chan struct{}, 1)
        go r.doRandomSeed()
        return r, nil
}

func (r *raftFsm) doRandomSeed() {
        ticker := time.Tick(time.Duration(rand.Intn(5)) * time.Second)
        for {
                select {
                case <-ticker:
                        r.rand.Seed(time.Now().UnixNano())
                case <-r.stopCh:
                        return
                }
        }
}

func (r *raftFsm) StopFsm() {
        peers := make([]proto.Peer, len(r.replicas))
        for _, r := range r.replicas {
                peers = append(peers, r.peer)
        }
        if r.mo != nil {
                r.mo.RemovePartition(r.id, peers)
        }
        close(r.stopCh)
}

// raft main method
func (r *raftFsm) Step(m *proto.Message) {
        if m.Type == proto.LocalMsgHup {
                if r.state != stateLeader && r.promotable() {
                        ents, err := r.raftLog.slice(r.raftLog.applied+1, r.raftLog.committed+1, noLimit)
                        if err != nil {
                                errMsg := fmt.Sprintf("[raft->Step][%v]unexpected error getting unapplied entries:[%v]", r.id, err)
                                logger.Error(errMsg)
                                panic(AppPanicError(errMsg))
                        }
                        if n := numOfPendingConf(ents); n != 0 && r.raftLog.committed > r.raftLog.applied {
                                if logger.IsEnableWarn() {
                                        logger.Warn("[raft->Step][%v] cannot campaign at term %d since there are still %d pending configuration changes to apply.", r.id, r.term, n)
                                }
                                return
                        }

                        if logger.IsEnableInfo() {
                                logger.Info("[raft->Step][%v] is starting a new election at term[%d].", r.id, r.term)
                        }
                        // only transfer leader will set forceVote=true.
                        // Leadership transfers never use pre-vote even if r.preVote is true; we
                        // know we are not recovering from a partition so there is no need for the
                        // extra round trip.
                        if r.config.PreVote && !m.ForceVote {
                                r.campaign(m.ForceVote, campaignPreElection)
                        } else {
                                r.campaign(m.ForceVote, campaignElection)
                        }

                } else if logger.IsEnableDebug() && r.state == stateLeader {
                        logger.Debug("[raft->Step][%v] ignoring LocalMsgHup because already leader.", r.id)
                } else if logger.IsEnableDebug() {
                        var replicas []uint64
                        for id := range r.replicas {
                                replicas = append(replicas, id)
                        }
                        logger.Debug("[raft->Step][%v] state %v, replicas %v.", r.id, r.state, replicas)
                }
                return
        }
        switch {
        case m.Term == 0:
                // local message
        case m.Term > r.term:
                if logger.IsEnableDebug() {
                        logger.Debug("[raft->Step][%v term: %d] received a [%s] message with higher term from [%v term: %d],ForceVote[%v].",
                                r.id, r.term, m.Type, m.From, m.Term, m.ForceVote)
                }
                if m.Type == proto.ReqMsgVote || m.Type == proto.ReqMsgPreVote {
                        inLease := r.config.LeaseCheck && r.leader != NoLeader
                        if r.leader != m.From && inLease && !m.ForceVote && r.electionElapsed < r.randElectionTick {
                                if logger.IsEnableWarn() {
                                        logger.Warn("[raft->Step][%v logterm: %d, index: %d, vote: %v] ignored %v from %v [logterm: %d, index: %d] at term %d: lease is not expired.",
                                                r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
                                }

                                return
                        }
                }
                switch {
                case m.Type == proto.ReqMsgPreVote:
                        // Never change our term in response to a PreVote
                case m.Type == proto.RespMsgPreVote && !m.Reject:
                        // We send pre-vote requests with a term in our future. If the
                        // pre-vote is granted, we will increment our term when we get a
                        // quorum. If it is not, the term comes from the node that
                        // rejected our vote so we should become a follower at the new
                        // term.
                default:
                        if logger.IsEnableDebug() {
                                logger.Debug("[raft->Step][%x,%d] [term: %d] received a %s message with higher term from %x [term: %d]",
                                        r.id, r.config.ReplicateAddr, r.term, m.Type, m.From, m.Term)
                        }
                        if m.Type == proto.ReqMsgAppend || m.Type == proto.ReqMsgHeartBeat || m.Type == proto.ReqMsgSnapShot {
                                r.becomeFollower(m.Term, m.From)
                        } else {
                                r.becomeFollower(m.Term, NoLeader)
                        }
                }

        case m.Term < r.term:
                if (r.config.LeaseCheck || r.config.PreVote) && (m.Type == proto.ReqMsgHeartBeat || m.Type == proto.ReqMsgAppend) {
                        // We have received messages from a leader at a lower term. It is possible
                        // that these messages were simply delayed in the network, but this could
                        // also mean that this node has advanced its term number during a network
                        // partition, and it is now unable to either win an election or to rejoin
                        // the majority on the old term. If checkQuorum is false, this will be
                        // handled by incrementing term numbers in response to MsgVote with a
                        // higher term, but if checkQuorum is true we may not advance the term on
                        // MsgVote and must generate other messages to advance the term. The net
                        // result of these two features is to minimize the disruption caused by
                        // nodes that have been removed from the cluster's configuration: a
                        // removed node will send MsgVotes (or MsgPreVotes) which will be ignored,
                        // but it will not receive MsgApp or MsgHeartbeat, so it will not create
                        // disruptive term increases, by notifying leader of this node's activeness.
                        // The above comments also true for Pre-Vote
                        //
                        // When follower gets isolated, it soon starts an election ending
                        // up with a higher term than leader, although it won't receive enough
                        // votes to win the election. When it regains connectivity, this response
                        // with "proto.MsgAppResp" of higher term would force leader to step down.
                        // However, this disruption is inevitable to free this stuck node with
                        // fresh election. This can be prevented with Pre-Vote phase.
                        r.send(&proto.Message{To: m.From, Term: r.term, Type: proto.RespMsgAppend})
                } else if m.Type == proto.ReqMsgPreVote {
                        // Before Pre-Vote enable, there may have candidate with higher term,
                        // but less log. After update to Pre-Vote, the cluster may deadlock if
                        // we drop messages with a lower term.
                        if logger.IsEnableInfo() {
                                logger.Info("%x [logterm: %d, index: %d, vote: %x] rejected %s from %x [logterm: %d, index: %d] at term %d",
                                        r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
                        }
                        r.send(&proto.Message{To: m.From, Term: r.term, Type: proto.RespMsgPreVote, Reject: true})
                } else {
                        // ignore other cases
                        if logger.IsEnableInfo() {
                                logger.Info("%x [term: %d] ignored a %s message with lower term from %x [term: %d]",
                                        r.id, r.term, m.Type, m.From, m.Term)
                        }
                }

                return
        }

        if m.Type == proto.ReqMsgPreVote || m.Type == proto.ReqMsgVote {
                // We can vote if this is a repeat of a vote we've already cast...
                canVote := r.vote == m.From ||
                        // ...we haven't voted and we don't think there's a leader yet in this term...
                        (r.vote == NoLeader && r.leader == NoLeader) ||
                        // ...or this is a PreVote for a future term...
                        (m.Type == proto.ReqMsgPreVote && m.Term > r.term)
                // ...and we believe the candidate is up to date.
                var respType proto.MsgType
                if m.Type == proto.ReqMsgPreVote {
                        respType = proto.RespMsgPreVote
                } else {
                        respType = proto.RespMsgVote
                }
                if canVote && r.raftLog.isUpToDate(m.Index, m.LogTerm, 0, 0) {
                        // Note: it turns out that that learners must be allowed to cast votes.
                        // This seems counter- intuitive but is necessary in the situation in which
                        // a learner has been promoted (i.e. is now a voter) but has not learned
                        // about this yet.
                        // For example, consider a group in which id=1 is a learner and id=2 and
                        // id=3 are voters. A configuration change promoting 1 can be committed on
                        // the quorum `{2,3}` without the config change being appended to the
                        // learner's log. If the leader (say 2) fails, there are de facto two
                        // voters remaining. Only 3 can win an election (due to its log containing
                        // all committed entries), but to do so it will need 1 to vote. But 1
                        // considers itself a learner and will continue to do so until 3 has
                        // stepped up as leader, replicates the conf change to 1, and 1 applies it.
                        // Ultimately, by receiving a request to vote, the learner realizes that
                        // the candidate believes it to be a voter, and that it should act
                        // accordingly. The candidate's config may be stale, too; but in that case
                        // it won't win the election, at least in the absence of the bug discussed
                        // in:
                        // https://github.com/etcd-io/etcd/issues/7625#issuecomment-488798263.
                        if logger.IsEnableDebug() {
                                logger.Info("%x [logterm: %d, index: %d, vote: %x] cast %s for %x [logterm: %d, index: %d] at term %d",
                                        r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
                        }
                        // When responding to Msg{Pre,}Vote messages we include the term
                        // from the message, not the local term. To see why, consider the
                        // case where a single node was previously partitioned away and
                        // it's local term is now out of date. If we include the local term
                        // (recall that for pre-votes we don't update the local term), the
                        // (pre-)campaigning node on the other end will proceed to ignore
                        // the message (it ignores all out of date messages).
                        // The term in the original message and current local term are the
                        // same in the case of regular votes, but different for pre-votes.
                        r.send(&proto.Message{To: m.From, Term: m.Term, Type: respType})
                        if m.Type == proto.ReqMsgVote {
                                // Only record real votes.
                                r.electionElapsed = 0
                                r.vote = m.From
                        }
                } else {
                        if logger.IsEnableDebug() {
                                logger.Info("%x [logterm: %d, index: %d, vote: %x] rejected %s from %x [logterm: %d, index: %d] at term %d",
                                        r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.Type, m.From, m.LogTerm, m.Index, r.term)
                        }
                        r.send(&proto.Message{To: m.From, Term: r.term, Type: respType, Reject: true})
                }
                return
        }
        r.step(r, m)
}

func (r *raftFsm) loadState(state proto.HardState) error {
        if state.Commit < r.raftLog.committed || state.Commit > r.raftLog.lastIndex() {
                return fmt.Errorf("[raft->loadState][%v] state.commit %d is out of range [%d, %d]", r.id, state.Commit, r.raftLog.committed, r.raftLog.lastIndex())
        }

        r.term = state.Term
        r.vote = state.Vote
        r.raftLog.committed = state.Commit
        return nil
}

func (r *raftFsm) recoverCommit() error {
        for r.raftLog.applied <= r.raftLog.committed {
                committedEntries := r.raftLog.nextEnts(64 * MB)
                for _, entry := range committedEntries {
                        r.raftLog.appliedTo(entry.Index)

                        switch entry.Type {
                        case proto.EntryNormal:
                                if entry.Data == nil || len(entry.Data) == 0 {
                                        continue
                                }
                                if _, err := r.sm.Apply(entry.Data, entry.Index); err != nil {
                                        return err
                                }

                        case proto.EntryConfChange:
                                cc := new(proto.ConfChange)
                                cc.Decode(entry.Data)
                                if _, err := r.sm.ApplyMemberChange(cc, entry.Index); err != nil {
                                        return err
                                }
                                r.applyConfChange(cc)
                        }
                }
                if r.raftLog.applied == r.raftLog.committed {
                        break
                }
        }
        return nil
}

func (r *raftFsm) applyConfChange(cc *proto.ConfChange) (ok bool) {
        if cc.Peer.ID == NoLeader {
                r.pendingConf = false
                return
        }

        switch cc.Type {
        case proto.ConfAddNode:
                r.addPeer(cc.Peer)
        case proto.ConfRemoveNode:
                return r.removePeer(cc.Peer)
        case proto.ConfUpdateNode:
                r.updatePeer(cc.Peer)
        }
        return
}

func (r *raftFsm) addPeer(peer proto.Peer) {
        r.pendingConf = false
        if _, ok := r.replicas[peer.ID]; !ok {
                if r.state == stateLeader {
                        r.replicas[peer.ID] = newReplica(peer, r.config.MaxInflightMsgs)
                        r.replicas[peer.ID].next = r.raftLog.lastIndex() + 1
                } else {
                        r.replicas[peer.ID] = newReplica(peer, 0)
                }
        }
}

func (r *raftFsm) removePeer(peer proto.Peer) (ok bool) {
        r.pendingConf = false

        replica, ok := r.replicas[peer.ID]
        if !ok {
                return
        } else if replica.peer.PeerID != peer.PeerID {
                if logger.IsEnableInfo() {
                        logger.Info("raft[%v] ignore remove peer[%v], current[%v]", r.id, peer.String(), replica.peer.String())
                }
                return
        }

        delete(r.replicas, peer.ID)
        ok = true

        if peer.ID == r.config.NodeID {
                r.becomeFollower(r.term, NoLeader)
        } else if r.state == stateLeader && len(r.replicas) > 0 {
                if r.maybeCommit() {
                        r.bcastAppend()
                }
        }
        return
}

func (r *raftFsm) updatePeer(peer proto.Peer) {
        r.pendingConf = false
        if _, ok := r.replicas[peer.ID]; ok {
                r.replicas[peer.ID].peer = peer
        }
}

func (r *raftFsm) quorum() int {
        return len(r.replicas)/2 + 1
}

func (r *raftFsm) send(m *proto.Message) {
        m.ID = r.id
        m.From = r.config.NodeID
        // ReqMsgPreVote's message should add one
        if m.Type != proto.LocalMsgProp && m.Type != proto.ReqMsgPreVote {
                m.Term = r.term
        }
        r.msgs = append(r.msgs, m)
}

func (r *raftFsm) reset(term, lasti uint64, isLeader bool) {
        if r.term != term {
                r.term = term
                r.vote = NoLeader
        }
        r.leader = NoLeader
        r.electionElapsed = 0
        r.heartbeatElapsed = 0
        r.votes = make(map[uint64]bool)
        r.pendingConf = false
        r.readOnly.reset(ErrNotLeader)

        if isLeader {
                r.randElectionTick = r.config.ElectionTick - 1
                for id, p := range r.replicas {
                        r.replicas[id] = newReplica(p.peer, r.config.MaxInflightMsgs)
                        r.replicas[id].next = lasti + 1
                        if id == r.config.NodeID {
                                r.replicas[id].match = lasti
                                r.replicas[id].committed = r.raftLog.committed
                        }
                }
        } else {
                r.resetRandomizedElectionTimeout()
                for id, p := range r.replicas {
                        r.replicas[id] = newReplica(p.peer, 0)
                }
        }
}

func (r *raftFsm) resetRandomizedElectionTimeout() {
        randTick := r.rand.Intn(r.config.ElectionTick)
        r.randElectionTick = r.config.ElectionTick + randTick
        logger.Debug("raft[%v,%v] random election timeout randElectionTick=%v, config.ElectionTick=%v, randTick=%v",
                r.id, r.config.ReplicateAddr, r.randElectionTick, r.config.ElectionTick, randTick)
}

func (r *raftFsm) pastElectionTimeout() bool {
        return r.electionElapsed >= r.randElectionTick
}

func (r *raftFsm) peers() []proto.Peer {
        peers := make([]proto.Peer, 0, len(r.replicas))
        for _, p := range r.replicas {
                peers = append(peers, p.peer)
        }
        return peers
}

func (r *raftFsm) checkSnapshot(meta proto.SnapshotMeta) bool {
        if meta.Index <= r.raftLog.committed {
                return false
        }
        if r.raftLog.matchTerm(meta.Index, meta.Term) {
                r.raftLog.commitTo(meta.Index)
                return false
        }
        return true
}

func (r *raftFsm) restore(meta proto.SnapshotMeta) {
        if logger.IsEnableWarn() {
                logger.Warn("raft [%v, commit: %d, lastindex: %d, lastterm: %d] starts to restore snapshot [index: %d,term:%d]",
                        r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), meta.Index, meta.Term)
        }

        r.raftLog.restore(meta.Index)
        r.replicas = make(map[uint64]*replica)
        for _, p := range meta.Peers {
                r.replicas[p.ID] = newReplica(p, 0)
        }
}

func (r *raftFsm) addReadIndex(futures []*Future) {
        // not leader
        if r.leader != r.config.NodeID {
                respondReadIndex(futures, ErrNotLeader)
                return
        }

        // check leader commit in current term
        if !r.readOnly.committed {
                if r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(r.raftLog.committed)) == r.term {
                        r.readOnly.commit(r.raftLog.committed)
                }
        }
        r.readOnly.add(r.raftLog.committed, futures)
        r.bcastReadOnly()
}

func numOfPendingConf(ents []*proto.Entry) int {
        n := 0
        for i := range ents {
                if ents[i].Type == proto.EntryConfChange {
                        n++
                }
        }
        return n
}

func (r *raftFsm) monitorElection() {
        if r.mo == nil {
                return
        }
        now := time.Now()
        if r.electionFirstBegin.IsZero() || r.state != stateCandidate {
                //Record the time of the most recent lost of leader.
                r.electionFirstBegin = now
                return
        }
        //call r.mo.MonitorElection when r.leader==NoLeader continuously
        r.mo.MonitorElection(r.id, r.getReplicas(), now.Sub(r.electionFirstBegin))
}

func (r *raftFsm) monitorZombie(peer *replica) {
        if r.mo == nil {
                return
        }
        now := time.Now()
        if peer.lastZombie.Before(peer.lastActive) {
                peer.lastZombie = now
        }
        if du := now.Sub(peer.lastZombie); du > 2*r.config.TickInterval {
                r.mo.MonitorZombie(r.id, peer.peer, r.getReplicas(), du)
        }
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

// first become preCandidate,
func (r *raftFsm) becomeCandidate() {
        if r.state == stateLeader {
                panic(AppPanicError(fmt.Sprintf("[raft->becomeCandidate][%v] invalid transition [leader -> candidate].", r.id)))
        }

        r.monitorElection()
        r.step = stepCandidate
        r.reset(r.term+1, 0, false)
        r.tick = r.tickElection
        r.vote = r.config.NodeID
        r.state = stateCandidate
        if logger.IsEnableDebug() {
                logger.Debug("raft[%v] became candidate at term %d.", r.id, r.config.TransportConfig.ReplicateAddr, r.term)
        }
}

func stepCandidate(r *raftFsm, m *proto.Message) {
        switch m.Type {
        case proto.LocalMsgProp:
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term)
                }
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgAppend:
                r.becomeFollower(r.term, m.From)
                r.handleAppendEntries(m)
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgHeartBeat:
                r.becomeFollower(r.term, m.From)
                return

        case proto.ReqMsgPreVote:
                r.becomeFollower(r.term, m.From)
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgPreVote
                nmsg.To = m.From
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgVote:
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
                }
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgVote
                nmsg.To = m.From
                nmsg.Reject = true
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.RespMsgVote:
                gr := r.poll(m.From, !m.Reject)
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] [q:%d] has received %d votes and %d vote rejections.", r.id, r.quorum(), gr, len(r.votes)-gr)
                }
                switch r.quorum() {
                case gr:
                        r.becomeLeader()
                        r.bcastAppend()
                case len(r.votes) - gr:
                        r.becomeFollower(r.term, NoLeader)
                }
        }
}

func (r *raftFsm) campaign(force bool, t CampaignType) {
        var msgType proto.MsgType
        var term uint64
        if t == campaignPreElection {
                r.becomePreCandidate()
                msgType = proto.ReqMsgPreVote
                term = r.term + 1
        } else {
                r.becomeCandidate()
                msgType = proto.ReqMsgVote
        }

        if r.quorum() == r.poll(r.config.NodeID, true) {
                if t == campaignPreElection {
                        r.campaign(force, campaignElection)
                } else {
                        r.becomeLeader()
                }
                return
        }

        for id := range r.replicas {
                if id == r.config.NodeID {
                        continue
                }
                li, lt := r.raftLog.lastIndexAndTerm()
                if logger.IsEnableDebug() {
                        logger.Debug("[raft->campaign][%v,%v logterm: %d, index: %d] sent "+
                                "%v request to %v at term %d.   raftFSM[%p]", msgType, r.id, r.config.ReplicateAddr, lt, li, id, r.term, r)
                }

                m := proto.GetMessage()
                m.To = id
                m.Type = msgType
                m.ForceVote = force
                m.Index = li
                m.LogTerm = lt
                m.Term = term
                r.send(m)
        }
}

func (r *raftFsm) poll(id uint64, v bool) (granted int) {
        if logger.IsEnableDebug() {
                if v {
                        logger.Debug("raft[%v,%v] received vote from %v at term %d.", r.id, r.config.ReplicateAddr, id, r.term)
                } else {
                        logger.Debug("raft[%v,%v] received vote rejection from %v at term %d.", r.id, r.config.ReplicateAddr, id, r.term)
                }
        }
        if _, ok := r.votes[id]; !ok {
                r.votes[id] = v
        }
        for _, vv := range r.votes {
                if vv {
                        granted++
                }
        }
        return granted
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "math"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

func (r *raftFsm) becomeFollower(term, lead uint64) {
        r.step = stepFollower
        r.reset(term, 0, false)
        r.tick = r.tickElection
        r.leader = lead
        r.state = stateFollower
        if logger.IsEnableDebug() {
                logger.Debug("[raft][%v,%v] became follower at term[%d] leader[%d].", r.id, r.config.ReplicateAddr, r.term, r.leader)
        }
}

func stepFollower(r *raftFsm, m *proto.Message) {
        switch m.Type {
        case proto.LocalMsgProp:
                if r.leader == NoLeader {
                        if logger.IsEnableWarn() {
                                logger.Warn("raft[%v] no leader at term %d; dropping proposal.", r.id, r.term)
                        }
                        return
                }
                m.To = r.leader
                r.send(m)
                return

        case proto.ReqMsgAppend:
                r.electionElapsed = 0
                r.leader = m.From
                r.handleAppendEntries(m)
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgHeartBeat:
                r.electionElapsed = 0
                r.leader = m.From
                return

        case proto.ReqMsgPreVote:
                r.electionElapsed = 0
                r.leader = m.From
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgPreVote
                nmsg.To = m.From
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.ReqCheckQuorum:
                // TODO: remove this
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%d] recv check quorum from %d, index=%d", r.id, m.From, m.Index)
                }
                r.electionElapsed = 0
                r.leader = m.From
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespCheckQuorum
                nmsg.Index = m.Index
                nmsg.To = m.From
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgVote:
                fpri, lpri := uint16(math.MaxUint16), uint16(0)
                if pr, ok := r.replicas[m.From]; ok {
                        fpri = pr.peer.Priority
                }
                if pr, ok := r.replicas[r.config.NodeID]; ok {
                        lpri = pr.peer.Priority
                }

                if (!r.config.LeaseCheck || r.leader == NoLeader) && (r.vote == NoLeader || r.vote == m.From) && r.raftLog.isUpToDate(m.Index, m.LogTerm, fpri, lpri) {
                        r.electionElapsed = 0
                        if logger.IsEnableDebug() {
                                logger.Debug("raft[%v] [logterm: %d, index: %d, vote: %v] voted for %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
                        }
                        r.vote = m.From
                        nmsg := proto.GetMessage()
                        nmsg.Type = proto.RespMsgVote
                        nmsg.To = m.From
                        r.send(nmsg)
                } else {
                        if logger.IsEnableDebug() {
                                logger.Debug("raf[%v] [logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d.", r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
                        }
                        nmsg := proto.GetMessage()
                        nmsg.Type = proto.RespMsgVote
                        nmsg.To = m.From
                        nmsg.Reject = true
                        r.send(nmsg)
                }
                proto.ReturnMessage(m)
                return

        case proto.LeaseMsgTimeout:
                if r.leader == m.From {
                        r.electionElapsed = 0
                        nmsg := proto.GetMessage()
                        nmsg.Type = proto.LocalMsgHup
                        nmsg.From = r.config.NodeID
                        r.Step(nmsg)
                }
                proto.ReturnMessage(m)
                return
        }
}

func (r *raftFsm) tickElection() {
        if !r.promotable() {
                r.electionElapsed = 0
                return
        }

        r.electionElapsed++
        timeout := false
        // check follower lease (2 * electiontimeout)
        if r.config.LeaseCheck && r.leader != NoLeader && r.state == stateFollower {
                timeout = (r.electionElapsed >= (r.config.ElectionTick << 1))
        } else {
                timeout = r.pastElectionTimeout()
        }
        if timeout {
                r.electionElapsed = 0
                m := proto.GetMessage()
                m.Type = proto.LocalMsgHup
                m.From = r.config.NodeID
                r.Step(m)
        }
}

func (r *raftFsm) handleAppendEntries(m *proto.Message) {
        if m.Index < r.raftLog.committed {
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgAppend
                nmsg.To = m.From
                nmsg.Index = r.raftLog.committed
                nmsg.Commit = r.raftLog.committed
                r.send(nmsg)
                return
        }

        if mlastIndex, ok := r.raftLog.maybeAppend(m.Index, m.LogTerm, m.Commit, m.Entries...); ok {
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgAppend
                nmsg.To = m.From
                nmsg.Index = mlastIndex
                nmsg.Commit = r.raftLog.committed
                r.send(nmsg)
        } else {
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v logterm: %d, index: %d] rejected msgApp [logterm: %d, index: %d] from %v",
                                r.id, r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(m.Index)), m.Index, m.LogTerm, m.Index, m.From)
                }
                // Return a hint to the leader about the maximum index and term that the
                // two logs could be divergent at. Do this by searching through the
                // follower's log for the maximum (index, term) pair with a term <= the
                // MsgApp's LogTerm and an index <= the MsgApp's Index. This can help
                // skip all indexes in the follower's uncommitted tail with terms
                // greater than the MsgApp's LogTerm.
                //
                // See the other caller for findConflictByTerm (in stepLeader) for a much
                // more detailed explanation of this mechanism.
                hintIndex := util.Min(m.Index, r.raftLog.lastIndex())
                hintIndex = r.raftLog.findConflictByTerm(hintIndex, m.LogTerm)
                hintTerm, err := r.raftLog.term(hintIndex)
                if err != nil {
                        panic(fmt.Sprintf("term(%d) must be valid, but got %v", hintIndex, err))
                }

                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgAppend
                nmsg.To = m.From
                nmsg.Index = m.Index
                nmsg.Commit = r.raftLog.committed
                nmsg.Reject = true
                nmsg.LogTerm = hintTerm
                nmsg.RejectHint = hintIndex
                r.send(nmsg)
        }
}

func (r *raftFsm) promotable() bool {
        // todo check snapshot
        pr, ok := r.replicas[r.config.NodeID]
        return ok && pr.state != replicaStateSnapshot
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "sort"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

func (r *raftFsm) becomeLeader() {
        if r.state == stateFollower {
                panic(AppPanicError(fmt.Sprintf("[raft->becomeLeader][%v] invalid transition [follower -> leader].", r.id)))
        }
        r.recoverCommit()
        lasti := r.raftLog.lastIndex()
        r.step = stepLeader
        r.reset(r.term, lasti, true)
        r.tick = r.tickHeartbeat
        r.leader = r.config.NodeID
        r.state = stateLeader
        r.acks = nil
        if pr, ok := r.replicas[r.config.NodeID]; ok {
                pr.active = true
        }

        ents, err := r.raftLog.entries(r.raftLog.committed+1, noLimit)
        if err != nil {
                errMsg := fmt.Sprintf("[raft->becomeLeader][%v] unexpected error getting uncommitted entries (%v).", r.id, err)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        nconf := numOfPendingConf(ents)
        if nconf > 1 {
                panic(AppPanicError(fmt.Sprintf("[raft->becomeLeader][%v] unexpected double uncommitted config entry.", r.id)))
        }
        if nconf == 1 {
                r.pendingConf = true
        }

        r.appendEntry(&proto.Entry{Term: r.term, Index: lasti + 1, Data: nil})
        if logger.IsEnableDebug() {
                logger.Debug("raft[%v,%v] became leader at term %d.index:%d", r.id, r.config.ReplicateAddr, r.term, lasti+1)
        }
}

func stepLeader(r *raftFsm, m *proto.Message) {
        // These message types do not require any progress for m.From.
        switch m.Type {
        case proto.LocalMsgProp:
                if _, ok := r.replicas[r.config.NodeID]; !ok || len(m.Entries) == 0 {
                        return
                }

                for i, e := range m.Entries {
                        if e.Type == proto.EntryConfChange {
                                if r.pendingConf {
                                        m.Entries[i] = &proto.Entry{Term: e.Term, Index: e.Index, Type: proto.EntryNormal}
                                }
                                r.pendingConf = true
                        }
                }
                r.appendEntry(m.Entries...)
                r.bcastAppend()
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgVote:
                if logger.IsEnableDebug() {
                        logger.Debug("[raft->stepLeader][%v logterm: %d, index: %d, vote: %v] rejected vote from %v [logterm: %d, index: %d] at term %d",
                                r.id, r.raftLog.lastTerm(), r.raftLog.lastIndex(), r.vote, m.From, m.LogTerm, m.Index, r.term)
                }
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgVote
                nmsg.To = m.From
                nmsg.Reject = true
                r.send(nmsg)
                proto.ReturnMessage(m)
                return
        }

        // All other message types require a progress for m.From (pr).
        pr, prOk := r.replicas[m.From]
        if !prOk {
                if logger.IsEnableDebug() {
                        logger.Debug("[raft->stepLeader][%v] no progress available for %v.", r.id, m.From)
                }
                return
        }
        switch m.Type {
        case proto.RespMsgAppend:
                pr.active = true
                pr.lastActive = time.Now()

                if m.Reject {
                        if logger.IsEnableDebug() {
                                logger.Debug("raft[%v, %v, %v, %v] received msgApp rejection(lastindex: %d) from %v for index %d commit %v. replica info [%v,%v,%v,%v]",
                                        r.id, r.raftLog.firstIndex(), r.raftLog.lastIndex(), r.raftLog.committed, m.RejectHint, m.From, m.Index, m.Commit, pr.state, pr.next, pr.committed, pr.match)
                        }
                        nextProbeIdx := m.RejectHint
                        if m.LogTerm > 0 {
                                // If the follower has an uncommitted log tail, we would end up
                                // probing one by one until we hit the common prefix.
                                //
                                // For example, if the leader has:
                                //
                                //   idx        1 2 3 4 5 6 7 8 9
                                //              -----------------
                                //   term (L)   1 3 3 3 5 5 5 5 5
                                //   term (F)   1 1 1 1 2 2
                                //
                                // Then, after sending an append anchored at (idx=9,term=5) we
                                // would receive a RejectHint of 6 and LogTerm of 2. Without the
                                // code below, we would try an append at index 6, which would
                                // fail again.
                                //
                                // However, looking only at what the leader knows about its own
                                // log and the rejection hint, it is clear that a probe at index
                                // 6, 5, 4, 3, and 2 must fail as well:
                                //
                                // For all of these indexes, the leader's log term is larger than
                                // the rejection's log term. If a probe at one of these indexes
                                // succeeded, its log term at that index would match the leader's,
                                // i.e. 3 or 5 in this example. But the follower already told the
                                // leader that it is still at term 2 at index 9, and since the
                                // log term only ever goes up (within a log), this is a contradiction.
                                //
                                // At index 1, however, the leader can draw no such conclusion,
                                // as its term 1 is not larger than the term 2 from the
                                // follower's rejection. We thus probe at 1, which will succeed
                                // in this example. In general, with this approach we probe at
                                // most once per term found in the leader's log.
                                //
                                // There is a similar mechanism on the follower (implemented in
                                // handleAppendEntries via a call to findConflictByTerm) that is
                                // useful if the follower has a large divergent uncommitted log
                                // tail[1], as in this example:
                                //
                                //   idx        1 2 3 4 5 6 7 8 9
                                //              -----------------
                                //   term (L)   1 3 3 3 3 3 3 3 7
                                //   term (F)   1 3 3 4 4 5 5 5 6
                                //
                                // Naively, the leader would probe at idx=9, receive a rejection
                                // revealing the log term of 6 at the follower. Since the leader's
                                // term at the previous index is already smaller than 6, the leader-
                                // side optimization discussed above is ineffective. The leader thus
                                // probes at index 8 and, naively, receives a rejection for the same
                                // index and log term 5. Again, the leader optimization does not improve
                                // over linear probing as term 5 is above the leader's term 3 for that
                                // and many preceding indexes; the leader would have to probe linearly
                                // until it would finally hit index 3, where the probe would succeed.
                                //
                                // Instead, we apply a similar optimization on the follower. When the
                                // follower receives the probe at index 8 (log term 3), it concludes
                                // that all of the leader's log preceding that index has log terms of
                                // 3 or below. The largest index in the follower's log with a log term
                                // of 3 or below is index 3. The follower will thus return a rejection
                                // for index=3, log term=3 instead. The leader's next probe will then
                                // succeed at that index.
                                //
                                // [1]: more precisely, if the log terms in the large uncommitted
                                // tail on the follower are larger than the leader's. At first,
                                // it may seem unintuitive that a follower could even have such
                                // a large tail, but it can happen:
                                //
                                // 1. Leader appends (but does not commit) entries 2 and 3, crashes.
                                //   idx        1 2 3 4 5 6 7 8 9
                                //              -----------------
                                //   term (L)   1 2 2     [crashes]
                                //   term (F)   1
                                //   term (F)   1
                                //
                                // 2. a follower becomes leader and appends entries at term 3.
                                //              -----------------
                                //   term (x)   1 2 2     [down]
                                //   term (F)   1 3 3 3 3
                                //   term (F)   1
                                //
                                // 3. term 3 leader goes down, term 2 leader returns as term 4
                                //    leader. It commits the log & entries at term 4.
                                //
                                //              -----------------
                                //   term (L)   1 2 2 2
                                //   term (x)   1 3 3 3 3 [down]
                                //   term (F)   1
                                //              -----------------
                                //   term (L)   1 2 2 2 4 4 4
                                //   term (F)   1 3 3 3 3 [gets probed]
                                //   term (F)   1 2 2 2 4 4 4
                                //
                                // 4. the leader will now probe the returning follower at index
                                //    7, the rejection points it at the end of the follower's log
                                //    which is at a higher log term than the actually committed
                                //    log.
                                nextProbeIdx = r.raftLog.findConflictByTerm(m.RejectHint, m.LogTerm)
                        }
                        if pr.maybeDecrTo(m.Index, nextProbeIdx, m.Commit) {
                                if logger.IsEnableDebug() {
                                        logger.Debug("[%v] decreased progress of [%v] to [%s]", r.id, m.From, pr)
                                }
                                if pr.state == replicaStateReplicate {
                                        pr.becomeProbe()
                                }
                                r.sendAppend(m.From)
                        }
                } else {
                        oldPaused := pr.isPaused()
                        if pr.maybeUpdate(m.Index, m.Commit) {
                                switch {
                                case pr.state == replicaStateProbe:
                                        pr.becomeReplicate()
                                case pr.state == replicaStateSnapshot && pr.needSnapshotAbort():
                                        if logger.IsEnableWarn() {
                                                logger.Warn("raft[%v] snapshot aborted, resumed sending replication messages to %v.", r.id, m.From)
                                        }
                                        pr.becomeProbe()
                                case pr.state == replicaStateReplicate:
                                        pr.inflight.freeTo(m.Index)
                                }

                                if r.maybeCommit() {
                                        r.bcastAppend()
                                } else if oldPaused {
                                        r.sendAppend(m.From)
                                }
                        }
                }
                proto.ReturnMessage(m)
                return

        case proto.RespMsgHeartBeat:
                if pr.state == replicaStateReplicate && pr.inflight.full() {
                        pr.inflight.freeFirstOne()
                }
                if !pr.pending && (pr.match < r.raftLog.lastIndex() || pr.committed < r.raftLog.committed) {
                        r.sendAppend(m.From)
                }

                pr.active = true
                pr.lastActive = time.Now()
                if pr.state != replicaStateSnapshot {
                        pr.pending = false
                }
                return

        case proto.LeaseMsgOffline:
                for id := range r.replicas {
                        if id == r.config.NodeID {
                                continue
                        }
                        nmsg := proto.GetMessage()
                        nmsg.Type = proto.LeaseMsgTimeout
                        nmsg.To = id
                        r.send(nmsg)
                }
                logger.Debug("[raft][%v] LeaseMsgOffline at term[%d] leader[%d].", r.id, r.term, r.leader)
                r.becomeFollower(r.term, NoLeader)
                proto.ReturnMessage(m)
                return

        case proto.RespMsgSnapShot:
                if pr.state != replicaStateSnapshot {
                        return
                }

                if m.Reject {
                        if logger.IsEnableWarn() {
                                logger.Warn("raft[%v] send snapshot to [%v] failed.", r.id, m.From)
                        }
                        pr.snapshotFailure()
                        pr.becomeProbe()
                } else {
                        pr.active = true
                        pr.lastActive = time.Now()
                        pr.becomeProbe()
                        if logger.IsEnableWarn() {
                                logger.Warn("raft[%v] send snapshot to [%v] succeeded, resumed replication [%s]", r.id, m.From, pr)
                        }
                }

                // If snapshot finish, wait for the RespMsgAppend from the remote node before sending out the next ReqMsgAppend.
                // If snapshot failure, wait for a heartbeat interval before next try.
                pr.pause()
                proto.ReturnMessage(m)
                return

        case proto.RespCheckQuorum:
                // TODO: remove this when stable
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%d] recv check quorum resp from %d, index=%d", r.id, m.From, m.Index)
                }
                r.readOnly.recvAck(m.Index, m.From, r.quorum())
                proto.ReturnMessage(m)
                return
        }
}

func (r *raftFsm) becomePreCandidate() {
        r.acks = make(map[uint64]bool)
        r.acks[r.config.NodeID] = true
        logger.Debug("raft[%v] became preCandidate at term %d.", r.id, r.term)

        r.step = stepPreCandidate
        r.reset(r.term, 0, false)
        r.tick = r.tickElectionAck
        r.state = statePreCandidate
}

func stepPreCandidate(r *raftFsm, m *proto.Message) {
        switch m.Type {
        case proto.LocalMsgProp:
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] no leader at term %d; dropping proposal", r.id, r.term)
                }
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgAppend:
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] PreCandidate receive append in term %d; become follower.", r.id, r.term)
                }
                r.becomeFollower(r.term, m.From)
                r.handleAppendEntries(m)
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgHeartBeat:
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] PreCandidate receive heartbeat in term %d; become follower.", r.id, r.term)
                }
                r.becomeFollower(r.term, m.From)
                return

        case proto.ReqMsgPreVote:
                r.becomeFollower(r.term, m.From)
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgPreVote
                nmsg.To = m.From
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.RespCheckQuorum:
                // TODO: remove this when stable
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%d] recv check quorum resp from %d, index=%d", r.id, m.From, m.Index)
                }
                r.readOnly.recvAck(m.Index, m.From, r.quorum())
                proto.ReturnMessage(m)
                return

        case proto.ReqMsgVote:
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgVote
                nmsg.To = m.From
                nmsg.Reject = true
                r.send(nmsg)
                proto.ReturnMessage(m)
                return

        case proto.RespMsgPreVote:
                gr := r.poll(m.From, !m.Reject)
                if logger.IsEnableDebug() {
                        logger.Debug("raft[%v] [q:%d] stepPreCandidate has received %d votes and %d vote rejections.", r.id, r.quorum(), gr, len(r.votes)-gr)
                }
                switch r.quorum() {
                case gr:
                        r.campaign(false, campaignElection)
                case len(r.votes) - gr:
                        r.becomeFollower(r.term, NoLeader)
                }
                return
        }
}

func (r *raftFsm) tickHeartbeat() {
        r.heartbeatElapsed++
        r.electionElapsed++
        if r.pastElectionTimeout() {
                r.electionElapsed = 0
                if r.config.LeaseCheck && !r.checkLeaderLease() {
                        if logger.IsEnableWarn() {
                                logger.Warn("raft[%v] stepped down to follower since quorum is not active.", r.id)
                        }
                        logger.Debug("[raft][%v] heartbeat election timeout at term[%d] leader[%d].", r.id, r.term, r.leader)
                        r.becomeFollower(r.term, NoLeader)
                }
        }

        if r.state != stateLeader {
                return
        }

        if r.heartbeatElapsed >= r.config.HeartbeatTick {
                r.heartbeatElapsed = 0
                for id := range r.replicas {
                        if id == r.config.NodeID {
                                continue
                        }
                        if r.replicas[id].state != replicaStateSnapshot {
                                r.replicas[id].resume()
                        }
                }
                r.bcastReadOnly()
        }
}

func (r *raftFsm) tickElectionAck() {
        r.electionElapsed++
        if r.electionElapsed >= r.config.ElectionTick {
                r.electionElapsed = 0

                m := proto.GetMessage()
                m.Type = proto.LocalMsgHup
                m.From = r.config.NodeID
                r.Step(m)
        }
}

func (r *raftFsm) checkLeaderLease() bool {
        var act int
        for id, peer := range r.replicas {
                if id == r.config.NodeID || peer.state == replicaStateSnapshot {
                        act++
                        continue
                }

                if peer.active {
                        peer.active = false
                        act++
                } else {
                        r.monitorZombie(peer)
                }
        }

        return act >= r.quorum()
}

func (r *raftFsm) maybeCommit() bool {
        mis := make(util.Uint64Slice, 0, len(r.replicas))
        for _, rp := range r.replicas {
                mis = append(mis, rp.match)
        }
        sort.Sort(sort.Reverse(mis))
        mci := mis[r.quorum()-1]
        isCommit := r.raftLog.maybeCommit(mci, r.term)
        if r.state == stateLeader && r.replicas[r.config.NodeID] != nil {
                r.replicas[r.config.NodeID].committed = r.raftLog.committed
        }

        if r.state == stateLeader && !r.readOnly.committed && isCommit {
                if r.raftLog.zeroTermOnErrCompacted(r.raftLog.term(r.raftLog.committed)) == r.term {
                        r.readOnly.commit(r.raftLog.committed)
                }
                r.bcastReadOnly()
        }

        return isCommit
}

func (r *raftFsm) bcastAppend() {
        for id := range r.replicas {
                if id == r.config.NodeID {
                        continue
                }
                r.sendAppend(id)
        }
}

func (r *raftFsm) sendAppend(to uint64) {
        pr := r.replicas[to]
        if pr.isPaused() {
                return
        }

        var (
                term       uint64
                ents       []*proto.Entry
                errt, erre error
                m          *proto.Message
        )
        fi := r.raftLog.firstIndex()
        if pr.next >= fi {
                term, errt = r.raftLog.term(pr.next - 1)
                ents, erre = r.raftLog.entries(pr.next, r.config.MaxSizePerMsg)
        }
        if pr.next < fi || errt != nil || erre != nil {
                if !pr.active {
                        if logger.IsEnableDebug() {
                                logger.Debug("[raft->sendAppend][%v]ignore sending snapshot to %v since it is not recently active.", r.id, to)
                        }
                        return
                }

                snapshot, err := r.sm.Snapshot()
                if err != nil || snapshot.ApplyIndex() < fi-1 {
                        panic(AppPanicError(fmt.Sprintf("[raft->sendAppend][%v]failed to send snapshot[%d] to %v because snapshot is unavailable, error is: \r\n%v", r.id, snapshot.ApplyIndex(), to, err)))
                }

                m = proto.GetMessage()
                m.Type = proto.ReqMsgSnapShot
                m.To = to
                m.Snapshot = snapshot
                snapMeta := proto.SnapshotMeta{Index: snapshot.ApplyIndex(), Peers: make([]proto.Peer, 0, len(r.replicas))}
                if snapTerm, err := r.raftLog.term(snapMeta.Index); err != nil {
                        panic(AppPanicError(fmt.Sprintf("[raft->sendAppend][%v]failed to send snapshot to %v because snapshot is unavailable, error is: \r\n%v", r.id, to, err)))
                } else {
                        snapMeta.Term = snapTerm
                }
                for _, p := range r.replicas {
                        snapMeta.Peers = append(snapMeta.Peers, p.peer)
                }
                m.SnapshotMeta = snapMeta
                pr.becomeSnapshot(snapMeta.Index)

                logger.Debug("[raft->sendAppend][%v][firstindex: %d, commit: %d] sent snapshot[index: %d, term: %d] to [%v][%s]",
                        r.id, fi, r.raftLog.committed, snapMeta.Index, snapMeta.Term, to, pr)

        } else {
                m = proto.GetMessage()
                m.Type = proto.ReqMsgAppend
                m.To = to
                m.Index = pr.next - 1
                m.LogTerm = term
                m.Commit = r.raftLog.committed
                m.Entries = append(m.Entries, ents...)

                if n := len(m.Entries); n != 0 {
                        switch pr.state {
                        case replicaStateReplicate:
                                last := m.Entries[n-1].Index
                                pr.update(last)
                                pr.inflight.add(last)
                        case replicaStateProbe:
                                pr.pause()
                        default:
                                errMsg := fmt.Sprintf("[repl->sendAppend][%v] is sending append in unhandled state %s.", r.id, pr.state)
                                logger.Error(errMsg)
                                panic(AppPanicError(errMsg))
                        }
                }
        }
        pr.pending = true
        r.send(m)
}

func (r *raftFsm) appendEntry(es ...*proto.Entry) {
        r.raftLog.append(es...)
        r.replicas[r.config.NodeID].maybeUpdate(r.raftLog.lastIndex(), r.raftLog.committed)
        r.maybeCommit()
}

func (r *raftFsm) bcastReadOnly() {
        index := r.readOnly.lastPending()
        if index == 0 {
                return
        }
        if logger.IsEnableDebug() {
                logger.Debug("raft[%d] bcast readonly index: %d", r.id, index)
        }
        for id := range r.replicas {
                if id == r.config.NodeID {
                        continue
                }
                msg := proto.GetMessage()
                msg.Type = proto.ReqCheckQuorum
                msg.To = id
                msg.Index = index
                r.send(msg)
        }
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

type (
        fsmState     byte
        replicaState byte
)

const (
        stateFollower     fsmState = 0
        stateCandidate    fsmState = 1
        stateLeader       fsmState = 2
        statePreCandidate fsmState = 3

        replicaStateProbe     replicaState = 0
        replicaStateReplicate replicaState = 1
        replicaStateSnapshot  replicaState = 2
)

func (st fsmState) String() string {
        switch st {
        case 0:
                return "StateFollower"
        case 1:
                return "StateCandidate"
        case 2:
                return "StateLeader"
        case 3:
                return "statePreCandidate"
        }
        return ""
}

func (st replicaState) String() string {
        switch st {
        case 1:
                return "ReplicaStateReplicate"
        case 2:
                return "ReplicaStateSnapshot"
        default:
                return "ReplicaStateProbe"
        }
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "math"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/storage"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

const noLimit = math.MaxUint64

// raftLog is responsible for the operation of the log.
type raftLog struct {
        unstable           unstable
        storage            storage.Storage
        committed, applied uint64
}

func newRaftLog(storage storage.Storage) (*raftLog, error) {
        log := &raftLog{
                storage: storage,
        }
        firstIndex, err := storage.FirstIndex()
        if err != nil {
                return nil, err
        }
        lastIndex, err := storage.LastIndex()
        if err != nil {
                return nil, err
        }

        log.unstable.offset = lastIndex + 1
        log.unstable.entries = make([]*proto.Entry, 0, 256)
        log.committed = firstIndex - 1
        log.applied = firstIndex - 1
        return log, nil
}

func (l *raftLog) String() string {
        return fmt.Sprintf("committed=%d, applied=%d, unstable.offset=%d, len(unstable.Entries)=%d", l.committed, l.applied, l.unstable.offset, len(l.unstable.entries))
}

func (l *raftLog) firstIndex() uint64 {
        index, err := l.storage.FirstIndex()
        if err != nil {
                errMsg := fmt.Sprintf("[raftLog->firstIndex]get firstindex from storage err:[%v].", err)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        return index
}

func (l *raftLog) lastIndex() uint64 {
        if i, ok := l.unstable.maybeLastIndex(); ok {
                return i
        }
        i, err := l.storage.LastIndex()
        if err != nil {
                errMsg := fmt.Sprintf("[raftLog->lastIndex]get lastIndex from storage err:[%v]", err)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        return i
}

func (l *raftLog) term(i uint64) (uint64, error) {
        dummyIndex := l.firstIndex() - 1
        if i < dummyIndex || i > l.lastIndex() {
                return 0, nil
        }
        if t, ok := l.unstable.maybeTerm(i); ok {
                return t, nil
        }

        t, c, err := l.storage.Term(i)
        if c {
                return 0, ErrCompacted
        }
        if err == nil {
                return t, nil
        }

        errMsg := fmt.Sprintf("[raftLog->term]get term[%d] from storage err:[%v].", i, err)
        logger.Error(errMsg)
        panic(AppPanicError(errMsg))
}

func (l *raftLog) lastTerm() uint64 {
        t, err := l.term(l.lastIndex())
        if err != nil {
                errMsg := fmt.Sprintf("[raftLog->lastTerm]unexpected error when getting the last term (%v)", err)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        return t
}

func (l *raftLog) lastIndexAndTerm() (uint64, uint64) {
        li := l.lastIndex()
        t, err := l.term(li)
        if err != nil {
                errMsg := fmt.Sprintf("[raftLog->lastIndexAndTerm]unexpected error when getting the last term (%v)", err)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        return li, t
}

func (l *raftLog) matchTerm(i, term uint64) bool {
        t, err := l.term(i)
        if err != nil {
                return false
        }
        return t == term
}

func (l *raftLog) findConflict(ents []*proto.Entry) uint64 {
        for _, ne := range ents {
                if !l.matchTerm(ne.Index, ne.Term) {
                        if ne.Index <= l.lastIndex() && logger.IsEnableDebug() {
                                logger.Debug("[raftLog->findConflict]found conflict at index %d [existing term: %d, conflicting term: %d]", ne.Index, l.zeroTermOnErrCompacted(l.term(ne.Index)), ne.Term)
                        }
                        return ne.Index
                }
        }
        return 0
}

func (l *raftLog) maybeAppend(index, logTerm, committed uint64, ents ...*proto.Entry) (lastnewi uint64, ok bool) {
        if l.matchTerm(index, logTerm) {
                lastnewi = index + uint64(len(ents))
                ci := l.findConflict(ents)
                switch {
                case ci == 0:
                case ci <= l.committed:
                        errMsg := fmt.Sprintf("[raftLog->maybeAppend]entry %d conflict with committed entry [committed(%d)]", ci, l.committed)
                        logger.Error(errMsg)
                        panic(AppPanicError(errMsg))

                default:
                        l.append(ents[ci-(index+1):]...)
                }
                l.commitTo(util.Min(committed, lastnewi))
                return lastnewi, true
        }
        return 0, false
}

func (l *raftLog) append(ents ...*proto.Entry) uint64 {
        if len(ents) == 0 {
                return l.lastIndex()
        }
        if after := ents[0].Index - 1; after < l.committed {
                errMsg := fmt.Sprintf("[raftLog->append]after(%d) is out of range [committed(%d)]", after, l.committed)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        l.unstable.truncateAndAppend(ents)
        return l.lastIndex()
}

func (l *raftLog) unstableEntries() []*proto.Entry {
        if len(l.unstable.entries) == 0 {
                return nil
        }
        return l.unstable.entries
}

func (l *raftLog) nextEnts(maxSize uint64) (ents []*proto.Entry) {
        off := util.Max(l.applied+1, l.firstIndex())
        hi := l.committed + 1
        if hi > off {
                ents, err := l.slice(off, hi, maxSize)
                if err != nil {
                        errMsg := fmt.Sprintf("[raftLog->nextEnts]unexpected error when getting unapplied[%d,%d) entries (%v)", off, hi, err)
                        logger.Error(errMsg)
                        panic(AppPanicError(errMsg))
                }
                return ents
        }
        return nil
}

func (l *raftLog) entries(i uint64, maxsize uint64) ([]*proto.Entry, error) {
        if i > l.lastIndex() {
                return nil, nil
        }
        return l.slice(i, l.lastIndex()+1, maxsize)
}

func (l *raftLog) maybeCommit(maxIndex, term uint64) bool {
        if maxIndex > l.committed && l.zeroTermOnErrCompacted(l.term(maxIndex)) == term {
                l.commitTo(maxIndex)
                return true
        }
        return false
}

func (l *raftLog) commitTo(tocommit uint64) {
        if l.committed < tocommit {
                if l.lastIndex() < tocommit {
                        errMsg := fmt.Sprintf("[raftLog->commitTo]tocommit(%d) is out of range [lastIndex(%d)]", tocommit, l.lastIndex())
                        logger.Error(errMsg)
                        panic(AppPanicError(errMsg))
                }
                l.committed = tocommit
        }
}

func (l *raftLog) appliedTo(i uint64) {
        if i == 0 {
                return
        }
        if l.committed < i || i < l.applied {
                errMsg := fmt.Sprintf("[raftLog->appliedTo]applied(%d) is out of range [prevApplied(%d), committed(%d)]", i, l.applied, l.committed)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        l.applied = i
}

func (l *raftLog) stableTo(i, t uint64) { l.unstable.stableTo(i, t) }

func (l *raftLog) isUpToDate(lasti, term uint64, fpri, lpri uint16) bool {
        li, lt := l.lastIndexAndTerm()
        return term > lt || (term == lt && lasti > li) || (term == lt && lasti == li && fpri >= lpri)
}

func (l *raftLog) restore(index uint64) {
        if logger.IsEnableDebug() {
                logger.Debug("[raftLog->restore]log [%s] starts to restore snapshot [index: %d]", l.String(), index)
        }
        l.committed = index
        l.applied = index
        l.unstable.restore(index)
}

func (l *raftLog) slice(lo, hi uint64, maxSize uint64) ([]*proto.Entry, error) {
        if lo == hi {
                return nil, nil
        }
        err := l.mustCheckOutOfBounds(lo, hi)
        if err != nil {
                return nil, err
        }

        var ents []*proto.Entry
        if lo < l.unstable.offset {
                storedhi := util.Min(hi, l.unstable.offset)
                storedEnts, cmp, err := l.storage.Entries(lo, storedhi, maxSize)
                if cmp {
                        return nil, ErrCompacted
                } else if err != nil {
                        errMsg := fmt.Sprintf("[raftLog->slice]get entries[%d:%d) from storage err:[%v].", lo, storedhi, err)
                        logger.Error(errMsg)
                        panic(AppPanicError(errMsg))
                }
                // check if ents has reached the size limitation
                if uint64(len(storedEnts)) < storedhi-lo {
                        return storedEnts, nil
                }
                ents = storedEnts
        }
        if hi > l.unstable.offset {
                unstable := l.unstable.slice(util.Max(lo, l.unstable.offset), hi)
                if len(ents) > 0 {
                        ents = append([]*proto.Entry{}, ents...)
                        ents = append(ents, unstable...)
                } else {
                        ents = unstable
                }
        }
        if maxSize == noLimit {
                return ents, nil
        }
        return limitSize(ents, maxSize), nil
}

// l.firstIndex <= lo <= hi <= l.firstIndex + len(l.entries)
func (l *raftLog) mustCheckOutOfBounds(lo, hi uint64) error {
        if lo > hi {
                errMsg := fmt.Sprintf("[raftLog->mustCheckOutOfBounds]invalid slice %d > %d", lo, hi)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        fi := l.firstIndex()
        if lo < fi {
                return ErrCompacted
        }
        li := l.lastIndex()
        length := li - fi + 1
        if lo < fi || hi > fi+length {
                errMsg := fmt.Sprintf("[raftLog->mustCheckOutOfBounds]slice[%d,%d) out of bound [%d,%d]", lo, hi, fi, li)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        return nil
}

func (l *raftLog) zeroTermOnErrCompacted(t uint64, err error) uint64 {
        if err == nil {
                return t
        }
        if err == ErrCompacted {
                return 0
        }
        errMsg := fmt.Sprintf("[raftLog->zeroTermOnErrCompacted]unexpected error (%v)", err)
        logger.Error(errMsg)
        panic(AppPanicError(errMsg))
}

func (l *raftLog) allEntries() []*proto.Entry {
        ents, err := l.entries(l.firstIndex(), noLimit)
        if err == nil {
                return ents
        }
        if err == ErrCompacted { // try again if there was a racing compaction
                return l.allEntries()
        }
        errMsg := fmt.Sprintf("[log->allEntries]get all entries err:[%v]", err)
        logger.Error(errMsg)
        panic(AppPanicError(errMsg))
}

func limitSize(ents []*proto.Entry, maxSize uint64) []*proto.Entry {
        if len(ents) == 0 || maxSize == noLimit {
                return ents
        }

        size := ents[0].Size()
        limit := 1
        for l := len(ents); limit < l; limit++ {
                size += ents[limit].Size()
                if size > maxSize {
                        break
                }
        }
        return ents[:limit]
}

// findConflictByTerm takes an (index, term) pair (indicating a conflicting log
// entry on a leader/follower during an append) and finds the largest index in
// log l with a term <= `term` and an index <= `index`. If no such index exists
// in the log, the log's first index is returned.
//
// The index provided MUST be equal to or less than l.lastIndex(). Invalid
// inputs log a warning and the input index is returned.
func (l *raftLog) findConflictByTerm(index uint64, term uint64) uint64 {
        if li := l.lastIndex(); index > li {
                // NB: such calls should not exist, but since there is a straightfoward
                // way to recover, do it.
                //
                // It is tempting to also check something about the first index, but
                // there is odd behavior with peers that have no log, in which case
                // lastIndex will return zero and firstIndex will return one, which
                // leads to calls with an index of zero into this method.
                logger.Warn("index(%d) is out of range [0, lastIndex(%d)] in findConflictByTerm",
                        index, li)
                return index
        }
        for {
                logTerm, err := l.term(index)
                if logTerm <= term || err != nil {
                        break
                }
                index--
        }
        return index
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

// unstable temporary deposit the unpersistent log entries.It has log position i+unstable.offset.
// unstable can support group commit.
// Note that unstable.offset may be less than the highest log position in storage;
// this means that the next write to storage might need to truncate the log before persisting unstable.entries.
type unstable struct {
        offset uint64
        // all entries that have not yet been written to storage.
        entries []*proto.Entry
}

// maybeLastIndex returns the last index if it has at least one unstable entry.
func (u *unstable) maybeLastIndex() (uint64, bool) {
        if l := len(u.entries); l != 0 {
                return u.offset + uint64(l) - 1, true
        }
        return 0, false
}

// myabeTerm returns the term of the entry at index i, if there is any.
func (u *unstable) maybeTerm(i uint64) (uint64, bool) {
        if i < u.offset {
                return 0, false
        }

        last, ok := u.maybeLastIndex()
        if !ok || i > last {
                return 0, false
        }
        return u.entries[i-u.offset].Term, true
}

func (u *unstable) stableTo(i, t uint64) {
        gt, ok := u.maybeTerm(i)
        if !ok {
                return
        }
        if gt == t && i >= u.offset {
                l := uint64(len(u.entries))
                diff := l - (i + 1 - u.offset)
                if diff > 0 {
                        copy(u.entries, u.entries[i+1-u.offset:l])
                }
                for k := diff; k < l; k++ {
                        u.entries[k] = nil
                }
                u.entries = u.entries[0:diff]
                u.offset = i + 1
        }
}

func (u *unstable) restore(index uint64) {
        for i, l := 0, len(u.entries); i < l; i++ {
                u.entries[i] = nil
        }
        u.entries = u.entries[0:0]
        u.offset = index + 1
}

func (u *unstable) truncateAndAppend(ents []*proto.Entry) {
        after := ents[0].Index
        switch {
        case after == u.offset+uint64(len(u.entries)):
                // after is the next index in the u.entries directly append
                u.entries = append(u.entries, ents...)

        case after <= u.offset:
                // The log is being truncated to before our current offset portion, so set the offset and replace the entries
                for i, l := 0, len(u.entries); i < l; i++ {
                        u.entries[i] = nil
                }
                u.entries = append(u.entries[0:0], ents...)
                u.offset = after

        default:
                // truncate to after and copy to u.entries then append
                u.entries = append(u.entries[0:0], u.slice(u.offset, after)...)
                u.entries = append(u.entries, ents...)
        }
}

func (u *unstable) slice(lo uint64, hi uint64) []*proto.Entry {
        u.mustCheckOutOfBounds(lo, hi)
        return u.entries[lo-u.offset : hi-u.offset]
}

// u.offset <= lo <= hi <= u.offset+len(u.offset)
func (u *unstable) mustCheckOutOfBounds(lo, hi uint64) {
        if lo > hi {
                errMsg := fmt.Sprintf("unstable.slice[%d,%d) is invalid.", lo, hi)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
        upper := u.offset + uint64(len(u.entries))
        if lo < u.offset || hi > upper {
                errMsg := fmt.Sprintf("unstable.slice[%d,%d) out of bound [%d,%d].", lo, hi, u.offset, upper)
                logger.Error(errMsg)
                panic(AppPanicError(errMsg))
        }
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

// replication represents a follower’s progress of replicate in the view of the leader.
// Leader maintains progresses of all followers, and sends entries to the follower based on its progress.
type replica struct {
        inflight
        peer                                proto.Peer
        state                               replicaState
        paused, active, pending             bool
        match, next, committed, pendingSnap uint64

        lastActive time.Time
        lastZombie time.Time
}

func newReplica(peer proto.Peer, maxInflight int) *replica {
        repl := &replica{
                peer:       peer,
                state:      replicaStateProbe,
                lastActive: time.Now(),
        }
        if maxInflight > 0 {
                repl.inflight.size = maxInflight
                repl.inflight.buffer = make([]uint64, maxInflight)
        }

        return repl
}

func (r *replica) resetState(state replicaState) {
        logger.Debug("raft resetState from [%v]", r)
        r.paused = false
        r.pendingSnap = 0
        r.state = state
        logger.Debug("raft resetState to [%v]", r)
        r.reset()
}

func (r *replica) becomeProbe() {
        if r.state == replicaStateSnapshot {
                pendingSnap := r.pendingSnap
                r.resetState(replicaStateProbe)
                r.next = util.Max(r.match+1, pendingSnap+1)
        } else {
                r.resetState(replicaStateProbe)
                r.next = r.match + 1
        }
}

func (r *replica) becomeReplicate() {
        r.resetState(replicaStateReplicate)
        r.next = r.match + 1
}

func (r *replica) becomeSnapshot(index uint64) {
        r.resetState(replicaStateSnapshot)
        r.pendingSnap = index
}

func (r *replica) update(index uint64) {
        r.next = index + 1
}

func (r *replica) maybeUpdate(index, commit uint64) bool {
        updated := false
        if r.committed < commit {
                r.committed = commit
        }
        if r.match < index {
                r.match = index
                updated = true
                r.resume()
        }
        next := index + 1
        if r.next < next {
                r.next = next
        }
        return updated
}

func (r *replica) maybeDecrTo(rejected, last, commit uint64) bool {
        if r.state == replicaStateReplicate {
                if r.committed < commit {
                        r.committed = commit
                }
                if rejected <= r.match {
                        return false
                }
                r.next = r.match + 1
                return true
        }
        //Probe State
        if r.next-1 != rejected {
                return false
        }
        if r.next = util.Min(rejected, last+1); r.next < 1 {
                r.next = 1
        }
        r.committed = commit
        r.resume()
        return true
}

func (r *replica) snapshotFailure() { r.pendingSnap = 0 }

func (r *replica) needSnapshotAbort() bool {
        return r.state == replicaStateSnapshot && r.match >= r.pendingSnap
}

func (r *replica) pause() { r.paused = true }

func (r *replica) resume() { r.paused = false }

func (r *replica) isPaused() bool {
        switch r.state {
        case replicaStateProbe:
                return r.paused
        case replicaStateSnapshot:
                return true
        default:
                return r.full()
        }
}

func (r *replica) String() string {
        return fmt.Sprintf("next = %d, match = %d, commit = %d, state = %s, waiting = %v, pendingSnapshot = %d", r.next, r.match, r.committed, r.state, r.isPaused(), r.pendingSnap)
}

// inflight is the replication sliding window,avoid overflowing that sending buffer.
type inflight struct {
        start  int
        count  int
        size   int
        buffer []uint64
}

func (in *inflight) add(index uint64) {
        if in.full() {
                panic(AppPanicError(fmt.Sprint("inflight.add cannot add into a full inflights.")))
        }
        next := in.start + in.count
        if next >= in.size {
                next = next - in.size
        }
        in.buffer[next] = index
        in.count = in.count + 1
}

func (in *inflight) freeTo(index uint64) {
        if in.count == 0 || index < in.buffer[in.start] {
                return
        }
        i, idx := 0, in.start
        for ; i < in.count; i++ {
                if index < in.buffer[idx] {
                        break
                }
                if idx = idx + 1; idx >= in.size {
                        idx = idx - in.size
                }
        }
        in.count = in.count - i
        in.start = idx
}

func (in *inflight) freeFirstOne() {
        in.freeTo(in.buffer[in.start])
}

func (in *inflight) full() bool {
        return in.count == in.size
}

func (in *inflight) reset() {
        in.count = 0
        in.start = 0
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "encoding/binary"
        "fmt"
        "io"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type snapshotStatus struct {
        respErr
        stopCh chan struct{}
}

func newSnapshotStatus() *snapshotStatus {
        f := &snapshotStatus{
                stopCh: make(chan struct{}),
        }
        f.init()
        return f
}

type snapshotRequest struct {
        respErr
        snapshotReader
        header *proto.Message
}

func newSnapshotRequest(m *proto.Message, r *util.BufferReader) *snapshotRequest {
        f := &snapshotRequest{
                header:         m,
                snapshotReader: snapshotReader{reader: r},
        }
        f.init()
        return f
}

func (r *snapshotRequest) response() error {
        return <-r.error()
}

type snapshotReader struct {
        reader *util.BufferReader
        err    error
}

func (r *snapshotReader) Next() ([]byte, error) {
        if r.err != nil {
                return nil, r.err
        }

        // read size header
        // r.reader.Reset()
        var buf []byte
        if buf, r.err = r.reader.ReadFull(4); r.err != nil {
                return nil, r.err
        }
        size := uint64(binary.BigEndian.Uint32(buf))
        if size == 0 {
                r.err = io.EOF
                return nil, r.err
        }

        // read data
        // r.reader.Reset()
        if buf, r.err = r.reader.ReadFull(int(size)); r.err != nil {
                return nil, r.err
        }

        return buf, nil
}

func (s *raft) addSnapping(nodeID uint64, rs *snapshotStatus) {
        s.mu.Lock()
        defer s.mu.Unlock()

        if snap, ok := s.snapping[nodeID]; ok {
                close(snap.stopCh)
        }
        s.snapping[nodeID] = rs
}

func (s *raft) removeSnapping(nodeID uint64) {
        s.mu.Lock()
        defer s.mu.Unlock()

        if snap, ok := s.snapping[nodeID]; ok {
                close(snap.stopCh)
                delete(s.snapping, nodeID)
        }
}

func (s *raft) stopSnapping() {
        s.mu.Lock()
        defer s.mu.Unlock()

        for id, snap := range s.snapping {
                close(snap.stopCh)
                delete(s.snapping, id)
        }
}

func (s *raft) sendSnapshot(m *proto.Message) {
        util.RunWorker(func() {
                defer func() {
                        logger.Debug(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] stop send snapshot "+
                                "without the replica from [%v]. to [%v]",
                                s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.To)
                        s.removeSnapping(m.To)
                        m.Snapshot.Close()
                        proto.ReturnMessage(m)
                }()
                logger.Debug(" [raft] [%v term: %d] raftFm[%p] raftReplicas[%v] send snapshot "+
                        "without the replica from [%v ] to [%v].",
                        s.raftFsm.id, s.raftFsm.term, s.raftFsm, s.raftFsm.getReplicas(), m.Type, m.From, m.To)
                // send snapshot
                rs := newSnapshotStatus()
                s.addSnapping(m.To, rs)
                s.config.transport.SendSnapshot(m, rs)
                select {
                case <-s.stopc:
                        return
                case <-rs.stopCh:
                        return
                case err := <-rs.error():
                        nmsg := proto.GetMessage()
                        nmsg.Type = proto.RespMsgSnapShot
                        nmsg.ID = m.ID
                        nmsg.From = m.To
                        nmsg.Reject = (err != nil)
                        s.recvc <- nmsg
                }
        }, func(err interface{}) {
                s.doStop()
                s.handlePanic(err)
        })
}

func (s *raft) handleSnapshot(req *snapshotRequest) {
        s.restoringSnapshot.Set(true)
        var err error
        defer func() {
                req.respond(err)
                s.resetTick()
                s.restoringSnapshot.Set(false)
                proto.ReturnMessage(req.header)
        }()

        // validate snapshot
        if req.header.Term < s.raftFsm.term {
                err = fmt.Errorf("raft %v [term: %d] ignored a snapshot message with lower term from %v [term: %d]", s.raftFsm.id, s.raftFsm.term, req.header.From, req.header.Term)
                return
        }
        if req.header.Term > s.raftFsm.term || s.raftFsm.state != stateFollower {
                s.raftFsm.becomeFollower(req.header.Term, req.header.From)
                s.maybeChange(true)
        }
        if !s.raftFsm.checkSnapshot(req.header.SnapshotMeta) {
                logger.Warn("raft %v [commit: %d] ignored snapshot [index: %d, term: %d].", s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term)
                nmsg := proto.GetMessage()
                nmsg.Type = proto.RespMsgAppend
                nmsg.To = req.header.From
                nmsg.Index = s.raftFsm.raftLog.committed
                nmsg.Commit = s.raftFsm.raftLog.committed
                s.raftFsm.send(nmsg)
                return
        }

        // restore snapshot
        s.raftConfig.Storage.ApplySnapshot(proto.SnapshotMeta{})
        if err = s.raftConfig.StateMachine.ApplySnapshot(req.header.SnapshotMeta.Peers, req); err != nil {
                return
        }
        if err = s.raftConfig.Storage.ApplySnapshot(req.header.SnapshotMeta); err != nil {
                return
        }
        s.raftFsm.restore(req.header.SnapshotMeta)
        s.peerState.replace(req.header.SnapshotMeta.Peers)
        s.curApplied.Set(req.header.SnapshotMeta.Index)

        // send snapshot response message

        logger.Warn("raft %v [commit: %d] restored snapshot [index: %d, term: %d]",
                s.raftFsm.id, s.raftFsm.raftLog.committed, req.header.SnapshotMeta.Index, req.header.SnapshotMeta.Term)

        nmsg := proto.GetMessage()
        nmsg.Type = proto.RespMsgAppend
        nmsg.To = req.header.From
        nmsg.Index = s.raftFsm.raftLog.lastIndex()
        nmsg.Commit = s.raftFsm.raftLog.committed
        s.raftFsm.send(nmsg)
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
)

// ReadOnlyOption read only option
type ReadOnlyOption int

const (
        // ReadOnlySafe guarantees the linearizability of the read only request by
        // communicating with the quorum. It is the default and suggested option.
        ReadOnlySafe ReadOnlyOption = iota
        // ReadOnlyLeaseBased ensures linearizability of the read only request by
        // relying on the leader lease. It can be affected by clock drift.
        // If the clock drift is unbounded, leader might keep the lease longer than it
        // should (clock can move backward/pause without any bound). ReadIndex is not safe
        // in that case.
        ReadOnlyLeaseBased
)

type readIndexStatus struct {
        index   uint64
        futures []*Future
        acks    map[uint64]struct{}
}

type readIndexReady struct {
        index   uint64
        futures []*Future
}

type readOnly struct {
        id     uint64 // raft id
        option ReadOnlyOption

        // wait leader to commit an entry in current term
        committed bool
        // ReadIndex requests before leader commit entry in current term
        scratch []*Future

        // wait quorum ack
        pendings     map[uint64]*readIndexStatus
        pendingQueue []uint64

        // quorum acked, wait apply
        readys     map[uint64]*readIndexReady
        readyQueue []uint64
}

func newReadOnly(id uint64, option ReadOnlyOption) *readOnly {
        return &readOnly{
                id:       id,
                option:   option,
                pendings: make(map[uint64]*readIndexStatus),
                readys:   make(map[uint64]*readIndexReady),
        }
}

func (r *readOnly) addPending(index uint64, futures []*Future) {
        if status, ok := r.pendings[index]; ok {
                status.futures = append(status.futures, futures...)
                return
        }

        // check index valid
        if index <= r.lastPending() {
                panic(AppPanicError(fmt.Sprintf("[raft->addReadOnly][%v] invalid index[%d]: less than last[%d]", r.id, index, r.lastPending())))
        }
        r.pendingQueue = append(r.pendingQueue, index)
        r.pendings[index] = &readIndexStatus{
                index:   index,
                futures: futures,
                acks:    make(map[uint64]struct{}),
        }
}

func (r *readOnly) addReady(index uint64, futures []*Future) {
        if status, ok := r.readys[index]; ok {
                status.futures = append(status.futures, futures...)
                return
        }
        r.readyQueue = append(r.readyQueue, index)
        r.readys[index] = &readIndexReady{
                index:   index,
                futures: futures,
        }
}

func (r *readOnly) add(index uint64, futures []*Future) {
        if !r.committed {
                r.scratch = append(r.scratch, futures...)
                return
        }

        if r.option == ReadOnlyLeaseBased {
                r.addReady(index, futures)
        } else {
                r.addPending(index, futures)
        }
}

func (r *readOnly) commit(index uint64) {
        if !r.committed {
                r.committed = true
                if len(r.scratch) > 0 {
                        r.add(index, r.scratch)
                        r.scratch = nil
                }
        }
}

func (r *readOnly) lastPending() uint64 {
        if len(r.pendingQueue) > 0 {
                return r.pendingQueue[len(r.pendingQueue)-1]
        }
        return 0
}

func (r *readOnly) recvAck(index uint64, from uint64, quorum int) {
        status, ok := r.pendings[index]
        if !ok {
                return
        }
        status.acks[from] = struct{}{}
        // add one to include an ack from local node
        if len(status.acks)+1 >= quorum {
                r.advance(index)
        }
}

func (r *readOnly) advance(index uint64) {
        var i int
        for _, idx := range r.pendingQueue {
                if idx > index {
                        break
                }
                if rs, ok := r.pendings[idx]; ok {
                        r.addReady(idx, rs.futures)
                        delete(r.pendings, idx)
                }
                i++
        }
        r.pendingQueue = r.pendingQueue[i:]
}

func (r *readOnly) getReady(applied uint64) (futures []*Future) {
        if len(r.readyQueue) == 0 {
                return nil
        }

        var i int
        for _, idx := range r.readyQueue {
                if idx > applied {
                        break
                }
                if rs, ok := r.readys[idx]; ok {
                        futures = append(futures, rs.futures...)
                        delete(r.readys, idx)
                }
                i++
        }
        r.readyQueue = r.readyQueue[i:]
        // TODO: remove this when stable
        if logger.IsEnableDebug() {
                logger.Debug("raft[%d] get ready index %d, futures len: %d", r.id, applied, len(futures))
        }
        return
}

func (r *readOnly) containsUpdate(applied uint64) bool {
        return len(r.readyQueue) > 0 && applied >= r.readyQueue[0]
}

func (r *readOnly) reset(err error) {
        respondReadIndex(r.scratch, err)
        for _, status := range r.pendings {
                respondReadIndex(status.futures, err)
        }
        for _, ready := range r.readys {
                respondReadIndex(ready.futures, err)
        }

        r.committed = false
        r.scratch = nil
        r.pendings = make(map[uint64]*readIndexStatus)
        r.pendingQueue = nil
        r.readys = make(map[uint64]*readIndexReady)
}

func respondReadIndex(future []*Future, err error) {
        for _, f := range future {
                f.respond(nil, err)
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "errors"
        "sync"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

var (
        fatalStopc = make(chan uint64)
)

type RaftServer struct {
        config *Config
        ticker *time.Ticker
        heartc chan *proto.Message
        stopc  chan struct{}
        mu     sync.RWMutex
        rafts  map[uint64]*raft
}

func (rs *RaftServer) RemoveRaftForce(raftId uint64, cc *proto.ConfChange) {
        var s *raft
        var ok bool
        if s, ok = rs.rafts[raftId]; !ok {
                return
        }
        // repl apply
        peerChange := cc.Peer
        for _, replica := range s.raftFsm.replicas {
                logger.Info("raft[%v] replias [%v]", s.raftFsm.id, replica.peer.String())
        }
        s.raftFsm.removePeer(cc.Peer)

        if _, ok := s.raftFsm.replicas[peerChange.PeerID]; !ok {
                if logger.IsEnableWarn() {
                        logger.Warn("raft[%v] applying configuration peer [%v] be removed and stop snapshot", s.raftFsm.id, peerChange)
                }
                s.removeSnapping(peerChange.PeerID)
                s.peerState.change(cc)
                if logger.IsEnableWarn() {
                        logger.Warn("raft[%v] applying configuration change %v.", s.raftFsm.id, cc)
                }
        }
}

func NewRaftServer(config *Config) (*RaftServer, error) {
        if err := config.validate(); err != nil {
                return nil, err
        }

        rs := &RaftServer{
                config: config,
                ticker: time.NewTicker(config.TickInterval),
                rafts:  make(map[uint64]*raft),
                heartc: make(chan *proto.Message, 512),
                stopc:  make(chan struct{}),
        }
        if transport, err := NewMultiTransport(rs, &config.TransportConfig); err != nil {
                return nil, err
        } else {
                rs.config.transport = transport
        }

        util.RunWorkerUtilStop(rs.run, rs.stopc)
        return rs, nil
}

func (rs *RaftServer) run() {
        ticks := 0
        for {
                select {
                case <-rs.stopc:
                        return

                case id := <-fatalStopc:
                        rs.mu.Lock()
                        delete(rs.rafts, id)
                        rs.mu.Unlock()

                case m := <-rs.heartc:
                        switch m.Type {
                        case proto.ReqMsgHeartBeat:
                                rs.handleHeartbeat(m)
                        case proto.RespMsgHeartBeat:
                                rs.handleHeartbeatResp(m)
                        }

                case <-rs.ticker.C:
                        ticks++
                        if ticks >= rs.config.HeartbeatTick {
                                ticks = 0
                                rs.sendHeartbeat()
                        }

                        rs.mu.RLock()
                        for _, raft := range rs.rafts {
                                raft.tick()
                        }
                        rs.mu.RUnlock()
                }
        }
}

func (rs *RaftServer) Stop() {
        rs.mu.Lock()
        defer rs.mu.Unlock()

        select {
        case <-rs.stopc:
                return

        default:
                close(rs.stopc)
                rs.ticker.Stop()
                wg := new(sync.WaitGroup)
                for id, s := range rs.rafts {
                        delete(rs.rafts, id)
                        wg.Add(1)
                        go func(r *raft) {
                                defer wg.Done()
                                r.stop()
                        }(s)
                }
                wg.Wait()
                rs.config.transport.Stop()
        }
}

func (rs *RaftServer) CreateRaft(raftConfig *RaftConfig) error {
        var (
                raft *raft
                err  error
        )

        defer func() {
                if err != nil {
                        logger.Error("CreateRaft [%v] failed, error is:\r\n %s", raftConfig.ID, err.Error())
                        return
                }
                logger.Info("Create Raft success, id:%d", raftConfig.ID)
        }()

        if raft, err = newRaft(rs.config, raftConfig); err != nil {
                return err
        }
        if raft == nil {
                err = errors.New("CreateRaft return nil, maybe occur panic.")
                return err
        }

        rs.mu.Lock()
        defer rs.mu.Unlock()
        if _, ok := rs.rafts[raftConfig.ID]; ok {
                raft.stop()
                err = ErrRaftExists
                return err
        }
        rs.rafts[raftConfig.ID] = raft
        return nil
}

func (rs *RaftServer) RemoveRaft(id uint64) error {
        rs.mu.Lock()
        raft, ok := rs.rafts[id]
        delete(rs.rafts, id)
        rs.mu.Unlock()

        if ok {
                raft.stop()
        }
        return nil
}

func (rs *RaftServer) Submit(id uint64, cmd []byte) (future *Future) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        future = newFuture()
        if !ok {
                future.respond(nil, ErrRaftNotExists)
                return
        }
        raft.propose(cmd, future)
        return
}

func (rs *RaftServer) ChangeMember(id uint64, changeType proto.ConfChangeType, peer proto.Peer, context []byte) (future *Future) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        future = newFuture()
        if !ok {
                future.respond(nil, ErrRaftNotExists)
                return
        }
        raft.proposeMemberChange(&proto.ConfChange{Type: changeType, Peer: peer, Context: context}, future)
        return
}

func (rs *RaftServer) IsRestoring(id uint64) bool {
        rs.mu.RLock()
        defer rs.mu.RUnlock()
        if raft, ok := rs.rafts[id]; ok {
                return raft.restoringSnapshot.Get() && raft.applied() == 0
        }
        return true
}

func (rs *RaftServer) Status(id uint64) (status *Status) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if ok {
                status = raft.status()
        }
        if status == nil {
                status = &Status{
                        ID:      id,
                        NodeID:  rs.config.NodeID,
                        Stopped: true,
                }
        }
        return
}

func (rs *RaftServer) LeaderTerm(id uint64) (leader, term uint64) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if ok {
                return raft.leaderTerm()
        }
        return NoLeader, 0
}

func (rs *RaftServer) IsLeader(id uint64) bool {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()
        if ok {
                return raft.isLeader()
        }
        return false
}

func (rs *RaftServer) AppliedIndex(id uint64) uint64 {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if ok {
                return raft.applied()
        }
        return 0
}

func (rs *RaftServer) CommittedIndex(id uint64) uint64 {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if ok {
                return raft.committed()
        }
        return 0
}

func (rs *RaftServer) FirstCommittedIndex(id uint64) uint64 {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if ok {
                return raft.raftFsm.raftLog.firstIndex()
        }
        return 0
}

func (rs *RaftServer) TryToLeader(id uint64) (future *Future) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        future = newFuture()
        if !ok {
                future.respond(nil, ErrRaftNotExists)
                return
        }
        raft.tryToLeader(future)
        return
}

func (rs *RaftServer) Truncate(id uint64, index uint64) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if !ok {
                return
        }
        raft.truncate(index)
}

func (rs *RaftServer) GetUnreachable(id uint64) (nodes []uint64) {
        downReplicas := rs.GetDownReplicas(id)
        for _, r := range downReplicas {
                nodes = append(nodes, r.NodeID)
        }
        return
}

// GetDownReplicas 获取down的副本
func (rs *RaftServer) GetDownReplicas(id uint64) (downReplicas []DownReplica) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if !ok {
                return nil
        }

        status := raft.status()
        if status != nil && len(status.Replicas) > 0 {
                for n, r := range status.Replicas {
                        if n == rs.config.NodeID {
                                continue
                        }
                        since := time.Since(r.LastActive)
                        // 两次心跳内没活跃就视为Down
                        downDuration := since - time.Duration(2*rs.config.HeartbeatTick)*rs.config.TickInterval
                        if downDuration > 0 {
                                downReplicas = append(downReplicas, DownReplica{
                                        NodeID:      n,
                                        DownSeconds: int(downDuration / time.Second),
                                })
                        }
                }
        }
        return
}

// GetPendingReplica get snapshot pending followers
func (rs *RaftServer) GetPendingReplica(id uint64) (peers []uint64) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        if !ok {
                return nil
        }

        status := raft.status()
        if status != nil && len(status.Replicas) > 0 {
                for n, r := range status.Replicas {
                        if n == rs.config.NodeID {
                                continue
                        }
                        if r.Snapshoting {
                                peers = append(peers, n)
                        }
                }
        }
        return
}

// ReadIndex read index
func (rs *RaftServer) ReadIndex(id uint64) (future *Future) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        future = newFuture()
        if !ok {
                future.respond(nil, ErrRaftNotExists)
                return
        }
        raft.readIndex(future)
        return
}

// GetEntries get raft log entries
func (rs *RaftServer) GetEntries(id uint64, startIndex uint64, maxSize uint64) (future *Future) {
        rs.mu.RLock()
        raft, ok := rs.rafts[id]
        rs.mu.RUnlock()

        future = newFuture()
        if !ok {
                future.respond(nil, ErrRaftNotExists)
                return
        }
        raft.getEntries(future, startIndex, maxSize)
        return
}

func (rs *RaftServer) sendHeartbeat() {
        // key: sendto nodeId; value: range ids
        nodes := make(map[uint64]proto.HeartbeatContext)
        rs.mu.RLock()
        for id, raft := range rs.rafts {
                if !raft.isLeader() {
                        continue
                }
                peers := raft.getPeers()
                for _, p := range peers {
                        nodes[p] = append(nodes[p], id)
                }
        }
        rs.mu.RUnlock()

        for to, ctx := range nodes {
                if to == rs.config.NodeID {
                        continue
                }

                msg := proto.GetMessage()
                msg.Type = proto.ReqMsgHeartBeat
                msg.From = rs.config.NodeID
                msg.To = to
                msg.Context = proto.EncodeHBConext(ctx)
                rs.config.transport.Send(msg)
        }
}

func (rs *RaftServer) handleHeartbeat(m *proto.Message) {
        ctx := proto.DecodeHBContext(m.Context)
        var respCtx proto.HeartbeatContext
        rs.mu.RLock()
        for _, id := range ctx {
                if raft, ok := rs.rafts[id]; ok {
                        raft.reciveMessage(m)
                        respCtx = append(respCtx, id)
                }
        }
        rs.mu.RUnlock()

        msg := proto.GetMessage()
        msg.Type = proto.RespMsgHeartBeat
        msg.From = rs.config.NodeID
        msg.To = m.From
        msg.Context = proto.EncodeHBConext(respCtx)
        rs.config.transport.Send(msg)
}

func (rs *RaftServer) handleHeartbeatResp(m *proto.Message) {
        ctx := proto.DecodeHBContext(m.Context)

        rs.mu.RLock()
        defer rs.mu.RUnlock()

        for _, id := range ctx {
                if raft, ok := rs.rafts[id]; ok {
                        raft.reciveMessage(m)
                }
        }
}

func (rs *RaftServer) reciveMessage(m *proto.Message) {
        if m.Type == proto.ReqMsgHeartBeat || m.Type == proto.RespMsgHeartBeat {
                rs.heartc <- m
                return
        }

        rs.mu.RLock()
        raft, ok := rs.rafts[m.ID]
        rs.mu.RUnlock()
        if ok {
                raft.reciveMessage(m)
        }
}

func (rs *RaftServer) reciveSnapshot(req *snapshotRequest) {
        rs.mu.RLock()
        raft, ok := rs.rafts[req.header.ID]
        rs.mu.RUnlock()

        if !ok {
                req.respond(ErrRaftNotExists)
                return
        }
        raft.reciveSnapshot(req)
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

// The StateMachine interface is supplied by the application to persist/snapshot data of application.
type StateMachine interface {
        Apply(command []byte, index uint64) (interface{}, error)
        ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error)
        Snapshot() (proto.Snapshot, error)
        ApplySnapshot(peers []proto.Peer, iter proto.SnapIterator) error
        HandleFatalEvent(err *FatalError)
        HandleLeaderChange(leader uint64)
}

type SocketType byte

const (
        HeartBeat SocketType = 0
        Replicate SocketType = 1
)

func (t SocketType) String() string {
        switch t {
        case 0:
                return "HeartBeat"
        case 1:
                return "Replicate"
        }
        return "unkown"
}

// The SocketResolver interface is supplied by the application to resolve NodeID to net.Addr addresses.
type SocketResolver interface {
        NodeAddress(nodeID uint64, stype SocketType) (addr string, err error)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "time"
)

// DownReplica  down replica
type DownReplica struct {
        NodeID      uint64
        DownSeconds int
}

// ReplicaStatus  replica status
type ReplicaStatus struct {
        Match       uint64 // 复制进度
        Commit      uint64 // commmit位置
        Next        uint64
        State       string
        Snapshoting bool
        Paused      bool
        Active      bool
        LastActive  time.Time
        Inflight    int
}

// Status raft status
type Status struct {
        ID                uint64
        NodeID            uint64
        Leader            uint64
        Term              uint64
        Index             uint64
        Commit            uint64
        Applied           uint64
        Vote              uint64
        PendQueue         int
        RecvQueue         int
        AppQueue          int
        Stopped           bool
        RestoringSnapshot bool
        State             string // leader、follower、candidate
        Replicas          map[uint64]*ReplicaStatus
}

func (s *Status) String() string {
        st := "running"
        if s.Stopped {
                st = "stopped"
        } else if s.RestoringSnapshot {
                st = "snapshot"
        }
        j := fmt.Sprintf(`{"id":"%v","nodeID":"%v","state":"%v","leader":"%v","term":"%v","index":"%v","commit":"%v","applied":"%v","vote":"%v","pendingQueue":"%v",
                                        "recvQueue":"%v","applyQueue":"%v","status":"%v","replication":{`, s.ID, s.NodeID, s.State, s.Leader, s.Term, s.Index, s.Commit, s.Applied, s.Vote, s.PendQueue, s.RecvQueue, s.AppQueue, st)
        if len(s.Replicas) == 0 {
                j += "}}"
        } else {
                for k, v := range s.Replicas {
                        p := "false"
                        if v.Paused {
                                p = "true"
                        }
                        subj := fmt.Sprintf(`"%v":{"match":"%v","commit":"%v","next":"%v","state":"%v","paused":"%v","inflight":"%v","active":"%v"},`, k, v.Match, v.Commit, v.Next, v.State, p, v.Inflight, v.Active)
                        j += subj
                }
                j = j[:len(j)-1] + "}}"
        }
        return j
}

// Copyright 2015 The etcd Authors
// Modified work copyright 2018 The tiglabs Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package storage

import (
        "errors"
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type fsm interface {
        AppliedIndex(id uint64) uint64
}

// This storage is circular storage in memory and truncate when over capacity,
// but keep it a high capacity.
type MemoryStorage struct {
        fsm fsm
        id  uint64
        // the threshold of truncate
        capacity uint64
        // the index of last truncate
        truncIndex uint64
        truncTerm  uint64
        // the starting offset in the ents
        start uint64
        // the actual log in the ents
        count uint64
        // the total size of the ents
        size uint64
        // ents[i] has raft log position i+snapshot.Metadata.Index
        ents      []*proto.Entry
        hardState proto.HardState
}

func NewMemoryStorage(fsm fsm, id, capacity uint64) *MemoryStorage {
        if logger.IsEnableWarn() {
                logger.Warn("Memory Storage capacity is: %v.", capacity)
        }
        return &MemoryStorage{
                fsm:      fsm,
                id:       id,
                capacity: capacity,
                size:     capacity,
                ents:     make([]*proto.Entry, capacity),
        }
}

func DefaultMemoryStorage() *MemoryStorage {
        return NewMemoryStorage(nil, 0, 4096)
}

func (ms *MemoryStorage) InitialState() (proto.HardState, error) {
        return ms.hardState, nil
}

func (ms *MemoryStorage) FirstIndex() (uint64, error) {
        return ms.truncIndex + 1, nil
}

func (ms *MemoryStorage) LastIndex() (uint64, error) {
        return ms.lastIndex(), nil
}

func (ms *MemoryStorage) lastIndex() uint64 {
        return ms.truncIndex + ms.count
}

func (ms *MemoryStorage) Term(index uint64) (term uint64, isCompact bool, err error) {
        switch {
        case index < ms.truncIndex:
                return 0, true, nil
        case index == ms.truncIndex:
                return ms.truncTerm, false, nil
        default:
                return ms.ents[ms.locatePosition(index)].Term, false, nil
        }
}

func (ms *MemoryStorage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
        if lo <= ms.truncIndex {
                return nil, true, nil
        }
        if hi > ms.lastIndex()+1 {
                return nil, false, fmt.Errorf("[MemoryStorage->Entries]entries's hi(%d) is out of bound lastindex(%d)", hi, ms.lastIndex())
        }
        // only contains dummy entries.
        if ms.count == 0 {
                return nil, false, errors.New("requested entry at index is unavailable")
        }

        count := hi - lo
        if count <= 0 {
                return []*proto.Entry{}, false, nil
        }
        retEnts := make([]*proto.Entry, count)
        pos := ms.locatePosition(lo)
        retEnts[0] = ms.ents[pos]
        size := ms.ents[pos].Size()
        limit := uint64(1)
        for ; limit < count; limit++ {
                pos = pos + 1
                if pos >= ms.size {
                        pos = pos - ms.size
                }
                size = size + ms.ents[pos].Size()
                if uint64(size) > maxSize {
                        break
                }
                retEnts[limit] = ms.ents[pos]
        }
        return retEnts[:limit], false, nil
}

// StoreEntries equal etcd raft append
func (ms *MemoryStorage) StoreEntries(entries []*proto.Entry) error {
        if len(entries) == 0 {
                return nil
        }

        appIndex := uint64(0)
        if ms.fsm != nil {
                appIndex = ms.fsm.AppliedIndex(ms.id)
        }
        first := appIndex + 1
        last := entries[0].Index + uint64(len(entries)) - 1
        if last < first {
                // shortcut if there is no new entry.
                return nil
        }
        if first > entries[0].Index {
                // truncate compacted entries
                entries = entries[first-entries[0].Index:]
        }
        offset := entries[0].Index - ms.truncIndex - 1
        if ms.count < offset {
                logger.Error("missing log entry [last: %d, append at: %d]", ms.lastIndex(), entries[0].Index)
                return nil
        }

        // resize and truncate compacted ents
        entriesSize := uint64(len(entries))
        maxSize := offset + entriesSize
        minSize := maxSize - (appIndex - ms.truncIndex)
        switch {
        case minSize > ms.capacity:
                // truncate compacted ents
                if ms.truncIndex < appIndex {
                        ms.truncateTo(appIndex)
                }
                // grow ents
                if minSize > ms.size {
                        ms.resize(ms.capacity+minSize, minSize)
                }

        default:
                // truncate compacted ents
                if maxSize > ms.capacity {
                        cmpIdx := util.Min(appIndex, maxSize-ms.capacity+ms.truncIndex)
                        if ms.truncIndex < cmpIdx {
                                ms.truncateTo(cmpIdx)
                        }
                }
                // short ents
                if ms.size > ms.capacity {
                        ms.resize(ms.capacity, maxSize)
                }
        }

        // append new entries
        start := ms.locatePosition(entries[0].Index)
        next := start + entriesSize
        if next <= ms.size {
                copy(ms.ents[start:], entries)
                if ms.start <= start {
                        ms.count = next - ms.start
                } else {
                        ms.count = (ms.size - ms.start) + (next - 0)
                }
        } else {
                count := ms.size - start
                copy(ms.ents[start:], entries[0:count])
                copy(ms.ents[0:], entries[count:])
                ms.count = (ms.size - ms.start) + (entriesSize - count)
        }

        return nil
}

func (ms *MemoryStorage) StoreHardState(st proto.HardState) error {
        ms.hardState = st
        return nil
}

func (ms *MemoryStorage) ApplySnapshot(meta proto.SnapshotMeta) error {
        ms.truncIndex = meta.Index
        ms.truncTerm = meta.Term
        ms.start = 0
        ms.count = 0
        ms.size = ms.capacity
        ms.ents = make([]*proto.Entry, ms.capacity)
        return nil
}

func (ms *MemoryStorage) Truncate(index uint64) error {
        if index == 0 || index <= ms.truncIndex {
                return errors.New("requested index is unavailable due to compaction")
        }
        if index > ms.lastIndex() {
                return fmt.Errorf("compact %d is out of bound lastindex(%d)", index, ms.lastIndex())
        }
        ms.truncateTo(index)
        return nil
}

func (ms *MemoryStorage) Close() {

}

func (ms *MemoryStorage) truncateTo(index uint64) {
        ms.truncTerm = ms.ents[ms.locatePosition(index)].Term
        ms.start = ms.locatePosition(index + 1)
        ms.count = ms.count - (index - ms.truncIndex)
        ms.truncIndex = index
}

func (ms *MemoryStorage) resize(capacity, needSize uint64) {
        ents := make([]*proto.Entry, capacity)
        count := util.Min(util.Min(capacity, ms.count), needSize)
        next := ms.start + count
        if next <= ms.size {
                copy(ents, ms.ents[ms.start:next])
        } else {
                next = next - ms.size
                copy(ents, ms.ents[ms.start:])
                copy(ents[ms.size-ms.start:], ms.ents[0:next])
        }

        ms.start = 0
        ms.count = count
        ms.size = capacity
        ms.ents = ents
}

func (ms *MemoryStorage) locatePosition(index uint64) uint64 {
        position := ms.start + (index - ms.truncIndex - 1)
        if position >= ms.size {
                position = position - ms.size
        }
        return position
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import "github.com/cubefs/cubefs/depends/tiglabs/raft/util"

const (
        DefaultFileCacheCapacity = 2
        DefaultFileSize          = 32 * util.MB
        MinFileSize              = 1 * util.MB
        MaxRotateInterval        = 86400
        DefaultSync              = false
)

// Config wal config
type Config struct {
        // FileCacheCapacity  缓存多少个打开的日志文件（包括index等）
        FileCacheCapacity int

        // FileSize 日志文件的大小
        FileSize int

        Sync bool

        // TruncateFirstDummy  初始化时添加一条日志然后截断
        TruncateFirstDummy bool
}

func (c *Config) GetFileCacheCapacity() int {
        if c == nil || c.FileCacheCapacity <= 0 {
                return DefaultFileCacheCapacity
        }
        return c.FileCacheCapacity
}

func (c *Config) GetFileSize() int {
        if c == nil || c.FileSize <= 0 {
                return DefaultFileSize
        }

        return c.FileSize
}

func (c *Config) GetSync() bool {
        if c == nil {
                return DefaultSync
        }
        return c.Sync
}

func (c *Config) GetTruncateFirstDummy() bool {
        if c == nil {
                return false
        }
        return c.TruncateFirstDummy
}

func (c *Config) dup() *Config {
        if c != nil {
                dc := *c
                return &dc
        } else {
                return nil
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/google/btree"
)

type cacheItem proto.Entry

func (c *cacheItem) Less(than btree.Item) bool {
        return c.Index < than.(*cacheItem).Index
}

// cache中只保持最新的(index较大的)若干条日志
type entryCache struct {
        capacity int
        ents     *btree.BTree
        key      *cacheItem
}

func newEntryCache(capacity int) *entryCache {
        return &entryCache{
                capacity: capacity,
                ents:     btree.New(4),
                key:      new(cacheItem),
        }
}

func (c *entryCache) Get(index uint64) *proto.Entry {
        c.key.Index = index
        ent := c.ents.Get(c.key)
        if ent != nil {
                return (*proto.Entry)(ent.(*cacheItem))
        } else {
                return nil
        }
}

func (c *entryCache) Append(ent *proto.Entry) {
        // 截断冲突的
        for c.ents.Len() > 0 && c.ents.Max().(*cacheItem).Index >= ent.Index {
                c.ents.DeleteMax()
        }

        c.ents.ReplaceOrInsert((*cacheItem)(ent))

        // keep capacity
        for c.ents.Len() > c.capacity {
                c.ents.DeleteMin()
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import "container/list"

type openFunc func(logFileName) (*logEntryFile, error)

type logFileCache struct {
        capacity int

        l *list.List
        m map[logFileName]*list.Element // key是seq

        f openFunc
}

func newLogFileCache(capacity int, f openFunc) *logFileCache {
        return &logFileCache{
                capacity: capacity,
                l:        list.New(),
                m:        make(map[logFileName]*list.Element, capacity),
                f:        f,
        }
}

func (lc *logFileCache) Get(name logFileName) (lf *logEntryFile, err error) {
        e, ok := lc.m[name]
        if ok {
                lf = (e.Value).(*logEntryFile)
                lc.l.MoveToFront(e)
                return
        }

        // 不存在打开新的
        lf, err = lc.f(name)
        if err != nil {
                return
        }
        // 缓存
        e = lc.l.PushFront(lf)
        lc.m[name] = e

        // keep capacity
        for lc.l.Len() > lc.capacity {
                e = lc.l.Back()
                df := (e.Value).(*logEntryFile)
                if err = lc.Delete(df.Name(), true); err != nil {
                        return nil, err
                }
        }
        return
}

func (lc *logFileCache) Delete(name logFileName, close bool) error {
        e, ok := lc.m[name]
        if !ok {
                return nil
        }

        lf := e.Value.(*logEntryFile)
        if close {
                if err := lf.Close(); err != nil {
                        return err
                }
        }
        delete(lc.m, lf.Name())
        lc.l.Remove(e)
        return nil
}

func (lc *logFileCache) Close() (err error) {
        for _, e := range lc.m {
                f := (e.Value).(*logEntryFile)
                err = f.Close()
        }
        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "errors"
        "fmt"
        "io"
        "os"
        "sort"
)

// 目录初始化 不存在则创建；存在检查路径是否是目录
func initDir(dir string) error {
        info, err := os.Stat(dir)
        if err != nil {
                if pathErr, ok := err.(*os.PathError); ok {
                        if os.IsNotExist(pathErr) {
                                return os.MkdirAll(dir, 0755)
                        }
                }
                return err
        }

        if !info.IsDir() {
                return errors.New("fbase/raftstore: path is not directory")
        }

        return nil
}

// 日志文件名的组成 seq-index.log
type logFileName struct {
        seq   uint64 // 文件序号
        index uint64 // 起始index（log entry)
}

func (l *logFileName) String() string {
        return fmt.Sprintf("%016x-%016x.log", l.seq, l.index)
}

func (l *logFileName) ParseFrom(s string) bool {
        _, err := fmt.Sscanf(s, "%016x-%016x.log", &l.seq, &l.index)
        return err == nil
}

type nameSlice []logFileName

func (s nameSlice) Len() int           { return len(s) }
func (s nameSlice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
func (s nameSlice) Less(i, j int) bool { return s[i].seq < s[j].seq }

// 枚举目录下的所有日志文件并按序号排序
func listLogEntryFiles(path string) (fnames []logFileName, err error) {
        dir, err := os.Open(path)
        if err != nil {
                return nil, err
        }
        defer dir.Close()

        names, err := dir.Readdirnames(0)
        if err != nil {
                return nil, err
        }

        for _, name := range names {
                var n logFileName
                if n.ParseFrom(name) {
                        fnames = append(fnames, n)
                }
        }
        sort.Sort(nameSlice(fnames))
        return
}

// 退化版本的预分配空间
func fallocDegraded(f *os.File, sizeInBytes int64) error {
        curOff, err := f.Seek(0, io.SeekCurrent)
        if err != nil {
                return err
        }
        size, err := f.Seek(sizeInBytes, io.SeekEnd)
        if err != nil {
                return err
        }
        if _, err = f.Seek(curOff, io.SeekStart); err != nil {
                return err
        }
        if sizeInBytes > size {
                return nil
        }
        return f.Truncate(sizeInBytes)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build linux
// +build linux

package wal

import (
        "os"
        "syscall"
)

const (
        fallocateModeDefault  uint32 = 0 // 默认模式下预分配的空间全部补0
        fallocateModeKeepSize uint32 = 1 // 预分配后保持原来的文件大小，不补0
)

func fdatasync(f *os.File) error {
        return syscall.Fdatasync(int(f.Fd()))
}

// 预分配然后补零
func fallocate(f *os.File, sizeInBytes int64) error {
        err := syscall.Fallocate(int(f.Fd()), fallocateModeDefault, 0, sizeInBytes)
        if err != nil {
                errno, ok := err.(syscall.Errno)
                if ok && (errno == syscall.ENOTSUP || errno == syscall.EINTR) {
                        return fallocDegraded(f, sizeInBytes)
                }
        }
        return err
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "bytes"
        "io"
        "os"
        "path"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)

type logEntryFile struct {
        dir  string
        name logFileName

        f     *os.File
        r     recordReadAt
        w     *recordWriter
        index logEntryIndex
}

func openLogEntryFile(dir string, name logFileName, isLastOne bool) (*logEntryFile, error) {
        p := path.Join(dir, name.String())
        f, err := os.OpenFile(p, os.O_RDWR|os.O_APPEND, 0600)
        if err != nil {
                return nil, err
        }

        lf := &logEntryFile{
                dir:  dir,
                name: name,
                f:    f,
                r:    newRecordReader(f),
        }

        if !isLastOne {
                // 读取索引数据
                if err = lf.ReadIndex(); err != nil {
                        return nil, err
                }
        } else {
                // 重建索引
                toffset, err := lf.ReBuildIndex()
                if err != nil && err != io.ErrUnexpectedEOF && !IsErrCorrupt(err) {
                        return nil, err
                }
                // 打开写
                if err = lf.OpenWrite(); err != nil {
                        return nil, err
                }
                // 截断索引及后面的数据
                if toffset >= 0 {
                        log.Warn("truncate last logfile's N@%d index at: %d", lf.name.seq, toffset)
                        if err := lf.w.Truncate(toffset); err != nil {
                                return nil, err
                        }
                }
        }

        return lf, nil
}

func createLogEntryFile(dir string, name logFileName) (*logEntryFile, error) {
        p := path.Join(dir, name.String())
        f, err := os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC|os.O_APPEND, 0600)
        if err != nil {
                return nil, err
        }

        lf := &logEntryFile{
                dir:  dir,
                name: name,
                f:    f,
                r:    newRecordReader(f),
        }

        if err := lf.OpenWrite(); err != nil {
                return nil, err
        }

        return lf, nil
}

func (lf *logEntryFile) ReadIndex() error {
        info, err := lf.f.Stat()
        if err != nil {
                return err
        }

        // read footer
        var footer footerRecord
        if info.Size() < int64(footer.Size()) {
                return NewCorruptError(lf.f.Name(), 0, "too small footer")
        }
        offset := info.Size() - int64(recordSize(footer))
        rec, err := lf.r.ReadAt(offset)
        if err != nil {
                return err
        }
        if rec.recType != recTypeFooter {
                return NewCorruptError(lf.f.Name(), offset, "wrong footer record type")
        }
        if rec.dataLen != footer.Size() {
                return NewCorruptError(lf.f.Name(), offset, "wrong footer size")
        }
        footer.Decode(rec.data)
        if !bytes.Equal(footer.magic, footerMagic) {
                return NewCorruptError(lf.f.Name(), offset, "wrong footer magic")
        }

        // read index data
        offset = int64(footer.indexOffset)
        rec, err = lf.r.ReadAt(offset)
        if err != nil {
                return err
        }
        if rec.recType != recTypeIndex {
                return NewCorruptError(lf.f.Name(), offset, "wrong index record type")
        }
        lf.index = decodeLogIndex(rec.data)

        return nil
}

func (lf *logEntryFile) ReBuildIndex() (truncateOffset int64, err error) {
        lf.index = nil

        // 获取文件大小
        info, err := lf.f.Stat()
        if err != nil {
                return 0, err
        }
        filesize := info.Size()

        var (
                rec              record
                offset           int64
                nextRecordOffset int64
        )
        r := newRecordReader(lf.f)
        for {
                offset, rec, err = r.Read()
                if err != nil {
                        break
                }
                nextRecordOffset = r.offset
                // log entry 更新索引
                if rec.recType == recTypeLogEntry {
                        ent := &proto.Entry{}
                        ent.Decode(rec.data)
                        lf.index = lf.index.Append(uint32(offset), ent)
                } else {
                        // All valid log entries have been loaded
                        return offset, nil
                }
        }
        if err == io.EOF {
                err = nil
        }
        if filesize != nextRecordOffset {
                log.Warn("logName[%v],fileSize[%v],corrupt data after offset[%v]", lf.name, filesize, nextRecordOffset)
        }
        return offset, err
}

func (lf *logEntryFile) Name() logFileName {
        return lf.name
}

func (lf *logEntryFile) Seq() uint64 {
        return lf.name.seq
}

func (lf *logEntryFile) Len() int {
        return lf.index.Len()
}

func (lf *logEntryFile) FirstIndex() uint64 {
        return lf.index.First()
}

func (lf *logEntryFile) LastIndex() uint64 {
        return lf.index.Last()
}

// Get get log entry
func (lf *logEntryFile) Get(i uint64) (*proto.Entry, error) {
        item, err := lf.index.Get(i)
        if err != nil {
                return nil, err
        }

        rec, err := lf.r.ReadAt(int64(item.offset))
        if err != nil {
                return nil, err
        }

        ent := &proto.Entry{}
        ent.Decode(rec.data)

        return ent, nil
}

// Term get log's term
func (lf *logEntryFile) Term(i uint64) (uint64, error) {
        item, err := lf.index.Get(i)
        if err != nil {
                return 0, err
        }
        return item.logterm, nil
}

// Truncate 截断最近的日志
func (lf *logEntryFile) Truncate(index uint64) error {
        if lf.Len() == 0 {
                return nil
        }

        item, err := lf.index.Get(index)
        if err != nil {
                return err
        }

        // 截断文件
        offset := int64(item.offset)
        if err = lf.w.Truncate(offset); err != nil {
                return err
        }

        // 截断索引
        lf.index, err = lf.index.Truncate(index)
        return err
}

func (lf *logEntryFile) Save(ent *proto.Entry) error {
        // 写入文件
        offset := lf.w.Offset()
        if err := lf.w.Write(recTypeLogEntry, ent); err != nil {
                return err
        }

        // 更新索引
        lf.index = lf.index.Append(uint32(offset), ent)

        return nil
}

func (lf *logEntryFile) OpenWrite() error {
        if lf.w != nil {
                return nil
        }

        lf.w = newRecordWriter(lf.f)
        return nil
}

func (lf *logEntryFile) WriteOffset() int64 {
        return lf.w.Offset()
}

func (lf *logEntryFile) Flush() error {
        return lf.w.Flush()
}

// Sync flush write buffer and sync to disk
func (lf *logEntryFile) Sync() error {
        return lf.w.Sync()
}

func (lf *logEntryFile) FinishWrite() error {
        var err error

        // write log index data
        recOffset := lf.w.Offset()
        if err = lf.w.Write(recTypeIndex, lf.index); err != nil {
                return err
        }

        // write log file footer
        footer := &footerRecord{
                indexOffset: uint64(recOffset),
        }
        if err = lf.w.Write(recTypeFooter, footer); err != nil {
                return err
        }

        if err := lf.w.Close(); err != nil {
                return err
        }
        lf.w = nil
        return nil
}

// Close 关闭读写，关闭文件
func (lf *logEntryFile) Close() error {
        if lf.w != nil {
                if err := lf.w.Close(); err != nil {
                        return err
                }
                lf.w = nil
        }

        return lf.f.Close()
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "encoding/binary"
        "fmt"
        "io"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

const indexItemSize = 8 + 8 + 4

type indexItem struct {
        logindex uint64 // 日志的index
        logterm  uint64 // 日志的term
        offset   uint32 // 日志在文件中的偏移
}

type logEntryIndex []indexItem

func (li logEntryIndex) First() uint64 {
        if len(li) == 0 {
                return 0
        }
        return li[0].logindex
}

func (li logEntryIndex) Last() uint64 {
        size := len(li)
        if size == 0 {
                return 0
        }

        return li[size-1].logindex
}

func (li logEntryIndex) Get(i uint64) (item indexItem, err error) {
        size := len(li)
        if size == 0 {
                err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last())
                return
        }

        ibegin := li[0].logindex
        iend := li[size-1].logindex
        if i < ibegin || i > iend {
                err = fmt.Errorf("maybe index(%d) is out of bound lastindex(%d)", i, li.Last())
                return
        }
        return li[i-ibegin], nil
}

func (li logEntryIndex) Append(offset uint32, entry *proto.Entry) logEntryIndex {
        return append(li, indexItem{
                logindex: entry.Index,
                logterm:  entry.Term,
                offset:   offset,
        })
}

func (li logEntryIndex) Truncate(i uint64) (logEntryIndex, error) {
        if _, err := li.Get(i); err != nil {
                return nil, err
        }

        return li[:i-li[0].logindex], nil
}

func (li logEntryIndex) Len() int {
        return len(li)
}

// 实现recordData接口Encode方法
func (li logEntryIndex) Encode(w io.Writer) (err error) {
        u32Buf := make([]byte, 4)
        u64Buf := make([]byte, 8)

        // write index items count
        binary.BigEndian.PutUint32(u32Buf, uint32(li.Len()))
        if _, err = w.Write(u32Buf); err != nil {
                return
        }

        // write indexs data
        for _, item := range li {
                // logindex
                binary.BigEndian.PutUint64(u64Buf, item.logindex)
                if _, err = w.Write(u64Buf); err != nil {
                        return
                }
                // logitem
                binary.BigEndian.PutUint64(u64Buf, item.logterm)
                if _, err = w.Write(u64Buf); err != nil {
                        return
                }
                // logoffset
                binary.BigEndian.PutUint32(u32Buf, item.offset)
                if _, err = w.Write(u32Buf); err != nil {
                        return
                }
        }
        return
}

// 实现recordData接口Size方法
func (li logEntryIndex) Size() uint64 {
        return uint64(4 + li.Len()*indexItemSize)
}

func decodeLogIndex(data []byte) logEntryIndex {
        offset := 0

        nItems := binary.BigEndian.Uint32(data[offset:])
        offset += 4
        li := make([]indexItem, nItems)

        for i := 0; i < int(nItems); i++ {
                li[i].logindex = binary.BigEndian.Uint64(data[offset:])
                offset += 8
                li[i].logterm = binary.BigEndian.Uint64(data[offset:])
                offset += 8
                li[i].offset = binary.BigEndian.Uint32(data[offset:])
                offset += 4
        }
        return li
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal

import (
        "fmt"
        "os"
        "path"
        "sort"

        "math"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
        "github.com/cubefs/cubefs/util/timeutil"
)

type logEntryStorage struct {
        s *Storage

        dir         string
        filesize    int
        rotateTime  int64
        logfiles    []logFileName // 所有日志文件的名字
        last        *logEntryFile
        nextFileSeq uint64

        cache *logFileCache
}

func openLogStorage(dir string, s *Storage) (*logEntryStorage, error) {
        ls := &logEntryStorage{
                s:           s,
                dir:         dir,
                filesize:    s.c.GetFileSize(),
                rotateTime:  timeutil.GetCurrentTimeUnix(),
                nextFileSeq: 1,
        }

        // cache
        ls.cache = newLogFileCache(s.c.GetFileCacheCapacity(),
                func(name logFileName) (*logEntryFile, error) {
                        return openLogEntryFile(ls.dir, name, false)
                })

        // open
        if err := ls.open(); err != nil {
                return nil, err
        }

        return ls, nil
}

func (ls *logEntryStorage) open() error {
        names, err := listLogEntryFiles(ls.dir)
        if err != nil {
                return err
        }

        // 没有历史文件，创建第一个起始index为0的文件
        if len(names) == 0 {
                f, err := ls.createNew(1)
                if err != nil {
                        return err
                }
                ls.logfiles = append(ls.logfiles, f.Name())
                ls.last = f
                return nil
        }

        nlen := len(names)
        ls.nextFileSeq = names[nlen-1].seq + 1 // next设为历史文件中seq最大的加1
        ls.logfiles = append(ls.logfiles, names...)
        f, err := openLogEntryFile(ls.dir, ls.logfiles[nlen-1], true) // 打开最后一个文件
        if err != nil {
                return err
        }
        ls.last = f
        return nil
}

func (ls *logEntryStorage) Term(i uint64) (term uint64, isCompact bool, err error) {
        lf, err := ls.locateFile(i)
        if err != nil {
                return
        }
        term, err = lf.Term(i)
        return
}

func (ls *logEntryStorage) LastIndex() uint64 {
        // 最后一个日志文件里没有东西
        if ls.last.Len() == 0 {
                if len(ls.logfiles) > 1 { // 拿上一个文件的lastIndex
                        return ls.last.name.index - 1
                }
                return 0
        }
        return ls.last.LastIndex()
}

func (ls *logEntryStorage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
        if lo > ls.LastIndex() {
                err = fmt.Errorf("entries's hi(%d) is out of bound lastindex(%d)", hi, ls.LastIndex())
                return
        }

        si := ls.locate(lo)
        lfs := ls.logfiles[si:]
        var ent *proto.Entry
        var lf *logEntryFile
        i := lo
        var size uint64
        // 读取历史文件里的日志
        for _, fn := range lfs {
                if fn.index >= hi {
                        return
                }

                lf, err = ls.get(fn)
                if err != nil {
                        return
                }
                for i <= lf.LastIndex() {
                        ent, err = lf.Get(i)
                        if err != nil {
                                return
                        }
                        if i >= hi {
                                return
                        }
                        size += ent.Size()
                        entries = append(entries, ent)
                        i++
                        if size > maxSize {
                                return
                        }
                }
        }

        return
}

func (ls *logEntryStorage) SaveEntries(ents []*proto.Entry) error {
        if len(ents) == 0 {
                return nil
        }

        if err := ls.truncateBack(ents[0].Index); err != nil {
                return err
        }

        for _, ent := range ents {
                if err := ls.saveEntry(ent); err != nil {
                        return err
                }
        }

        // flush应用层内存中的，写入file
        if err := ls.last.Flush(); err != nil {
                return err
        }

        return nil
}

func (ls *logEntryStorage) Sync() error {
        return ls.last.Sync()
}

// TruncateFront 从前面截断，用于删除旧数据, 只有整个文件的数据都是旧的时才删除
func (ls *logEntryStorage) TruncateFront(index uint64) error {
        truncFIndex := -1
        for i := 0; i < len(ls.logfiles)-1; i++ {
                if ls.logfiles[i+1].index-1 <= index {
                        truncFIndex = i
                } else {
                        break
                }
        }

        for i := 0; i <= truncFIndex; i++ {
                if err := ls.remove(ls.logfiles[i]); err != nil {
                        return err
                }
        }

        if truncFIndex >= 0 {
                ls.logfiles = ls.logfiles[truncFIndex+1:]
        }

        return nil
}

// TruncateAll 清空
func (ls *logEntryStorage) TruncateAll() error {
        for _, f := range ls.logfiles {
                if err := ls.remove(f); err != nil {
                        return err
                }
        }
        ls.nextFileSeq = 1
        ls.logfiles = nil

        lf, err := ls.createNew(1)
        if err != nil {
                return err
        }
        ls.last = lf
        ls.logfiles = append(ls.logfiles, lf.Name())

        return nil
}

// truncateBack 从后面截断，用于删除冲突日志
func (ls *logEntryStorage) truncateBack(index uint64) error {
        if ls.LastIndex() < index {
                return nil
        }

        if ls.logfiles[0].index >= index {
                return ls.TruncateAll()
        }

        idx := ls.locate(index)
        if idx == len(ls.logfiles)-1 { // 冲突位置在最后一个文件
                if err := ls.last.Truncate(index); err != nil {
                        return err
                }
        } else {
                for i := idx + 1; i < len(ls.logfiles); i++ {
                        if err := ls.remove(ls.logfiles[i]); err != nil {
                                return err
                        }
                }

                n := ls.logfiles[idx]
                lf, err := ls.get(n)
                if err != nil {
                        return err
                }
                ls.cache.Delete(n, false)
                ls.last = lf
                if err := ls.last.OpenWrite(); err != nil {
                        return err
                }
                if err := ls.last.Truncate(index); err != nil {
                        return err
                }

                ls.logfiles = ls.logfiles[:idx+1]
                ls.nextFileSeq = n.seq + 1
        }
        return nil
}

func (ls *logEntryStorage) createNew(index uint64) (*logEntryFile, error) {
        name := logFileName{seq: ls.nextFileSeq, index: index}
        f, err := createLogEntryFile(ls.dir, name)
        if err != nil {
                return nil, err
        }

        ls.nextFileSeq++

        return f, nil
}

func (ls *logEntryStorage) get(name logFileName) (*logEntryFile, error) {
        if name.seq == ls.last.Seq() {
                return ls.last, nil
        }
        return ls.cache.Get(name)
}

func (ls *logEntryStorage) remove(name logFileName) error {
        _ = ls.cache.Delete(name, true)
        return os.Remove(path.Join(ls.dir, name.String()))
}

// 写满了，新建一个新文件
func (ls *logEntryStorage) rotate() error {
        prevLast := ls.last.LastIndex()

        if err := ls.last.FinishWrite(); err != nil {
                return err
        }
        if err := ls.last.Close(); err != nil {
                return err
        }

        lf, err := ls.createNew(prevLast + 1)
        if err != nil {
                return err
        }
        ls.last = lf
        ls.logfiles = append(ls.logfiles, lf.Name())
        return nil
}

func (ls *logEntryStorage) size() int {
        return len(ls.logfiles)
}

func (ls *logEntryStorage) locate(logindex uint64) int {
        fi := sort.Search(len(ls.logfiles), func(i int) bool {
                var nextIndex uint64
                if i == len(ls.logfiles)-1 {
                        nextIndex = math.MaxUint64
                } else {
                        nextIndex = ls.logfiles[i+1].index
                }
                return logindex < nextIndex
        })
        return fi
}

func (ls *logEntryStorage) locateFile(logindex uint64) (*logEntryFile, error) {
        i := ls.locate(logindex)
        if i >= len(ls.logfiles) {
                panic("could not find log file")
        }
        return ls.get(ls.logfiles[i])
}

func (ls *logEntryStorage) saveEntry(ent *proto.Entry) error {
        // 检查日志是否连续
        prevIndex := ls.LastIndex()
        if prevIndex != 0 {
                if prevIndex+1 != ent.Index {
                        return fmt.Errorf("append discontinuous log. prev index: %d, current: %d", prevIndex, ent.Index)
                }
        }

        // 当期文件是否已经写满
        // 如果文件大小超过 MinFileSize 并且创建时间超过 MaxRotateInterval 则强制 rotate
        // 能大幅减少 log index 的内存开销
        woffset := ls.last.WriteOffset()
        if uint64(woffset)+uint64(recordSize(ent)) > uint64(ls.filesize) ||
                (woffset > MinFileSize && timeutil.GetCurrentTimeUnix()-ls.rotateTime > MaxRotateInterval) {
                ls.rotateTime = timeutil.GetCurrentTimeUnix()
                if err := ls.rotate(); err != nil {
                        return err
                }
        }

        if err := ls.last.Save(ent); err != nil {
                return err
        }

        return nil
}

func (ls *logEntryStorage) Close() {
        if err := ls.cache.Close(); err != nil {
                log.Warn("close log file cache error: %v", err)
        }
        if err := ls.last.Close(); err != nil {
                log.Warn("close log file %s error: %v", ls.last.Name(), err)
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "encoding/binary"
        "io"
        "os"
        "path"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/bufalloc"
)

type truncateMeta struct {
        truncIndex uint64
        truncTerm  uint64
}

func (m truncateMeta) Size() uint64 {
        return 16
}

func (m truncateMeta) Encode(b []byte) {
        binary.BigEndian.PutUint64(b, m.truncIndex)
        binary.BigEndian.PutUint64(b[8:], m.truncTerm)
}

func (m *truncateMeta) Decode(b []byte) {
        m.truncIndex = binary.BigEndian.Uint64(b)
        m.truncTerm = binary.BigEndian.Uint64(b[8:])
}

// 存储HardState和truncateMeta信息
type metaFile struct {
        f           *os.File
        truncOffset int64
}

func openMetaFile(dir string) (mf *metaFile, hs proto.HardState, meta truncateMeta, err error) {
        f, err := os.OpenFile(path.Join(dir, "META"), os.O_RDWR|os.O_CREATE, 0600)
        if err != nil {
                return
        }

        mf = &metaFile{
                f:           f,
                truncOffset: int64(hs.Size()),
        }

        hs, meta, err = mf.load()
        return mf, hs, meta, err
}

func (mf *metaFile) load() (hs proto.HardState, meta truncateMeta, err error) {
        // load hardstate
        hs_size := int(hs.Size())
        buffer := bufalloc.AllocBuffer(hs_size)
        defer bufalloc.FreeBuffer(buffer)

        buf := buffer.Alloc(hs_size)
        n, err := mf.f.Read(buf)
        if err != nil {
                if err == io.EOF {
                        err = nil
                        return
                }
                return
        }
        if n != hs_size {
                err = NewCorruptError("META", 0, "wrong hardstate data size")
                return
        }
        hs.Decode(buf)

        // load trunc meta
        buffer.Reset()
        mt_size := int(meta.Size())
        buf = buffer.Alloc(mt_size)
        n, err = mf.f.Read(buf)
        if err != nil {
                if err == io.EOF {
                        err = nil
                        return
                }
                return
        }
        if n != mt_size {
                err = NewCorruptError("META", 0, "wrong truncmeta data size")
                return
        }
        meta.Decode(buf)
        return
}

func (mf *metaFile) Close() error {
        return mf.f.Close()
}

func (mf *metaFile) SaveTruncateMeta(meta truncateMeta) error {
        mt_size := int(meta.Size())
        buffer := bufalloc.AllocBuffer(mt_size)
        defer bufalloc.FreeBuffer(buffer)

        b := buffer.Alloc(mt_size)
        meta.Encode(b)
        _, err := mf.f.WriteAt(b, mf.truncOffset)
        return err
}

func (mf *metaFile) SaveHardState(hs proto.HardState) error {
        hs_size := int(hs.Size())
        buffer := bufalloc.AllocBuffer(hs_size)
        defer bufalloc.FreeBuffer(buffer)

        b := buffer.Alloc(hs_size)
        hs.Encode(b)
        _, err := mf.f.WriteAt(b, 0)
        return err
}

func (mf *metaFile) Sync() error {
        return mf.f.Sync()
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal

import (
        "io"

        "encoding/binary"
        "fmt"
)

// 日志文件({seq}.log)格式：
// [log record]
//    ...
// [log record]
// [index record]
// [footer record]

// ErrCorrupt error
type ErrCorrupt struct {
        filename string
        offset   int64
        reason   string
}

func (e *ErrCorrupt) Error() string {
        return fmt.Sprintf("corrput data at %s:%d (%v)", e.filename, e.offset, e.reason)
}

// NewCorruptError new
func NewCorruptError(filename string, offset int64, reason string) *ErrCorrupt {
        return &ErrCorrupt{
                filename: filename,
                offset:   offset,
                reason:   reason,
        }
}

func IsErrCorrupt(err error) (is bool) {
        if err == nil {
                return
        }
        _, is = err.(*ErrCorrupt)
        return
}

type recordType uint8

const (
        recTypeLogEntry recordType = 1
        recTypeIndex    recordType = 2
        recTypeFooter   recordType = 3
)

func (rt recordType) Valid() bool {
        switch rt {
        case recTypeLogEntry, recTypeIndex, recTypeFooter:
                return true
        default:
        }
        return false
}

func (rt recordType) String() string {
        switch rt {
        case recTypeLogEntry:
                return "type-log"
        case recTypeIndex:
                return "type-index"
        case recTypeFooter:
                return "type-footer"
        default:
                return fmt.Sprintf("type-unknown(%d)", uint8(rt))
        }
}

var footerMagic = []byte{'\xf9', '\xbf', '\x3e', '\x0a', '\xd3', '\xc5', '\xcc', '\x3f'}

// record格式
type record struct {
        recType recordType // 字节类型
        dataLen uint64     // 八字节大端数据长度
        data    []byte     // []byte recordData.Encode()
        crc     uint32     // 固定四字节
}

// 一个record写入时最多需要多少字节的空间
func recordSize(data recordData) int {
        return 1 + 8 + int(data.Size()) + 4
}

type recordData interface {
        Encode(w io.Writer) error
        Size() uint64
}

type footerRecord struct {
        indexOffset uint64
        magic       []byte
}

func (fr footerRecord) Encode(w io.Writer) (err error) {
        buf := make([]byte, 8)
        binary.BigEndian.PutUint64(buf, fr.indexOffset)
        if _, err = w.Write(buf); err != nil {
                return
        }
        if _, err = w.Write(footerMagic); err != nil {
                return
        }
        return nil
}

func (fr footerRecord) Size() uint64 {
        return 16
}

func (fr *footerRecord) Decode(data []byte) {
        fr.indexOffset = binary.BigEndian.Uint64(data)
        fr.magic = data[8 : 8+len(footerMagic)]
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "bufio"
        "encoding/binary"
        "io"
        "math"
        "os"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

// 初始化完成之后，读取记录只能调用ReadAt方法
type recordReadAt interface {
        ReadAt(offset int64) (rec record, err error)
}

const defaultReadBufferedSize = 512

type bufferedReader struct {
        r *bufio.Reader
}

func newBufferedReader(f *os.File) *bufferedReader {
        return &bufferedReader{
                r: bufio.NewReaderSize(f, defaultReadBufferedSize),
        }
}

func (br *bufferedReader) Read(p []byte) (total int, err error) {
        n := 0
        for {
                n, err = br.r.Read(p)
                if err != nil {
                        return
                }

                total += n

                switch {
                case n == len(p):
                        return
                case n < len(p):
                        p = p[n:]
                default:
                        panic("invalid read buffer")
                }
        }
}

type recordReader struct {
        br     *bufferedReader
        offset int64 // 当期记录的起始位置

        sr io.ReaderAt // 随机IO

        filename string

        typeLenBuf []byte
}

func newRecordReader(f *os.File) *recordReader {
        return &recordReader{
                br:         newBufferedReader(f),
                sr:         f,
                filename:   f.Name(),
                typeLenBuf: make([]byte, 9), // 1字节类型+8字节dataLen
        }
}

// 顺序读
func (r *recordReader) Read() (recStartOffset int64, rec record, err error) {
        recStartOffset = r.offset

        // read record type and data len
        n, err := r.br.Read(r.typeLenBuf)
        if err != nil {
                return
        }
        if n != len(r.typeLenBuf) {
                if n < 1 {
                        err = NewCorruptError(r.filename, recStartOffset, "too small record type")
                } else {
                        err = NewCorruptError(r.filename, recStartOffset, "too small record datalen")
                }
                return
        }

        // Decode and validate record type
        rec.recType = recordType(r.typeLenBuf[0])
        if !rec.recType.Valid() {
                err = NewCorruptError(r.filename, recStartOffset, "illegal record type")
                return
        }

        // Decode and validate record data length
        rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:])
        if rec.dataLen+4 <= 0 || rec.dataLen > math.MaxUint32 {
                err = NewCorruptError(r.filename, recStartOffset, "illegal data length")
                return
        }

        // read data and crc
        // WARN：不可以用buffer pool，因为log entry等decode时没有进行拷贝
        rec.data = make([]byte, rec.dataLen+4)
        n, err = r.br.Read(rec.data)
        if err != nil {
                return
        }
        if uint64(n) != rec.dataLen+4 {
                err = NewCorruptError(r.filename, recStartOffset, "data size unmatch or too small crc")
                return
        }

        // decode crc
        rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:])
        // truncate crc
        rec.data = rec.data[:len(rec.data)-4]
        // checksum
        crc := util.NewCRC(rec.data)
        if rec.crc != crc.Value() {
                err = NewCorruptError(r.filename, recStartOffset, "crc mismatch")
                return
        }

        r.offset += (1 + 8 + int64(rec.dataLen) + 4)

        return
}

// 随机读指定offset
func (r *recordReader) ReadAt(offset int64) (rec record, err error) {
        defer func() {
                if err == io.EOF {
                        err = NewCorruptError(r.filename, offset, "unexpected eof")
                }
        }()

        // read record type and data len
        n, err := r.sr.ReadAt(r.typeLenBuf, offset)
        if err != nil {
                return
        }
        if n != len(r.typeLenBuf) {
                if n < 1 {
                        err = NewCorruptError(r.filename, offset, "too small record type")
                } else {
                        err = NewCorruptError(r.filename, offset, "too small record datalen")
                }
                return
        }
        rec.recType = recordType(r.typeLenBuf[0])
        rec.dataLen = binary.BigEndian.Uint64(r.typeLenBuf[1:])

        // read data and crc
        rec.data = make([]byte, rec.dataLen+4)
        n, err = r.sr.ReadAt(rec.data, offset+int64(n))
        if err != nil {
                return
        }
        if uint64(n) != rec.dataLen+4 {
                err = NewCorruptError(r.filename, offset, "data size unmatch or too small crc")
                return
        }

        // decode crc
        rec.crc = binary.BigEndian.Uint32(rec.data[len(rec.data)-4:])
        // truncate crc
        rec.data = rec.data[:len(rec.data)-4]
        // checksum
        crc := util.NewCRC(rec.data)
        if rec.crc != crc.Value() {
                err = NewCorruptError(r.filename, offset, "crc mismatch")
                return
        }

        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "encoding/binary"
        "io"
        "os"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/bufalloc"
)

const initialBufferSize = 1024 * 32
const flushTriggerSize = 1024 * 1024

type recordWriter struct {
        f      *os.File
        buf    bufalloc.Buffer
        u64Buf []byte
        u32Buf []byte
        offset int64
}

func newRecordWriter(f *os.File) *recordWriter {
        return &recordWriter{
                f:      f,
                u64Buf: make([]byte, 8),
                u32Buf: make([]byte, 4),
        }
}

func (w *recordWriter) Write(recType recordType, data recordData) error {
        if w.buf == nil {
                w.buf = bufalloc.AllocBuffer(initialBufferSize)
        }

        w.buf.Grow(recordSize(data))
        // write record type
        w.buf.WriteByte(byte(recType))
        // write data size
        binary.BigEndian.PutUint64(w.u64Buf, data.Size())
        w.buf.Write(w.u64Buf)
        // write data
        prevLen := w.buf.Len()
        data.Encode(w.buf)
        if uint64(w.buf.Len()-prevLen) != data.Size() {
                panic("fbase/raft/logstorage: unexpected data size when decode " + recType.String())
        }
        // write crc
        crc := util.NewCRC(w.buf.Bytes()[w.buf.Len()-int(data.Size()):])
        binary.BigEndian.PutUint32(w.u32Buf, crc.Value())
        w.buf.Write(w.u32Buf)

        w.offset += int64(recordSize(data))

        if err := w.tryToFlush(); err != nil {
                return err
        }

        return nil
}

func (w *recordWriter) tryToFlush() error {
        if w.buf != nil && w.buf.Len() >= flushTriggerSize {
                return w.Flush()
        }
        return nil
}

func (w *recordWriter) Offset() int64 {
        return w.offset
}

func (w *recordWriter) Truncate(offset int64) error {
        if err := w.f.Truncate(offset); err != nil {
                return err
        }
        w.offset = offset
        _, err := w.f.Seek(offset, io.SeekStart)
        return err
}

func (w *recordWriter) Flush() error {
        if w.buf != nil && w.buf.Len() > 0 {
                _, err := w.buf.WriteTo(w.f)
                if err != nil {
                        return err
                }
        }
        return nil
}

func (w *recordWriter) Sync() error {
        if err := w.Flush(); err != nil {
                return err
        }

        return w.f.Sync()
}

// 关闭写
func (w *recordWriter) Close() error {
        if err := w.Sync(); err != nil {
                return err
        }
        if w.buf != nil {
                bufalloc.FreeBuffer(w.buf)
        }
        return nil
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package wal

import (
        "errors"
        "fmt"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
)

// Storage the storage
type Storage struct {
        c *Config

        // Log Entry
        ls         *logEntryStorage
        truncIndex uint64
        truncTerm  uint64

        hardState  proto.HardState
        metafile   *metaFile
        prevCommit uint64 // 有commit变化时sync一下

        closed bool
}

// NewStorage new
func NewStorage(dir string, c *Config) (*Storage, error) {
        if err := initDir(dir); err != nil {
                return nil, err
        }

        // 加载HardState
        mf, hardState, meta, err := openMetaFile(dir)
        if err != nil {
                return nil, err
        }

        s := &Storage{
                c:          c.dup(),
                truncIndex: meta.truncIndex,
                truncTerm:  meta.truncTerm,
                hardState:  hardState,
                metafile:   mf,
                prevCommit: hardState.Commit,
        }

        // 加载日志文件
        ls, err := openLogStorage(dir, s)
        if err != nil {
                return nil, err
        }
        s.ls = ls

        if c.GetTruncateFirstDummy() {
                if err := s.truncateFirstDummy(); err != nil {
                        return nil, err
                }
        }

        return s, nil
}

func (s *Storage) truncateFirstDummy() error {
        // 保证是初始化时（不能已有日志存在）
        li, err := s.LastIndex()
        if err != nil {
                return err
        }
        if li != 0 {
                return errors.New("truncate first dummy forbidden")
        }

        meta := truncateMeta{
                truncIndex: 1,
                truncTerm:  1,
        }

        if err = s.metafile.SaveTruncateMeta(meta); err != nil {
                return err
        }
        if err = s.metafile.Sync(); err != nil {
                return err
        }

        s.truncIndex = meta.truncIndex
        s.truncTerm = meta.truncTerm

        return nil
}

// InitialState returns the saved HardState information to init the repl state.
func (s *Storage) InitialState() (proto.HardState, error) {
        return s.hardState, nil
}

// Entries returns a slice of log entries in the range [lo,hi), the hi is not inclusive.
// MaxSize limits the total size of the log entries returned, but Entries returns at least one entry if any.
// If lo <= CompactIndex,then return isCompact true.
// If no entries,then return entries nil.
// Note: math.MaxUint32 is no limit.
func (s *Storage) Entries(lo, hi uint64, maxSize uint64) (entries []*proto.Entry, isCompact bool, err error) {
        if lo <= s.truncIndex {
                return nil, true, nil
        }
        entries, isCompact, err = s.ls.Entries(lo, hi, maxSize)
        return
}

// Term returns the term of entry i, which must be in the range [FirstIndex()-1, LastIndex()].
// The term of the entry before FirstIndex is retained for matching purposes even though the
// rest of that entry may not be available.
// If lo <= CompactIndex,then return isCompact true.
func (s *Storage) Term(index uint64) (term uint64, isCompact bool, err error) {
        switch {
        case index < s.truncIndex:
                return 0, true, nil
        case index == s.truncIndex:
                term = s.truncTerm
                return
        default:
                term, isCompact, err = s.ls.Term(index)
                return
        }
}

// FirstIndex returns the index of the first log entry that is possibly available via Entries (older entries have been incorporated
// into the latest Snapshot; if storage only contains the dummy entry the first log entry is not available).
func (s *Storage) FirstIndex() (index uint64, err error) {
        index = s.truncIndex + 1
        return
}

// LastIndex returns the index of the last entry in the log.
func (s *Storage) LastIndex() (index uint64, err error) {
        index = s.ls.LastIndex()
        if index < s.truncIndex {
                index = s.truncIndex
        }
        return
}

// StoreEntries store the log entries to the repository.
// If first index of entries > LastIndex,then append all entries,
// Else write entries at first index and truncate the redundant log entries.
func (s *Storage) StoreEntries(entries []*proto.Entry) error {
        if err := s.ls.SaveEntries(entries); err != nil {
                return err
        }
        return nil
}

// StoreHardState store the raft state to the repository.
func (s *Storage) StoreHardState(st proto.HardState) error {
        if err := s.metafile.SaveHardState(st); err != nil {
                return err
        }
        s.hardState = st

        if s.c.GetSync() {
                sync := false
                if st.Commit != s.prevCommit {
                        sync = true
                        s.prevCommit = st.Commit
                }
                if sync {
                        if err := s.metafile.Sync(); err != nil {
                                return err
                        }
                        if err := s.ls.Sync(); err != nil {
                                return err
                        }
                }
        }

        return nil
}

// Truncate the log to index,  The index is inclusive.
func (s *Storage) Truncate(index uint64) error {
        if index <= s.truncIndex {
                log.Warn("already truncated. index=%d", index)
                return nil
        }

        term, isCompact, err := s.ls.Term(index)
        if err != nil {
                return err
        }
        if isCompact {
                return fmt.Errorf("expected compacted term. index:%d", index)
        }

        // 更新meta
        meta := truncateMeta{
                truncIndex: index,
                truncTerm:  term,
        }

        logger.Debug("Storage truncate index %v term %v", index, term)

        if err = s.metafile.SaveTruncateMeta(meta); err != nil {
                return err
        }
        if err = s.metafile.Sync(); err != nil {
                return err
        }

        // 截断日志文件
        if err = s.ls.TruncateFront(index); err != nil {
                return err
        }

        s.truncIndex = index
        s.truncTerm = term

        return nil
}

// ApplySnapshot Sync snapshot status.
func (s *Storage) ApplySnapshot(meta proto.SnapshotMeta) error {
        tMeta := truncateMeta{
                truncIndex: meta.Index,
                truncTerm:  meta.Term,
        }

        var err error

        // 更新commit位置
        s.hardState.Commit = meta.Index
        if err := s.metafile.SaveHardState(s.hardState); err != nil {
                return err
        }

        if err = s.metafile.SaveTruncateMeta(tMeta); err != nil {
                return err
        }
        if err = s.metafile.Sync(); err != nil {
                return err
        }

        if err = s.ls.TruncateAll(); err != nil {
                return err
        }

        s.truncIndex = meta.Index
        s.truncTerm = meta.Term

        return nil
}

// Close the storage.
func (s *Storage) Close() {
        if !s.closed {
                s.ls.Close()
                s.metafile.Close()
                s.closed = true
        }
}

type metricReporter struct {
        ID string
}

func newReporterWithID(id string) *metricReporter {
        return &metricReporter{
                ID: id,
        }
}

func (r *metricReporter) ReportInterval() time.Duration {
        return time.Minute
}

func (r *metricReporter) Report(data []byte) error {
        logger.Info("wal [%s] metrics: %s", r.ID, string(data))
        return nil
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package wal

import (
        "bytes"
        "fmt"
        "math/rand"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

func compapreEntry(le, re *proto.Entry) error {
        if le.Index != re.Index {
                return fmt.Errorf("unmatch index: %d != %d", le.Index, re.Index)
        }
        if le.Type != re.Type {
                return fmt.Errorf("unmatch type: %d != %d", le.Type, re.Type)
        }
        if le.Term != re.Term {
                return fmt.Errorf("unmatch term: %d != %d", le.Term, re.Term)
        }
        if !bytes.Equal(le.Data, re.Data) {
                return fmt.Errorf("unmatch data: %s != %s", string(le.Data), string(re.Data))
        }
        return nil
}

func compareEntries(lh, rh []*proto.Entry) error {
        if len(lh) != len(rh) {
                return fmt.Errorf("unmatch size: %d != %d", len(lh), len(rh))
        }

        for i := 0; i < len(lh); i++ {
                le := lh[i]
                re := rh[i]
                if err := compapreEntry(le, re); err != nil {
                        return fmt.Errorf("%v at %d", err, i)
                }
        }
        return nil
}

func genLogEntry(rnd *rand.Rand, i uint64) *proto.Entry {
        randType := func() proto.EntryType {
                switch rnd.Int() % 2 {
                case 0:
                        return proto.EntryNormal
                default:
                        return proto.EntryConfChange
                }
        }
        randTerm := func() uint64 {
                return uint64(rnd.Uint32())
        }
        randData := func() []byte {
                const letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
                length := 10 + rnd.Int()%100
                buf := make([]byte, length)
                for i := 0; i < length; i++ {
                        buf[i] = letters[rnd.Int()%len(letters)]
                }
                return buf
        }
        ent := &proto.Entry{
                Index: i,
                Type:  randType(),
                Term:  randTerm(),
                Data:  randData(),
        }
        return ent
}

func genLogEntries(lo, hi uint64) (ents []*proto.Entry) {
        rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
        for i := lo; i < hi; i++ {
                ents = append(ents, genLogEntry(rnd, i))
        }
        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "fmt"
        "net"
        "sync"

        //"fmt"
        //"github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type heartbeatTransport struct {
        config     *TransportConfig
        raftServer *RaftServer
        listener   net.Listener
        mu         sync.RWMutex
        senders    map[uint64]*transportSender
        stopc      chan struct{}
}

func newHeartbeatTransport(raftServer *RaftServer, config *TransportConfig) (*heartbeatTransport, error) {
        var (
                listener net.Listener
                err      error
        )

        if listener, err = net.Listen("tcp", config.HeartbeatAddr); err != nil {
                return nil, err
        }
        t := &heartbeatTransport{
                config:     config,
                raftServer: raftServer,
                listener:   listener,
                senders:    make(map[uint64]*transportSender),
                stopc:      make(chan struct{}),
        }
        return t, nil
}

func (t *heartbeatTransport) stop() {
        t.mu.Lock()
        defer t.mu.Unlock()

        select {
        case <-t.stopc:
                return
        default:
                close(t.stopc)
                t.listener.Close()
                for _, s := range t.senders {
                        s.stop()
                }
        }
}

func (t *heartbeatTransport) start() {
        util.RunWorkerUtilStop(func() {
                for {
                        select {
                        case <-t.stopc:
                                return
                        default:
                                conn, err := t.listener.Accept()
                                if err != nil {
                                        continue
                                }
                                t.handleConn(util.NewConnTimeout(conn))
                        }
                }
        }, t.stopc)
}

func (t *heartbeatTransport) handleConn(conn *util.ConnTimeout) {
        util.RunWorker(func() {
                defer conn.Close()

                bufRd := util.NewBufferReader(conn, 16*KB)
                for {
                        select {
                        case <-t.stopc:
                                return
                        default:
                                if msg, err := reciveMessage(bufRd); err != nil {
                                        logger.Error(fmt.Sprintf("[heartbeatTransport] recive message from conn error, %s", err.Error()))
                                        return
                                } else {
                                        //logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr()))
                                        t.raftServer.reciveMessage(msg)
                                }
                        }
                }
        })
}

func (t *heartbeatTransport) send(msg *proto.Message) {
        s := t.getSender(msg.To)
        s.send(msg)
}

func (t *heartbeatTransport) getSender(nodeId uint64) *transportSender {
        t.mu.RLock()
        sender, ok := t.senders[nodeId]
        t.mu.RUnlock()
        if ok {
                return sender
        }

        t.mu.Lock()
        defer t.mu.Unlock()
        if sender, ok = t.senders[nodeId]; !ok {
                sender = newTransportSender(nodeId, 1, 64, HeartBeat, t.config.Resolver)
                t.senders[nodeId] = sender
        }
        return sender
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type MultiTransport struct {
        heartbeat *heartbeatTransport
        replicate *replicateTransport
}

func NewMultiTransport(raft *RaftServer, config *TransportConfig) (Transport, error) {
        mt := new(MultiTransport)

        if ht, err := newHeartbeatTransport(raft, config); err != nil {
                return nil, err
        } else {
                mt.heartbeat = ht
        }
        if rt, err := newReplicateTransport(raft, config); err != nil {
                return nil, err
        } else {
                mt.replicate = rt
        }

        mt.heartbeat.start()
        mt.replicate.start()
        return mt, nil
}

func (t *MultiTransport) Stop() {
        t.heartbeat.stop()
        t.replicate.stop()
}

func (t *MultiTransport) Send(m *proto.Message) {
        // if m.IsElectionMsg() {
        if m.IsHeartbeatMsg() {
                t.heartbeat.send(m)
        } else {
                t.replicate.send(m)
        }
}

func (t *MultiTransport) SendSnapshot(m *proto.Message, rs *snapshotStatus) {
        t.replicate.sendSnapshot(m, rs)
}

func reciveMessage(r *util.BufferReader) (msg *proto.Message, err error) {
        msg = proto.GetMessage()
        if err = msg.Decode(r); err != nil {
                proto.ReturnMessage(msg)
        }
        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "encoding/binary"
        "fmt"
        "io"
        "net"
        "runtime"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type replicateTransport struct {
        config      *TransportConfig
        raftServer  *RaftServer
        listener    net.Listener
        curSnapshot int32
        mu          sync.RWMutex
        senders     map[uint64]*transportSender
        stopc       chan struct{}
}

func newReplicateTransport(raftServer *RaftServer, config *TransportConfig) (*replicateTransport, error) {
        var (
                listener net.Listener
                err      error
        )

        if listener, err = net.Listen("tcp", config.ReplicateAddr); err != nil {
                return nil, err
        }
        t := &replicateTransport{
                config:     config,
                raftServer: raftServer,
                listener:   listener,
                senders:    make(map[uint64]*transportSender),
                stopc:      make(chan struct{}),
        }
        return t, nil
}

func (t *replicateTransport) stop() {
        t.mu.Lock()
        defer t.mu.Unlock()

        select {
        case <-t.stopc:
                return
        default:
                close(t.stopc)
                t.listener.Close()
                for _, s := range t.senders {
                        s.stop()
                }
        }
}

func (t *replicateTransport) send(m *proto.Message) {
        s := t.getSender(m.To)
        s.send(m)
}

func (t *replicateTransport) getSender(nodeId uint64) *transportSender {
        t.mu.RLock()
        sender, ok := t.senders[nodeId]
        t.mu.RUnlock()
        if ok {
                return sender
        }

        t.mu.Lock()
        defer t.mu.Unlock()
        if sender, ok = t.senders[nodeId]; !ok {
                sender = newTransportSender(nodeId, uint64(t.config.MaxReplConcurrency), t.config.SendBufferSize, Replicate, t.config.Resolver)
                t.senders[nodeId] = sender
        }
        return sender
}

func (t *replicateTransport) sendSnapshot(m *proto.Message, rs *snapshotStatus) {
        var (
                conn *util.ConnTimeout
                err  error
        )
        defer func() {
                atomic.AddInt32(&t.curSnapshot, -1)
                rs.respond(err)
                if conn != nil {
                        conn.Close()
                }
                if err != nil {
                        logger.Error("[Transport] %v send snapshot to %v failed error is: %v.", m.ID, m.To, err)
                } else if logger.IsEnableWarn() {
                        logger.Warn("[Transport] %v send snapshot to %v successful.", m.ID, m.To)
                }

        }()

        if atomic.AddInt32(&t.curSnapshot, 1) > int32(t.config.MaxSnapConcurrency) {
                err = fmt.Errorf("snapshot concurrency exceed the limit %v, now %d", t.config.MaxSnapConcurrency, t.curSnapshot)
                return
        }
        if conn = getConn(m.To, Replicate, t.config.Resolver, 10*time.Minute, 1*time.Minute); conn == nil {
                err = fmt.Errorf("can't get connection to %v.", m.To)
                return
        }

        // send snapshot header message
        bufWr := util.NewBufferWriter(conn, 1*MB)
        if err = m.Encode(bufWr); err != nil {
                return
        }
        if err = bufWr.Flush(); err != nil {
                return
        }

        // send snapshot data
        var (
                data      []byte
                loopCount = 0
                sizeBuf   = make([]byte, 4)
        )
        for err == nil {
                loopCount = loopCount + 1
                if loopCount > 16 {
                        loopCount = 0
                        runtime.Gosched()
                }

                select {
                case <-rs.stopCh:
                        err = fmt.Errorf("raft has shutdown.")

                default:
                        data, err = m.Snapshot.Next()
                        if len(data) > 0 {
                                // write block size
                                binary.BigEndian.PutUint32(sizeBuf, uint32(len(data)))
                                if _, err = bufWr.Write(sizeBuf); err == nil {
                                        _, err = bufWr.Write(data)
                                }
                        }
                }
        }

        // write end flag and flush
        if err != nil && err != io.EOF {
                return
        }
        binary.BigEndian.PutUint32(sizeBuf, 0)
        if _, err = bufWr.Write(sizeBuf); err != nil {
                return
        }
        if err = bufWr.Flush(); err != nil {
                return
        }

        // wait response
        err = nil
        resp := make([]byte, 1)
        io.ReadFull(conn, resp)
        if resp[0] != 1 {
                err = fmt.Errorf("follower response failed.")
        }
}

func (t *replicateTransport) start() {
        util.RunWorkerUtilStop(func() {
                for {
                        select {
                        case <-t.stopc:
                                return
                        default:
                                conn, err := t.listener.Accept()
                                if err != nil {
                                        continue
                                }
                                t.handleConn(util.NewConnTimeout(conn))
                        }
                }
        }, t.stopc)
}

func (t *replicateTransport) handleConn(conn *util.ConnTimeout) {
        util.RunWorker(func() {
                defer conn.Close()

                loopCount := 0
                bufRd := util.NewBufferReader(conn, 16*KB)
                for {
                        loopCount = loopCount + 1
                        if loopCount > 16 {
                                loopCount = 0
                                runtime.Gosched()
                        }

                        select {
                        case <-t.stopc:
                                return
                        default:
                                if msg, err := reciveMessage(bufRd); err != nil {
                                        return
                                } else {
                                        //logger.Debug(fmt.Sprintf("Recive %v from (%v)", msg.ToString(), conn.RemoteAddr()))
                                        if msg.Type == proto.ReqMsgSnapShot {
                                                if err := t.handleSnapshot(msg, conn, bufRd); err != nil {
                                                        return
                                                }
                                        } else {
                                                t.raftServer.reciveMessage(msg)
                                        }
                                }
                        }
                }
        })
}

var snap_ack = []byte{1}

func (t *replicateTransport) handleSnapshot(m *proto.Message, conn *util.ConnTimeout, bufRd *util.BufferReader) error {
        conn.SetReadTimeout(time.Minute)
        conn.SetWriteTimeout(15 * time.Second)
        bufRd.Grow(1 * MB)
        req := newSnapshotRequest(m, bufRd)
        t.raftServer.reciveSnapshot(req)

        // wait snapshot result
        if err := req.response(); err != nil {
                logger.Error("[Transport] handle snapshot request from %v error: %v.", m.From, err)
                return err
        }

        _, err := conn.Write(snap_ack)
        return err
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raft

import (
        "runtime"
        "sync"
        "time"

        //"fmt"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

type unreachableReporter func(uint64)

type transportSender struct {
        nodeID      uint64
        concurrency uint64
        senderType  SocketType
        resolver    SocketResolver
        inputc      []chan *proto.Message
        send        func(msg *proto.Message)
        mu          sync.Mutex
        stopc       chan struct{}
}

func newTransportSender(nodeID, concurrency uint64, buffSize int, senderType SocketType, resolver SocketResolver) *transportSender {
        sender := &transportSender{
                nodeID:      nodeID,
                concurrency: concurrency,
                senderType:  senderType,
                resolver:    resolver,
                inputc:      make([]chan *proto.Message, concurrency),
                stopc:       make(chan struct{}),
        }
        for i := uint64(0); i < concurrency; i++ {
                sender.inputc[i] = make(chan *proto.Message, buffSize)
                sender.loopSend(sender.inputc[i])
        }

        if (concurrency & (concurrency - 1)) == 0 {
                sender.send = func(msg *proto.Message) {
                        idx := 0
                        if concurrency > 1 {
                                idx = int(msg.ID&concurrency - 1)
                        }
                        sender.inputc[idx] <- msg
                }
        } else {
                sender.send = func(msg *proto.Message) {
                        idx := 0
                        if concurrency > 1 {
                                idx = int(msg.ID % concurrency)
                        }
                        sender.inputc[idx] <- msg
                }
        }
        return sender
}

func (s *transportSender) stop() {
        s.mu.Lock()
        defer s.mu.Unlock()

        select {
        case <-s.stopc:
                return
        default:
                close(s.stopc)
        }
}

func (s *transportSender) loopSend(recvc chan *proto.Message) {
        util.RunWorkerUtilStop(func() {
                conn := getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second)
                bufWr := util.NewBufferWriter(conn, 16*KB)

                defer func() {
                        if conn != nil {
                                conn.Close()
                        }
                }()

                loopCount := 0
                var err error
                for {
                        loopCount = loopCount + 1
                        if loopCount > 8 {
                                loopCount = 0
                                runtime.Gosched()
                        }

                        select {
                        case <-s.stopc:
                                return

                        case msg := <-recvc:
                                if conn == nil {
                                        conn = getConn(s.nodeID, s.senderType, s.resolver, 0, 2*time.Second)
                                        if conn == nil {
                                                proto.ReturnMessage(msg)
                                                // reset chan
                                                for {
                                                        select {
                                                        case msg := <-recvc:
                                                                proto.ReturnMessage(msg)
                                                                continue
                                                        default:
                                                        }
                                                        break
                                                }
                                                time.Sleep(50 * time.Millisecond)
                                                continue
                                        }
                                        bufWr.Reset(conn)
                                }
                                err = msg.Encode(bufWr)
                                proto.ReturnMessage(msg)
                                if err != nil {
                                        goto flush
                                }
                                // group send message
                                flag := false
                                for i := 0; i < 16; i++ {
                                        select {
                                        case msg := <-recvc:
                                                err = msg.Encode(bufWr)
                                                //logger.Debug(fmt.Sprintf("SendMesg %v to (%v) ", msg.ToString(), conn.RemoteAddr()))
                                                proto.ReturnMessage(msg)
                                                if err != nil {
                                                        goto flush
                                                }
                                        default:
                                                flag = true
                                        }
                                        if flag {
                                                break
                                        }
                                }
                        }

                flush:
                        // flush write
                        if err == nil {
                                err = bufWr.Flush()
                        }
                        if err != nil {
                                logger.Error("[Transport]send message[%s] to %v[%s] error:[%v].", s.senderType, s.nodeID, conn.RemoteAddr(), err)
                                conn.Close()
                                conn = nil
                        }
                }
        }, s.stopc)
}

func getConn(nodeID uint64, socketType SocketType, resolver SocketResolver, rdTime, wrTime time.Duration) (conn *util.ConnTimeout) {
        var (
                addr string
                err  error
        )
        if addr, err = resolver.NodeAddress(nodeID, socketType); err == nil {
                if conn, err = util.DialTimeout(addr, 2*time.Second); err == nil {
                        conn.SetReadTimeout(rdTime)
                        conn.SetWriteTimeout(wrTime)
                }
        }

        if err != nil {
                conn = nil
                if logger.IsEnableDebug() {
                        logger.Debug("[Transport] get connection[%s] to %v[%s] failed,error is: %s", socketType, nodeID, addr, err)
                }
        }
        return
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import "sync/atomic"

type AtomicBool struct {
        v int32
}

func (b *AtomicBool) Get() bool {
        return atomic.LoadInt32(&b.v) != 0
}

func (b *AtomicBool) Set(newValue bool) {
        atomic.StoreInt32(&b.v, boolToInt(newValue))
}

func (b *AtomicBool) CompareAndSet(expect, update bool) bool {
        return atomic.CompareAndSwapInt32(&b.v, boolToInt(expect), boolToInt(update))
}

func boolToInt(v bool) int32 {
        if v {
                return 1
        } else {
                return 0
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import "sync/atomic"

type AtomicUInt64 struct {
        v uint64
}

func (a *AtomicUInt64) Get() uint64 {
        return atomic.LoadUint64(&a.v)
}

func (a *AtomicUInt64) Set(v uint64) {
        atomic.StoreUint64(&a.v, v)
}

func (a *AtomicUInt64) Add(v uint64) uint64 {
        return atomic.AddUint64(&a.v, v)
}

func (a *AtomicUInt64) Incr() uint64 {
        return atomic.AddUint64(&a.v, 1)
}

func (a *AtomicUInt64) CompareAndSwap(o, n uint64) bool {
        return atomic.CompareAndSwapUint64(&a.v, o, n)
}

// Copyright 2009 The Go Authors. All rights reserved.
// Modified work copyright 2018 The tiglabs Authors.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package bufalloc

import (
        "io"
)

// A Buffer is a variable-sized buffer of bytes with Read and Write methods.
type Buffer interface {
        // Alloc allocs n bytes of slice from the buffer, growing the buffer as needed.
        // If n is negative, Alloc will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge.
        Alloc(n int) []byte
        // Truncate discards all but the first n unread bytes from the buffer.
        // It panics if n is negative or greater than the length of the buffer.
        Truncate(n int)
        // Grow grows the buffer's capacity, if necessary, to guarantee space for n bytes.
        // If n is negative, Grow will panic. If the buffer can't grow it will panic with bytes.ErrTooLarge.
        Grow(n int)
        // Write appends the contents of p to the buffer, growing the buffer as needed.
        // The return value n is the length of p; err is always nil.
        // If the buffer becomes too large, Write will panic with bytes.ErrTooLarge.
        Write(p []byte) (n int, err error)
        // WriteByte appends the byte c to the buffer, growing the buffer as needed.
        // If the buffer becomes too large, WriteByte will panic with bytes.ErrTooLarge.
        WriteByte(c byte) error
        // WriteTo writes data to w until the buffer is drained or an error occurs.
        // The return value n is the number of bytes written;
        // Any error encountered during the write is also returned.
        WriteTo(w io.Writer) (n int64, err error)
        // Read reads the next len(p) bytes from the buffer or until the buffer is drained.
        // The return value n is the number of bytes read.
        // If the buffer has no data to return, err is io.EOF (unless len(p) is zero); otherwise it is nil.
        Read(p []byte) (n int, err error)
        // ReadByte reads and returns the next byte from the buffer.
        // If no byte is available, it returns error io.EOF.
        ReadByte() (c byte, err error)
        // ReadBytes reads until the first occurrence of delim in the input,
        // returning a slice containing the data up to and including the delimiter.
        // If ReadBytes encounters an error before finding a delimiter, it returns the data read before the error and the error itself (often io.EOF).
        // ReadBytes returns err != nil if and only if the returned data does not end in delim.
        ReadBytes(delim byte) (line []byte, err error)
        // ReadFrom reads data from r until EOF and appends it to the buffer, growing the buffer as needed.
        // The return value n is the number of bytes read. Any error except io.EOF encountered during the read is also returned.
        // If the buffer becomes too large, ReadFrom will panic with bytes.ErrTooLarge.
        ReadFrom(r io.Reader) (n int64, err error)
        // Bytes returns a slice of the contents of the unread portion of the buffer;
        // If the caller changes the contents of the returned slice, the contents of the buffer will change,
        // provided there are no intervening method calls on the Buffer.
        Bytes() []byte
        // Next returns a slice containing the next n bytes from the buffer, advancing the buffer as if the bytes had been returned by Read.
        // If there are fewer than n bytes in the buffer, Next returns the entire buffer.
        // The slice is only valid until the next call to a read or write method.
        Next(n int) []byte
        // Reset resets the buffer so it has no content.
        // b.Reset() is the same as b.Truncate(0).
        Reset()
        // String returns the contents of the unread portion of the buffer as a string.
        // If the Buffer is a nil pointer, it returns "<nil>".
        String() string
        // Len returns the number of bytes of the unread portion of the buffer;
        Len() int
        // Cap returns the capacity of the buffer.
        Cap() int
}

func AllocBuffer(n int) Buffer {
        return buffPool.getBuffer(n)
}

func FreeBuffer(buf Buffer) {
        buffPool.putBuffer(buf)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package bufalloc

import (
        "sync"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
)

const (
        baseSize = 15
        bigSize  = 64 * util.KB
)

var buffPool *bufferPool

func init() {
        buffPool = &bufferPool{
                baseline: [...]int{64, 128, 256, 512, util.KB, 2 * util.KB, 4 * util.KB, 8 * util.KB, 16 * util.KB, 32 * util.KB, 64 * util.KB, 128 * util.KB, 256 * util.KB, 512 * util.KB, util.MB},
        }
        for i, n := range buffPool.baseline {
                buffPool.pool[i] = createPool(n)
        }
        buffPool.pool[baseSize] = createPool(0)
}

func createPool(n int) *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if n == 0 || n > bigSize {
                                return &ibuffer{}
                        }
                        return &ibuffer{buf: makeSlice(n)}
                },
        }
}

type bufferPool struct {
        baseline [baseSize]int
        pool     [baseSize + 1]*sync.Pool
}

func (p *bufferPool) getPoolNum(n int) int {
        for i, x := range p.baseline {
                if n <= x {
                        return i
                }
        }
        return baseSize
}

func (p *bufferPool) getBuffer(n int) Buffer {
        num := p.getPoolNum(n)
        pool := p.pool[num]
        buf := pool.Get().(Buffer)
        if buf.Cap() < n {
                // return old buffer to pool
                buffPool.putBuffer(buf)
                buf = &ibuffer{buf: makeSlice(n)}
        }
        buf.Reset()
        return buf
}

func (p *bufferPool) putBuffer(buf Buffer) {
        num := p.getPoolNum(buf.Cap())
        pool := p.pool[num]
        pool.Put(buf)
}

// Copyright 2009 The Go Authors. All rights reserved.
// Modified work copyright 2018 The tiglabs Authors.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package bufalloc

import (
        "bytes"
        "errors"
        "io"
)

const minRead = 512

var (
        ErrTooLarge = errors.New("bufalloc.Buffer: too large.")
)

type ibuffer struct {
        off int
        buf []byte
}

func makeSlice(n int) []byte {
        defer func() {
                if recover() != nil {
                        panic(ErrTooLarge)
                }
        }()
        return make([]byte, n)
}

func (b *ibuffer) Bytes() []byte { return b.buf[b.off:] }

func (b *ibuffer) String() string {
        if b == nil {
                return "<nil>"
        }
        return string(b.buf[b.off:])
}

func (b *ibuffer) Len() int { return len(b.buf) - b.off }

func (b *ibuffer) Cap() int { return cap(b.buf) }

func (b *ibuffer) Reset() { b.Truncate(0) }

func (b *ibuffer) Truncate(n int) {
        switch {
        case n < 0 || n > b.Len():
                panic("bufalloc.Buffer: truncation out of range")
        case n == 0:
                b.off = 0
        }
        b.buf = b.buf[0 : b.off+n]
}

func (b *ibuffer) grow(n int) int {
        if b.buf == nil {
                b.buf = makeSlice(n)
                return 0
        }

        m := b.Len()
        if m == 0 && b.off != 0 {
                b.Truncate(0)
        }
        if len(b.buf)+n > cap(b.buf) {
                var buf []byte
                if m+n <= cap(b.buf)/2 {
                        copy(b.buf[:], b.buf[b.off:])
                        buf = b.buf[:m]
                } else {
                        buf = makeSlice(2*cap(b.buf) + n)
                        copy(buf, b.buf[b.off:])
                }
                b.buf = buf
                b.off = 0
        }
        b.buf = b.buf[0 : b.off+m+n]
        return b.off + m
}

func (b *ibuffer) Alloc(n int) []byte {
        if n < 0 {
                panic("bufalloc.Buffer: negative count")
        }
        m := b.grow(n)
        return b.buf[m:]
}

func (b *ibuffer) Grow(n int) {
        if n < 0 {
                panic("bufalloc.Buffer: negative count")
        }
        m := b.grow(n)
        b.buf = b.buf[0:m]
}

func (b *ibuffer) Write(p []byte) (n int, err error) {
        m := b.grow(len(p))
        return copy(b.buf[m:], p), nil
}

func (b *ibuffer) ReadFrom(r io.Reader) (n int64, err error) {
        if b.off >= len(b.buf) {
                b.Truncate(0)
        }
        for {
                if free := cap(b.buf) - len(b.buf); free < minRead {
                        // not enough space at end
                        newBuf := b.buf
                        if b.off+free < minRead {
                                newBuf = makeSlice(2*cap(b.buf) + minRead)
                        }
                        copy(newBuf, b.buf[b.off:])
                        b.buf = newBuf[:len(b.buf)-b.off]
                        b.off = 0
                }
                m, e := r.Read(b.buf[len(b.buf):cap(b.buf)])
                b.buf = b.buf[0 : len(b.buf)+m]
                n += int64(m)
                if e == io.EOF {
                        break
                }
                if e != nil {
                        return n, e
                }
        }
        return n, nil // err is EOF, so return nil explicitly
}

func (b *ibuffer) WriteTo(w io.Writer) (n int64, err error) {
        if b.off < len(b.buf) {
                nBytes := b.Len()
                m, e := w.Write(b.buf[b.off:])
                if m > nBytes {
                        panic("bufalloc.Buffer: invalid Write count")
                }
                b.off += m
                n = int64(m)
                if e != nil {
                        return n, e
                }
                if m != nBytes {
                        return n, io.ErrShortWrite
                }
        }
        b.Truncate(0)
        return
}

func (b *ibuffer) WriteByte(c byte) error {
        m := b.grow(1)
        b.buf[m] = c
        return nil
}

func (b *ibuffer) Read(p []byte) (n int, err error) {
        if b.off >= len(b.buf) {
                b.Truncate(0)
                if len(p) == 0 {
                        return
                }
                return 0, io.EOF
        }
        n = copy(p, b.buf[b.off:])
        b.off += n
        return
}

func (b *ibuffer) Next(n int) []byte {
        m := b.Len()
        if n > m {
                n = m
        }
        data := b.buf[b.off : b.off+n]
        b.off += n
        return data
}

func (b *ibuffer) ReadByte() (c byte, err error) {
        if b.off >= len(b.buf) {
                b.Truncate(0)
                return 0, io.EOF
        }
        c = b.buf[b.off]
        b.off++
        return c, nil
}

func (b *ibuffer) ReadBytes(delim byte) (line []byte, err error) {
        slice, err := b.readSlice(delim)
        line = append(line, slice...)
        return
}

func (b *ibuffer) readSlice(delim byte) (line []byte, err error) {
        i := bytes.IndexByte(b.buf[b.off:], delim)
        end := b.off + i + 1
        if i < 0 {
                end = len(b.buf)
                err = io.EOF
        }
        line = b.buf[b.off:end]
        b.off = end
        return line, err
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
        "net"
        "time"
)

type ConnTimeout struct {
        addr      string
        conn      net.Conn
        readTime  time.Duration
        writeTime time.Duration
}

func DialTimeout(addr string, connTime time.Duration) (*ConnTimeout, error) {
        conn, err := net.DialTimeout("tcp", addr, connTime)
        if err != nil {
                return nil, err
        }

        conn.(*net.TCPConn).SetNoDelay(true)
        conn.(*net.TCPConn).SetLinger(0)
        conn.(*net.TCPConn).SetKeepAlive(true)
        return &ConnTimeout{conn: conn, addr: addr}, nil
}

func NewConnTimeout(conn net.Conn) *ConnTimeout {
        if conn == nil {
                return nil
        }

        conn.(*net.TCPConn).SetNoDelay(true)
        conn.(*net.TCPConn).SetLinger(0)
        conn.(*net.TCPConn).SetKeepAlive(true)
        return &ConnTimeout{conn: conn, addr: conn.RemoteAddr().String()}
}

func (c *ConnTimeout) SetReadTimeout(timeout time.Duration) {
        c.readTime = timeout
}

func (c *ConnTimeout) SetWriteTimeout(timeout time.Duration) {
        c.writeTime = timeout
}

func (c *ConnTimeout) Read(p []byte) (n int, err error) {
        if c.readTime.Nanoseconds() > 0 {
                err = c.conn.SetReadDeadline(time.Now().Add(c.readTime))
                if err != nil {
                        return
                }
        }

        n, err = c.conn.Read(p)
        return
}

func (c *ConnTimeout) Write(p []byte) (n int, err error) {
        if c.writeTime.Nanoseconds() > 0 {
                err = c.conn.SetWriteDeadline(time.Now().Add(c.writeTime))
                if err != nil {
                        return
                }
        }

        n, err = c.conn.Write(p)
        return
}

func (c *ConnTimeout) RemoteAddr() string {
        return c.addr
}

func (c *ConnTimeout) Close() error {
        return c.conn.Close()
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
        "hash/crc32"
)

var table = crc32.MakeTable(crc32.Castagnoli)

// CRC is a CRC-32 checksum computed using Castagnoli's polynomial.
type CRC uint32

// NewCRC creates a new crc based on the given bytes.
func NewCRC(b []byte) CRC {
        return CRC(0).Update(b)
}

// Update updates the crc with the given bytes.
func (c CRC) Update(b []byte) CRC {
        return CRC(crc32.Update(uint32(c), table, b))
}

// Value returns a masked crc.
func (c CRC) Value() uint32 {
        return uint32(c>>15|c<<17) + 0xa282ead8
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
        "bufio"
        "errors"
        "io"
)

var (
        maxEmptyReads      = 100
        err_reader_isnil   = errors.New("BufferReader: reader is nil!")
        err_negative_count = errors.New("BufferReader: read return negative count!")
        err_no_progress    = errors.New("BufferReader: multiple Read calls return no data or error!")
        err_too_large      = errors.New("BufferReader: make byte slice too large!")
)

type BufferReader struct {
        buf    []byte
        reader io.Reader
        size   int
        r, w   int
        err    error
}

func NewBufferReader(reader io.Reader, size int) *BufferReader {
        return &BufferReader{
                reader: reader,
                size:   size,
                buf:    make([]byte, size),
        }
}

func (br *BufferReader) Reset() {
        if br.w > br.r {
                copy(br.buf, br.buf[br.r:br.w])
        }
        br.w = br.w - br.r
        br.r = 0
}

func (br *BufferReader) ReadFull(min int) (data []byte, err error) {
        if br.reader == nil {
                return nil, err_reader_isnil
        }
        if min == 0 {
                err = br.err
                br.err = nil
                return make([]byte, 0, 0), err
        }

        if min > (cap(br.buf) - br.r) {
                br.Grow(min)
        }
        for (br.w-br.r) < min && err == nil {
                br.fill()
                err = br.err
        }
        if (br.w - br.r) >= min {
                data = br.buf[br.r : br.r+min]
                br.r = br.r + min
                err = nil
        } else {
                data = br.buf[br.r:br.w]
                br.r = br.w
                err = br.err
                br.err = nil
        }
        return
}

func (br *BufferReader) fill() {
        if br.w >= cap(br.buf) {
                br.Grow(br.w - br.r)
        }

        for i := maxEmptyReads; i > 0; i-- {
                n, err := br.reader.Read(br.buf[br.w:])
                if n < 0 {
                        panic(err_negative_count)
                }
                br.w = br.w + n
                if err != nil {
                        br.err = err
                        return
                }
                if n > 0 {
                        return
                }
        }
        br.err = err_no_progress
}

func (br *BufferReader) Grow(n int) {
        defer func() {
                if recover() != nil {
                        panic(err_too_large)
                }
        }()

        var buf []byte
        if n > br.size {
                buf = make([]byte, n)
        } else {
                buf = make([]byte, br.size)
        }

        if br.w > br.r {
                copy(buf, br.buf[br.r:br.w])
        }
        br.w = br.w - br.r
        br.r = 0
        br.buf = buf
}

type BufferWriter struct {
        *bufio.Writer
}

func NewBufferWriter(wr io.Writer, size int) *BufferWriter {
        return &BufferWriter{
                Writer: bufio.NewWriterSize(wr, size),
        }
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package log

import (
        "bytes"
        "errors"
        "fmt"
        "io"
        "io/ioutil"
        "math"
        "os"
        "path"
        "runtime"
        "sort"
        "strings"
        "sync"
        "syscall"
        "time"
)

const (
        // A colon appears after these items:  2009/01/23 01:23:23.123123 /a/b/c/d.go:23: message
        Ldate         = 1 << iota     // the date: 2009/01/23
        Ltime                         // the time: 01:23:23
        Lmicroseconds                 // microsecond resolution: 01:23:23.123123.  assumes Ltime.
        Llongfile                     // full file name and line number: /a/b/c/d.go:23
        Lshortfile                    // final file name element and line number: d.go:23. overrides Llongfile
        LstdFlags     = Ldate | Ltime // initial values for the standard logger

        LogFileNameDateFormat = "200601021504"
        LogMaxReservedDays    = 7 * 24 * time.Hour
        // DefaultRollingSize Specifies at what size to roll the output log at, Units: MB
        DefaultRollingSize    = 5 * 1024
        DefaultMinRollingSize = 200
        // DefaultHeadRoom The tolerance for the log space limit, Units: MB
        DefaultHeadRoom = 50 * 1024
        // DefaultHeadRatio The disk reserve space ratio
        DefaultHeadRatio = 0.2
)

var (
        errLogFileName   = "_err.log"
        warnLogFileName  = "_warn.log"
        infoLogFileName  = "_info.log"
        debugLogFileName = "_debug.log"
)

type logWriter struct {
        mu     sync.Mutex     // ensures atomic writes; protects the following fields
        prefix string         // prefix to write at beginning of each line
        flag   int            // properties
        out    io.WriteCloser // destination for output
        buf    []byte         // for accumulating text to write
}

func newLogWriter(out io.WriteCloser, prefix string, flag int) *logWriter {
        return &logWriter{out: out, prefix: prefix, flag: flag}
}

type RolledFile []os.FileInfo

func (f RolledFile) Less(i, j int) bool {
        return f[i].ModTime().Before(f[j].ModTime())
}

func (f RolledFile) Len() int {
        return len(f)
}

func (f RolledFile) Swap(i, j int) {
        f[i], f[j] = f[j], f[i]
}

func itoa(buf *[]byte, i int, wid int) {
        var u uint = uint(i)
        if u == 0 && wid <= 1 {
                *buf = append(*buf, '0')
                return
        }
        // Assemble decimal in reverse order.
        var b [32]byte
        bp := len(b)
        for ; u > 0 || wid > 0; u /= 10 {
                bp--
                wid--
                b[bp] = byte(u%10) + '0'
        }
        *buf = append(*buf, b[bp:]...)
}

func (l *logWriter) formatHeader(buf *[]byte, t time.Time, file string, line int) {
        *buf = append(*buf, l.prefix...)
        if l.flag&(Ldate|Ltime|Lmicroseconds) != 0 {
                if l.flag&Ldate != 0 {
                        year, month, day := t.Date()
                        itoa(buf, year, 4)
                        *buf = append(*buf, '-')
                        itoa(buf, int(month), 2)
                        *buf = append(*buf, '-')
                        itoa(buf, day, 2)
                        *buf = append(*buf, ' ')
                }
                if l.flag&(Ltime|Lmicroseconds) != 0 {
                        hour, min, sec := t.Clock()
                        itoa(buf, hour, 2)
                        *buf = append(*buf, ':')
                        itoa(buf, min, 2)
                        *buf = append(*buf, ':')
                        itoa(buf, sec, 2)
                        if l.flag&Lmicroseconds != 0 {
                                *buf = append(*buf, ',')
                                itoa(buf, t.Nanosecond()/1e6, 3)
                        }
                        *buf = append(*buf, ' ')
                }
        }
        if l.flag&(Lshortfile|Llongfile) != 0 {
                if l.flag&Lshortfile != 0 {
                        short := file
                        for i := len(file) - 1; i > 0; i-- {
                                if file[i] == '/' {
                                        short = file[i+1:]
                                        break
                                }
                        }
                        file = short
                }
                *buf = append(*buf, file...)
                *buf = append(*buf, ':')
                itoa(buf, line, -1)
                *buf = append(*buf, ": "...)
        }
}

func (l *logWriter) output(s string, file string, line int, now time.Time) error {
        l.mu.Lock()
        defer l.mu.Unlock()

        l.buf = l.buf[:0]
        l.formatHeader(&l.buf, now, file, line)
        l.buf = append(l.buf, s...)
        if len(s) > 0 && s[len(s)-1] != '\n' {
                l.buf = append(l.buf, '\n')
        }
        _, err := l.out.Write(l.buf)
        return err
}

func (lw *logWriter) rotateFile(logDir, logFile, module string, rotate bool) {
        lw.mu.Lock()
        defer lw.mu.Unlock()

        if lw.out != nil {
                lw.out.Close()
        }
        file, err := lw.createFile(logDir, logFile, module, rotate)
        if err != nil {
                file = os.Stdout
        }
        lw.out = file

        if err == nil && logFile == errLogFileName {
                if f, e := file.Stat(); e == nil && f.Size() == 0 {
                        // Write header.
                        var buf bytes.Buffer
                        fmt.Fprintf(&buf, "Log file created at: %s\n", time.Now().Format("2006/01/02 15:04:05"))
                        fmt.Fprintf(&buf, "Log line format: yyyy-mm-dd hh:mm:ss.uuuuuu[DIWE] file:line: msg\n")
                        fmt.Fprintf(&buf, "####################################################################\n\n")
                        lw.out.Write(buf.Bytes())
                }
        }
}

func (lw *logWriter) createFile(logDir, logFile, module string, rotate bool) (*os.File, error) {
        if _, err := os.Stat(logDir); err != nil && os.IsNotExist(err) {
                if err = os.MkdirAll(logDir, os.ModePerm); err != nil {
                        fmt.Printf("[Util.Logger]Create logger dir[%s] err: [%s]\r\n", logDir, err)
                }
        }

        logFileOpt := os.O_RDWR | os.O_CREATE | os.O_APPEND
        logFilePath := logDir + "/" + module + logFile
        if rotate {
                yesterday := time.Now().AddDate(0, 0, -1)
                os.Rename(logFilePath, logFilePath+"."+yesterday.Format(LogFileNameDateFormat))
        }

        file, err := os.OpenFile(logFilePath, logFileOpt, os.ModePerm)
        if err != nil {
                fmt.Printf("[Util.Logger]Create logger file[%s] err: [%s]\r\n", logFilePath, err)
        }
        return file, err
}

func (lw *logWriter) checkRollingSize(logDir, logFile, module string, rollingSizeMB int64) {
        logFilePath := path.Join(logDir, module+logFile)
        fInfo, err := os.Stat(logFilePath)
        if err == nil {
                if fInfo.Size() >= rollingSizeMB*1024*1024 {
                        lw.rotateFile(logDir, logFile, module, true)
                }
        }
}

const (
        TraceLevel = 0
        DebugLevel = 1
        InfoLevel  = 2
        WarnLevel  = 3
        ErrorLevel = 4
        FatalLevel = 5
)

var levels = []string{
        "[TRACE]",
        "[DEBUG]",
        "[INFO.]",
        "[WARN.]",
        "[ERROR]",
        "[FATAL]",
}

type entity struct {
        msg  string
        now  time.Time
        file string
        line int
}

type Log struct {
        dir           string
        module        string
        level         int
        startTime     time.Time
        flag          int
        err           *logWriter
        warn          *logWriter
        info          *logWriter
        debug         *logWriter
        entityCh      chan *entity
        rollingSizeMB int64 // the size of the rotated log, unit: MB
        headRoomMB    int64 // capacity reserved for writing the next log on the disk, unit: MB
}

var glog *Log = NewDefaultLog()

func NewDefaultLog() *Log {
        log, err := NewLog("", "", "DEBUG")
        if err != nil {
                panic(err)
        }
        return log
}

func NewLog(dir, module, level string) (*Log, error) {
        lg := new(Log)
        lg.dir = dir
        lg.module = module
        lg.SetLevel(level)
        if err := lg.initLog(dir, module); err != nil {
                return nil, err
        }
        lg.startTime = time.Now()
        lg.entityCh = make(chan *entity, 204800)

        if dir != "" {
                if err := lg.SetRotate(dir); err != nil {
                        return nil, err
                }
                go lg.checkLogRotation(dir, module)
                go lg.checkCleanLog(dir, module)
        }
        go lg.loopMsg()

        return lg, nil
}

func InitFileLog(dir, module, level string) {
        log, err := NewLog(dir, module, level)
        if err != nil {
                panic(err)
        }
        glog = log
}

func GetFileLogger() *Log {
        return glog
}

func (l *Log) initLog(logDir, module string) error {
        logOpt := Lshortfile | LstdFlags | Lmicroseconds
        if logDir == "" {
                l.debug = newLogWriter(os.Stdout, "", logOpt)
                l.info = newLogWriter(os.Stdout, "", logOpt)
                l.warn = newLogWriter(os.Stdout, "", logOpt)
                l.err = newLogWriter(os.Stdout, "", logOpt)

                return nil
        }

        if fi, err := os.Stat(logDir); err != nil {
                return err
        } else if !fi.IsDir() {
                return errors.New(logDir + " is not a directory")
        }
        l.flag = logOpt

        l.debug = newLogWriter(nil, "", logOpt)
        l.info = newLogWriter(nil, "", logOpt)
        l.warn = newLogWriter(nil, "", logOpt)
        l.err = newLogWriter(nil, "", logOpt)
        l.debug.rotateFile(logDir, debugLogFileName, module, false)
        l.info.rotateFile(logDir, infoLogFileName, module, false)
        l.warn.rotateFile(logDir, warnLogFileName, module, false)
        l.err.rotateFile(logDir, errLogFileName, module, false)

        return nil
}

func (l *Log) SetLevel(level string) {
        switch level {
        case "TRACE", "trace", "Trace":
                l.level = TraceLevel
        case "", "debug", "Debug", "DEBUG":
                l.level = DebugLevel
        case "info", "Info", "INFO":
                l.level = InfoLevel
        case "warn", "Warn", "WARN":
                l.level = WarnLevel
        case "error", "Error", "ERROR":
                l.level = ErrorLevel
        default:
                l.level = InfoLevel
        }
}

func (l *Log) SetPrefix(s, level string) string {
        return level + " " + s
}

func (l *Log) SetRotate(logDir string) error {
        fs := syscall.Statfs_t{}
        if err := syscall.Statfs(logDir, &fs); err != nil {
                return fmt.Errorf("[InitLog] stats disk space: %s", err.Error())
        }
        var minRatio float64
        if float64(fs.Bavail*uint64(fs.Bsize)) < float64(fs.Blocks*uint64(fs.Bsize))*DefaultHeadRatio {
                minRatio = float64(fs.Bavail*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
        } else {
                minRatio = float64(fs.Blocks*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
        }
        l.headRoomMB = int64(math.Min(minRatio, DefaultHeadRoom))

        minRollingSize := int64(fs.Bavail*uint64(fs.Bsize)/4) / 1024 / 1024 // because 4 log levels
        if minRollingSize < DefaultMinRollingSize {
                minRollingSize = DefaultMinRollingSize
        }
        l.rollingSizeMB = int64(math.Min(float64(minRollingSize), float64(DefaultRollingSize)))
        return nil
}

func (l *Log) IsEnableDebug() bool {
        return l.level <= DebugLevel
}
func (l *Log) IsEnableInfo() bool {
        return l.level <= InfoLevel
}
func (l *Log) IsEnableWarn() bool {
        return l.level <= WarnLevel
}
func (l *Log) IsEnableError() bool {
        return l.level <= ErrorLevel
}

func (l *Log) IsEnableTrace() bool {
        return l.level <= TraceLevel
}

func (l *Log) Output(calldepth int, s string, sync bool) {
        now := time.Now()
        var file string
        var line int
        var ok bool
        if l.flag&(Lshortfile|Llongfile) != 0 {
                _, file, line, ok = runtime.Caller(calldepth)
                if !ok {
                        file = "???"
                        line = 0
                }
        }
        if sync {
                l.printMsg(s, file, line, now)
        } else {
                l.putMsg(s, file, line, now)
        }
}

func (l *Log) putMsg(msg string, file string, line int, now time.Time) {
        l.entityCh <- &entity{msg: msg, file: file, line: line, now: now}
}

func (l *Log) loopMsg() {
        for entity := range l.entityCh {
                l.printMsg(entity.msg, entity.file, entity.line, entity.now)
        }
}

func (l *Log) printMsg(msg string, file string, line int, now time.Time) {
        switch l.level {
        case TraceLevel:
                switch msg[1] {
                case 'I', 'W', 'E', 'F':
                        l.debug.output(msg, file, line, now)
                }
        case DebugLevel:
                switch msg[1] {
                case 'I', 'W', 'E', 'F':
                        l.debug.output(msg, file, line, now)
                }
        case InfoLevel:
                switch msg[1] {
                case 'W', 'E', 'F':
                        l.info.output(msg, file, line, now)
                }
        case WarnLevel:
                switch msg[1] {
                case 'E', 'F':
                        l.warn.output(msg, file, line, now)
                }
        }
        switch msg[1] {
        case 'T':
                l.debug.output(msg, file, line, now)
        case 'D':
                l.debug.output(msg, file, line, now)
        case 'I':
                l.info.output(msg, file, line, now)
        case 'W':
                l.warn.output(msg, file, line, now)
        case 'E':
                l.err.output(msg, file, line, now)
        case 'F':
                l.err.output(msg, file, line, now)
        }
}

func (l *Log) checkLogRotation(logDir, module string) {
        // handle panic
        defer func() {
                if r := recover(); r != nil {
                        fmt.Printf("[Util.Logger]Check logger rotation panic: [%s]\r\n", r)
                }
        }()

        for {
                yesterday := time.Now().AddDate(0, 0, -1)
                _, err := os.Stat(logDir + "/" + module + errLogFileName + "." + yesterday.Format(LogFileNameDateFormat))
                if err == nil || time.Now().Day() == l.startTime.Day() {
                        l.debug.checkRollingSize(logDir, debugLogFileName, module, l.rollingSizeMB)
                        l.info.checkRollingSize(logDir, infoLogFileName, module, l.rollingSizeMB)
                        l.warn.checkRollingSize(logDir, warnLogFileName, module, l.rollingSizeMB)
                        l.err.checkRollingSize(logDir, errLogFileName, module, l.rollingSizeMB)
                        time.Sleep(time.Second * 600)
                        continue
                }

                //rotate the log files
                l.debug.rotateFile(logDir, debugLogFileName, module, true)
                l.info.rotateFile(logDir, infoLogFileName, module, true)
                l.warn.rotateFile(logDir, warnLogFileName, module, true)
                l.err.rotateFile(logDir, errLogFileName, module, true)
                l.startTime = time.Now()
                time.Sleep(time.Second * 600)
        }
}

func (l *Log) checkCleanLog(logDir, module string) {
        // handle panic
        defer func() {
                if r := recover(); r != nil {
                        fmt.Printf("[Util.Logger]Check clean logger file panic: [%s]\r\n", r)
                }
        }()

        for {
                // check disk space
                fs := syscall.Statfs_t{}
                if err := syscall.Statfs(logDir, &fs); err != nil {
                        fmt.Printf("[Util.Logger]Check disk space of dir[%s] err: [%s]\r\n", logDir, err)
                        time.Sleep(time.Second * 600)
                        continue
                }
                diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
                diskSpaceLeft -= l.headRoomMB * 1024 * 1024

                fInfos, err := ioutil.ReadDir(logDir)
                if err != nil || len(fInfos) == 0 {
                        time.Sleep(time.Second * 600)
                        continue
                }
                var needDelFiles RolledFile
                for _, info := range fInfos {
                        if deleteFileFilter(module, info, diskSpaceLeft) {
                                needDelFiles = append(needDelFiles, info)
                        }
                }
                sort.Sort(needDelFiles)
                for _, info := range needDelFiles {
                        if err = os.Remove(path.Join(logDir, info.Name())); err != nil {
                                fmt.Printf("[Util.Logger]Remove logger file[%s] err: [%s]\r\n", info.Name(), err)
                                continue
                        }
                        diskSpaceLeft += info.Size()
                        if diskSpaceLeft > 0 && time.Since(info.ModTime()) < LogMaxReservedDays {
                                break
                        }
                }
                time.Sleep(time.Second * 600)
        }
}

func deleteFileFilter(module string, info os.FileInfo, diskSpaceLeft int64) bool {
        if diskSpaceLeft <= 0 {
                return info.Mode().IsRegular() && isExpiredRaftLog(module, info.Name())
        }
        return time.Since(info.ModTime()) > LogMaxReservedDays && isExpiredRaftLog(module, info.Name())
}

func isExpiredRaftLog(module, name string) bool {
        if strings.HasSuffix(name, ".log") {
                return false
        }
        if strings.HasPrefix(name, module+infoLogFileName) || strings.HasPrefix(name, module+debugLogFileName) ||
                strings.HasPrefix(name, module+warnLogFileName) || strings.HasPrefix(name, module+errLogFileName) {
                return true
        }
        return false
}

func (l *Log) Debug(format string, v ...interface{}) {
        if l.IsEnableDebug() {
                l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[DebugLevel]), false)
        }
}

func (l *Log) Info(format string, v ...interface{}) {
        if l.IsEnableInfo() {
                l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[InfoLevel]), false)
        }
}

func (l *Log) Warn(format string, v ...interface{}) {
        if l.IsEnableWarn() {
                l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[WarnLevel]), false)
        }
}

func (l *Log) Error(format string, v ...interface{}) {
        l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[ErrorLevel]), false)
}

func (l *Log) Fatal(format string, v ...interface{}) {
        l.Output(3, l.SetPrefix(fmt.Sprintf(format+"\r\n", v...), levels[FatalLevel]), true)
        os.Exit(1)
}

func (l *Log) Panic(format string, v ...interface{}) {
        s := fmt.Sprintf(format+"\r\n", v...)
        l.Output(3, l.SetPrefix(s, levels[FatalLevel]), true)
        panic(s)
}

func Debug(format string, v ...interface{}) {
        glog.Debug(format, v...)
}

func Info(format string, v ...interface{}) {
        glog.Info(format, v...)
}

func Warn(format string, v ...interface{}) {
        glog.Warn(format, v...)
}

func Error(format string, v ...interface{}) {
        glog.Error(format, v...)
}

func Fatal(format string, v ...interface{}) {
        glog.Fatal(format, v...)
}

func Panic(format string, v ...interface{}) {
        glog.Panic(format, v...)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
        "os"
        "syscall"
)

func logCrash(f *os.File) error {
        return syscall.Dup3(int(f.Fd()), 2, 0)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
        "fmt"
        "runtime"
        "runtime/debug"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
)

func HandleCrash(handlers ...func(interface{})) {
        if r := recover(); r != nil {
                debug.PrintStack()
                logPanic(r)
                for _, fn := range handlers {
                        fn(r)
                }
        }
}

func logPanic(r interface{}) {
        callers := ""
        for i := 0; true; i++ {
                _, file, line, ok := runtime.Caller(i)
                if !ok {
                        break
                }
                callers = callers + fmt.Sprintf("%v:%v\n", file, line)
        }
        logger.Error("Recovered from panic: %#v (%v)\n%v", r, r, callers)
}

func RunWorker(f func(), handlers ...func(interface{})) {
        go func() {
                defer HandleCrash(handlers...)

                f()
        }()
}

func RunWorkerUtilStop(f func(), stopCh <-chan struct{}, handlers ...func(interface{})) {
        go func() {
                for {
                        select {
                        case <-stopCh:
                                return

                        default:
                                func() {
                                        defer HandleCrash(handlers...)
                                        f()
                                }()
                        }
                }
        }()
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

import (
        "time"
)

const (
        _  = iota
        KB = 1 << (10 * iota)
        MB
        GB
)

const time_format = "2006-01-02 15:04:05.000"

type Uint64Slice []uint64

func (p Uint64Slice) Len() int           { return len(p) }
func (p Uint64Slice) Less(i, j int) bool { return p[i] < p[j] }
func (p Uint64Slice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }

func Min(a, b uint64) uint64 {
        if a > b {
                return b
        }
        return a
}

func Max(a, b uint64) uint64 {
        if a > b {
                return a
        }
        return b
}

func FormatDate(t time.Time) string {
        return t.Format(time_format)
}

func FormatTimestamp(t int64) string {
        if t <= 0 {
                return ""
        }
        return time.Unix(0, t).Format(time_format)
}

// Copyright 2018 The tiglabs raft Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package util

func Uvarint64(buf []byte) (uint64, int) {
        if buf[0] <= 0xF0 {
                return uint64(buf[0]), 1
        }
        if buf[0] <= 0xF8 {
                return 240 + 256*(uint64(buf[0])-241) + uint64(buf[1]), 2
        }
        if buf[0] == 0xF9 {
                return 2288 + 256*uint64(buf[1]) + uint64(buf[2]), 3
        }
        if buf[0] == 0xFA {
                return uint64(buf[1])<<16 | uint64(buf[2])<<8 | uint64(buf[3]), 4
        }
        if buf[0] == 0xFB {
                return uint64(buf[1])<<24 | uint64(buf[2])<<16 | uint64(buf[3])<<8 | uint64(buf[4]), 5
        }
        if buf[0] == 0xFC {
                return uint64(buf[1])<<32 | uint64(buf[2])<<24 | uint64(buf[3])<<16 | uint64(buf[4])<<8 | uint64(buf[5]), 6
        }
        if buf[0] == 0xFD {
                return uint64(buf[1])<<40 | uint64(buf[2])<<32 | uint64(buf[3])<<24 | uint64(buf[4])<<16 | uint64(buf[5])<<8 | uint64(buf[6]), 7
        }
        if buf[0] == 0xFE {
                return uint64(buf[1])<<48 | uint64(buf[2])<<40 | uint64(buf[3])<<32 | uint64(buf[4])<<24 | uint64(buf[5])<<16 | uint64(buf[6])<<8 | uint64(buf[7]), 8
        }
        return uint64(buf[1])<<56 | uint64(buf[2])<<48 | uint64(buf[3])<<40 | uint64(buf[4])<<32 | uint64(buf[5])<<24 | uint64(buf[6])<<16 | uint64(buf[7])<<8 | uint64(buf[8]), 9
}

func PutUvarint64(buf []byte, x uint64) int {
        if x < 241 {
                buf[0] = byte(x)
                return 1
        }
        if x < 2288 {
                buf[0] = byte((x-240)/256 + 241)
                buf[1] = byte((x - 240) % 256)
                return 2
        }
        if x < 67824 {
                buf[0] = 0xF9
                buf[1] = byte((x - 2288) / 256)
                buf[2] = byte((x - 2288) % 256)
                return 3
        }
        if x < 1<<24 {
                buf[0] = 0xFA
                buf[1] = byte(x >> 16)
                buf[2] = byte(x >> 8)
                buf[3] = byte(x)
                return 4
        }
        if x < 1<<32 {
                buf[0] = 0xFB
                buf[1] = byte(x >> 24)
                buf[2] = byte(x >> 16)
                buf[3] = byte(x >> 8)
                buf[4] = byte(x)
                return 5
        }
        if x < 1<<40 {
                buf[0] = 0xFC
                buf[1] = byte(x >> 32)
                buf[2] = byte(x >> 24)
                buf[3] = byte(x >> 16)
                buf[4] = byte(x >> 8)
                buf[5] = byte(x)
                return 6
        }
        if x < 1<<48 {
                buf[0] = 0xFD
                buf[1] = byte(x >> 40)
                buf[2] = byte(x >> 32)
                buf[3] = byte(x >> 24)
                buf[4] = byte(x >> 16)
                buf[5] = byte(x >> 8)
                buf[6] = byte(x)
                return 7
        }
        if x < 1<<56 {
                buf[0] = 0xFE
                buf[1] = byte(x >> 48)
                buf[2] = byte(x >> 40)
                buf[3] = byte(x >> 32)
                buf[4] = byte(x >> 24)
                buf[5] = byte(x >> 16)
                buf[6] = byte(x >> 8)
                buf[7] = byte(x)
                return 8
        }
        buf[0] = 0xFF
        buf[1] = byte(x >> 56)
        buf[2] = byte(x >> 48)
        buf[3] = byte(x >> 40)
        buf[4] = byte(x >> 32)
        buf[5] = byte(x >> 24)
        buf[6] = byte(x >> 16)
        buf[7] = byte(x >> 8)
        buf[8] = byte(x)
        return 9
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "net"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// const
const (
        // the maximum number of tasks that can be handled each time
        MaxTaskNum = 30

        TaskWorkerInterval = time.Second * time.Duration(2)
        idleConnTimeout    = 90 // seconds
        connectTimeout     = 10 // seconds
)

// AdminTaskManager sends administration commands to the metaNode or dataNode.
type AdminTaskManager struct {
        clusterID  string
        targetAddr string
        TaskMap    map[string]*proto.AdminTask
        sync.RWMutex
        exitCh   chan struct{}
        connPool *util.ConnectPool
}

func newAdminTaskManager(targetAddr, clusterID string) (sender *AdminTaskManager) {
        proto.InitBufferPool(int64(32768))

        sender = &AdminTaskManager{
                targetAddr: targetAddr,
                clusterID:  clusterID,
                TaskMap:    make(map[string]*proto.AdminTask),
                exitCh:     make(chan struct{}, 1),
                connPool:   util.NewConnectPoolWithTimeout(idleConnTimeout, connectTimeout),
        }
        go sender.process()

        return
}

func (sender *AdminTaskManager) process() {
        ticker := time.NewTicker(TaskWorkerInterval)
        defer func() {
                ticker.Stop()
                Warn(sender.clusterID, fmt.Sprintf("clusterID[%v] %v sender stop", sender.clusterID, sender.targetAddr))
        }()
        for {
                select {
                case <-sender.exitCh:
                        return
                case <-ticker.C:
                        sender.doDeleteTasks()
                        sender.doSendTasks()
                }
        }
}

func (sender *AdminTaskManager) doDeleteTasks() {
        delTasks := sender.getToBeDeletedTasks()
        for _, t := range delTasks {
                sender.DelTask(t)
        }
        return
}

func (sender *AdminTaskManager) getToBeDeletedTasks() (delTasks []*proto.AdminTask) {
        sender.RLock()
        defer sender.RUnlock()
        delTasks = make([]*proto.AdminTask, 0)

        for _, task := range sender.TaskMap {
                if task.CheckTaskTimeOut() {
                        log.LogWarnf(fmt.Sprintf("clusterID[%v] %v has no response until time out",
                                sender.clusterID, task.ID))
                        if task.SendTime > 0 {
                                Warn(sender.clusterID, fmt.Sprintf("clusterID[%v] %v has no response until time out",
                                        sender.clusterID, task.ID))
                        }

                        // timed-out tasks will be deleted
                        delTasks = append(delTasks, task)
                }
        }
        return
}

func (sender *AdminTaskManager) doSendTasks() {
        tasks := sender.getToDoTasks()
        if len(tasks) == 0 {
                return
        }
        sender.sendTasks(tasks)
}

func (sender *AdminTaskManager) getConn() (conn *net.TCPConn, err error) {
        if useConnPool {
                return sender.connPool.GetConnect(sender.targetAddr)
        }
        var connect net.Conn
        connect, err = net.Dial("tcp", sender.targetAddr)
        if err == nil {
                conn = connect.(*net.TCPConn)
                conn.SetKeepAlive(true)
                conn.SetNoDelay(true)
        }
        return
}

func (sender *AdminTaskManager) putConn(conn *net.TCPConn, forceClose bool) {
        if useConnPool {
                sender.connPool.PutConnect(conn, forceClose)
        }
}

func (sender *AdminTaskManager) sendTasks(tasks []*proto.AdminTask) {
        for _, task := range tasks {
                if task.OpCode == proto.OpVersionOperation {
                        log.LogInfof("action[sendTasks] get task to addr [%v]", task.OperatorAddr)
                }
                conn, err := sender.getConn()
                if err != nil {
                        msg := fmt.Sprintf("clusterID[%v] get connection to %v,err,%v", sender.clusterID, sender.targetAddr, errors.Stack(err))
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_sendTask", sender.clusterID, ModuleName), msg)
                        sender.putConn(conn, true)
                        sender.updateTaskInfo(task, false)
                        break
                }
                if err = sender.sendAdminTask(task, conn); err != nil {
                        log.LogError(fmt.Sprintf("send task %v to %v err %v,errStack,%v", task.ID, sender.targetAddr, err, errors.Stack(err)))
                        sender.putConn(conn, true)
                        sender.updateTaskInfo(task, true)
                        continue
                }
                sender.putConn(conn, false)
        }
}

func (sender *AdminTaskManager) updateTaskInfo(task *proto.AdminTask, connSuccess bool) {
        task.SendCount++
        if connSuccess {
                task.SendTime = time.Now().Unix()
                task.Status = proto.TaskRunning
        }
}

func (sender *AdminTaskManager) buildPacket(task *proto.AdminTask) (packet *proto.Packet, err error) {
        packet = proto.NewPacket()
        packet.Opcode = task.OpCode
        packet.ReqID = proto.GenerateRequestID()
        packet.PartitionID = task.PartitionID
        body, err := json.Marshal(task)
        if err != nil {
                return nil, err
        }
        packet.Size = uint32(len(body))
        packet.Data = body
        return packet, nil
}

func (sender *AdminTaskManager) sendAdminTask(task *proto.AdminTask, conn net.Conn) (err error) {
        packet, err := sender.buildPacket(task)
        if err != nil {
                return errors.Trace(err, "action[sendAdminTask build packet failed,task:%v]", task.ID)
        }
        if err = packet.WriteToConn(conn); err != nil {
                return errors.Trace(err, "action[sendAdminTask],WriteToConn failed,task:%v", task.ID)
        }
        if err = packet.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
                return errors.Trace(err, "action[sendAdminTask],ReadFromConn failed task:%v", task.ID)
        }
        log.LogDebugf(fmt.Sprintf("action[sendAdminTask] sender task:%v success", task.ToString()))
        sender.updateTaskInfo(task, true)

        return nil
}

func (sender *AdminTaskManager) syncSendAdminTask(task *proto.AdminTask) (packet *proto.Packet, err error) {
        packet, err = sender.buildPacket(task)
        if err != nil {
                return nil, errors.Trace(err, "action[syncSendAdminTask build packet failed,task:%v]", task.ID)
        }
        log.LogInfof("action[syncSendAdminTask],task[%s], op %s, reqId %d", task.ToString(), packet.GetOpMsg(), packet.GetReqID())
        conn, err := sender.getConn()
        if err != nil {
                return nil, errors.Trace(err, "action[syncSendAdminTask get conn failed,task:%v]", task.ID)
        }
        defer func() {
                if err == nil {
                        sender.putConn(conn, false)
                } else {
                        sender.putConn(conn, true)
                }
        }()
        if err = packet.WriteToConn(conn); err != nil {
                return nil, errors.Trace(err, "action[syncSendAdminTask],WriteToConn failed,task:%v,reqID[%v]", task.ID, packet.ReqID)
        }
        if err = packet.ReadFromConnWithVer(conn, proto.SyncSendTaskDeadlineTime); err != nil {
                return nil, errors.Trace(err, "action[syncSendAdminTask],ReadFromConn failed task:%v,reqID[%v]", task.ID, packet.ReqID)
        }
        if packet.ResultCode != proto.OpOk {
                err = fmt.Errorf("result code[%v],msg[%v]", packet.ResultCode, string(packet.Data))
                log.LogErrorf("action[syncSendAdminTask],task:%v,reqID[%v],err[%v],", task.ID, packet.ReqID, err)
                return
        }
        return packet, nil
}

// DelTask deletes the to-be-deleted tasks.
func (sender *AdminTaskManager) DelTask(t *proto.AdminTask) {
        sender.Lock()
        defer sender.Unlock()
        _, ok := sender.TaskMap[t.ID]
        if !ok {
                return
        }
        if t.OpCode != proto.OpMetaNodeHeartbeat && t.OpCode != proto.OpDataNodeHeartbeat && t.OpCode != proto.OpLcNodeHeartbeat {
                log.LogDebugf("action[DelTask] delete task[%v]", t.ToString())
        }
        delete(sender.TaskMap, t.ID)
}

// AddTask adds a new task to the task map.
func (sender *AdminTaskManager) AddTask(t *proto.AdminTask) {
        sender.Lock()
        defer sender.Unlock()
        _, ok := sender.TaskMap[t.ID]
        if !ok {
                sender.TaskMap[t.ID] = t
        }
}

func (sender *AdminTaskManager) getToDoTasks() (tasks []*proto.AdminTask) {
        sender.RLock()
        defer sender.RUnlock()
        tasks = make([]*proto.AdminTask, 0)

        // send heartbeat task first
        for _, t := range sender.TaskMap {
                if t.IsHeartbeatTask() && t.CheckTaskNeedSend() == true {
                        tasks = append(tasks, t)
                        t.SendTime = time.Now().Unix()
                }
        }
        // send urgent task immediately
        for _, t := range sender.TaskMap {
                if t.IsUrgentTask() && t.CheckTaskNeedSend() == true {
                        tasks = append(tasks, t)
                        t.SendTime = time.Now().Unix()
                }
        }
        for _, task := range sender.TaskMap {
                if !task.IsHeartbeatTask() && !task.IsUrgentTask() && task.CheckTaskNeedSend() {
                        tasks = append(tasks, task)
                        task.SendTime = time.Now().Unix()
                        if task.OpCode == proto.OpVersionOperation {
                                log.LogInfof("action[getToDoTasks] get task to addr [%v]", task.OperatorAddr)
                                continue
                        }
                }

                if len(tasks) > MaxTaskNum {
                        break
                }
        }
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "math"
        "net/http"
        "strconv"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/compressor"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/log"
)

// Parse the request that adds/deletes a raft node.
func parseRequestForRaftNode(r *http.Request) (id uint64, host string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        var idStr string
        if idStr = r.FormValue(idKey); idStr == "" {
                err = keyNotFound(idKey)
                return
        }

        if id, err = strconv.ParseUint(idStr, 10, 64); err != nil {
                return
        }
        if host = r.FormValue(addrKey); host == "" {
                err = keyNotFound(addrKey)
                return
        }

        if arr := strings.Split(host, colonSplit); len(arr) < 2 {
                err = unmatchedKey(addrKey)
                return
        }
        return
}

func extractTxTimeout(r *http.Request) (timeout int64, err error) {
        var txTimeout uint64
        if txTimeout, err = extractUint64WithDefault(r, txTimeoutKey, proto.DefaultTransactionTimeout); err != nil {
                return
        }

        if txTimeout == 0 || txTimeout > proto.MaxTransactionTimeout {
                return timeout, fmt.Errorf("txTimeout(%d) value range [1-%v] minutes", txTimeout, proto.MaxTransactionTimeout)
        }
        timeout = int64(txTimeout)
        return timeout, nil
}

func extractTxConflictRetryNum(r *http.Request) (retryNum int64, err error) {
        var txRetryNum uint64
        if txRetryNum, err = extractUint64WithDefault(r, txConflictRetryNumKey, proto.DefaultTxConflictRetryNum); err != nil {
                return
        }

        if txRetryNum == 0 || txRetryNum > proto.MaxTxConflictRetryNum {
                return retryNum, fmt.Errorf("txRetryNum(%d) value range [1-%v]", txRetryNum, proto.MaxTxConflictRetryNum)
        }
        retryNum = int64(txRetryNum)
        return retryNum, nil
}

func extractTxConflictRetryInterval(r *http.Request) (interval int64, err error) {
        var txInterval uint64
        if txInterval, err = extractUint64WithDefault(r, txConflictRetryIntervalKey, proto.DefaultTxConflictRetryInterval); err != nil {
                return
        }

        if txInterval < proto.MinTxConflictRetryInterval || txInterval > proto.MaxTxConflictRetryInterval {
                return interval, fmt.Errorf("txInterval(%d) value range [%v-%v] ms",
                        txInterval, proto.MinTxConflictRetryInterval, proto.MaxTxConflictRetryInterval)
        }
        interval = int64(txInterval)
        return interval, nil
}

func extractTxOpLimitInterval(r *http.Request, volLimit int) (limit int, err error) {
        var txLimit int
        if txLimit, err = extractUintWithDefault(r, txOpLimitKey, volLimit); err != nil {
                return
        }

        limit = txLimit
        return
}

func hasTxParams(r *http.Request) bool {
        var (
                maskStr    string
                timeoutStr string
        )
        if maskStr = r.FormValue(enableTxMaskKey); maskStr != "" {
                return true
        }

        if timeoutStr = r.FormValue(txTimeoutKey); timeoutStr != "" {
                return true
        }
        return false
}

func parseTxMask(r *http.Request, oldMask proto.TxOpMask) (mask proto.TxOpMask, err error) {
        var maskStr string
        if maskStr = r.FormValue(enableTxMaskKey); maskStr == "" {
                mask = oldMask
                return
        }

        var reset bool
        reset, err = extractBoolWithDefault(r, txForceResetKey, false)
        if err != nil {
                return
        }

        mask, err = proto.GetMaskFromString(maskStr)
        if err != nil {
                return
        }

        if reset {
                return
        }

        if mask != proto.TxOpMaskOff {
                mask = mask | oldMask
        }
        return
}

func parseRequestForUpdateMetaNode(r *http.Request) (nodeAddr string, id uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if nodeAddr, err = extractNodeAddr(r); err != nil {
                return
        }
        if id, err = extractNodeID(r); err != nil {
                return
        }
        return
}

func parseRequestForAddNode(r *http.Request) (nodeAddr, zoneName string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if nodeAddr, err = extractNodeAddr(r); err != nil {
                return
        }
        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }
        return
}

func parseDecomNodeReq(r *http.Request) (nodeAddr string, limit int, err error) {
        nodeAddr, err = parseAndExtractNodeAddr(r)
        if err != nil {
                return
        }

        limit, err = parseUintParam(r, countKey)
        if err != nil {
                return
        }

        return
}

func parseDecomDataNodeReq(r *http.Request) (nodeAddr string, err error) {
        nodeAddr, err = parseAndExtractNodeAddr(r)
        if err != nil {
                return
        }

        return
}

func parseAndExtractNodeAddr(r *http.Request) (nodeAddr string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        return extractNodeAddr(r)
}

func parseRequestToDecommissionNode(r *http.Request) (nodeAddr, diskPath string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        nodeAddr, err = extractNodeAddr(r)
        if err != nil {
                return
        }
        diskPath, err = extractDiskPath(r)
        return
}

func parseRequestToGetTaskResponse(r *http.Request) (tr *proto.AdminTask, err error) {
        var body []byte
        if err = r.ParseForm(); err != nil {
                return
        }
        if body, err = io.ReadAll(r.Body); err != nil {
                return
        }
        tr = &proto.AdminTask{}
        decoder := json.NewDecoder(bytes.NewBuffer([]byte(body)))
        decoder.UseNumber()
        err = decoder.Decode(tr)
        return
}

func parseVolName(r *http.Request) (name string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if name, err = extractName(r); err != nil {
                return
        }
        return
}

func parseVolVerStrategy(r *http.Request) (strategy proto.VolumeVerStrategy, isForce bool, err error) {
        var value string
        if value = r.FormValue(enableKey); value == "" {
                strategy.Enable = true
        } else {
                if strategy.Enable, err = strconv.ParseBool(value); err != nil {
                        log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
                        return
                }
        }

        strategy.KeepVerCnt, err = parseUintParam(r, countKey)
        if strategy.Enable && err != nil {
                log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
                return
        }
        strategy.Periodic, err = parseUintParam(r, Periodic)
        if strategy.Enable && err != nil {
                log.LogErrorf("parseVolVerStrategy. strategy.Enable %v strategy %v", strategy.Enable, strategy)
                return
        }

        if value = r.FormValue(forceKey); value != "" {
                isForce = true
                strategy.ForceUpdate, _ = strconv.ParseBool(value)
        }

        log.LogDebugf("parseVolVerStrategy. strategy %v", strategy)
        return
}

func parseGetVolParameter(r *http.Request) (p *getVolParameter, err error) {
        p = &getVolParameter{}
        skipOwnerValidationVal := r.Header.Get(proto.SkipOwnerValidation)
        if len(skipOwnerValidationVal) > 0 {
                if p.skipOwnerValidation, err = strconv.ParseBool(skipOwnerValidationVal); err != nil {
                        return
                }
        }
        if p.name = r.FormValue(nameKey); p.name == "" {
                err = keyNotFound(nameKey)
                return
        }
        if !volNameRegexp.MatchString(p.name) {
                err = errors.New("name can only be number and letters")
                return
        }
        if p.authKey = r.FormValue(volAuthKey); !p.skipOwnerValidation && len(p.authKey) == 0 {
                err = keyNotFound(volAuthKey)
                return
        }
        return
}

func parseRequestToDeleteVol(r *http.Request) (name, authKey string, force bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                return
        }

        if authKey, err = extractAuthKey(r); err != nil {
                return
        }

        force, err = extractBoolWithDefault(r, forceDelVolKey, false)
        if err != nil {
                return
        }

        return
}

func extractUintWithDefault(r *http.Request, key string, def int) (val int, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return def, nil
        }

        if val, err = strconv.Atoi(str); err != nil || val < 0 {
                return 0, fmt.Errorf("parse [%s] is not valid int [%d], err %v", key, val, err)
        }

        return val, nil
}

func extractUint64WithDefault(r *http.Request, key string, def uint64) (val uint64, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return def, nil
        }

        if val, err = strconv.ParseUint(str, 10, 64); err != nil || val < 0 {
                return 0, fmt.Errorf("parse [%s] is not valid uint [%d], err %v", key, val, err)
        }

        return val, nil
}

func extractInt64WithDefault(r *http.Request, key string, def int64) (val int64, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return def, nil
        }

        if val, err = strconv.ParseInt(str, 10, 64); err != nil || val < 0 {
                return 0, fmt.Errorf("parse [%s] is not valid int [%d], err %v", key, val, err)
        }

        return val, nil
}

func extractStrWithDefault(r *http.Request, key string, def string) (val string) {
        if val = r.FormValue(key); val == "" {
                return def
        }

        return val
}

func extractBoolWithDefault(r *http.Request, key string, def bool) (val bool, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return def, nil
        }

        if val, err = strconv.ParseBool(str); err != nil {
                return false, fmt.Errorf("parse [%s] is not a bool val [%t]", key, val)
        }

        return val, nil
}

type updateVolReq struct {
        name                    string
        authKey                 string
        capacity                uint64
        deleteLockTime          int64
        followerRead            bool
        authenticate            bool
        enablePosixAcl          bool
        enableTransaction       proto.TxOpMask
        txTimeout               int64
        txConflictRetryNum      int64
        txConflictRetryInterval int64
        txOpLimit               int
        zoneName                string
        description             string
        dpSelectorName          string
        dpSelectorParm          string
        replicaNum              int
        coldArgs                *coldVolArgs
        dpReadOnlyWhenVolFull   bool
        enableQuota             bool
}

func parseColdVolUpdateArgs(r *http.Request, vol *Vol) (args *coldVolArgs, err error) {
        args = &coldVolArgs{}

        if args.objBlockSize, err = extractUintWithDefault(r, ebsBlkSizeKey, vol.EbsBlkSize); err != nil {
                return
        }

        if args.cacheCap, err = extractUint64WithDefault(r, cacheCapacity, vol.CacheCapacity); err != nil {
                return
        }

        if args.cacheAction, err = extractUintWithDefault(r, cacheActionKey, vol.CacheAction); err != nil {
                return
        }

        if args.cacheThreshold, err = extractUintWithDefault(r, cacheThresholdKey, vol.CacheThreshold); err != nil {
                return
        }

        if args.cacheTtl, err = extractUintWithDefault(r, cacheTTLKey, vol.CacheTTL); err != nil {
                return
        }

        if args.cacheHighWater, err = extractUintWithDefault(r, cacheHighWaterKey, vol.CacheHighWater); err != nil {
                return
        }

        if args.cacheLowWater, err = extractUintWithDefault(r, cacheLowWaterKey, vol.CacheLowWater); err != nil {
                return
        }

        if args.cacheLRUInterval, err = extractUintWithDefault(r, cacheLRUIntervalKey, vol.CacheLRUInterval); err != nil {
                return
        }

        if args.cacheLRUInterval < 2 {
                return nil, fmt.Errorf("cacheLruInterval(%d) muster be bigger than 2 minute", args.cacheLRUInterval)
        }

        args.cacheRule = extractStrWithDefault(r, cacheRuleKey, vol.CacheRule)
        emptyCacheRule, err := extractBoolWithDefault(r, emptyCacheRuleKey, false)
        if err != nil {
                return
        }

        if emptyCacheRule {
                args.cacheRule = ""
        }

        // do some check
        if args.cacheLowWater >= args.cacheHighWater {
                return nil, fmt.Errorf("low water(%d) must be less than high water(%d)", args.cacheLowWater, args.cacheHighWater)
        }

        if args.cacheHighWater >= 90 || args.cacheLowWater >= 90 {
                return nil, fmt.Errorf("low(%d) or high water(%d) can't be large than 90, low than 0", args.cacheLowWater, args.cacheHighWater)
        }

        if args.cacheAction < proto.NoCache || args.cacheAction > proto.RWCache {
                return nil, fmt.Errorf("cache action is illegal (%d)", args.cacheAction)
        }

        return
}

func parseVolUpdateReq(r *http.Request, vol *Vol, req *updateVolReq) (err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        req.authKey = extractStr(r, volAuthKey)
        req.description = extractStrWithDefault(r, descriptionKey, vol.description)
        req.zoneName = extractStrWithDefault(r, zoneNameKey, vol.zoneName)

        if req.capacity, err = extractUint64WithDefault(r, volCapacityKey, vol.Capacity); err != nil {
                return
        }

        if req.deleteLockTime, err = extractInt64WithDefault(r, volDeleteLockTimeKey, vol.DeleteLockTime); err != nil {
                return
        }

        if req.enablePosixAcl, err = extractBoolWithDefault(r, enablePosixAclKey, vol.enablePosixAcl); err != nil {
                return
        }

        var txMask proto.TxOpMask
        if txMask, err = parseTxMask(r, vol.enableTransaction); err != nil {
                return
        }
        req.enableTransaction = txMask

        if req.enableQuota, err = extractBoolWithDefault(r, enableQuota, vol.enableQuota); err != nil {
                return
        }

        var txTimeout int64
        if txTimeout, err = extractTxTimeout(r); err != nil {
                return
        }
        req.txTimeout = txTimeout

        var txConflictRetryNum int64
        if txConflictRetryNum, err = extractTxConflictRetryNum(r); err != nil {
                return
        }
        req.txConflictRetryNum = txConflictRetryNum

        var txConflictRetryInterval int64
        if txConflictRetryInterval, err = extractTxConflictRetryInterval(r); err != nil {
                return
        }
        req.txConflictRetryInterval = txConflictRetryInterval

        if req.txOpLimit, err = extractTxOpLimitInterval(r, vol.txOpLimit); err != nil {
                return
        }

        if req.authenticate, err = extractBoolWithDefault(r, authenticateKey, vol.authenticate); err != nil {
                return
        }

        if req.followerRead, err = extractBoolWithDefault(r, followerReadKey, vol.FollowerRead); err != nil {
                return
        }

        if req.dpReadOnlyWhenVolFull, err = extractBoolWithDefault(r, dpReadOnlyWhenVolFull, vol.DpReadOnlyWhenVolFull); err != nil {
                return
        }

        req.dpSelectorName = r.FormValue(dpSelectorNameKey)
        req.dpSelectorParm = r.FormValue(dpSelectorParmKey)

        if (req.dpSelectorName == "" && req.dpSelectorParm != "") || (req.dpSelectorName != "" && req.dpSelectorParm == "") {
                err = keyNotFound(dpSelectorNameKey + " or " + dpSelectorParmKey)
                return

        } else if req.dpSelectorParm == "" && req.dpSelectorName == "" {
                req.dpSelectorName = vol.dpSelectorName
                req.dpSelectorParm = vol.dpSelectorParm
        }

        if proto.IsCold(vol.VolType) {
                req.followerRead = true
                req.coldArgs, err = parseColdVolUpdateArgs(r, vol)
                if err != nil {
                        return
                }
        }

        return
}

func parseBoolFieldToUpdateVol(r *http.Request, vol *Vol) (followerRead, authenticate bool, err error) {
        if followerReadStr := r.FormValue(followerReadKey); followerReadStr != "" {
                if followerRead, err = strconv.ParseBool(followerReadStr); err != nil {
                        err = unmatchedKey(followerReadKey)
                        return
                }
        } else {
                followerRead = vol.FollowerRead
        }
        if authenticateStr := r.FormValue(authenticateKey); authenticateStr != "" {
                if authenticate, err = strconv.ParseBool(authenticateStr); err != nil {
                        err = unmatchedKey(authenticateKey)
                        return
                }
        } else {
                authenticate = vol.authenticate
        }
        return
}

func parseRequestToSetApiQpsLimit(r *http.Request) (name string, limit uint32, timeout uint32, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                return
        }

        if limit, err = extractUint32(r, Limit); err != nil {
                return
        }

        if timeout, err = extractUint32(r, TimeOut); err != nil {
                return
        }

        if timeout == 0 {
                err = fmt.Errorf("timeout(seconds) args must be larger than 0")
        }

        return
}

func parseRequestToSetVolCapacity(r *http.Request) (name, authKey string, capacity int, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                return
        }

        if authKey, err = extractAuthKey(r); err != nil {
                return
        }

        if capacity, err = extractUint(r, volCapacityKey); err != nil {
                return
        }

        return
}

type qosArgs struct {
        qosEnable     bool
        diskQosEnable bool
        iopsRVal      uint64
        iopsWVal      uint64
        flowRVal      uint64
        flowWVal      uint64
}

func (qos *qosArgs) isArgsWork() bool {
        return (qos.iopsRVal | qos.iopsWVal | qos.flowRVal | qos.flowWVal) > 0
}

type coldVolArgs struct {
        objBlockSize     int
        cacheCap         uint64
        cacheAction      int
        cacheThreshold   int
        cacheTtl         int
        cacheHighWater   int
        cacheLowWater    int
        cacheLRUInterval int
        cacheRule        string
}

type createVolReq struct {
        name                                 string
        owner                                string
        dpSize                               int
        mpCount                              int
        dpCount                              int
        dpReplicaNum                         uint8
        capacity                             int
        deleteLockTime                       int64
        followerRead                         bool
        authenticate                         bool
        crossZone                            bool
        normalZonesFirst                     bool
        domainId                             uint64
        zoneName                             string
        description                          string
        volType                              int
        enablePosixAcl                       bool
        DpReadOnlyWhenVolFull                bool
        enableTransaction                    proto.TxOpMask
        enableQuota                          bool
        txTimeout                            int64
        txConflictRetryNum                   int64
        txConflictRetryInterval              int64
        qosLimitArgs                         *qosArgs
        clientReqPeriod, clientHitTriggerCnt uint32
        // cold vol args
        coldArgs coldVolArgs
}

func checkCacheAction(action int) error {
        if action != proto.NoCache && action != proto.RCache && action != proto.RWCache {
                return fmt.Errorf("cache action is not legal, action [%d]", action)
        }

        return nil
}

func parseColdArgs(r *http.Request) (args coldVolArgs, err error) {
        args.cacheRule = extractStr(r, cacheRuleKey)

        if args.objBlockSize, err = extractUint(r, ebsBlkSizeKey); err != nil {
                return
        }

        if args.cacheCap, err = extractUint64(r, cacheCapacity); err != nil {
                return
        }

        if args.cacheAction, err = extractUint(r, cacheActionKey); err != nil {
                return
        }

        if args.cacheThreshold, err = extractUint(r, cacheThresholdKey); err != nil {
                return
        }

        if args.cacheTtl, err = extractUint(r, cacheTTLKey); err != nil {
                return
        }

        if args.cacheHighWater, err = extractUint(r, cacheHighWaterKey); err != nil {
                return
        }

        if args.cacheLowWater, err = extractUint(r, cacheLowWaterKey); err != nil {
                return
        }

        if args.cacheLRUInterval, err = extractUint(r, cacheLRUIntervalKey); err != nil {
                return
        }

        return
}

func parseRequestToCreateVol(r *http.Request, req *createVolReq) (err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if req.name, err = extractName(r); err != nil {
                return
        }

        if req.owner, err = extractOwner(r); err != nil {
                return
        }

        if req.coldArgs, err = parseColdArgs(r); err != nil {
                return
        }

        if req.mpCount, err = extractUintWithDefault(r, metaPartitionCountKey, defaultInitMetaPartitionCount); err != nil {
                return
        }

        if req.dpCount, err = extractUintWithDefault(r, dataPartitionCountKey, defaultInitDataPartitionCnt); err != nil {
                return
        }

        var parsedDpReplicaNum int
        if parsedDpReplicaNum, err = extractUint(r, replicaNumKey); err != nil {
                return
        }
        if parsedDpReplicaNum < 0 || parsedDpReplicaNum > math.MaxUint8 {
                return fmt.Errorf("invalid arg dpReplicaNum: %v", parsedDpReplicaNum)
        }
        req.dpReplicaNum = uint8(parsedDpReplicaNum)

        if req.dpSize, err = extractUintWithDefault(r, dataPartitionSizeKey, 120); err != nil {
                return
        }

        // default capacity 120
        if req.capacity, err = extractUint(r, volCapacityKey); err != nil {
                return
        }

        if req.deleteLockTime, err = extractInt64WithDefault(r, volDeleteLockTimeKey, 0); err != nil {
                return
        }

        if req.volType, err = extractUint(r, volTypeKey); err != nil {
                return
        }

        followerRead, followerExist, err := extractFollowerRead(r)
        if err != nil {
                return
        }
        if followerExist && followerRead == false && proto.IsHot(req.volType) &&
                (req.dpReplicaNum == 1 || req.dpReplicaNum == 2) {
                return fmt.Errorf("vol with 1 ro 2 replia should enable followerRead")
        }
        req.followerRead = followerRead
        if proto.IsHot(req.volType) && (req.dpReplicaNum == 1 || req.dpReplicaNum == 2) {
                req.followerRead = true
        }

        if req.authenticate, err = extractBoolWithDefault(r, authenticateKey, false); err != nil {
                return
        }

        if req.crossZone, err = extractBoolWithDefault(r, crossZoneKey, false); err != nil {
                return
        }

        if req.normalZonesFirst, err = extractBoolWithDefault(r, normalZonesFirstKey, false); err != nil {
                return
        }

        if req.qosLimitArgs, err = parseRequestQos(r, false, false); err != nil {
                return err
        }
        req.zoneName = extractStr(r, zoneNameKey)
        req.description = extractStr(r, descriptionKey)
        req.domainId, err = extractUint64WithDefault(r, domainIdKey, 0)
        if err != nil {
                return
        }

        req.enablePosixAcl, err = extractPosixAcl(r)

        if req.DpReadOnlyWhenVolFull, err = extractBoolWithDefault(r, dpReadOnlyWhenVolFull, false); err != nil {
                return
        }

        var txMask proto.TxOpMask
        if txMask, err = parseTxMask(r, proto.TxOpMaskOff); err != nil {
                return
        }
        req.enableTransaction = txMask

        var txTimeout int64
        if txTimeout, err = extractTxTimeout(r); err != nil {
                return
        }
        req.txTimeout = txTimeout

        var txConflictRetryNum int64
        if txConflictRetryNum, err = extractTxConflictRetryNum(r); err != nil {
                return
        }
        req.txConflictRetryNum = txConflictRetryNum

        var txConflictRetryInterval int64
        if txConflictRetryInterval, err = extractTxConflictRetryInterval(r); err != nil {
                return
        }
        req.txConflictRetryInterval = txConflictRetryInterval

        if req.enableQuota, err = extractBoolWithDefault(r, enableQuota, false); err != nil {
                return
        }

        return
}

func parseRequestToCreateDataPartition(r *http.Request) (count int, name string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if countStr := r.FormValue(countKey); countStr == "" {
                err = keyNotFound(countKey)
                return
        } else if count, err = strconv.Atoi(countStr); err != nil || count == 0 {
                err = unmatchedKey(countKey)
                return
        }
        if name, err = extractName(r); err != nil {
                return
        }
        return
}

func parseRequestToGetConcurrentLcNode(r *http.Request) (count uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if count, err = extractUint64(r, countKey); err != nil || count == 0 {
                err = unmatchedKey(countKey)
                return
        }

        return
}

func parseRequestToGetDataPartition(r *http.Request) (ID uint64, volName string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if ID, err = extractDataPartitionID(r); err != nil {
                return
        }
        volName = r.FormValue(nameKey)
        return
}

func parseRequestToBalanceMetaPartition(r *http.Request) (zones string, nodeSetIds string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        zones = r.FormValue(zoneNameKey)
        nodeSetIds = r.FormValue(nodesetIdKey)

        return
}

func parseRequestToLoadDataPartition(r *http.Request) (ID uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if ID, err = extractDataPartitionID(r); err != nil {
                return
        }
        return
}

func parseRequestToAddMetaReplica(r *http.Request) (ID uint64, addr string, err error) {
        return extractMetaPartitionIDAndAddr(r)
}

func parseRequestToRemoveMetaReplica(r *http.Request) (ID uint64, addr string, err error) {
        return extractMetaPartitionIDAndAddr(r)
}

func extractMetaPartitionIDAndAddr(r *http.Request) (ID uint64, addr string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if ID, err = extractMetaPartitionID(r); err != nil {
                return
        }
        if addr, err = extractNodeAddr(r); err != nil {
                return
        }
        return
}

func parseRequestToAddDataReplica(r *http.Request) (ID uint64, addr string, err error) {
        return extractDataPartitionIDAndAddr(r)
}

func parseRequestToRemoveDataReplica(r *http.Request) (ID uint64, addr string, err error) {
        return extractDataPartitionIDAndAddr(r)
}

func extractDataPartitionIDAndAddr(r *http.Request) (ID uint64, addr string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if ID, err = extractDataPartitionID(r); err != nil {
                return
        }
        if addr, err = extractNodeAddr(r); err != nil {
                return
        }
        return
}

func extractDataPartitionID(r *http.Request) (ID uint64, err error) {
        var value string
        if value = r.FormValue(idKey); value == "" {
                err = keyNotFound(idKey)
                return
        }
        return strconv.ParseUint(value, 10, 64)
}

func parseRequestToDecommissionDataPartition(r *http.Request) (ID uint64, nodeAddr string, err error) {
        return extractDataPartitionIDAndAddr(r)
}

func extractNodeAddr(r *http.Request) (nodeAddr string, err error) {
        if nodeAddr = r.FormValue(addrKey); nodeAddr == "" {
                err = keyNotFound(addrKey)
                return
        }
        if ipAddr, ok := util.ParseAddrToIpAddr(nodeAddr); ok {
                nodeAddr = ipAddr
        }
        return
}

func extractNodeID(r *http.Request) (ID uint64, err error) {
        var value string
        if value = r.FormValue(idKey); value == "" {
                err = keyNotFound(idKey)
                return
        }
        return strconv.ParseUint(value, 10, 64)
}

func extractNodesetID(r *http.Request) (ID uint64, err error) {
        // nodeset id use same form key with node id
        return extractNodeID(r)
}

func extractDiskPath(r *http.Request) (diskPath string, err error) {
        if diskPath = r.FormValue(diskPathKey); diskPath == "" {
                err = keyNotFound(diskPathKey)
                return
        }
        return
}

func extractDiskDisable(r *http.Request) (diskDisable bool, err error) {
        var value string
        if value = r.FormValue(DiskDisableKey); value == "" {
                diskDisable = true
                return
        }
        return strconv.ParseBool(value)
}

func parseRequestToLoadMetaPartition(r *http.Request) (partitionID uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if partitionID, err = extractMetaPartitionID(r); err != nil {
                return
        }
        return
}

func parseRequestToDecommissionMetaPartition(r *http.Request) (partitionID uint64, nodeAddr string, err error) {
        return extractMetaPartitionIDAndAddr(r)
}

func parseAndExtractStatus(r *http.Request) (status bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        return extractStatus(r)
}

func parseAndExtractForbidden(r *http.Request) (forbidden bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        return extractForbidden(r)
}

func extractStatus(r *http.Request) (status bool, err error) {
        var value string
        if value = r.FormValue(enableKey); value == "" {
                err = keyNotFound(enableKey)
                return
        }
        if status, err = strconv.ParseBool(value); err != nil {
                return
        }
        return
}

func extractForbidden(r *http.Request) (forbidden bool, err error) {
        var value string
        if value = r.FormValue(forbiddenKey); value == "" {
                err = keyNotFound(forbiddenKey)
                return
        }
        if forbidden, err = strconv.ParseBool(value); err != nil {
                return
        }
        return
}

func extractDataNodesetSelector(r *http.Request) string {
        return r.FormValue(dataNodesetSelectorKey)
}

func extractMetaNodesetSelector(r *http.Request) string {
        return r.FormValue(metaNodesetSelectorKey)
}

func extractDataNodeSelector(r *http.Request) string {
        return r.FormValue(dataNodeSelectorKey)
}

func extractMetaNodeSelector(r *http.Request) string {
        return r.FormValue(metaNodeSelectorKey)
}

func extractFollowerRead(r *http.Request) (followerRead bool, exist bool, err error) {
        var value string
        if value = r.FormValue(followerReadKey); value == "" {
                followerRead = false
                return
        }
        exist = true
        if followerRead, err = strconv.ParseBool(value); err != nil {
                return
        }
        return
}

func extractAuthenticate(r *http.Request) (authenticate bool, err error) {
        var value string
        if value = r.FormValue(authenticateKey); value == "" {
                authenticate = false
                return
        }
        if authenticate, err = strconv.ParseBool(value); err != nil {
                return
        }
        return
}

func extractCrossZone(r *http.Request) (crossZone bool, err error) {
        var value string
        if value = r.FormValue(crossZoneKey); value == "" {
                crossZone = false
                return
        }
        if crossZone, err = strconv.ParseBool(value); err != nil {
                return
        }
        return
}

func parseAndExtractDirLimit(r *http.Request) (limit uint32, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        var value string

        value = r.FormValue(dirLimitKey)
        if value == "" {
                value = r.FormValue(dirQuotaKey)
                if value == "" {
                        err = keyNotFound(dirLimitKey)
                        return
                }
        }

        var tmpLimit uint64
        if tmpLimit, err = strconv.ParseUint(value, 10, 32); err != nil {
                return
        }

        limit = uint32(tmpLimit)
        return
}

func parseAndExtractThreshold(r *http.Request) (threshold float64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        var value string
        if value = r.FormValue(thresholdKey); value == "" {
                err = keyNotFound(thresholdKey)
                return
        }
        if threshold, err = strconv.ParseFloat(value, 64); err != nil {
                return
        }
        return
}

func parseAndExtractSetNodeSetInfoParams(r *http.Request) (params map[string]interface{}, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        var value string
        params = make(map[string]interface{})
        if value = r.FormValue(countKey); value != "" {
                count := uint64(0)
                count, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(countKey)
                        return
                }
                params[countKey] = count
        } else {
                return nil, fmt.Errorf("not found %v", countKey)
        }
        var zoneName string
        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }
        params[zoneNameKey] = zoneName

        if value = r.FormValue(idKey); value != "" {
                nodesetId := uint64(0)
                nodesetId, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(idKey)
                        err = unmatchedKey(idKey)
                        return
                }
                params[idKey] = nodesetId
        } else {
                return nil, fmt.Errorf("not found %v", idKey)
        }

        log.LogInfof("action[parseAndExtractSetNodeSetInfoParams]%v,%v,%v", params[zoneNameKey], params[idKey], params[countKey])

        return
}

func parseAndExtractSetNodeInfoParams(r *http.Request) (params map[string]interface{}, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        var value string
        noParams := true
        params = make(map[string]interface{})
        if value = r.FormValue(nodeDeleteBatchCountKey); value != "" {
                noParams = false
                batchCount := uint64(0)
                batchCount, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeDeleteBatchCountKey)
                        return
                }
                params[nodeDeleteBatchCountKey] = batchCount
        }

        if value = r.FormValue(nodeMarkDeleteRateKey); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeMarkDeleteRateKey)
                        return
                }
                params[nodeMarkDeleteRateKey] = val
        }

        if value = r.FormValue(nodeAutoRepairRateKey); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeAutoRepairRateKey)
                        return
                }
                params[nodeAutoRepairRateKey] = val
        }

        if value = r.FormValue(nodeDeleteWorkerSleepMs); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeMarkDeleteRateKey)
                        return
                }
                params[nodeDeleteWorkerSleepMs] = val
        }

        if value = r.FormValue(clusterLoadFactorKey); value != "" {
                noParams = false
                valF, err := strconv.ParseFloat(value, 64)
                if err != nil || valF < 0 {
                        err = unmatchedKey(clusterLoadFactorKey)
                        return params, err
                }

                params[clusterLoadFactorKey] = float32(valF)
        }

        if value = r.FormValue(maxDpCntLimitKey); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(maxDpCntLimitKey)
                        return
                }
                params[maxDpCntLimitKey] = val
        }

        if value = r.FormValue(nodeDpRepairTimeOutKey); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeDpRepairTimeOutKey)
                        return
                }
                params[nodeDpRepairTimeOutKey] = val
        }

        if value = r.FormValue(nodeDpMaxRepairErrCntKey); value != "" {
                noParams = false
                val := uint64(0)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        err = unmatchedKey(nodeDpMaxRepairErrCntKey)
                        return
                }
                params[nodeDpMaxRepairErrCntKey] = val
        }

        if value = r.FormValue(clusterCreateTimeKey); value != "" {
                noParams = false
                params[clusterCreateTimeKey] = value
        }

        if value = extractDataNodesetSelector(r); value != "" {
                noParams = false
                params[dataNodesetSelectorKey] = value
        }

        if value = extractMetaNodesetSelector(r); value != "" {
                noParams = false
                params[metaNodesetSelectorKey] = value
        }

        if value = extractDataNodeSelector(r); value != "" {
                noParams = false
                params[dataNodeSelectorKey] = value
        }

        if value = extractMetaNodeSelector(r); value != "" {
                noParams = false
                params[metaNodeSelectorKey] = value
        }

        if noParams {
                err = keyNotFound(nodeDeleteBatchCountKey)
                return
        }
        return
}

func validateRequestToCreateMetaPartition(r *http.Request) (volName string, count int, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if countStr := r.FormValue(countKey); countStr == "" {
                err = keyNotFound(countKey)
                return
        } else if count, err = strconv.Atoi(countStr); err != nil || count == 0 {
                err = unmatchedKey(countKey)
                return
        }
        if volName, err = extractName(r); err != nil {
                return
        }
        return
}

func parseAndExtractPartitionInfo(r *http.Request) (partitionID uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if partitionID, err = extractMetaPartitionID(r); err != nil {
                return
        }
        return
}

func extractMetaPartitionID(r *http.Request) (partitionID uint64, err error) {
        var value string
        if value = r.FormValue(idKey); value == "" {
                err = keyNotFound(idKey)
                return
        }
        return strconv.ParseUint(value, 10, 64)
}

func extractAuthKey(r *http.Request) (authKey string, err error) {
        if authKey = r.FormValue(volAuthKey); authKey == "" {
                err = keyNotFound(volAuthKey)
                return
        }
        return
}

func extractClientIDKey(r *http.Request) (clientIDKey string, err error) {
        if clientIDKey = r.FormValue(ClientIDKey); clientIDKey == "" {
                err = keyNotFound(ClientIDKey)
                return
        }
        return
}

func parseVolStatReq(r *http.Request) (name string, ver int, byMeta bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        name, err = extractName(r)
        if err != nil {
                return
        }

        ver, err = extractUint(r, clientVersion)
        if err != nil {
                return
        }
        byMeta, err = extractBoolWithDefault(r, CountByMeta, false)
        if err != nil {
                return
        }
        return
}

func parseQosInfo(r *http.Request) (info *proto.ClientReportLimitInfo, err error) {
        info = proto.NewClientReportLimitInfo()
        var body []byte
        if body, err = io.ReadAll(r.Body); err != nil {
                return
        }
        // log.LogInfof("action[parseQosInfo] body len:[%v],crc:[%v]", len(body), crc32.ChecksumIEEE(body))
        err = json.Unmarshal(body, info)
        return
}

func parseAndExtractName(r *http.Request) (name string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        return extractName(r)
}

func extractName(r *http.Request) (name string, err error) {
        if name = r.FormValue(nameKey); name == "" {
                err = keyNotFound(nameKey)
                return
        }
        if !volNameRegexp.MatchString(name) {
                return "", errors.New("name can only be number and letters")
        }

        return
}

func extractUint(r *http.Request, key string) (val int, err error) {
        var str string
        var valParsed int64
        if str = r.FormValue(key); str == "" {
                return 0, nil
        }

        if valParsed, err = strconv.ParseInt(str, 10, 32); err != nil || valParsed < 0 {
                return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
        }

        val = int(valParsed)
        return val, nil
}

func extractPositiveUint(r *http.Request, key string) (val int, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return 0, fmt.Errorf("args [%s] is not legal", key)
        }

        if val, err = strconv.Atoi(str); err != nil || val <= 0 {
                return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
        }

        return val, nil
}

func extractUint64(r *http.Request, key string) (val uint64, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return 0, nil
        }

        if val, err = strconv.ParseUint(str, 10, 64); err != nil || val < 0 {
                return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
        }

        return val, nil
}

func extractUint32(r *http.Request, key string) (val uint32, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return 0, nil
        }

        var tmp uint64
        if tmp, err = strconv.ParseUint(str, 10, 32); err != nil || val < 0 {
                return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
        }

        return uint32(tmp), nil
}

func extractPositiveUint64(r *http.Request, key string) (val uint64, err error) {
        var str string
        if str = r.FormValue(key); str == "" {
                return 0, fmt.Errorf("args [%s] is not legal", key)
        }

        if val, err = strconv.ParseUint(str, 10, 64); err != nil || val <= 0 {
                return 0, fmt.Errorf("args [%s] is not legal, val %s", key, str)
        }

        return val, nil
}

func extractStr(r *http.Request, key string) (val string) {
        return r.FormValue(key)
}

func extractOwner(r *http.Request) (owner string, err error) {
        if owner = r.FormValue(volOwnerKey); owner == "" {
                err = keyNotFound(volOwnerKey)
                return
        }
        if !ownerRegexp.MatchString(owner) {
                return "", errors.New("owner can only be number and letters")
        }

        return
}

func parseAndCheckTicket(r *http.Request, key []byte, volName string) (jobj proto.APIAccessReq, ticket cryptoutil.Ticket, ts int64, err error) {
        var plaintext []byte

        if err = r.ParseForm(); err != nil {
                return
        }

        if plaintext, err = extractClientReqInfo(r); err != nil {
                return
        }

        if err = json.Unmarshal([]byte(plaintext), &jobj); err != nil {
                return
        }

        if err = proto.VerifyAPIAccessReqIDs(&jobj); err != nil {
                return
        }

        ticket, ts, err = extractTicketMess(&jobj, key, volName)

        return
}

func extractClientReqInfo(r *http.Request) (plaintext []byte, err error) {
        var message string
        if err = r.ParseForm(); err != nil {
                return
        }

        if message = r.FormValue(proto.ClientMessage); message == "" {
                err = keyNotFound(proto.ClientMessage)
                return
        }

        if plaintext, err = cryptoutil.Base64Decode(message); err != nil {
                return
        }

        return
}

func extractTicketMess(req *proto.APIAccessReq, key []byte, volName string) (ticket cryptoutil.Ticket, ts int64, err error) {
        if ticket, err = proto.ExtractTicket(req.Ticket, key); err != nil {
                err = fmt.Errorf("extractTicket failed: %s", err.Error())
                return
        }
        if time.Now().Unix() >= ticket.Exp {
                err = proto.ErrExpiredTicket
                return
        }
        if ts, err = proto.ParseVerifier(req.Verifier, ticket.SessionKey.Key); err != nil {
                err = fmt.Errorf("parseVerifier failed: %s", err.Error())
                return
        }
        if err = proto.CheckAPIAccessCaps(&ticket, proto.APIRsc, req.Type, proto.APIAccess); err != nil {
                err = fmt.Errorf("CheckAPIAccessCaps failed: %s", err.Error())
                return
        }
        if err = proto.CheckVOLAccessCaps(&ticket, volName, proto.VOLAccess, proto.MasterNode); err != nil {
                err = fmt.Errorf("CheckVOLAccessCaps failed: %s", err.Error())
                return
        }
        return
}

func checkTicket(encodedTicket string, key []byte, Type proto.MsgType) (ticket cryptoutil.Ticket, err error) {
        if ticket, err = proto.ExtractTicket(encodedTicket, key); err != nil {
                err = fmt.Errorf("extractTicket failed: %s", err.Error())
                return
        }
        if time.Now().Unix() >= ticket.Exp {
                err = proto.ErrExpiredTicket
                return
        }
        if err = proto.CheckAPIAccessCaps(&ticket, proto.APIRsc, Type, proto.APIAccess); err != nil {
                err = fmt.Errorf("CheckAPIAccessCaps failed: %s", err.Error())
                return
        }
        return
}

func newSuccessHTTPReply(data interface{}) *proto.HTTPReply {
        return &proto.HTTPReply{Code: proto.ErrCodeSuccess, Msg: proto.ErrSuc.Error(), Data: data}
}

func newErrHTTPReply(err error) *proto.HTTPReply {
        if err == nil {
                return newSuccessHTTPReply("")
        }

        code, ok := proto.Err2CodeMap[err]
        if ok {
                return &proto.HTTPReply{Code: code, Msg: err.Error()}
        }

        return &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()}
}

func sendOkReply(w http.ResponseWriter, r *http.Request, httpReply *proto.HTTPReply) (err error) {
        switch httpReply.Data.(type) {
        case *DataPartition:
                dp := httpReply.Data.(*DataPartition)
                dp.RLock()
                defer dp.RUnlock()
        case *MetaPartition:
                mp := httpReply.Data.(*MetaPartition)
                mp.RLock()
                defer mp.RUnlock()
        case *MetaNode:
                mn := httpReply.Data.(*MetaNode)
                mn.RLock()
                defer mn.RUnlock()
        case *DataNode:
                dn := httpReply.Data.(*DataNode)
                dn.RLock()
                defer dn.RUnlock()
        default:
                // do nothing
        }

        reply, err := json.Marshal(httpReply)
        if err != nil {
                log.LogErrorf("fail to marshal http reply. URL[%v],remoteAddr[%v] err:[%v]", r.URL, r.RemoteAddr, err)
                http.Error(w, "fail to marshal http reply", http.StatusBadRequest)
                return
        }

        if acceptEncoding := r.Header.Get(proto.HeaderAcceptEncoding); acceptEncoding != "" {
                if compressed, errx := compressor.New(acceptEncoding).Compress(reply); errx == nil {
                        w.Header().Set(proto.HeaderContentEncoding, acceptEncoding)
                        reply = compressed
                }
        }

        send(w, r, reply)
        return
}

func send(w http.ResponseWriter, r *http.Request, reply []byte) {
        w.Header().Set("content-type", "application/json")
        w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
        if _, err := w.Write(reply); err != nil {
                log.LogErrorf("fail to write http len[%d].URL[%v],remoteAddr[%v] err:[%v]", len(reply), r.URL, r.RemoteAddr, err)
                return
        }
        log.LogInfof("URL[%v],remoteAddr[%v],response ok", r.URL, r.RemoteAddr)
        return
}

func sendErrReply(w http.ResponseWriter, r *http.Request, httpReply *proto.HTTPReply) {
        log.LogInfof("URL[%v],remoteAddr[%v],response", r.URL, r.RemoteAddr)
        reply, err := json.Marshal(httpReply)
        if err != nil {
                log.LogErrorf("fail to marshal http reply. URL[%v],remoteAddr[%v] err:[%v]", r.URL, r.RemoteAddr, err)
                http.Error(w, "fail to marshal http reply", http.StatusBadRequest)
                return
        }

        w.Header().Set("content-type", "application/json")
        w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
        if _, err = w.Write(reply); err != nil {
                log.LogErrorf("fail to write http len[%d].URL[%v],remoteAddr[%v] err:[%v]", len(reply), r.URL, r.RemoteAddr, err)
        }

        return
}

func parseRequestToUpdateDecommissionLimit(r *http.Request) (limit uint64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        var value string
        if value = r.FormValue(decommissionLimit); value == "" {
                err = keyNotFound(decommissionLimit)
                return
        }

        limit, err = strconv.ParseUint(value, 10, 32)
        if err != nil {
                return
        }

        return
}

func parseSetConfigParam(r *http.Request) (key string, value string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if value = r.FormValue(cfgmetaPartitionInodeIdStep); value == "" {
                err = keyNotFound("config")
                return
        }
        key = cfgmetaPartitionInodeIdStep
        log.LogInfo("parseSetConfigParam success.")
        return
}

func parseGetConfigParam(r *http.Request) (key string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if key = r.FormValue(configKey); key == "" {
                err = keyNotFound("config")
                return
        }
        log.LogInfo("parseGetConfigParam success.")
        return
}

func parserSetQuotaParam(r *http.Request, req *proto.SetMasterQuotaReuqest) (err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if req.VolName, err = extractName(r); err != nil {
                return
        }

        if req.MaxFiles, err = extractUint64WithDefault(r, MaxFilesKey, math.MaxUint64); err != nil {
                return
        }

        if req.MaxBytes, err = extractUint64WithDefault(r, MaxBytesKey, math.MaxUint64); err != nil {
                return
        }
        var body []byte
        if body, err = io.ReadAll(r.Body); err != nil {
                return
        }

        if err = json.Unmarshal(body, &req.PathInfos); err != nil {
                return
        }

        log.LogInfo("parserSetQuotaParam success.")
        return
}

func parserUpdateQuotaParam(r *http.Request, req *proto.UpdateMasterQuotaReuqest) (err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if req.VolName, err = extractName(r); err != nil {
                return
        }

        if req.QuotaId, err = extractQuotaId(r); err != nil {
                return
        }

        if req.MaxFiles, err = extractUint64WithDefault(r, MaxFilesKey, math.MaxUint64); err != nil {
                return
        }

        if req.MaxBytes, err = extractUint64WithDefault(r, MaxBytesKey, math.MaxUint64); err != nil {
                return
        }
        log.LogInfo("parserUpdateQuotaParam success.")
        return
}

func parseDeleteQuotaParam(r *http.Request) (volName string, quotaId uint32, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if volName, err = extractName(r); err != nil {
                return
        }

        if quotaId, err = extractQuotaId(r); err != nil {
                return
        }

        return
}

func parseGetQuotaParam(r *http.Request) (volName string, quotaId uint32, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if volName, err = extractName(r); err != nil {
                return
        }

        if quotaId, err = extractQuotaId(r); err != nil {
                return
        }
        return
}

func extractPath(r *http.Request) (fullPath string, err error) {
        if fullPath = r.FormValue(fullPathKey); fullPath == "" {
                err = keyNotFound(nameKey)
                return
        }
        return
}

func extractQuotaId(r *http.Request) (quotaId uint32, err error) {
        var value string
        if value = r.FormValue(quotaKey); value == "" {
                err = keyNotFound(quotaKey)
                return
        }
        tmp, err := strconv.ParseUint(value, 10, 32)
        quotaId = uint32(tmp)
        return
}

func extractInodeId(r *http.Request) (inode uint64, err error) {
        var value string
        if value = r.FormValue(inodeKey); value == "" {
                err = keyNotFound(inodeKey)
                return
        }
        return strconv.ParseUint(value, 10, 64)
}

func parseRequestToUpdateDecommissionDiskFactor(r *http.Request) (factor float64, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        var value string
        if value = r.FormValue(decommissionDiskFactor); value == "" {
                err = keyNotFound(decommissionDiskFactor)
                return
        }
        return strconv.ParseFloat(value, 64)
}

func parseS3QosReq(r *http.Request, req *proto.S3QosRequest) (err error) {
        var body []byte
        if body, err = io.ReadAll(r.Body); err != nil {
                return
        }

        if err = json.Unmarshal(body, &req); err != nil {
                return
        }

        log.LogInfo("parseS3QosReq success.")
        return
}

package master

import (
        "context"
        "encoding/json"
        "fmt"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "golang.org/x/time/rate"
)

const (
        defaultApiLimitBurst = 1
)

type ApiLimitInfo struct {
        ApiName        string        `json:"api_name"`
        QueryPath      string        `json:"query_path"`
        Limit          uint32        `json:"limit"` // qps
        LimiterTimeout uint32        `json:"limiter_timeout"`
        Limiter        *rate.Limiter `json:"-"`
}

func (li *ApiLimitInfo) InitLimiter() {
        li.Limiter = rate.NewLimiter(rate.Limit(li.Limit), defaultApiLimitBurst)
}

type ApiLimiter struct {
        m            sync.RWMutex
        limiterInfos map[string]*ApiLimitInfo
}

func newApiLimiter() *ApiLimiter {
        return &ApiLimiter{
                limiterInfos: make(map[string]*ApiLimitInfo),
        }
}

func (l *ApiLimiter) clear() {
        for k := range l.limiterInfos {
                delete(l.limiterInfos, k)
        }
}

func (l *ApiLimiter) Clear() {
        l.m.Lock()
        defer l.m.Unlock()
        l.clear()
}

func (l *ApiLimiter) Replace(limiterInfos map[string]*ApiLimitInfo) {
        l.m.Lock()
        defer l.m.Unlock()
        l.clear()
        for k, v := range limiterInfos {
                l.limiterInfos[k] = v
        }
}

func (l *ApiLimiter) SetLimiter(apiName string, Limit uint32, LimiterTimeout uint32) (err error) {
        var normalizedName string
        var qPath string
        if err, normalizedName, qPath = l.IsApiNameValid(apiName); err != nil {
                return err
        }

        lInfo := &ApiLimitInfo{
                ApiName:        normalizedName,
                QueryPath:      qPath,
                Limit:          Limit,
                LimiterTimeout: LimiterTimeout,
        }
        lInfo.InitLimiter()

        l.m.Lock()
        l.limiterInfos[qPath] = lInfo
        l.m.Unlock()
        return nil
}

func (l *ApiLimiter) RmLimiter(apiName string) (err error) {
        var qPath string
        if err, _, qPath = l.IsApiNameValid(apiName); err != nil {
                return err
        }

        l.m.Lock()
        delete(l.limiterInfos, qPath)
        l.m.Unlock()
        return nil
}

func (l *ApiLimiter) Wait(qPath string) (err error) {
        var lInfo *ApiLimitInfo
        var ok bool
        l.m.RLock()
        if lInfo, ok = l.limiterInfos[qPath]; !ok {
                l.m.RUnlock()
                log.LogDebugf("no api limiter for api[%v]", qPath)
                return nil
        }
        l.m.RUnlock()
        ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(lInfo.LimiterTimeout))
        defer cancel()
        err = lInfo.Limiter.Wait(ctx)
        if err != nil {
                log.LogErrorf("wait api limiter for api[%v] failed: %v", qPath, err)
                return err
        }
        log.LogDebugf("wait api limiter for api[%v]", qPath)
        return nil
}

func (l *ApiLimiter) IsApiNameValid(name string) (err error, normalizedName, qPath string) {
        normalizedName = strings.ToLower(name)
        var ok bool
        if qPath, ok = proto.GApiInfo[normalizedName]; ok {
                return nil, normalizedName, qPath
        }
        return fmt.Errorf("api name [%v] is not valid", name), normalizedName, qPath
}

func (l *ApiLimiter) IsFollowerLimiter(qPath string) bool {
        if qPath == proto.AdminGetIP || qPath == proto.ClientDataPartitions {
                return true
        }
        return false
}

func (l *ApiLimiter) updateLimiterInfoFromLeader(value []byte) {
        limiterInfos := make(map[string]*ApiLimitInfo)
        if err := json.Unmarshal(value, &limiterInfos); err != nil {
                log.LogErrorf("action[updateLimiterInfoFromLeader], unmarshal err:%v", err.Error())
                return
        }

        for _, v := range limiterInfos {
                v.InitLimiter()
        }

        l.m.Lock()
        l.limiterInfos = limiterInfos
        l.m.Unlock()
        log.LogInfof("action[updateLimiterInfoFromLeader], limiter info[%v]", value)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "context"
        "encoding/json"
        "fmt"
        "io"
        "math"
        "net/http"
        "regexp"
        "strconv"
        "strings"
        "sync/atomic"
        "time"

        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/compressor"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/iputil"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

func apiToMetricsName(api string) (reqMetricName string) {
        var builder strings.Builder
        builder.WriteString("req")
        // prometheus metric not allow '/' in name, need to transfer to '_'
        builder.WriteString(strings.Replace(api, "/", "_", -1))
        return builder.String()
}

func doStatAndMetric(statName string, metric *exporter.TimePointCount, err error, metricLabels map[string]string) {
        if metric == nil {
                return
        }
        if metricLabels == nil {
                metric.Set(err)
        } else {
                metric.SetWithLabels(err, metricLabels)
        }

        startTime := metric.GetStartTime()
        stat.EndStat(statName, err, &startTime, 1)
}

// NodeView provides the view of the data or meta node.
type NodeView struct {
        Addr       string
        Status     bool
        ID         uint64
        IsWritable bool
}

// NodeView provides the view of the data or meta node.
type InvalidNodeView struct {
        Addr     string
        ID       uint64
        OldID    uint64
        NodeType string
}

// TopologyView provides the view of the topology view of the cluster
type TopologyView struct {
        Zones []*ZoneView
}

type NodeSetView struct {
        DataNodeLen int
        MetaNodeLen int
        MetaNodes   []proto.NodeView
        DataNodes   []proto.NodeView
}

func newNodeSetView(dataNodeLen, metaNodeLen int) *NodeSetView {
        return &NodeSetView{DataNodes: make([]proto.NodeView, 0), MetaNodes: make([]proto.NodeView, 0), DataNodeLen: dataNodeLen, MetaNodeLen: metaNodeLen}
}

// ZoneView define the view of zone
type ZoneView struct {
        Name                string
        Status              string
        DataNodesetSelector string
        MetaNodesetSelector string
        NodeSet             map[uint64]*NodeSetView
}

func newZoneView(name string) *ZoneView {
        return &ZoneView{NodeSet: make(map[uint64]*NodeSetView, 0), Name: name}
}

type badPartitionView = proto.BadPartitionView

func (m *Server) setClusterInfo(w http.ResponseWriter, r *http.Request) {
        var (
                dirLimit uint32
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetClusterInfo))
        defer func() {
                doStatAndMetric(proto.AdminSetClusterInfo, metric, err, nil)
        }()

        if dirLimit, err = parseAndExtractDirLimit(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if dirLimit < proto.MinDirChildrenNumLimit {
                dirLimit = proto.MinDirChildrenNumLimit
        }
        if err = m.cluster.setClusterInfo(dirLimit); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set dir limit(min:%v, max:%v) to %v successfully",
                proto.MinDirChildrenNumLimit, math.MaxUint32, dirLimit)))
}

func (m *Server) getMonitorPushAddr(w http.ResponseWriter, r *http.Request) {
        var (
                addr string
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMonitorPushAddr))
        defer func() {
                doStatAndMetric(proto.AdminGetMonitorPushAddr, metric, err, nil)
        }()

        addr = m.cluster.getMonitorPushAddr()
        sendOkReply(w, r, newSuccessHTTPReply(addr))
}

// Set the threshold of the memory usage on each meta node.
// If the memory usage reaches this threshold, then all the mata partition will be marked as readOnly.
func (m *Server) setMetaNodeThreshold(w http.ResponseWriter, r *http.Request) {
        var (
                threshold float64
                err       error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetMetaNodeThreshold))
        defer func() {
                doStatAndMetric(proto.AdminSetMetaNodeThreshold, metric, err, nil)
        }()

        if threshold, err = parseAndExtractThreshold(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.setMetaNodeThreshold(float32(threshold)); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set threshold to %v successfully", threshold)))
}

// Turn on or off the automatic allocation of the data partitions.
// If DisableAutoAllocate == off, then we WILL NOT automatically allocate new data partitions for the volume when:
//  1. the used space is below the max capacity,
//  2. and the number of r&w data partition is less than 20.
//
// If DisableAutoAllocate == on, then we WILL automatically allocate new data partitions for the volume when:
//  1. the used space is below the max capacity,
//  2. and the number of r&w data partition is less than 20.
func (m *Server) setupAutoAllocation(w http.ResponseWriter, r *http.Request) {
        var (
                status bool
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterFreeze))
        defer func() {
                doStatAndMetric(proto.AdminClusterFreeze, metric, err, nil)
        }()

        if status, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.setDisableAutoAllocate(status); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set DisableAutoAllocate to %v successfully", status)))
}

func (m *Server) forbidVolume(w http.ResponseWriter, r *http.Request) {
        var (
                status bool
                name   string
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolForbidden))
        defer func() {
                doStatAndMetric(proto.AdminVolForbidden, metric, err, nil)
                if err != nil {
                        log.LogErrorf("set volume forbidden failed, error: %v", err)
                } else {
                        log.LogInfof("set volume forbidden to (%v) success", status)
                }
        }()
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if status, err = parseAndExtractForbidden(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        vol, err := m.cluster.getVol(name)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        oldForbiden := vol.Forbidden
        vol.Forbidden = status
        defer func() {
                if err != nil {
                        vol.Forbidden = oldForbiden
                }
        }()
        if err = m.cluster.syncUpdateVol(vol); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if status {
                // set data partition status to write only
                vol.setDpRdOnly()
                // set meta partition status to read only
                vol.setMpRdOnly()
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set volume forbidden to (%v) success", status)))
}

func (m *Server) setEnableAuditLogForVolume(w http.ResponseWriter, r *http.Request) {
        var (
                status bool
                name   string
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolEnableAuditLog))
        defer func() {
                doStatAndMetric(proto.AdminVolEnableAuditLog, metric, err, nil)
                if err != nil {
                        log.LogErrorf("set volume aduit log failed, error: %v", err)
                } else {
                        log.LogInfof("set volume aduit log to (%v) success", status)
                }
        }()
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if status, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        vol, err := m.cluster.getVol(name)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        oldEnable := vol.EnableAuditLog
        vol.EnableAuditLog = status
        defer func() {
                if err != nil {
                        vol.EnableAuditLog = oldEnable
                }
        }()
        if err = m.cluster.syncUpdateVol(vol); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set volume audit log to (%v) success", status)))
}

func (m *Server) setupForbidMetaPartitionDecommission(w http.ResponseWriter, r *http.Request) {
        var (
                status bool
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterForbidMpDecommission))
        defer func() {
                doStatAndMetric(proto.AdminClusterForbidMpDecommission, metric, err, nil)
                if err != nil {
                        log.LogErrorf("set ForbidMpDecommission failed, error: %v", err)
                } else {
                        log.LogInfof("set ForbidMpDecommission to (%v) success", status)
                }
        }()

        if status, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.setForbidMpDecommission(status); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set ForbidMpDecommission to %v successfully", status)))
}

// View the topology of the cluster.
func (m *Server) getTopology(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetTopologyView))
        defer func() {
                doStatAndMetric(proto.GetTopologyView, metric, nil, nil)
        }()

        tv := &TopologyView{
                Zones: make([]*ZoneView, 0),
        }
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                cv := newZoneView(zone.name)
                cv.Status = zone.getStatusToString()
                cv.DataNodesetSelector = zone.GetDataNodesetSelector()
                cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
                tv.Zones = append(tv.Zones, cv)
                nsc := zone.getAllNodeSet()
                for _, ns := range nsc {
                        nsView := newNodeSetView(ns.dataNodeLen(), ns.metaNodeLen())
                        cv.NodeSet[ns.ID] = nsView
                        ns.dataNodes.Range(func(key, value interface{}) bool {
                                dataNode := value.(*DataNode)
                                nsView.DataNodes = append(nsView.DataNodes, proto.NodeView{
                                        ID: dataNode.ID, Addr: dataNode.Addr,
                                        DomainAddr: dataNode.DomainAddr, IsActive: dataNode.isActive, IsWritable: dataNode.isWriteAble(),
                                })
                                return true
                        })
                        ns.metaNodes.Range(func(key, value interface{}) bool {
                                metaNode := value.(*MetaNode)
                                nsView.MetaNodes = append(nsView.MetaNodes, proto.NodeView{
                                        ID: metaNode.ID, Addr: metaNode.Addr,
                                        DomainAddr: metaNode.DomainAddr, IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable(),
                                })
                                return true
                        })
                }
        }
        sendOkReply(w, r, newSuccessHTTPReply(tv))
}

func (m *Server) updateZone(w http.ResponseWriter, r *http.Request) {
        var (
                name string
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UpdateZone))
        defer func() {
                doStatAndMetric(proto.UpdateZone, metric, err, nil)
        }()

        if name = r.FormValue(nameKey); name == "" {
                err = keyNotFound(nameKey)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        status, err := extractStatus(r)
        dataNodesetSelector := extractDataNodesetSelector(r)
        metaNodesetSelector := extractMetaNodesetSelector(r)
        dataNodeSelector := extractDataNodeSelector(r)
        metaNodeSelector := extractMetaNodeSelector(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        zone, err := m.cluster.t.getZone(name)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeZoneNotExists, Msg: err.Error()})
                return
        }
        if status {
                zone.setStatus(normalZone)
        } else {
                zone.setStatus(unavailableZone)
        }
        err = zone.updateNodesetSelector(m.cluster, dataNodesetSelector, metaNodesetSelector)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        err = m.updateZoneNodeSelector(zone.name, dataNodeSelector, metaNodeSelector)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("update zone status to [%v] successfully", status)))
}

func (m *Server) listZone(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetAllZones))
        defer func() {
                doStatAndMetric(proto.GetAllZones, metric, nil, nil)
        }()

        zones := m.cluster.t.getAllZones()
        zoneViews := make([]*ZoneView, 0)
        for _, zone := range zones {
                cv := newZoneView(zone.name)
                cv.Status = zone.getStatusToString()
                cv.DataNodesetSelector = zone.GetDataNodesetSelector()
                cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
                zoneViews = append(zoneViews, cv)
        }
        sendOkReply(w, r, newSuccessHTTPReply(zoneViews))
}

func (m *Server) listNodeSets(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetAllNodeSets))
        defer func() {
                doStatAndMetric(proto.GetAllNodeSets, metric, nil, nil)
        }()

        var zones []*Zone

        // if zoneName is empty, list all nodeSets, otherwise list node sets in the specified zone
        zoneName := r.FormValue(zoneNameKey)
        if zoneName == "" {
                zones = m.cluster.t.getAllZones()
        } else {
                zone, err := m.cluster.t.getZone(zoneName)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeZoneNotExists, Msg: err.Error()})
                        return
                }
                zones = []*Zone{zone}
        }

        nodeSetStats := make([]*proto.NodeSetStat, 0)

        for _, zone := range zones {
                nsc := zone.getAllNodeSet()
                for _, ns := range nsc {
                        nsStat := &proto.NodeSetStat{
                                ID:          ns.ID,
                                Capacity:    ns.Capacity,
                                Zone:        zone.name,
                                DataNodeNum: ns.dataNodeLen(),
                                MetaNodeNum: ns.metaNodeLen(),
                        }
                        nodeSetStats = append(nodeSetStats, nsStat)
                }
        }

        sendOkReply(w, r, newSuccessHTTPReply(nodeSetStats))
}

func (m *Server) getNodeSet(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetNodeSet))
        defer func() {
                doStatAndMetric(proto.GetNodeSet, metric, nil, nil)
        }()

        nodeSetStr := r.FormValue(nodesetIdKey)
        if nodeSetStr == "" {
                err := keyNotFound(nodesetIdKey)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        nodeSetId, err := strconv.ParseUint(nodeSetStr, 10, 64)
        if err != nil {
                err = fmt.Errorf("invalid nodeSetId: %v", nodeSetStr)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        ns, err := m.cluster.t.getNodeSetByNodeSetId(nodeSetId)
        if err != nil {
                err := nodeSetNotFound(nodeSetId)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeNodeSetNotExists, Msg: err.Error()})
                return
        }

        nsStat := &proto.NodeSetStatInfo{
                ID:               ns.ID,
                Capacity:         ns.Capacity,
                Zone:             ns.zoneName,
                DataNodeSelector: ns.GetDataNodeSelector(),
                MetaNodeSelector: ns.GetMetaNodeSelector(),
        }
        ns.dataNodes.Range(func(key, value interface{}) bool {
                dn := value.(*DataNode)
                nsStat.DataNodes = append(nsStat.DataNodes, &proto.NodeStatView{
                        Addr:       dn.Addr,
                        Status:     dn.isActive,
                        DomainAddr: dn.DomainAddr,
                        ID:         dn.ID,
                        IsWritable: dn.isWriteAble(),
                        Total:      dn.Total,
                        Used:       dn.Used,
                        Avail:      dn.Total - dn.Used,
                })
                return true
        })
        ns.metaNodes.Range(func(key, value interface{}) bool {
                mn := value.(*MetaNode)
                nsStat.MetaNodes = append(nsStat.MetaNodes, &proto.NodeStatView{
                        Addr:       mn.Addr,
                        Status:     mn.IsActive,
                        DomainAddr: mn.DomainAddr,
                        ID:         mn.ID,
                        IsWritable: mn.isWritable(),
                        Total:      mn.Total,
                        Used:       mn.Used,
                        Avail:      mn.Total - mn.Used,
                })
                return true
        })

        sendOkReply(w, r, newSuccessHTTPReply(nsStat))
}

func (m *Server) updateNodeSet(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UpdateNodeSet))
        defer func() {
                doStatAndMetric(proto.UpdateNodeSet, metric, nil, nil)
        }()
        nodeSetStr := r.FormValue(nodesetIdKey)
        if nodeSetStr == "" {
                err := keyNotFound(nodesetIdKey)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        nodeSetId, err := strconv.ParseUint(nodeSetStr, 10, 64)
        if err != nil {
                err = fmt.Errorf("invalid nodeSetId: %v", nodeSetStr)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        ns, err := m.cluster.t.getNodeSetByNodeSetId(nodeSetId)
        if err != nil {
                err := nodeSetNotFound(nodeSetId)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeNodeSetNotExists, Msg: err.Error()})
                return
        }
        dataNodeSelector := extractDataNodeSelector(r)
        metaNodeSelector := extractMetaNodeSelector(r)
        needSync := false
        if dataNodeSelector != "" && dataNodeSelector != ns.GetDataNodeSelector() {
                ns.SetDataNodeSelector(dataNodeSelector)
                needSync = true
        }
        if metaNodeSelector != "" && metaNodeSelector != ns.GetMetaNodeSelector() {
                ns.SetMetaNodeSelector(metaNodeSelector)
                needSync = true
        }
        if needSync {
                err = m.cluster.syncUpdateNodeSet(ns)
                if err != nil {
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
        }
        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func (m *Server) clusterStat(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminClusterStat))
        defer func() {
                doStatAndMetric(proto.AdminClusterStat, metric, nil, nil)
        }()

        cs := &proto.ClusterStatInfo{
                DataNodeStatInfo: m.cluster.dataNodeStatInfo,
                MetaNodeStatInfo: m.cluster.metaNodeStatInfo,
                ZoneStatInfo:     make(map[string]*proto.ZoneStat, 0),
        }
        for zoneName, zoneStat := range m.cluster.zoneStatInfos {
                cs.ZoneStatInfo[zoneName] = zoneStat
        }
        sendOkReply(w, r, newSuccessHTTPReply(cs))
}

func (m *Server) UidOperate(w http.ResponseWriter, r *http.Request) {
        var (
                uid     uint32
                err     error
                volName string
                vol     *Vol
                op      uint64
                value   string
                capSize uint64
                uidList []*proto.UidSpaceInfo
                uidInfo *proto.UidSpaceInfo
                ok      bool
        )
        if volName, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if value = r.FormValue(OperateKey); value == "" {
                err = keyNotFound(OperateKey)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        op, err = strconv.ParseUint(value, 10, 64)
        if err != nil {
                err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", OperateKey, value, err.Error())
                return
        }

        if op != util.UidLimitList {
                if uid, err = extractUint32(r, UIDKey); err != nil {
                        err = keyNotFound(UIDKey)
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
        }

        if op == util.UidAddLimit {
                if capSize, err = extractPositiveUint64(r, CapacityKey); err != nil {
                        err = keyNotFound(CapacityKey)
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
        }

        log.LogDebugf("uidOperate. name %v op %v uid %v", volName, op, uid)
        if vol, err = m.cluster.getVol(volName); err != nil {
                log.LogDebugf("aclOperate. name %v not found", volName)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        ok = true
        switch op {
        case util.UidGetLimit:
                ok, uidInfo = vol.uidSpaceManager.checkUid(uid)
                uidList = append(uidList, uidInfo)
        case util.AclAddIP:
                ok = vol.uidSpaceManager.addUid(uid, capSize)
        case util.AclDelIP:
                ok = vol.uidSpaceManager.removeUid(uid)
        case util.AclListIP:
                uidList = vol.uidSpaceManager.listAll()
        default:
                // do nothing
        }

        rsp := &proto.UidSpaceRsp{
                OK:          ok,
                UidSpaceArr: uidList,
        }

        _ = sendOkReply(w, r, newSuccessHTTPReply(rsp))
}

func (m *Server) aclOperate(w http.ResponseWriter, r *http.Request) {
        var (
                ip      string
                err     error
                volName string
                vol     *Vol
                op      uint64
                value   string
                ok, res bool
                ipList  []*proto.AclIpInfo
        )
        if volName, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if value = r.FormValue(OperateKey); value == "" {
                err = keyNotFound(OperateKey)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        op, err = strconv.ParseUint(value, 10, 64)
        if err != nil {
                err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", OperateKey, value, err.Error())
                return
        }

        if op != util.AclListIP {
                if ip = r.FormValue(IPKey); ip == "" {
                        err = keyNotFound(IPKey)
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
        }

        log.LogDebugf("aclOperate. name %v op %v ip %v", volName, op, ip)
        if vol, err = m.cluster.getVol(volName); err != nil {
                log.LogDebugf("aclOperate. name %v not found", volName)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        ok = true
        opAclRes := vol.aclMgr.aclOperate(op, ip)
        switch op {
        case util.AclCheckIP:
                if ipList, res = opAclRes.([]*proto.AclIpInfo); !res {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
                        return
                }
                if len(ipList) > 0 {
                        ok = false
                }
        case util.AclAddIP, util.AclDelIP:
                if opAclRes != nil {
                        if err, res = opAclRes.(error); !res {
                                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
                                return
                        }
                }
        case util.AclListIP:
                if ipList, res = opAclRes.([]*proto.AclIpInfo); !res {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("inner error")))
                        return
                }
        default:
                // do nothing
        }

        rsp := &proto.AclRsp{
                OK:   ok,
                List: ipList,
        }
        _ = sendOkReply(w, r, newSuccessHTTPReply(rsp))
}

func (m *Server) getCluster(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetCluster))
        defer func() {
                doStatAndMetric(proto.AdminGetCluster, metric, nil, nil)
        }()

        cv := &proto.ClusterView{
                Name:                 m.cluster.Name,
                CreateTime:           time.Unix(m.cluster.CreateTime, 0).Format(proto.TimeFormat),
                LeaderAddr:           m.leaderInfo.addr,
                DisableAutoAlloc:     m.cluster.DisableAutoAllocate,
                ForbidMpDecommission: m.cluster.ForbidMpDecommission,
                MetaNodeThreshold:    m.cluster.cfg.MetaNodeThreshold,
                Applied:              m.fsm.applied,
                MaxDataPartitionID:   m.cluster.idAlloc.dataPartitionID,
                MaxMetaNodeID:        m.cluster.idAlloc.commonID,
                MaxMetaPartitionID:   m.cluster.idAlloc.metaPartitionID,
                MasterNodes:          make([]proto.NodeView, 0),
                MetaNodes:            make([]proto.NodeView, 0),
                DataNodes:            make([]proto.NodeView, 0),
                VolStatInfo:          make([]*proto.VolStatInfo, 0),
                BadPartitionIDs:      make([]proto.BadPartitionView, 0),
                BadMetaPartitionIDs:  make([]proto.BadPartitionView, 0),
        }

        vols := m.cluster.allVolNames()
        cv.MasterNodes = m.cluster.allMasterNodes()
        cv.MetaNodes = m.cluster.allMetaNodes()
        cv.DataNodes = m.cluster.allDataNodes()
        cv.DataNodeStatInfo = m.cluster.dataNodeStatInfo
        cv.MetaNodeStatInfo = m.cluster.metaNodeStatInfo
        for _, name := range vols {
                stat, ok := m.cluster.volStatInfo.Load(name)
                if !ok {
                        cv.VolStatInfo = append(cv.VolStatInfo, newVolStatInfo(name, 0, 0, 0, 0, 0))
                        continue
                }
                cv.VolStatInfo = append(cv.VolStatInfo, stat.(*volStatInfo))
        }
        cv.BadPartitionIDs = m.cluster.getBadDataPartitionsView()
        cv.BadMetaPartitionIDs = m.cluster.getBadMetaPartitionsView()

        sendOkReply(w, r, newSuccessHTTPReply(cv))
}

func (m *Server) getApiList(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMasterApiList))
        defer func() {
                doStatAndMetric(proto.AdminGetMasterApiList, metric, nil, nil)
        }()

        sendOkReply(w, r, newSuccessHTTPReply(proto.GApiInfo))
}

func (m *Server) setApiQpsLimit(w http.ResponseWriter, r *http.Request) {
        var (
                name    string
                limit   uint32
                timeout uint32
                err     error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetApiQpsLimit))
        defer func() {
                doStatAndMetric(proto.AdminSetApiQpsLimit, metric, err, nil)
        }()

        if name, limit, timeout, err = parseRequestToSetApiQpsLimit(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = m.cluster.apiLimiter.SetLimiter(name, limit, timeout); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        // persist to rocksdb
        var qPath string
        if err, _, qPath = m.cluster.apiLimiter.IsApiNameValid(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if err = m.cluster.syncPutApiLimiterInfo(m.cluster.apiLimiter.IsFollowerLimiter(qPath)); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set api qps limit failed: %v", err)))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set api qps limit success: name: %v, limit: %v, timeout: %v",
                name, limit, timeout)))
        return
}

func (m *Server) getApiQpsLimit(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetMasterApiList))
        defer func() {
                doStatAndMetric(proto.AdminGetMasterApiList, metric, nil, nil)
        }()

        m.cluster.apiLimiter.m.RLock()
        v, err := json.Marshal(m.cluster.apiLimiter.limiterInfos)
        m.cluster.apiLimiter.m.RUnlock()
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("get api qps limit failed: %v", err)))
                return
        }

        limiterInfos := make(map[string]*ApiLimitInfo)
        json.Unmarshal(v, &limiterInfos)
        sendOkReply(w, r, newSuccessHTTPReply(limiterInfos))
}

func (m *Server) rmApiQpsLimit(w http.ResponseWriter, r *http.Request) {
        var (
                name string
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminRemoveApiQpsLimit))
        defer func() {
                doStatAndMetric(proto.AdminRemoveApiQpsLimit, metric, err, nil)
        }()

        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = m.cluster.apiLimiter.RmLimiter(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        // persist to rocksdb
        var qPath string
        if err, _, qPath = m.cluster.apiLimiter.IsApiNameValid(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if err = m.cluster.syncPutApiLimiterInfo(m.cluster.apiLimiter.IsFollowerLimiter(qPath)); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set api qps limit failed: %v", err)))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("rm api qps limit success: name: %v",
                name)))
}

func (m *Server) getIPAddr(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetIP))
        defer func() {
                doStatAndMetric(proto.AdminGetIP, metric, nil, nil)
        }()

        m.cluster.loadClusterValue()
        batchCount := atomic.LoadUint64(&m.cluster.cfg.MetaNodeDeleteBatchCount)
        limitRate := atomic.LoadUint64(&m.cluster.cfg.DataNodeDeleteLimitRate)
        deleteSleepMs := atomic.LoadUint64(&m.cluster.cfg.MetaNodeDeleteWorkerSleepMs)
        autoRepairRate := atomic.LoadUint64(&m.cluster.cfg.DataNodeAutoRepairLimitRate)
        dirChildrenNumLimit := atomic.LoadUint32(&m.cluster.cfg.DirChildrenNumLimit)
        dpMaxRepairErrCnt := atomic.LoadUint64(&m.cluster.cfg.DpMaxRepairErrCnt)

        cInfo := &proto.ClusterInfo{
                Cluster:                     m.cluster.Name,
                MetaNodeDeleteBatchCount:    batchCount,
                MetaNodeDeleteWorkerSleepMs: deleteSleepMs,
                DataNodeDeleteLimitRate:     limitRate,
                DataNodeAutoRepairLimitRate: autoRepairRate,
                DpMaxRepairErrCnt:           dpMaxRepairErrCnt,
                DirChildrenNumLimit:         dirChildrenNumLimit,
                // Ip:                          strings.Split(r.RemoteAddr, ":")[0],
                Ip:                iputil.RealIP(r),
                EbsAddr:           m.bStoreAddr,
                ServicePath:       m.servicePath,
                ClusterUuid:       m.cluster.clusterUuid,
                ClusterUuidEnable: m.cluster.clusterUuidEnable,
        }

        sendOkReply(w, r, newSuccessHTTPReply(cInfo))
}

func (m *Server) createMetaPartition(w http.ResponseWriter, r *http.Request) {
        var (
                vol     *Vol
                volName string
                count   int
                err     error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateMetaPartition))
        defer func() {
                doStatAndMetric(proto.AdminCreateMetaPartition, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, count, err = validateRequestToCreateMetaPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if vol.status() == proto.VolStatusMarkDelete {
                log.LogErrorf("action[createMetaPartition] vol[%s] is marked delete ", vol.Name)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if vol.Forbidden {
                log.LogErrorf("action[createMetaPartition] vol[%s] is forbidden", vol.Name)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = vol.addMetaPartitions(m.cluster, count); err != nil {
                log.LogErrorf("create meta partition fail: volume(%v) err(%v)", volName, err)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprint("create meta partition successfully")))
}

func parsePreloadDpReq(r *http.Request, preload *DataPartitionPreLoad) (err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        preload.preloadZoneName = r.FormValue(zoneNameKey)

        if preload.PreloadCacheTTL, err = extractPositiveUint64(r, cacheTTLKey); err != nil {
                return
        }

        if preload.preloadCacheCapacity, err = extractPositiveUint(r, volCapacityKey); err != nil {
                return
        }

        if preload.preloadReplicaNum, err = extractUintWithDefault(r, replicaNumKey, 1); err != nil {
                return
        }

        if preload.preloadReplicaNum < 1 || preload.preloadReplicaNum > 16 {
                return fmt.Errorf("preload replicaNum must be between [%d] to [%d], now[%d]", 1, 16, preload.preloadReplicaNum)
        }

        return
}

func (m *Server) createPreLoadDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                volName string
                vol     *Vol
                err     error
                dps     []*DataPartition
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreatePreLoadDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminCreatePreLoadDataPartition, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        log.LogInfof("action[createPreLoadDataPartition]")
        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if !proto.IsCold(vol.VolType) {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("only low frequency volume can create preloadDp")))
                return
        }

        preload := new(DataPartitionPreLoad)
        err = parsePreloadDpReq(r, preload)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        total := vol.CalculatePreloadCapacity() + uint64(preload.preloadCacheCapacity)
        if total > vol.CacheCapacity {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("preload total capacity[%d] can't be bigger than cache capacity [%d]",
                        total, vol.CacheCapacity)))
                return
        }

        log.LogInfof("[createPreLoadDataPartition] start create preload dataPartition, vol(%s), req(%s)", volName, preload.toString())
        err, dps = m.cluster.batchCreatePreLoadDataPartition(vol, preload)
        if err != nil {
                log.LogErrorf("create data partition fail: volume(%v), req(%v) err(%v)", volName, preload.toString(), err)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if len(dps) == 0 {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("create zero datapartition")))
                return
        }

        cv := proto.NewDataPartitionsView()
        dpResps := make([]*proto.DataPartitionResponse, 0)

        for _, dp := range dps {
                dpResp := dp.convertToDataPartitionResponse()
                dpResps = append(dpResps, dpResp)
        }

        log.LogDebugf("action[createPreLoadDataPartition] dps cnt[%v]  content[%v]", len(dps), dpResps)
        cv.DataPartitions = dpResps
        sendOkReply(w, r, newSuccessHTTPReply(cv))
}

func (m *Server) getQosStatus(w http.ResponseWriter, r *http.Request) {
        var (
                volName string
                err     error
                vol     *Vol
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetStatus))
        defer func() {
                doStatAndMetric(proto.QosGetStatus, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(vol.getQosStatus(m.cluster)))
}

func (m *Server) getClientQosInfo(w http.ResponseWriter, r *http.Request) {
        var (
                volName string
                err     error
                vol     *Vol
                host    string
                id      uint64
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetClientsLimitInfo))
        defer func() {
                doStatAndMetric(proto.QosGetClientsLimitInfo, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if host = r.FormValue(addrKey); host != "" {
                log.LogInfof("action[getClientQosInfo] host %v", host)
                if !checkIp(host) {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                        return
                }
        }

        if value := r.FormValue(idKey); value != "" {
                if id, err = strconv.ParseUint(value, 10, 64); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
        }
        var rsp interface{}
        if rsp, err = vol.getClientLimitInfo(id, host); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
        } else {
                sendOkReply(w, r, newSuccessHTTPReply(rsp))
        }
}

func (m *Server) getQosUpdateMasterLimit(w http.ResponseWriter, r *http.Request) {
        var (
                err   error
                value string
                limit uint64
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateMasterLimit))
        defer func() {
                doStatAndMetric(proto.QosUpdateMasterLimit, metric, err, nil)
        }()

        if value = r.FormValue(QosMasterLimit); value != "" {
                if limit, err = strconv.ParseUint(value, 10, 64); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of limit")))
                        return
                }
                if limit < QosMasterAcceptCnt {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("limit too less than %v", QosMasterAcceptCnt)))
                        return
                }

                m.cluster.cfg.QosMasterAcceptLimit = limit
                m.cluster.QosAcceptLimit.SetLimit(rate.Limit(limit))
                if err = m.cluster.syncPutCluster(); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
                        return
                }
                sendOkReply(w, r, newSuccessHTTPReply("success"))
                return
        }
        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("no param of limit")))
}

func (m *Server) QosUpdateClientParam(w http.ResponseWriter, r *http.Request) {
        var (
                volName            string
                value              string
                parsed             uint64
                period, triggerCnt uint32
                err                error
                vol                *Vol
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateClientParam))
        defer func() {
                doStatAndMetric(proto.QosUpdateClientParam, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if value = r.FormValue(ClientReqPeriod); value != "" {
                if parsed, err = strconv.ParseUint(value, 10, 32); err != nil || parsed == 0 {
                        log.LogErrorf("hytemp error %v", err)
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of peroid")))
                        return
                }
                period = uint32(parsed)
        }
        if value = r.FormValue(ClientTriggerCnt); value != "" {
                if parsed, err = strconv.ParseUint(value, 10, 32); err != nil || parsed == 0 {
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("wrong param of triggerCnt")))
                        return
                }
                triggerCnt = uint32(parsed)
        }
        if err = vol.updateClientParam(m.cluster, period, triggerCnt); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func parseRequestQos(r *http.Request, isMagnify bool, isEnableIops bool) (qosParam *qosArgs, err error) {
        qosParam = &qosArgs{}

        var value int
        var flowFmt int

        if isMagnify {
                flowFmt = 1
        } else {
                flowFmt = util.MB
        }

        if qosEnableStr := r.FormValue(QosEnableKey); qosEnableStr != "" {
                qosParam.qosEnable, _ = strconv.ParseBool(qosEnableStr)
        }

        if isEnableIops {
                if iopsRLimitStr := r.FormValue(IopsRKey); iopsRLimitStr != "" {
                        log.LogInfof("actin[parseRequestQos] iopsRLimitStr %v", iopsRLimitStr)
                        if value, err = strconv.Atoi(iopsRLimitStr); err == nil {
                                qosParam.iopsRVal = uint64(value)
                                if !isMagnify && qosParam.iopsRVal < MinIoLimit {
                                        err = fmt.Errorf("iops read %v need larger than 100", value)
                                        return
                                }
                                if isMagnify && (qosParam.iopsRVal < MinMagnify || qosParam.iopsRVal > MaxMagnify) {
                                        err = fmt.Errorf("iops read magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
                                        log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
                                        return
                                }
                        }
                }

                if iopsWLimitStr := r.FormValue(IopsWKey); iopsWLimitStr != "" {
                        log.LogInfof("actin[parseRequestQos] iopsWLimitStr %v", iopsWLimitStr)
                        if value, err = strconv.Atoi(iopsWLimitStr); err == nil {
                                qosParam.iopsWVal = uint64(value)
                                if !isMagnify && qosParam.iopsWVal < MinIoLimit {
                                        err = fmt.Errorf("iops %v write write io larger than 100", value)
                                        return
                                }
                                if isMagnify && (qosParam.iopsWVal < MinMagnify || qosParam.iopsWVal > MaxMagnify) {
                                        err = fmt.Errorf("iops write magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
                                        log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
                                        return
                                }
                        }
                }

        }

        if flowRLimitStr := r.FormValue(FlowRKey); flowRLimitStr != "" {
                log.LogInfof("actin[parseRequestQos] flowRLimitStr %v", flowRLimitStr)
                if value, err = strconv.Atoi(flowRLimitStr); err == nil {
                        qosParam.flowRVal = uint64(value * flowFmt)
                        if !isMagnify && (qosParam.flowRVal < MinFlowLimit || qosParam.flowRVal > MaxFlowLimit) {
                                err = fmt.Errorf("flow read %v should be between 100M and 10TB ", value)
                                return
                        }
                        if isMagnify && (qosParam.flowRVal < MinMagnify || qosParam.flowRVal > MaxMagnify) {
                                err = fmt.Errorf("flow read magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
                                log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
                                return
                        }
                }
        }
        if flowWLimitStr := r.FormValue(FlowWKey); flowWLimitStr != "" {
                log.LogInfof("actin[parseRequestQos] flowWLimitStr %v", flowWLimitStr)
                if value, err = strconv.Atoi(flowWLimitStr); err == nil {
                        qosParam.flowWVal = uint64(value * flowFmt)
                        if !isMagnify && (qosParam.flowWVal < MinFlowLimit || qosParam.flowWVal > MaxFlowLimit) {
                                err = fmt.Errorf("flow write %v should be between 100M and 10TB", value)
                                log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
                                return
                        }
                        if isMagnify && (qosParam.flowWVal < MinMagnify || qosParam.flowWVal > MaxMagnify) {
                                err = fmt.Errorf("flow write magnify %v must between %v and %v", value, MinMagnify, MaxMagnify)
                                log.LogErrorf("acttion[parseRequestQos] %v", err.Error())
                                return
                        }
                }
        }

        log.LogInfof("action[parseRequestQos] result %v", qosParam)

        return
}

// flowRVal, flowWVal take MB as unit
func (m *Server) QosUpdateZoneLimit(w http.ResponseWriter, r *http.Request) {
        var (
                value    interface{}
                ok       bool
                err      error
                qosParam *qosArgs
                enable   bool
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdateZoneLimit))
        defer func() {
                doStatAndMetric(proto.QosUpdateZoneLimit, metric, err, nil)
        }()

        var zoneName string
        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }
        if qosParam, err = parseRequestQos(r, false, true); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if sVal := r.FormValue(DiskEnableKey); sVal != "" {
                if enable, err = strconv.ParseBool(sVal); err == nil {
                        log.LogInfof("action[DiskQosUpdate] enable be set [%v]", enable)
                        m.cluster.diskQosEnable = enable
                        err = m.cluster.syncPutCluster()
                }
        }

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("zonename [%v] not found", zoneName)))
                return
        }
        zone := value.(*Zone)
        zone.updateDataNodeQosLimit(m.cluster, qosParam)

        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

// flowRVal, flowWVal take MB as unit
func (m *Server) QosGetZoneLimit(w http.ResponseWriter, r *http.Request) {
        var (
                value interface{}
                ok    bool
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosGetZoneLimitInfo))
        defer func() {
                doStatAndMetric(proto.QosGetZoneLimitInfo, metric, nil, nil)
        }()

        var zoneName string
        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("zonename [%v] not found", zoneName)))
                return
        }
        zone := value.(*Zone)

        type qosZoneStatus struct {
                Zone            string
                DiskLimitEnable bool
                IopsRVal        uint64
                IopsWVal        uint64
                FlowRVal        uint64
                FlowWVal        uint64
        }

        zoneSt := &qosZoneStatus{
                Zone:            zoneName,
                DiskLimitEnable: m.cluster.diskQosEnable,
                IopsRVal:        zone.QosIopsRLimit,
                IopsWVal:        zone.QosIopsWLimit,
                FlowRVal:        zone.QosFlowRLimit,
                FlowWVal:        zone.QosFlowWLimit,
        }
        sendOkReply(w, r, newSuccessHTTPReply(zoneSt))
}

func (m *Server) QosUpdate(w http.ResponseWriter, r *http.Request) {
        var (
                volName   string
                err       error
                vol       *Vol
                enable    bool
                value     string
                limitArgs *qosArgs
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpdate))
        defer func() {
                doStatAndMetric(proto.QosUpdate, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = extractName(r); err == nil {
                if vol, err = m.cluster.getVol(volName); err != nil {
                        goto RET
                }
                if value = r.FormValue(QosEnableKey); value != "" {
                        if enable, err = strconv.ParseBool(value); err != nil {
                                goto RET
                        }

                        if err = vol.volQosEnable(m.cluster, enable); err != nil {
                                goto RET
                        }
                        log.LogInfof("action[DiskQosUpdate] update qos eanble [%v]", enable)
                }
                if limitArgs, err = parseRequestQos(r, false, false); err == nil && limitArgs.isArgsWork() {
                        if err = vol.volQosUpdateLimit(m.cluster, limitArgs); err != nil {
                                goto RET
                        }
                        log.LogInfof("action[DiskQosUpdate] update qos limit [%v] [%v] [%v] [%v] [%v]", enable,
                                limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal)
                }
        }

RET:
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        _ = sendOkReply(w, r, newSuccessHTTPReply("success"))
        return
}

func (m *Server) createDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg                     string
                volName                    string
                vol                        *Vol
                reqCreateCount             int
                lastTotalDataPartitions    int
                clusterTotalDataPartitions int
                err                        error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminCreateDataPartition, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if reqCreateCount, volName, err = parseRequestToCreateDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if proto.IsCold(vol.VolType) {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("low frequency vol can't create dp")))
                return
        }

        lastTotalDataPartitions = len(vol.dataPartitions.partitions)
        clusterTotalDataPartitions = m.cluster.getDataPartitionCount()
        err = m.cluster.batchCreateDataPartition(vol, reqCreateCount, false)
        rstMsg = fmt.Sprintf(" createDataPartition succeeeds. "+
                "clusterLastTotalDataPartitions[%v],vol[%v] has %v data partitions previously and %v data partitions now",
                clusterTotalDataPartitions, volName, lastTotalDataPartitions, len(vol.dataPartitions.partitions))
        if err != nil {
                log.LogErrorf("create data partition fail: volume(%v) err(%v)", volName, err)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) changeDataPartitionLeader(w http.ResponseWriter, r *http.Request) {
        var (
                dp          *DataPartition
                partitionID uint64
                err         error
                host        string
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDataPartitionChangeLeader))
        defer func() {
                doStatAndMetric(proto.AdminDataPartitionChangeLeader, metric, err, nil)
        }()

        if partitionID, _, err = parseRequestToGetDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }

        if host = r.FormValue(addrKey); host == "" {
                err = keyNotFound(addrKey)
                return
        }
        if !checkIp(host) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                return
        }
        if err = dp.tryToChangeLeaderByHost(host); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        rstMsg := fmt.Sprintf(" changeDataPartitionLeader command success send to dest host but need check. ")
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) getDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                dp          *DataPartition
                partitionID uint64
                volName     string
                vol         *Vol
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminGetDataPartition, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if partitionID, volName, err = parseRequestToGetDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if volName != "" {
                if vol, err = m.cluster.getVol(volName); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                        return
                }
                if dp, err = vol.getDataPartitionByID(partitionID); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                        return
                }
        } else {
                if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                        return
                }
        }

        sendOkReply(w, r, newSuccessHTTPReply(dp.buildDpInfo(m.cluster)))
}

// Load the data partition.
func (m *Server) loadDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                dp          *DataPartition
                partitionID uint64
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminLoadDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminLoadDataPartition, metric, err, nil)
        }()

        if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }

        m.cluster.loadDataPartition(dp)
        msg = fmt.Sprintf(proto.AdminLoadDataPartition+"partitionID :%v  load data partition successfully", partitionID)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) addDataReplica(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                addr        string
                dp          *DataPartition
                partitionID uint64
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminAddDataReplica))
        defer func() {
                doStatAndMetric(proto.AdminAddDataReplica, metric, err, nil)
        }()

        if partitionID, addr, err = parseRequestToAddDataReplica(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }

        if err = m.cluster.addDataReplica(dp, addr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        dp.Status = proto.ReadOnly
        dp.isRecover = true
        m.cluster.putBadDataPartitionIDs(nil, addr, dp.PartitionID)

        msg = fmt.Sprintf("data partitionID :%v  add replica [%v] successfully", partitionID, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) deleteDataReplica(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                addr        string
                dp          *DataPartition
                partitionID uint64
                err         error
                force       bool // now only used in two replicas in the scenario of no leader
                raftForce   bool
                vol         *Vol
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteDataReplica))
        defer func() {
                doStatAndMetric(proto.AdminDeleteDataReplica, metric, err, nil)
        }()

        if partitionID, addr, err = parseRequestToRemoveDataReplica(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }

        // force only be used in scenario that dp of two replicas volume no leader caused by one replica crash
        raftForce, err = parseRaftForce(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        force, err = pareseBoolWithDefault(r, forceKey, false)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.cluster.removeDataReplica(dp, addr, !force, raftForce); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if vol, err = m.cluster.getVol(dp.VolName); err != nil {
                log.LogErrorf("action[updateVol] err[%v]", err)
                err = proto.ErrVolNotExists
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        _ = vol.tryUpdateDpReplicaNum(m.cluster, dp)

        msg = fmt.Sprintf("data partitionID :%v  delete replica [%v] successfully", partitionID, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) addMetaReplica(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                addr        string
                mp          *MetaPartition
                partitionID uint64
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminAddMetaReplica))
        defer func() {
                doStatAndMetric(proto.AdminAddMetaReplica, metric, err, nil)
        }()

        if partitionID, addr, err = parseRequestToAddMetaReplica(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }

        if err = m.cluster.addMetaReplica(mp, addr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        mp.IsRecover = true
        m.cluster.putBadMetaPartitions(addr, mp.PartitionID)
        msg = fmt.Sprintf("meta partitionID :%v  add replica [%v] successfully", partitionID, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) deleteMetaReplica(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                addr        string
                mp          *MetaPartition
                partitionID uint64
                err         error
                force       bool
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteMetaReplica))
        defer func() {
                doStatAndMetric(proto.AdminDeleteMetaReplica, metric, err, nil)
        }()

        if partitionID, addr, err = parseRequestToRemoveMetaReplica(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }

        var value string
        if value = r.FormValue(forceKey); value != "" {
                force, _ = strconv.ParseBool(value)
        }
        if err = m.cluster.deleteMetaReplica(mp, addr, true, force); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg = fmt.Sprintf("meta partitionID :%v  delete replica [%v] successfully", partitionID, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) changeMetaPartitionLeader(w http.ResponseWriter, r *http.Request) {
        var (
                mp          *MetaPartition
                partitionID uint64
                err         error
                host        string
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminChangeMetaPartitionLeader))
        defer func() {
                doStatAndMetric(proto.AdminChangeMetaPartitionLeader, metric, err, nil)
        }()

        if partitionID, _, err = parseRequestToGetDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                log.LogErrorf("changeMetaPartitionLeader.err %v", err)
                return
        }

        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                log.LogErrorf("changeMetaPartitionLeader.err %v", proto.ErrMetaPartitionNotExists)
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }

        if host = r.FormValue(addrKey); host == "" {
                err = keyNotFound(addrKey)
                log.LogErrorf("changeMetaPartitionLeader.err %v", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !checkIp(host) {
                log.LogErrorf("changeMetaPartitionLeader.err addr not legal")
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                return
        }
        if err = mp.tryToChangeLeaderByHost(host); err != nil {
                log.LogErrorf("changeMetaPartitionLeader.err %v", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        rstMsg := fmt.Sprintf(" changeMetaPartitionLeader command success send to dest host but need check. ")
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// balance the leader meta partition in metaNodes which can select all cluster some zones or noteSet
func (m *Server) balanceMetaPartitionLeader(w http.ResponseWriter, r *http.Request) {
        var (
                zonesKey     string
                nodesetIdKey string
                err          error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminBalanceMetaPartitionLeader))
        defer func() {
                doStatAndMetric(proto.AdminBalanceMetaPartitionLeader, metric, err, nil)
        }()
        if zonesKey, nodesetIdKey, err = parseRequestToBalanceMetaPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
                return
        }
        log.LogInfof("zone:%v,nodesetId:%v", zonesKey, nodesetIdKey)

        zonesM := make(map[string]struct{})
        if zonesKey != "" {
                zones := strings.Split(zonesKey, commaSplit)
                for _, zone := range zones {
                        zonesM[zone] = struct{}{}
                }
        }

        nodesetIdM := make(map[uint64]struct{})
        if nodesetIdKey != "" {
                nodesetIds := strings.Split(nodesetIdKey, commaSplit)
                for _, nodeSetId := range nodesetIds {
                        id, err := strconv.ParseUint(nodeSetId, 10, 64)
                        if err != nil {
                                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                                log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
                                return
                        }
                        nodesetIdM[id] = struct{}{}
                }
        }
        log.LogInfof("balanceMetaPartitionLeader zones[%v] length[%d], nodesetIds[%v] length[%d]", zonesKey, len(zonesM), nodesetIdKey, len(nodesetIdM))
        err = m.cluster.balanceMetaPartitionLeader(zonesM, nodesetIdM)
        if err != nil {
                log.LogErrorf("balanceMetaPartitionLeader.err %v", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        rstMsg := fmt.Sprintf("balanceMetaPartitionLeader command sucess")
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// Decommission a data partition. This usually happens when disk error has been reported.
// This function needs to be called manually by the admin.
func (m *Server) decommissionDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg      string
                dp          *DataPartition
                addr        string
                partitionID uint64
                raftForce   bool
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDecommissionDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminDecommissionDataPartition, metric, err, nil)
        }()

        if partitionID, addr, err = parseRequestToDecommissionDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }
        raftForce, err = parseRaftForce(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if !dp.IsDecommissionInitial() {
                rstMsg = fmt.Sprintf(" dataPartitionID :%v  status %v not support decommission",
                        partitionID, dp.GetDecommissionStatus())
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: rstMsg})
                return
        }
        if dp.isSpecialReplicaCnt() {
                rstMsg = fmt.Sprintf(proto.AdminDecommissionDataPartition+" dataPartitionID :%v  is special replica cnt %v on node:%v async running,need check later",
                        partitionID, dp.ReplicaNum, addr)
                go m.cluster.decommissionDataPartition(addr, dp, raftForce, handleDataPartitionOfflineErr)
                sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
                return
        }
        if err = m.cluster.decommissionDataPartition(addr, dp, raftForce, handleDataPartitionOfflineErr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if !dp.isSpecialReplicaCnt() {
                rstMsg = fmt.Sprintf(proto.AdminDecommissionDataPartition+" dataPartitionID :%v  on node:%v successfully", partitionID, addr)
                sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
        }
}

func (m *Server) diagnoseDataPartition(w http.ResponseWriter, r *http.Request) {
        var (
                err                     error
                rstMsg                  *proto.DataPartitionDiagnosis
                inactiveNodes           []string
                corruptDps              []*DataPartition
                lackReplicaDps          []*DataPartition
                badReplicaDps           []*DataPartition
                repFileCountDifferDps   []*DataPartition
                repUsedSizeDifferDps    []*DataPartition
                excessReplicaDPs        []*DataPartition
                corruptDpIDs            []uint64
                lackReplicaDpIDs        []uint64
                badReplicaDpIDs         []uint64
                repFileCountDifferDpIDs []uint64
                repUsedSizeDifferDpIDs  []uint64
                excessReplicaDpIDs      []uint64
                badDataPartitionInfos   []proto.BadPartitionRepairView
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDiagnoseDataPartition))
        defer func() {
                doStatAndMetric(proto.AdminDiagnoseDataPartition, metric, err, nil)
        }()

        ignoreDiscardDp, err := pareseBoolWithDefault(r, ignoreDiscardKey, false)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        corruptDpIDs = make([]uint64, 0)
        lackReplicaDpIDs = make([]uint64, 0)
        badReplicaDpIDs = make([]uint64, 0)
        repFileCountDifferDpIDs = make([]uint64, 0)
        repUsedSizeDifferDpIDs = make([]uint64, 0)
        excessReplicaDpIDs = make([]uint64, 0)

        if inactiveNodes, err = m.cluster.checkInactiveDataNodes(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if lackReplicaDps, badReplicaDps, repFileCountDifferDps, repUsedSizeDifferDps, excessReplicaDPs, corruptDps, err = m.cluster.checkReplicaOfDataPartitions(ignoreDiscardDp); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        for _, dp := range corruptDps {
                corruptDpIDs = append(corruptDpIDs, dp.PartitionID)
        }
        for _, dp := range lackReplicaDps {
                lackReplicaDpIDs = append(lackReplicaDpIDs, dp.PartitionID)
        }
        for _, dp := range badReplicaDps {
                badReplicaDpIDs = append(badReplicaDpIDs, dp.PartitionID)
        }
        for _, dp := range repFileCountDifferDps {
                repFileCountDifferDpIDs = append(repFileCountDifferDpIDs, dp.PartitionID)
        }
        for _, dp := range repUsedSizeDifferDps {
                repUsedSizeDifferDpIDs = append(repUsedSizeDifferDpIDs, dp.PartitionID)
        }
        for _, dp := range excessReplicaDPs {
                excessReplicaDpIDs = append(excessReplicaDpIDs, dp.PartitionID)
        }

        // badDataPartitions = m.cluster.getBadDataPartitionsView()
        badDataPartitionInfos = m.cluster.getBadDataPartitionsRepairView()
        rstMsg = &proto.DataPartitionDiagnosis{
                InactiveDataNodes:           inactiveNodes,
                CorruptDataPartitionIDs:     corruptDpIDs,
                LackReplicaDataPartitionIDs: lackReplicaDpIDs,
                BadDataPartitionInfos:       badDataPartitionInfos,
                BadReplicaDataPartitionIDs:  badReplicaDpIDs,
                RepFileCountDifferDpIDs:     repFileCountDifferDpIDs,
                RepUsedSizeDifferDpIDs:      repUsedSizeDifferDpIDs,
                ExcessReplicaDpIDs:          excessReplicaDpIDs,
        }
        log.LogInfof("diagnose dataPartition[%v] inactiveNodes:[%v], corruptDpIDs:[%v], "+
                "lackReplicaDpIDs:[%v], BadReplicaDataPartitionIDs[%v], "+
                "repFileCountDifferDpIDs:[%v], RepUsedSizeDifferDpIDs[%v], excessReplicaDpIDs[%v]",
                m.cluster.Name, inactiveNodes, corruptDpIDs,
                lackReplicaDpIDs, badReplicaDpIDs,
                repFileCountDifferDpIDs, repUsedSizeDifferDpIDs, excessReplicaDpIDs)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) resetDataPartitionDecommissionStatus(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                dp          *DataPartition
                partitionID uint64
                err         error
        )

        if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }

        dp.ResetDecommissionStatus()
        msg = fmt.Sprintf("partitionID :%v  reset decommission status successfully", partitionID)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) queryDataPartitionDecommissionStatus(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                dp          *DataPartition
                partitionID uint64
                err         error
                replicas    []string
        )

        if partitionID, err = parseRequestToLoadDataPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataPartitionNotExists))
                return
        }
        for _, replica := range dp.Replicas {
                replicas = append(replicas, replica.Addr)
        }
        msg = fmt.Sprintf("partitionID:%v  status[%v] specialStep[%v] retry [%v] raftForce[%v] recover [%v] "+
                "decommission src dataNode[%v] disk[%v]  dst dataNode[%v] term[%v] replicas[%v] DecommissionWaitTimes[%v]",
                partitionID, dp.GetDecommissionStatus(), dp.GetSpecialReplicaDecommissionStep(), dp.DecommissionRetry, dp.DecommissionRaftForce, dp.isRecover,
                dp.DecommissionSrcAddr, dp.DecommissionSrcDiskPath, dp.DecommissionDstAddr, dp.DecommissionTerm, replicas, dp.DecommissionWaitTimes)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

// Mark the volume as deleted, which will then be deleted later.
func (m *Server) markDeleteVol(w http.ResponseWriter, r *http.Request) {
        var (
                name    string
                authKey string
                // force   bool
                err error
                msg string
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDeleteVol))
        defer func() {
                doStatAndMetric(proto.AdminDeleteVol, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, authKey, _, err = parseRequestToDeleteVol(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.cluster.markDeleteVol(name, authKey, false); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if err = m.user.deleteVolPolicy(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg = fmt.Sprintf("delete vol[%v] successfully,from[%v]", name, r.RemoteAddr)
        log.LogWarn(msg)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) checkReplicaNum(r *http.Request, vol *Vol, req *updateVolReq) (err error) {
        var (
                replicaNumInt64 int64
                replicaNum      int
        )

        if replicaNumStr := r.FormValue(replicaNumKey); replicaNumStr != "" {
                if replicaNumInt64, err = strconv.ParseInt(replicaNumStr, 10, 8); err != nil {
                        err = unmatchedKey(replicaNumKey)
                        return
                }
                replicaNum = int(replicaNumInt64)
        } else {
                replicaNum = int(vol.dpReplicaNum)
        }
        req.replicaNum = replicaNum
        if replicaNum != 0 && replicaNum != int(vol.dpReplicaNum) {
                if replicaNum != int(vol.dpReplicaNum)-1 {
                        err = fmt.Errorf("replicaNum only need be reduced one replica one time")
                        return
                }
                if !proto.IsHot(vol.VolType) {
                        err = fmt.Errorf("vol type(%v) replicaNum cann't be changed", vol.VolType)
                        return
                }
                if ok, dpArry := vol.isOkUpdateRepCnt(); !ok {
                        err = fmt.Errorf("vol have dataPartitions[%v] with inconsistent dataPartitions cnt to volume's ", dpArry)
                        return
                }
        }
        if proto.IsHot(vol.VolType) {
                if req.replicaNum == 0 ||
                        ((req.replicaNum == 1 || req.replicaNum == 2) && !req.followerRead) {
                        err = fmt.Errorf("replica or follower read status error")
                        return
                }
        } else {
                if req.replicaNum == 0 && req.coldArgs.cacheCap > 0 {
                        req.replicaNum = 1
                }
                if (req.replicaNum == 0 && req.replicaNum != int(vol.dpReplicaNum)) || !req.followerRead {
                        err = fmt.Errorf("replica or follower read status error")
                        return
                }
        }
        return
}

func (m *Server) updateVol(w http.ResponseWriter, r *http.Request) {
        var (
                req = &updateVolReq{}
                vol *Vol
                err error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateVol))
        defer func() {
                doStatAndMetric(proto.AdminUpdateVol, metric, err, map[string]string{exporter.Vol: req.name})
        }()

        if req.name, err = parseVolName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(req.name); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }

        if err = parseVolUpdateReq(r, vol, req); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if req.followerRead, req.authenticate, err = parseBoolFieldToUpdateVol(r, vol); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.checkReplicaNum(r, vol, req); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        newArgs := getVolVarargs(vol)

        newArgs.zoneName = req.zoneName
        newArgs.description = req.description
        newArgs.capacity = req.capacity
        newArgs.deleteLockTime = req.deleteLockTime
        newArgs.followerRead = req.followerRead
        newArgs.authenticate = req.authenticate
        newArgs.dpSelectorName = req.dpSelectorName
        newArgs.dpSelectorParm = req.dpSelectorParm
        newArgs.enablePosixAcl = req.enablePosixAcl
        newArgs.enableTransaction = req.enableTransaction
        newArgs.txTimeout = req.txTimeout
        newArgs.txConflictRetryNum = req.txConflictRetryNum
        newArgs.txConflictRetryInterval = req.txConflictRetryInterval
        newArgs.txOpLimit = req.txOpLimit
        newArgs.enableQuota = req.enableQuota
        if req.coldArgs != nil {
                newArgs.coldArgs = req.coldArgs
        }

        newArgs.dpReplicaNum = uint8(req.replicaNum)
        newArgs.dpReadOnlyWhenVolFull = req.dpReadOnlyWhenVolFull

        log.LogWarnf("[updateVolOut] name [%s], z1 [%s], z2[%s] replicaNum[%v]", req.name, req.zoneName, vol.Name, req.replicaNum)
        if err = m.cluster.updateVol(req.name, req.authKey, newArgs); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        var response string
        if hasTxParams(r) {
                response = fmt.Sprintf("update vol[%v] successfully, txTimeout[%v] enableTransaction[%v]",
                        req.name, newArgs.txTimeout, proto.GetMaskString(newArgs.enableTransaction))
        } else {
                response = fmt.Sprintf("update vol[%v] successfully", req.name)
        }
        sendOkReply(w, r, newSuccessHTTPReply(response))
}

func (m *Server) volExpand(w http.ResponseWriter, r *http.Request) {
        var (
                name     string
                authKey  string
                err      error
                msg      string
                capacity int
                vol      *Vol
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolExpand))
        defer func() {
                doStatAndMetric(proto.AdminVolExpand, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, authKey, capacity, err = parseRequestToSetVolCapacity(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }

        if uint64(capacity) <= vol.Capacity {
                err = fmt.Errorf("expand capacity[%v] should be larger than the old capacity[%v]", capacity, vol.Capacity)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        newArgs := getVolVarargs(vol)
        newArgs.capacity = uint64(capacity)

        if err = m.cluster.updateVol(name, authKey, newArgs); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        msg = fmt.Sprintf("update vol[%v] successfully\n", name)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) volShrink(w http.ResponseWriter, r *http.Request) {
        var (
                name     string
                authKey  string
                err      error
                msg      string
                capacity int
                vol      *Vol
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminVolShrink))
        defer func() {
                doStatAndMetric(proto.AdminVolShrink, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, authKey, capacity, err = parseRequestToSetVolCapacity(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }

        if uint64(capacity) >= vol.Capacity {
                err = fmt.Errorf("shrink capacity[%v] should be less than the old capacity[%v]", capacity, vol.Capacity)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        newArgs := getVolVarargs(vol)
        newArgs.capacity = uint64(capacity)

        if err = m.cluster.updateVol(name, authKey, newArgs); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        msg = fmt.Sprintf("update vol[%v] successfully\n", name)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) checkCreateReq(req *createVolReq) (err error) {
        if !proto.IsHot(req.volType) && !proto.IsCold(req.volType) {
                return fmt.Errorf("vol type %d is illegal", req.volType)
        }

        if req.capacity == 0 {
                return fmt.Errorf("vol capacity can't be zero, %d", req.capacity)
        }

        if req.dpSize != 0 && req.dpSize <= 10 {
                return fmt.Errorf("datapartition dpSize must be bigger than 10 G")
        }

        if req.dpCount > maxInitDataPartitionCnt {
                return fmt.Errorf("dpCount[%d] exceeds maximum limit[%d]", req.dpCount, maxInitDataPartitionCnt)
        }

        if proto.IsHot(req.volType) {
                if req.dpReplicaNum == 0 {
                        req.dpReplicaNum = defaultReplicaNum
                }

                if req.dpReplicaNum > 3 {
                        return fmt.Errorf("hot vol's replicaNum should be 1 to 3, received replicaNum is[%v]", req.dpReplicaNum)
                }
                return nil
        } else if proto.IsCold(req.volType) {
                if req.dpReplicaNum > 16 {
                        return fmt.Errorf("cold vol's replicaNum should less then 17, received replicaNum is[%v]", req.dpReplicaNum)
                }
        }

        if req.dpReplicaNum == 0 && req.coldArgs.cacheCap > 0 {
                req.dpReplicaNum = 1
        }

        req.followerRead = true

        args := req.coldArgs

        if args.objBlockSize == 0 {
                args.objBlockSize = defaultEbsBlkSize
        }

        if err = checkCacheAction(args.cacheAction); err != nil {
                return
        }

        if args.cacheTtl == 0 {
                args.cacheTtl = defaultCacheTtl
        }

        if args.cacheThreshold == 0 {
                args.cacheThreshold = defaultCacheThreshold
        }

        if args.cacheHighWater == 0 {
                args.cacheHighWater = defaultCacheHighWater
        }

        if args.cacheLowWater == 0 {
                args.cacheLowWater = defaultCacheLowWater
        }

        if args.cacheLRUInterval != 0 && args.cacheLRUInterval < 2 {
                return fmt.Errorf("cache lruInterval(%d) must bigger than 2 minutes", args.cacheLRUInterval)
        }

        if args.cacheLRUInterval == 0 {
                args.cacheLRUInterval = defaultCacheLruInterval
        }

        if args.cacheLowWater >= args.cacheHighWater {
                return fmt.Errorf("low water(%d) must be less than high water(%d)", args.cacheLowWater, args.cacheHighWater)
        }

        if args.cacheCap >= uint64(req.capacity) {
                return fmt.Errorf("cache capacity(%d) must be less than capacity(%d)", args.cacheCap, req.capacity)
        }

        if proto.IsCold(req.volType) && req.dpReplicaNum == 0 && args.cacheCap > 0 {
                return fmt.Errorf("cache capacity(%d) not zero,replicaNum should not be zero", args.cacheCap)
        }

        if args.cacheHighWater >= 90 || args.cacheLowWater >= 90 {
                return fmt.Errorf("low(%d) or high water(%d) can't be large than 90, low than 0", args.cacheLowWater, args.cacheHighWater)
        }

        if int(req.dpReplicaNum) > m.cluster.dataNodeCount() {
                return fmt.Errorf("dp replicaNum %d can't be large than dataNodeCnt %d", req.dpReplicaNum, m.cluster.dataNodeCount())
        }

        req.coldArgs = args
        return nil
}

func (m *Server) createVol(w http.ResponseWriter, r *http.Request) {
        req := &createVolReq{}
        vol := &Vol{}

        var err error

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminCreateVol))
        defer func() {
                doStatAndMetric(proto.AdminCreateVol, metric, err, map[string]string{exporter.Vol: req.name})
        }()

        if err = parseRequestToCreateVol(r, req); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.checkCreateReq(req); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if proto.IsHot(req.volType) && (req.dpReplicaNum == 1 || req.dpReplicaNum == 2) && !req.followerRead {
                err = fmt.Errorf("hot volume replicaNum be 2 and 3,followerRead must set true")
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if vol, err = m.cluster.createVol(req); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = m.associateVolWithUser(req.owner, req.name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        msg := fmt.Sprintf("create vol[%v] successfully, has allocate [%v] data partitions", req.name, len(vol.dataPartitions.partitions))
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) qosUpload(w http.ResponseWriter, r *http.Request) {
        var (
                err   error
                name  string
                vol   *Vol
                limit *proto.LimitRsp2Client
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QosUpload))
        defer func() {
                doStatAndMetric(proto.QosUpload, metric, err, map[string]string{exporter.Vol: name})
        }()

        ctx := context.Background()
        m.cluster.QosAcceptLimit.WaitN(ctx, 1)
        log.LogInfof("action[qosUpload] limit %v", m.cluster.QosAcceptLimit.Limit())
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        qosEnableStr := r.FormValue(QosEnableKey)
        if qosEnableStr == "" {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
                return
        }
        // qos upload may called by client init,thus use qosEnable param to identify it weather need to calc by master
        var clientInfo *proto.ClientReportLimitInfo
        if qosEnable, _ := strconv.ParseBool(qosEnableStr); qosEnable {
                if clientInfo, err = parseQosInfo(r); err == nil {
                        log.LogDebugf("action[qosUpload] cliInfoMgrMap [%v],clientInfo id[%v] clientInfo.Host %v, enable %v", clientInfo.ID, clientInfo.Host, r.RemoteAddr, qosEnable)
                        if clientInfo.ID == 0 {
                                if limit, err = vol.qosManager.init(m.cluster, clientInfo.Host); err != nil {
                                        sendErrReply(w, r, newErrHTTPReply(err))
                                        return
                                }
                                clientInfo.ID = limit.ID
                        }
                        if limit, err = vol.qosManager.HandleClientQosReq(clientInfo, clientInfo.ID); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                } else {
                        log.LogInfof("action[qosUpload] qosEnableStr:[%v] err [%v]", qosEnableStr, err)
                }
        }

        sendOkReply(w, r, newSuccessHTTPReply(limit))
}

func (m *Server) getVolSimpleInfo(w http.ResponseWriter, r *http.Request) {
        var (
                err  error
                name string
                vol  *Vol
        )
        metric := exporter.NewTPCnt("req" + strings.Replace(proto.AdminGetVol, "/", "_", -1))
        defer func() {
                doStatAndMetric(proto.AdminGetVol, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        volView := newSimpleView(vol)

        sendOkReply(w, r, newSuccessHTTPReply(volView))
}

func newSimpleView(vol *Vol) (view *proto.SimpleVolView) {
        var (
                volInodeCount  uint64
                volDentryCount uint64
        )
        vol.mpsLock.RLock()
        for _, mp := range vol.MetaPartitions {
                volDentryCount = volDentryCount + mp.DentryCount
                volInodeCount = volInodeCount + mp.InodeCount
        }
        vol.mpsLock.RUnlock()
        maxPartitionID := vol.maxPartitionID()

        view = &proto.SimpleVolView{
                ID:                      vol.ID,
                Name:                    vol.Name,
                Owner:                   vol.Owner,
                ZoneName:                vol.zoneName,
                DpReplicaNum:            vol.dpReplicaNum,
                MpReplicaNum:            vol.mpReplicaNum,
                InodeCount:              volInodeCount,
                DentryCount:             volDentryCount,
                MaxMetaPartitionID:      maxPartitionID,
                Status:                  vol.Status,
                Capacity:                vol.Capacity,
                FollowerRead:            vol.FollowerRead,
                EnablePosixAcl:          vol.enablePosixAcl,
                EnableQuota:             vol.enableQuota,
                EnableTransaction:       proto.GetMaskString(vol.enableTransaction),
                TxTimeout:               vol.txTimeout,
                TxConflictRetryNum:      vol.txConflictRetryNum,
                TxConflictRetryInterval: vol.txConflictRetryInterval,
                TxOpLimit:               vol.txOpLimit,
                NeedToLowerReplica:      vol.NeedToLowerReplica,
                Authenticate:            vol.authenticate,
                CrossZone:               vol.crossZone,
                DefaultPriority:         vol.defaultPriority,
                DomainOn:                vol.domainOn,
                RwDpCnt:                 vol.dataPartitions.readableAndWritableCnt,
                MpCnt:                   len(vol.MetaPartitions),
                DpCnt:                   len(vol.dataPartitions.partitionMap),
                CreateTime:              time.Unix(vol.createTime, 0).Format(proto.TimeFormat),
                DeleteLockTime:          vol.DeleteLockTime,
                Description:             vol.description,
                DpSelectorName:          vol.dpSelectorName,
                DpSelectorParm:          vol.dpSelectorParm,
                DpReadOnlyWhenVolFull:   vol.DpReadOnlyWhenVolFull,
                VolType:                 vol.VolType,
                ObjBlockSize:            vol.EbsBlkSize,
                CacheCapacity:           vol.CacheCapacity,
                CacheAction:             vol.CacheAction,
                CacheThreshold:          vol.CacheThreshold,
                CacheLruInterval:        vol.CacheLRUInterval,
                CacheTtl:                vol.CacheTTL,
                CacheLowWater:           vol.CacheLowWater,
                CacheHighWater:          vol.CacheHighWater,
                CacheRule:               vol.CacheRule,
                PreloadCapacity:         vol.getPreloadCapacity(),
                LatestVer:               vol.VersionMgr.getLatestVer(),
                Forbidden:               vol.Forbidden,
                EnableAuditLog:          vol.EnableAuditLog,
        }

        vol.uidSpaceManager.RLock()
        defer vol.uidSpaceManager.RUnlock()
        for _, uid := range vol.uidSpaceManager.uidInfo {
                view.Uids = append(view.Uids, proto.UidSimpleInfo{
                        UID:     uid.Uid,
                        Limited: uid.Limited,
                })
        }
        return
}

func checkIp(addr string) bool {
        ip := strings.Trim(addr, " ")
        regStr := `^(([1-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){2}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])`
        if match, _ := regexp.MatchString(regStr, ip); match {
                return true
        }
        return false
}

func checkIpPort(addr string) bool {
        var arr []string
        if arr = strings.Split(addr, ":"); len(arr) < 2 {
                return false
        }
        if id, err := strconv.ParseUint(arr[1], 10, 64); err != nil || id > 65535 || id < 1024 {
                return false
        }
        ip := strings.Trim(addr, " ")
        regStr := `^(([1-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.)(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){2}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])`
        if match, _ := regexp.MatchString(regStr, ip); match {
                return true
        }
        return false
}

func (m *Server) addDataNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr  string
                zoneName  string
                id        uint64
                err       error
                nodesetId uint64
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AddDataNode))
        defer func() {
                doStatAndMetric(proto.AddDataNode, metric, err, nil)
        }()

        if nodeAddr, zoneName, err = parseRequestForAddNode(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !checkIpPort(nodeAddr) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                return
        }
        var value string
        if value = r.FormValue(idKey); value == "" {
                nodesetId = 0
        } else {
                if nodesetId, err = strconv.ParseUint(value, 10, 64); err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                        return
                }
        }
        if id, err = m.cluster.addDataNode(nodeAddr, zoneName, nodesetId); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(id))
}

func (m *Server) getDataNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr     string
                dataNode     *DataNode
                dataNodeInfo *proto.DataNodeInfo
                err          error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetDataNode))
        defer func() {
                doStatAndMetric(proto.GetDataNode, metric, err, nil)
        }()

        if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if dataNode, err = m.cluster.dataNode(nodeAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }
        log.LogDebugf("getDataNode. addr %v Total %v used %v", nodeAddr, dataNode.Total, dataNode.Used)
        dataNode.PersistenceDataPartitions = m.cluster.getAllDataPartitionIDByDatanode(nodeAddr)
        // some dp maybe removed from this node but decommission failed
        dataNodeInfo = &proto.DataNodeInfo{
                Total:                     dataNode.Total,
                Used:                      dataNode.Used,
                AvailableSpace:            dataNode.AvailableSpace,
                ID:                        dataNode.ID,
                ZoneName:                  dataNode.ZoneName,
                Addr:                      dataNode.Addr,
                DomainAddr:                dataNode.DomainAddr,
                ReportTime:                dataNode.ReportTime,
                IsActive:                  dataNode.isActive,
                IsWriteAble:               dataNode.isWriteAble(),
                UsageRatio:                dataNode.UsageRatio,
                SelectedTimes:             dataNode.SelectedTimes,
                DataPartitionReports:      dataNode.DataPartitionReports,
                DataPartitionCount:        dataNode.DataPartitionCount,
                NodeSetID:                 dataNode.NodeSetID,
                PersistenceDataPartitions: dataNode.PersistenceDataPartitions,
                BadDisks:                  dataNode.BadDisks,
                RdOnly:                    dataNode.RdOnly,
                MaxDpCntLimit:             dataNode.GetDpCntLimit(),
                CpuUtil:                   dataNode.CpuUtil.Load(),
                IoUtils:                   dataNode.GetIoUtils(),
        }

        sendOkReply(w, r, newSuccessHTTPReply(dataNodeInfo))
}

// Decommission a data node. This will decommission all the data partition on that node.
func (m *Server) decommissionDataNode(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg      string
                offLineAddr string
                raftForce   bool
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionDataNode))
        defer func() {
                doStatAndMetric(proto.DecommissionDataNode, metric, err, nil)
        }()

        if offLineAddr, err = parseDecomDataNodeReq(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        raftForce, err = parseRaftForce(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if _, err = m.cluster.dataNode(offLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }

        if err = m.cluster.migrateDataNode(offLineAddr, "", raftForce, 0); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        rstMsg = fmt.Sprintf("decommission data node [%v] submited!need check status later!", offLineAddr)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) migrateDataNodeHandler(w http.ResponseWriter, r *http.Request) {
        var (
                srcAddr, targetAddr string
                limit               int
                raftForce           bool
                err                 error
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.MigrateDataNode))
        defer func() {
                doStatAndMetric(proto.MigrateDataNode, metric, err, nil)
        }()

        srcAddr, targetAddr, limit, err = parseMigrateNodeParam(r)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        raftForce, err = parseRaftForce(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        srcNode, err := m.cluster.dataNode(srcAddr)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeDataNodeNotExists, Msg: err.Error()})
                return
        }

        targetNode, err := m.cluster.dataNode(targetAddr)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeDataNodeNotExists, Msg: err.Error()})
                return
        }

        if srcNode.NodeSetID != targetNode.NodeSetID {
                err = fmt.Errorf("src %s and target %s must exist in the same nodeSet when migrate", srcAddr, targetAddr)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if !targetNode.isWriteAble() || !targetNode.dpCntInLimit() {
                err = fmt.Errorf("[%s] is not writable, can't used as target addr for migrate", targetAddr)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = m.cluster.migrateDataNode(srcAddr, targetAddr, raftForce, limit); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        rstMsg := fmt.Sprintf("migrateDataNodeHandler from src [%v] to target[%v] has submited and run in asyn ways,need check laster!", srcAddr, targetAddr)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// Decommission a data node. This will decommission all the data partition on that node.
func (m *Server) cancelDecommissionDataNode(w http.ResponseWriter, r *http.Request) {
        var (
                node        *DataNode
                rstMsg      string
                offLineAddr string
                err         error
                dps         []uint64
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.CancelDecommissionDataNode))
        defer func() {
                doStatAndMetric(proto.CancelDecommissionDataNode, metric, err, nil)
        }()

        if offLineAddr, err = parseAndExtractNodeAddr(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if node, err = m.cluster.dataNode(offLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }
        if err, dps = m.cluster.decommissionDataNodeCancel(node); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        rstMsg = fmt.Sprintf("cancel decommission data node [%v] with paused failed[%v]", offLineAddr, dps)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) setNodeInfoHandler(w http.ResponseWriter, r *http.Request) {
        var (
                params map[string]interface{}
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetNodeInfo))
        defer func() {
                doStatAndMetric(proto.AdminSetNodeInfo, metric, err, nil)
        }()

        if params, err = parseAndExtractSetNodeInfoParams(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if batchCount, ok := params[nodeDeleteBatchCountKey]; ok {
                if bc, ok := batchCount.(uint64); ok {
                        if err = m.cluster.setMetaNodeDeleteBatchCount(bc); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[clusterLoadFactorKey]; ok {
                if factor, ok := val.(float32); ok {
                        if err = m.cluster.setClusterLoadFactor(factor); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[nodeMarkDeleteRateKey]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setDataNodeDeleteLimitRate(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[nodeAutoRepairRateKey]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setDataNodeAutoRepairLimitRate(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[nodeDpRepairTimeOutKey]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setDataPartitionRepairTimeOut(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[nodeDpMaxRepairErrCntKey]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setDataPartitionMaxRepairErrCnt(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[nodeDeleteWorkerSleepMs]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setMetaNodeDeleteWorkerSleepMs(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[maxDpCntLimitKey]; ok {
                if v, ok := val.(uint64); ok {
                        if err = m.cluster.setMaxDpCntLimit(v); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        if val, ok := params[clusterCreateTimeKey]; ok {
                if createTimeParam, ok := val.(string); ok {
                        var createTime time.Time
                        var err error
                        if createTime, err = time.ParseInLocation(proto.TimeFormat, createTimeParam, time.Local); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                        if err = m.cluster.setClusterCreateTime(createTime.Unix()); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                }
        }

        dataNodesetSelector := extractDataNodesetSelector(r)
        metaNodesetSelector := extractMetaNodesetSelector(r)
        dataNodeSelector := extractDataNodeSelector(r)
        metaNodeSelector := extractMetaNodeSelector(r)
        if err = m.updateClusterSelector(dataNodesetSelector, metaNodesetSelector, dataNodeSelector, metaNodeSelector); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodeinfo params %v successfully", params)))
}

func (m *Server) updateDataUseRatio(ratio float64) (err error) {
        m.cluster.domainManager.dataRatioLimit = ratio
        err = m.cluster.putZoneDomain(false)
        return
}

func (m *Server) updateExcludeZoneUseRatio(ratio float64) (err error) {
        m.cluster.domainManager.excludeZoneUseRatio = ratio
        err = m.cluster.putZoneDomain(false)
        return
}

func (m *Server) updateNodesetId(zoneName string, destNodesetId uint64, nodeType uint64, addr string) (err error) {
        var (
                nsId           uint64
                dstNs          *nodeSet
                srcNs          *nodeSet
                ok             bool
                value          interface{}
                metaNode       *MetaNode
                dataNode       *DataNode
                nodeTypeUint32 uint32
        )
        defer func() {
                log.LogInfof("action[updateNodesetId] step out")
        }()
        log.LogWarnf("action[updateNodesetId] zonename[%v] destNodesetId[%v] nodeType[%v] addr[%v]",
                zoneName, destNodesetId, nodeType, addr)

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                return fmt.Errorf("zonename [%v] not found", zoneName)
        }
        zone := value.(*Zone)
        if dstNs, ok = zone.nodeSetMap[destNodesetId]; !ok {
                return fmt.Errorf("%v destNodesetId not found", destNodesetId)
        }
        if nodeType == uint64(TypeDataPartition) {
                value, ok = zone.dataNodes.Load(addr)
                if !ok {
                        return fmt.Errorf("addr %v not found", addr)
                }
                nsId = value.(*DataNode).NodeSetID
        } else if nodeType == uint64(TypeMetaPartition) {
                value, ok = zone.metaNodes.Load(addr)
                if !ok {
                        return fmt.Errorf("addr %v not found", addr)
                }
                nsId = value.(*MetaNode).NodeSetID
        } else {
                return fmt.Errorf("%v wrong type", nodeType)
        }
        log.LogInfof("action[updateNodesetId] zonename[%v] destNodesetId[%v] nodeType[%v] addr[%v] get destid[%v]",
                zoneName, destNodesetId, nodeType, addr, dstNs.ID)
        srcNs = zone.nodeSetMap[nsId]
        if srcNs.ID == dstNs.ID {
                return fmt.Errorf("addr belong to same nodeset")
        } else if srcNs.ID < dstNs.ID {
                // take parallel call updatenodeid and defer unlock order into consider
                srcNs.Lock()
                dstNs.Lock()
                defer srcNs.Unlock()
                defer dstNs.Unlock()
        } else {
                // take parallel call updatenodeid and defer unlock order into consider
                dstNs.Lock()
                srcNs.Lock()
                defer dstNs.Unlock()
                defer srcNs.Unlock()
        }

        // the nodeset capcity not enlarged if node be added,capacity can be adjust by
        // AdminUpdateNodeSetCapcity
        if nodeType <= math.MaxUint32 {
                nodeTypeUint32 = uint32(nodeType)
        } else {
                nodeTypeUint32 = math.MaxUint32
        }
        if nodeTypeUint32 == TypeDataPartition {
                if value, ok = srcNs.dataNodes.Load(addr); !ok {
                        return fmt.Errorf("addr not found in srcNs.dataNodes")
                }
                dataNode = value.(*DataNode)
                dataNode.NodeSetID = dstNs.ID
                dstNs.putDataNode(dataNode)
                srcNs.deleteDataNode(dataNode)
                if err = m.cluster.syncUpdateDataNode(dataNode); err != nil {
                        dataNode.NodeSetID = srcNs.ID
                        return
                }
        } else {
                if value, ok = srcNs.metaNodes.Load(addr); !ok {
                        return fmt.Errorf("ddr not found in srcNs.metaNodes")
                }
                metaNode = value.(*MetaNode)
                metaNode.NodeSetID = dstNs.ID
                dstNs.putMetaNode(metaNode)
                srcNs.deleteMetaNode(metaNode)
                if err = m.cluster.syncUpdateMetaNode(metaNode); err != nil {
                        dataNode.NodeSetID = srcNs.ID
                        return
                }
        }
        if err = m.cluster.syncUpdateNodeSet(dstNs); err != nil {
                return fmt.Errorf("warn:syncUpdateNodeSet dst srcNs [%v] failed", dstNs.ID)
        }
        if err = m.cluster.syncUpdateNodeSet(srcNs); err != nil {
                return fmt.Errorf("warn:syncUpdateNodeSet src srcNs [%v] failed", srcNs.ID)
        }

        return
}

func (m *Server) updateZoneNodeSelector(zoneName string, dataNodeSelector string, metaNodeSelector string) (err error) {
        var ok bool
        var value interface{}

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                err = fmt.Errorf("zonename [%v] not found", zoneName)
                return
        }

        zone := value.(*Zone)
        zone.nsLock.RLock()
        defer zone.nsLock.RUnlock()
        for _, ns := range zone.nodeSetMap {
                needSync := false
                if dataNodeSelector != "" && dataNodeSelector != ns.GetDataNodeSelector() {
                        ns.SetDataNodeSelector(dataNodeSelector)
                        needSync = true
                }
                if metaNodeSelector != "" && metaNodeSelector != ns.GetMetaNodeSelector() {
                        ns.SetMetaNodeSelector(metaNodeSelector)
                        needSync = true
                }
                if needSync {
                        err = m.cluster.syncUpdateNodeSet(ns)
                        if err != nil {
                                return
                        }
                }
        }
        return
}

func (m *Server) updateZoneNodesetNodeSelector(zoneName string, nodesetId uint64, dataNodesetSelector string, metaNodesetSelector string) (err error) {
        var ns *nodeSet
        var ok bool
        var value interface{}

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                err = fmt.Errorf("zonename [%v] not found", zoneName)
                return
        }

        zone := value.(*Zone)
        if ns, ok = zone.nodeSetMap[nodesetId]; !ok {
                err = fmt.Errorf("nodesetId [%v] not found", nodesetId)
                return
        }
        needSync := false
        if dataNodesetSelector != "" && dataNodesetSelector != ns.GetDataNodeSelector() {
                ns.SetDataNodeSelector(dataNodesetSelector)
                needSync = true
        }
        if metaNodesetSelector != "" && metaNodesetSelector != ns.GetMetaNodeSelector() {
                ns.SetMetaNodeSelector(metaNodesetSelector)
                needSync = true
        }
        if needSync {
                err = m.cluster.syncUpdateNodeSet(ns)
                if err != nil {
                        return
                }
        }
        log.LogInfof("action[updateNodesetNodeSelector] zonename %v nodeset %v dataNodeSelector %v metaNodeSelector %v", zoneName, nodesetId, dataNodesetSelector, metaNodesetSelector)
        return
}

func (m *Server) updateClusterSelector(dataNodesetSelector string, metaNodesetSelector string, dataNodeSelector string, metaNodeSelector string) (err error) {
        m.cluster.t.zoneMap.Range(func(key, value interface{}) bool {
                zone := value.(*Zone)
                err = zone.updateNodesetSelector(m.cluster, dataNodesetSelector, metaNodesetSelector)
                if err != nil {
                        return false
                }
                err = m.updateZoneNodeSelector(zone.name, dataNodeSelector, metaNodeSelector)
                if err != nil {
                        return false
                }
                return true
        })
        return
}

func (m *Server) setDpRdOnly(partitionID uint64, rdOnly bool) (err error) {
        var dp *DataPartition
        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                return fmt.Errorf("[setPartitionRdOnly] getDataPartitionByID err(%s)", err.Error())
        }
        dp.RLock()
        dp.RdOnly = rdOnly
        m.cluster.syncUpdateDataPartition(dp)
        dp.RUnlock()
        return
}

func (m *Server) setNodeRdOnly(addr string, nodeType uint32, rdOnly bool) (err error) {
        if nodeType == TypeDataPartition {
                m.cluster.dnMutex.Lock()
                defer m.cluster.dnMutex.Unlock()
                value, ok := m.cluster.dataNodes.Load(addr)
                if !ok {
                        return fmt.Errorf("[setNodeRdOnly] data node %s is not exist", addr)
                }

                dataNode := value.(*DataNode)
                oldRdOnly := dataNode.RdOnly
                dataNode.RdOnly = rdOnly

                if err = m.cluster.syncUpdateDataNode(dataNode); err != nil {
                        dataNode.RdOnly = oldRdOnly
                        return fmt.Errorf("[setNodeRdOnly] syncUpdateDataNode err(%s)", err.Error())
                }

                return
        }

        m.cluster.mnMutex.Lock()
        defer m.cluster.mnMutex.Unlock()

        value, ok := m.cluster.metaNodes.Load(addr)
        if !ok {
                return fmt.Errorf("[setNodeRdOnly] meta node %s is not exist", addr)
        }

        metaNode := value.(*MetaNode)
        oldRdOnly := metaNode.RdOnly
        metaNode.RdOnly = rdOnly

        if err = m.cluster.syncUpdateMetaNode(metaNode); err != nil {
                metaNode.RdOnly = oldRdOnly
                return fmt.Errorf("[setNodeRdOnly] syncUpdateMetaNode err(%s)", err.Error())
        }

        return
}

func (m *Server) updateNodesetCapcity(zoneName string, nodesetId uint64, capcity uint64) (err error) {
        var ns *nodeSet
        var ok bool
        var value interface{}

        if capcity < defaultReplicaNum || capcity > 100 {
                err = fmt.Errorf("capcity [%v] value out of scope", capcity)
                return
        }

        if value, ok = m.cluster.t.zoneMap.Load(zoneName); !ok {
                err = fmt.Errorf("zonename [%v] not found", zoneName)
                return
        }

        zone := value.(*Zone)
        if ns, ok = zone.nodeSetMap[nodesetId]; !ok {
                err = fmt.Errorf("nodesetId [%v] not found", nodesetId)
                return
        }

        ns.Capacity = int(capcity)
        m.cluster.syncUpdateNodeSet(ns)
        log.LogInfof("action[updateNodesetCapcity] zonename %v nodeset %v capcity %v", zoneName, nodesetId, capcity)
        return
}

func (m *Server) buildNodeSetGrpInfoByID(domainId, grpId uint64) (*proto.SimpleNodeSetGrpInfo, error) {
        domainIndex := m.cluster.domainManager.domainId2IndexMap[domainId]
        nsgm := m.cluster.domainManager.domainNodeSetGrpVec[domainIndex]
        var index int
        for index = 0; index < len(nsgm.nodeSetGrpMap); index++ {
                if nsgm.nodeSetGrpMap[index].ID == grpId {
                        break
                }
                if nsgm.nodeSetGrpMap[index].ID > grpId {
                        return nil, fmt.Errorf("id not found")
                }
        }
        if index == len(nsgm.nodeSetGrpMap) {
                return nil, fmt.Errorf("id not found")
        }
        return m.buildNodeSetGrpInfo(nsgm.nodeSetGrpMap[index]), nil
}

func (m *Server) buildNodeSetGrpInfo(nsg *nodeSetGroup) *proto.SimpleNodeSetGrpInfo {
        nsgStat := new(proto.SimpleNodeSetGrpInfo)
        nsgStat.ID = nsg.ID
        nsgStat.Status = nsg.status
        for i := 0; i < len(nsg.nodeSets); i++ {
                var nsStat proto.NodeSetInfo
                nsStat.ID = nsg.nodeSets[i].ID
                nsStat.Capacity = nsg.nodeSets[i].Capacity
                nsStat.ZoneName = nsg.nodeSets[i].zoneName
                nsg.nodeSets[i].dataNodes.Range(func(key, value interface{}) bool {
                        node := value.(*DataNode)
                        nsStat.DataTotal += node.Total
                        if node.isWriteAble() {
                                nsStat.DataUsed += node.Used
                        } else {
                                nsStat.DataUsed += node.Total
                        }
                        log.LogInfof("nodeset index[%v], datanode nodeset id[%v],zonename[%v], addr[%v] inner nodesetid[%v]",
                                i, nsStat.ID, node.ZoneName, node.Addr, node.NodeSetID)

                        dataNodeInfo := &proto.DataNodeInfo{
                                Total:              node.Total,
                                Used:               node.Used,
                                AvailableSpace:     node.AvailableSpace,
                                ID:                 node.ID,
                                ZoneName:           node.ZoneName,
                                Addr:               node.Addr,
                                ReportTime:         node.ReportTime,
                                IsActive:           node.isActive,
                                IsWriteAble:        node.isWriteAble(),
                                UsageRatio:         node.UsageRatio,
                                SelectedTimes:      node.SelectedTimes,
                                DataPartitionCount: node.DataPartitionCount,
                                NodeSetID:          node.NodeSetID,
                        }
                        nsStat.DataNodes = append(nsStat.DataNodes, dataNodeInfo)
                        return true
                })

                nsStat.DataUseRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", float64(nsStat.DataUsed)/float64(nsStat.DataTotal)), 64)

                nsg.nodeSets[i].metaNodes.Range(func(key, value interface{}) bool {
                        node := value.(*MetaNode)
                        nsStat.MetaTotal += node.Total
                        nsStat.MetaUsed += node.Used
                        log.LogInfof("nodeset index[%v], metanode nodeset id[%v],zonename[%v], addr[%v] inner nodesetid[%v]",
                                i, nsStat.ID, node.ZoneName, node.Addr, node.NodeSetID)

                        metaNodeInfo := &proto.MetaNodeInfo{
                                ID:                 node.ID,
                                Addr:               node.Addr,
                                IsActive:           node.IsActive,
                                IsWriteAble:        node.isWritable(),
                                ZoneName:           node.ZoneName,
                                MaxMemAvailWeight:  node.MaxMemAvailWeight,
                                Total:              node.Total,
                                Used:               node.Used,
                                Ratio:              node.Ratio,
                                SelectCount:        node.SelectCount,
                                Threshold:          node.Threshold,
                                ReportTime:         node.ReportTime,
                                MetaPartitionCount: node.MetaPartitionCount,
                                NodeSetID:          node.NodeSetID,
                        }

                        nsStat.MetaNodes = append(nsStat.MetaNodes, metaNodeInfo)

                        return true
                })
                nsStat.MetaUseRatio, _ = strconv.ParseFloat(fmt.Sprintf("%.2f", float64(nsStat.MetaUsed)/float64(nsStat.MetaTotal)), 64)
                nsgStat.NodeSetInfo = append(nsgStat.NodeSetInfo, nsStat)
                log.LogInfof("nodeset index[%v], nodeset id[%v],capacity[%v], datatotal[%v] dataused[%v] metatotal[%v] metaused[%v], metanode[%v], datanodes[%v]",
                        i, nsStat.ID, nsStat.Capacity, nsStat.DataTotal, nsStat.DataUsed, nsStat.MetaTotal, nsStat.MetaUsed, nsStat.MetaNodes, nsStat.DataNodes)
        }
        return nsgStat
}

func parseSetNodeRdOnlyParam(r *http.Request) (addr string, nodeType uint32, rdOnly bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if addr = r.FormValue(addrKey); addr == "" {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", addrKey)
                return
        }

        if nodeType, err = parseNodeType(r); err != nil {
                return
        }

        val := r.FormValue(rdOnlyKey)
        if val == "" {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", rdOnlyKey)
                return
        }

        if rdOnly, err = strconv.ParseBool(val); err != nil {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not bool value %s", rdOnlyKey, val)
                return
        }

        return
}

func parseSetDpRdOnlyParam(r *http.Request) (dpId uint64, rdOnly bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if dpId, err = extractDataPartitionID(r); err != nil {
                err = fmt.Errorf("parseSetDpRdOnlyParam get dpid error %v", err)
                return
        }

        val := r.FormValue(rdOnlyKey)
        if val == "" {
                err = fmt.Errorf("parseSetDpRdOnlyParam %s is empty", rdOnlyKey)
                return
        }

        if rdOnly, err = strconv.ParseBool(val); err != nil {
                err = fmt.Errorf("parseSetDpRdOnlyParam %s is not bool value %s", rdOnlyKey, val)
                return
        }

        return
}

func parseNodeType(r *http.Request) (nodeType uint32, err error) {
        var val string
        var nodeTypeUint64 uint64
        if val = r.FormValue(nodeTypeKey); val == "" {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is empty", nodeTypeKey)
                return
        }

        if nodeTypeUint64, err = strconv.ParseUint(val, 10, 32); err != nil {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not number, err %s", nodeTypeKey, err.Error())
                return
        }
        nodeType = uint32(nodeTypeUint64)
        if nodeType != TypeDataPartition && nodeType != TypeMetaPartition {
                err = fmt.Errorf("parseSetNodeRdOnlyParam %s is not legal, must be %d or %d", nodeTypeKey, TypeDataPartition, TypeMetaPartition)
                return
        }

        return
}

func (m *Server) setNodeRdOnlyHandler(w http.ResponseWriter, r *http.Request) {
        var (
                addr     string
                nodeType uint32
                rdOnly   bool
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetNodeRdOnly))
        defer func() {
                doStatAndMetric(proto.AdminSetNodeRdOnly, metric, err, nil)
        }()

        addr, nodeType, rdOnly, err = parseSetNodeRdOnlyParam(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        log.LogInfof("[setNodeRdOnlyHandler] set node %s to rdOnly(%v)", addr, rdOnly)

        err = m.setNodeRdOnly(addr, nodeType, rdOnly)
        if err != nil {
                log.LogErrorf("[setNodeRdOnlyHandler] set node %s to rdOnly %v, err (%s)", addr, rdOnly, err.Error())
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("[setNodeRdOnlyHandler] set node %s to rdOnly(%v) success", addr, rdOnly)))
        return
}

func (m *Server) setDpRdOnlyHandler(w http.ResponseWriter, r *http.Request) {
        var (
                dpId   uint64
                rdOnly bool
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetDpRdOnly))
        defer func() {
                doStatAndMetric(proto.AdminSetDpRdOnly, metric, err, nil)
        }()

        dpId, rdOnly, err = parseSetDpRdOnlyParam(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        log.LogInfof("[setNodeRdOnlyHandler] set dp %v to rdOnly(%v)", dpId, rdOnly)

        err = m.setDpRdOnly(dpId, rdOnly)
        if err != nil {
                log.LogErrorf("[setNodeRdOnlyHandler] set dp %v to rdOnly %v, err (%s)", dpId, rdOnly, err.Error())
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("[setNodeRdOnlyHandler] set dpid %v to rdOnly(%v) success", dpId, rdOnly)))
        return
}

func (m *Server) updateNodeSetCapacityHandler(w http.ResponseWriter, r *http.Request) {
        var (
                params map[string]interface{}
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetCapcity))
        defer func() {
                doStatAndMetric(proto.AdminUpdateNodeSetCapcity, metric, err, nil)
        }()

        if params, err = parseAndExtractSetNodeSetInfoParams(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err := m.updateNodesetCapcity(params[zoneNameKey].(string), params[idKey].(uint64), params[countKey].(uint64)); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply("set nodesetinfo successfully"))
}

func (m *Server) updateDataUseRatioHandler(w http.ResponseWriter, r *http.Request) {
        var (
                params map[string]interface{}
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDomainDataUseRatio))
        defer func() {
                doStatAndMetric(proto.AdminUpdateDomainDataUseRatio, metric, err, nil)
        }()

        var value string
        if value = r.FormValue(ratio); value == "" {
                err = keyNotFound(ratio)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        var ratioVal float64
        if ratioVal, err = strconv.ParseFloat(value, 64); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if ratioVal == 0 || ratioVal > 1 {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "ratioVal is not legal"})
                return
        }

        if err = m.updateDataUseRatio(ratioVal); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodesetinfo params %v successfully", params)))
}

func (m *Server) updateZoneExcludeRatioHandler(w http.ResponseWriter, r *http.Request) {
        var (
                params map[string]interface{}
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateZoneExcludeRatio))
        defer func() {
                doStatAndMetric(proto.AdminUpdateZoneExcludeRatio, metric, err, nil)
        }()

        var value string
        if value = r.FormValue(ratio); value == "" {
                err = keyNotFound(ratio)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        var ratioVal float64
        if ratioVal, err = strconv.ParseFloat(value, 64); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.updateExcludeZoneUseRatio(ratioVal); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set nodesetinfo params %v successfully", params)))
}

func (m *Server) updateNodeSetIdHandler(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr string
                id       uint64
                zoneName string
                err      error
                nodeType uint64
                value    string
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetId))
        defer func() {
                doStatAndMetric(proto.AdminUpdateNodeSetId, metric, err, nil)

                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                }
        }()

        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }
        if err = r.ParseForm(); err != nil {
                return
        }
        if nodeAddr, err = extractNodeAddr(r); err != nil {
                return
        }
        if id, err = extractNodeID(r); err != nil {
                return
        }
        if value = r.FormValue(nodeTypeKey); value == "" {
                err = fmt.Errorf("need param nodeType")
                return
        }

        if nodeType, err = strconv.ParseUint(value, 10, 64); err != nil {
                return
        }

        if err = m.updateNodesetId(zoneName, id, nodeType, nodeAddr); err != nil {
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("update node setid successfully")))
}

func (m *Server) updateNodeSetNodeSelector(w http.ResponseWriter, r *http.Request) {
        var (
                id       uint64
                zoneName string
                err      error
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateNodeSetNodeSelector))
        defer func() {
                doStatAndMetric(proto.AdminUpdateNodeSetNodeSelector, metric, err, nil)

                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                }
        }()

        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                zoneName = DefaultZoneName
        }
        if err = r.ParseForm(); err != nil {
                return
        }
        if id, err = extractNodesetID(r); err != nil {
                return
        }
        dataNodeSelector := extractDataNodeSelector(r)
        metaNodeSelector := r.FormValue(metaNodeSelectorKey)

        if err = m.updateZoneNodesetNodeSelector(zoneName, id, dataNodeSelector, metaNodeSelector); err != nil {
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply("update nodeset selector successfully"))
}

// get metanode some interval params
func (m *Server) getNodeSetGrpInfoHandler(w http.ResponseWriter, r *http.Request) {
        var err error

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetNodeSetGrpInfo))
        defer func() {
                doStatAndMetric(proto.AdminGetNodeSetGrpInfo, metric, err, nil)
        }()

        if err = r.ParseForm(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        var value string
        var id uint64
        if value = r.FormValue(idKey); value != "" {
                id, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                        return
                }
        }
        var domainId uint64
        if value = r.FormValue(domainIdKey); value != "" {
                domainId, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                        return
                }
        }

        log.LogInfof("action[getNodeSetGrpInfoHandler] id [%v]", id)
        var info *proto.SimpleNodeSetGrpInfo
        if info, err = m.buildNodeSetGrpInfoByID(domainId, id); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(info))
}

func (m *Server) getIsDomainOn(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetIsDomainOn))
        defer func() {
                doStatAndMetric(proto.AdminGetIsDomainOn, metric, nil, nil)
        }()

        type SimpleDomainInfo struct {
                DomainOn bool
        }
        nsglStat := new(SimpleDomainInfo)
        nsglStat.DomainOn = m.cluster.FaultDomain
        sendOkReply(w, r, newSuccessHTTPReply(nsglStat))
}

func (m *Server) createDomainHandler(w http.ResponseWriter, r *http.Request) {
        nsgm := m.cluster.domainManager
        var (
                zoneName string
                err      error
        )

        if zoneName = r.FormValue(zoneNameKey); zoneName == "" {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("zonename null").Error()})
                return
        }
        if err = nsgm.createDomain(zoneName); err != nil {
                log.LogErrorf("action[createDomainHandler] err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("successful")))
}

// get metanode some interval params
func (m *Server) getAllNodeSetGrpInfoHandler(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetAllNodeSetGrpInfo))
        defer func() {
                doStatAndMetric(proto.AdminGetAllNodeSetGrpInfo, metric, nil, nil)
        }()

        nsgm := m.cluster.domainManager

        nsglStat := new(proto.DomainNodeSetGrpInfoList)
        nsglStat.DomainOn = m.cluster.FaultDomain
        nsglStat.NeedDomain = m.cluster.needFaultDomain
        nsglStat.DataRatioLimit = nsgm.dataRatioLimit
        nsglStat.ZoneExcludeRatioLimit = nsgm.excludeZoneUseRatio
        nsglStat.ExcludeZones = nsgm.c.t.domainExcludeZones

        for i := 0; i < len(nsgm.domainNodeSetGrpVec); i++ {
                nodeSetGrpInfoList := &proto.SimpleNodeSetGrpInfoList{}
                nodeSetGrpInfoList.DomainId = nsgm.domainNodeSetGrpVec[i].domainId
                nodeSetGrpInfoList.Status = nsgm.domainNodeSetGrpVec[i].status
                nsglStat.DomainNodeSetGrpInfo = append(nsglStat.DomainNodeSetGrpInfo, nodeSetGrpInfoList)
                nodeSetGrpInfoList.Status = nsgm.domainNodeSetGrpVec[i].status
                log.LogInfof("action[getAllNodeSetGrpInfoHandler] start build domain id [%v]", nsgm.domainNodeSetGrpVec[i].domainId)
                for j := 0; j < len(nsgm.domainNodeSetGrpVec[i].nodeSetGrpMap); j++ {
                        log.LogInfof("action[getAllNodeSetGrpInfoHandler] build domain id [%v] nodeset group index [%v] Print inner nodeset now!",
                                nsgm.domainNodeSetGrpVec[i].domainId, j)

                        nodeSetGrpInfoList.SimpleNodeSetGrpInfo = append(nodeSetGrpInfoList.SimpleNodeSetGrpInfo,
                                m.buildNodeSetGrpInfo(nsgm.domainNodeSetGrpVec[i].nodeSetGrpMap[j]))
                }
        }

        sendOkReply(w, r, newSuccessHTTPReply(nsglStat))
}

// get metanode some interval params
func (m *Server) getNodeInfoHandler(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetNodeInfo))
        defer func() {
                doStatAndMetric(proto.AdminGetNodeInfo, metric, nil, nil)
        }()

        resp := make(map[string]string)
        resp[nodeDeleteBatchCountKey] = fmt.Sprintf("%v", m.cluster.cfg.MetaNodeDeleteBatchCount)
        resp[nodeMarkDeleteRateKey] = fmt.Sprintf("%v", m.cluster.cfg.DataNodeDeleteLimitRate)
        resp[nodeDeleteWorkerSleepMs] = fmt.Sprintf("%v", m.cluster.cfg.MetaNodeDeleteWorkerSleepMs)
        resp[nodeAutoRepairRateKey] = fmt.Sprintf("%v", m.cluster.cfg.DataNodeAutoRepairLimitRate)
        resp[nodeDpRepairTimeOutKey] = fmt.Sprintf("%v", m.cluster.cfg.DpRepairTimeOut)
        resp[nodeDpMaxRepairErrCntKey] = fmt.Sprintf("%v", m.cluster.cfg.DpMaxRepairErrCnt)
        resp[clusterLoadFactorKey] = fmt.Sprintf("%v", m.cluster.cfg.ClusterLoadFactor)
        resp[maxDpCntLimitKey] = fmt.Sprintf("%v", m.cluster.cfg.MaxDpCntLimit)

        sendOkReply(w, r, newSuccessHTTPReply(resp))
}

func (m *Server) diagnoseMetaPartition(w http.ResponseWriter, r *http.Request) {
        var (
                err                             error
                rstMsg                          *proto.MetaPartitionDiagnosis
                inactiveNodes                   []string
                noLeaderMps                     []*MetaPartition
                lackReplicaMps                  []*MetaPartition
                badReplicaMps                   []*MetaPartition
                excessReplicaMPs                []*MetaPartition
                inodeCountNotEqualReplicaMps    []*MetaPartition
                maxInodeNotEqualMPs             []*MetaPartition
                dentryCountNotEqualReplicaMps   []*MetaPartition
                corruptMpIDs                    []uint64
                lackReplicaMpIDs                []uint64
                badReplicaMpIDs                 []uint64
                excessReplicaMpIDs              []uint64
                inodeCountNotEqualReplicaMpIDs  []uint64
                maxInodeNotEqualReplicaMpIDs    []uint64
                dentryCountNotEqualReplicaMpIDs []uint64
                badMetaPartitions               []badPartitionView
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDiagnoseMetaPartition))
        defer func() {
                doStatAndMetric(proto.AdminDiagnoseMetaPartition, metric, err, nil)
        }()

        corruptMpIDs = make([]uint64, 0)
        lackReplicaMpIDs = make([]uint64, 0)
        badReplicaMpIDs = make([]uint64, 0)
        excessReplicaMpIDs = make([]uint64, 0)

        if inactiveNodes, err = m.cluster.checkInactiveMetaNodes(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if lackReplicaMps, noLeaderMps, badReplicaMps, excessReplicaMPs,
                inodeCountNotEqualReplicaMps, maxInodeNotEqualMPs, dentryCountNotEqualReplicaMps, err = m.cluster.checkReplicaMetaPartitions(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        for _, mp := range noLeaderMps {
                corruptMpIDs = append(corruptMpIDs, mp.PartitionID)
        }
        for _, mp := range lackReplicaMps {
                lackReplicaMpIDs = append(lackReplicaMpIDs, mp.PartitionID)
        }
        for _, mp := range badReplicaMps {
                badReplicaMpIDs = append(badReplicaMpIDs, mp.PartitionID)
        }
        for _, mp := range excessReplicaMPs {
                excessReplicaMpIDs = append(excessReplicaMpIDs, mp.PartitionID)
        }

        for _, mp := range inodeCountNotEqualReplicaMps {
                inodeCountNotEqualReplicaMpIDs = append(inodeCountNotEqualReplicaMpIDs, mp.PartitionID)
        }
        for _, mp := range maxInodeNotEqualMPs {
                maxInodeNotEqualReplicaMpIDs = append(maxInodeNotEqualReplicaMpIDs, mp.PartitionID)
        }

        for _, mp := range dentryCountNotEqualReplicaMps {
                dentryCountNotEqualReplicaMpIDs = append(dentryCountNotEqualReplicaMpIDs, mp.PartitionID)
        }
        badMetaPartitions = m.cluster.getBadMetaPartitionsView()
        rstMsg = &proto.MetaPartitionDiagnosis{
                InactiveMetaNodes:                          inactiveNodes,
                CorruptMetaPartitionIDs:                    corruptMpIDs,
                LackReplicaMetaPartitionIDs:                lackReplicaMpIDs,
                BadMetaPartitionIDs:                        badMetaPartitions,
                BadReplicaMetaPartitionIDs:                 badReplicaMpIDs,
                ExcessReplicaMetaPartitionIDs:              excessReplicaMpIDs,
                InodeCountNotEqualReplicaMetaPartitionIDs:  inodeCountNotEqualReplicaMpIDs,
                MaxInodeNotEqualReplicaMetaPartitionIDs:    maxInodeNotEqualReplicaMpIDs,
                DentryCountNotEqualReplicaMetaPartitionIDs: dentryCountNotEqualReplicaMpIDs,
        }
        log.LogInfof("diagnose metaPartition cluster[%v], inactiveNodes:[%v], corruptMpIDs:[%v], "+
                "lackReplicaMpIDs:[%v], badReplicaMpIDs:[%v], excessReplicaDpIDs[%v] "+
                "inodeCountNotEqualReplicaMpIDs[%v] dentryCountNotEqualReplicaMpIDs[%v]",
                m.cluster.Name, inactiveNodes, corruptMpIDs, lackReplicaMpIDs, badReplicaMpIDs, excessReplicaMpIDs,
                inodeCountNotEqualReplicaMpIDs, dentryCountNotEqualReplicaMpIDs)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// Decommission a disk. This will decommission all the data partitions on this disk.
// If parameter diskDisable is true, creating data partitions on this disk will be not allowed.
func (m *Server) decommissionDisk(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg                string
                offLineAddr, diskPath string
                diskDisable           bool
                err                   error
                raftForce             bool
                limit                 int
                decommissionType      int
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionDisk))
        defer func() {
                doStatAndMetric(proto.DecommissionDisk, metric, err, nil)
        }()
        // default diskDisable is true
        if offLineAddr, diskPath, diskDisable, limit, decommissionType, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        raftForce, err = parseRaftForce(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.migrateDisk(offLineAddr, diskPath, "", raftForce, limit, diskDisable, uint32(decommissionType)); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        rstMsg = fmt.Sprintf("decommission disk [%v:%v] submited!need check status later!", offLineAddr, diskPath)
        Warn(m.clusterName, rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// Recommission a disk which is set disable when decommissioning. This will allow creating data partitions on this disk again.
func (m *Server) recommissionDisk(w http.ResponseWriter, r *http.Request) {
        var (
                node                 *DataNode
                rstMsg               string
                onLineAddr, diskPath string
                err                  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.RecommissionDisk))
        defer func() {
                doStatAndMetric(proto.RecommissionDisk, metric, err, nil)
        }()

        if onLineAddr, diskPath, err = parseReqToRecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if node, err = m.cluster.dataNode(onLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }

        if err = m.cluster.deleteAndSyncDecommissionedDisk(node, diskPath); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        rstMsg = fmt.Sprintf("receive recommissionDisk node[%v] disk[%v], and recommission successfully",
                node.Addr, diskPath)

        Warn(m.clusterName, rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) restoreStoppedAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg                string
                offLineAddr, diskPath string
                err                   error
        )

        metric := exporter.NewTPCnt("req_restoreStoppedAutoDecommissionDisk")
        defer func() {
                metric.Set(err)
        }()
        if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.cluster.restoreStoppedAutoDecommissionDisk(offLineAddr, diskPath); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        rstMsg = fmt.Sprintf("restoreStoppedAutoDecommissionDisk node[%v] disk[%v] submited!need check status later!",
                offLineAddr, diskPath)
        Warn(m.clusterName, rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) queryDiskDecoProgress(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr, diskPath string
                err                   error
        )

        metric := exporter.NewTPCnt("req_queryDiskDecoProgress")
        defer func() {
                metric.Set(err)
        }()

        if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
        value, ok := m.cluster.DecommissionDisks.Load(key)
        if !ok {
                ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found  decommission task for node[%v] disk[%v], "+
                        "may be already offline", offLineAddr, diskPath)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
                return
        }
        disk := value.(*DecommissionDisk)
        status, progress := disk.updateDecommissionStatus(m.cluster, true)
        progress, _ = FormatFloatFloor(progress, 4)
        resp := &proto.DecommissionProgress{
                Status:        status,
                Progress:      fmt.Sprintf("%.2f%%", progress*float64(100)),
                StatusMessage: GetDecommissionStatusMessage(status),
        }
        if status == DecommissionFail {
                dps := disk.GetLatestDecommissionDP(m.cluster)
                dpIds := make([]uint64, 0)
                for _, dp := range dps {
                        if dp.IsDecommissionFailed() {
                                dpIds = append(dpIds, dp.PartitionID)
                        }
                }
                resp.FailedDps = dpIds
        }
        sendOkReply(w, r, newSuccessHTTPReply(resp))
}

func (m *Server) queryDecommissionDiskDecoFailedDps(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr, diskPath string
                err                   error
        )

        metric := exporter.NewTPCnt("req_queryDecommissionDiskDecoFailedDps")
        defer func() {
                metric.Set(err)
        }()

        if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
        value, ok := m.cluster.DecommissionDisks.Load(key)
        if !ok {
                ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
                        "may be already offline", offLineAddr, diskPath)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
                return
        }
        disk := value.(*DecommissionDisk)
        err, dps := disk.GetDecommissionFailedDP(m.cluster)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(dps))
}

func (m *Server) queryAllDecommissionDisk(w http.ResponseWriter, r *http.Request) {
        var err error

        metric := exporter.NewTPCnt("req_queryAllDecommissionDisk")
        defer func() {
                metric.Set(err)
        }()
        resp := &proto.DecommissionDisksResponse{}
        m.cluster.DecommissionDisks.Range(func(key, value interface{}) bool {
                disk := value.(*DecommissionDisk)
                info := proto.DecommissionDiskInfo{
                        SrcAddr:                  disk.SrcAddr,
                        DiskPath:                 disk.DiskPath,
                        DecommissionStatus:       disk.GetDecommissionStatus(),
                        DecommissionRaftForce:    disk.DecommissionRaftForce,
                        DecommissionRetry:        disk.DecommissionRetry,
                        DecommissionDpTotal:      disk.DecommissionDpTotal,
                        DecommissionTerm:         disk.DecommissionTerm,
                        DecommissionLimit:        disk.DecommissionDpCount,
                        Type:                     disk.Type,
                        DecommissionCompleteTime: disk.DecommissionCompleteTime,
                }
                _, info.Progress = disk.updateDecommissionStatus(m.cluster, true)
                resp.Infos = append(resp.Infos, info)
                return true
        })

        sendOkReply(w, r, newSuccessHTTPReply(resp))
}

func (m *Server) markDecoDiskFixed(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr, diskPath string
                err                   error
        )

        metric := exporter.NewTPCnt("req_markDecoDiskFixed")
        defer func() {
                metric.Set(err)
        }()

        if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
        value, ok := m.cluster.DecommissionDisks.Load(key)
        if !ok {
                ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for  node[%v] disk[%v], "+
                        "may be already offline", offLineAddr, diskPath)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
                return
        }
        disk := value.(*DecommissionDisk)
        err = m.cluster.syncDeleteDecommissionDisk(disk)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
                return
        }
        m.cluster.DecommissionDisks.Delete(disk.GenerateKey())
        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func (m *Server) cancelDecommissionDisk(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr, diskPath string
                err                   error
        )

        metric := exporter.NewTPCnt("req_cancelDecommissionDisk")
        defer func() {
                metric.Set(err)
        }()

        if offLineAddr, diskPath, _, _, _, err = parseReqToDecoDisk(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        key := fmt.Sprintf("%s_%s", offLineAddr, diskPath)
        value, ok := m.cluster.DecommissionDisks.Load(key)
        if !ok {
                ret := fmt.Sprintf("action[queryDiskDecoProgress]cannot found decommission task for node[%v] disk[%v], "+
                        "may be already offline", offLineAddr, diskPath)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: ret})
                return
        }
        disk := value.(*DecommissionDisk)
        err, dps := m.cluster.decommissionDiskCancel(disk)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        rstMsg := fmt.Sprintf("cancel decommission data node [%s] disk[%s] successfully with failed dp %v",
                offLineAddr, diskPath, dps)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

// handle tasks such as heartbeat，loadDataPartition，deleteDataPartition, etc.
func (m *Server) handleDataNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
        var (
                tr  *proto.AdminTask
                err error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetDataNodeTaskResponse))
        defer func() {
                doStatAndMetric(proto.GetDataNodeTaskResponse, metric, err, nil)
        }()

        tr, err = parseRequestToGetTaskResponse(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
        m.cluster.handleDataNodeTaskResponse(tr.OperatorAddr, tr)
}

func (m *Server) addMetaNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr  string
                zoneName  string
                id        uint64
                err       error
                nodesetId uint64
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AddMetaNode))
        defer func() {
                doStatAndMetric(proto.AddMetaNode, metric, err, nil)
        }()

        if nodeAddr, zoneName, err = parseRequestForAddNode(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !checkIpPort(nodeAddr) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                return
        }
        var value string
        if value = r.FormValue(idKey); value == "" {
                nodesetId = 0
        } else {
                if nodesetId, err = strconv.ParseUint(value, 10, 64); err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                        return
                }
        }
        if id, err = m.cluster.addMetaNode(nodeAddr, zoneName, nodesetId); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(id))
}

func (m *Server) checkInvalidIDNodes(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetInvalidNodes))
        defer func() {
                doStatAndMetric(proto.AdminGetInvalidNodes, metric, nil, nil)
        }()

        nodes := m.cluster.getInvalidIDNodes()
        sendOkReply(w, r, newSuccessHTTPReply(nodes))
}

func (m *Server) updateDataNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr string
                id       uint64
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDataNode))
        defer func() {
                doStatAndMetric(proto.AdminUpdateDataNode, metric, err, nil)
        }()

        if nodeAddr, id, err = parseRequestForUpdateMetaNode(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.updateDataNodeBaseInfo(nodeAddr, id); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(id))
}

func (m *Server) updateMetaNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr string
                id       uint64
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateMetaNode))
        defer func() {
                doStatAndMetric(proto.AdminUpdateMetaNode, metric, err, nil)
        }()

        if nodeAddr, id, err = parseRequestForUpdateMetaNode(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.cluster.updateMetaNodeBaseInfo(nodeAddr, id); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(id))
}

func (m *Server) getMetaNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr     string
                metaNode     *MetaNode
                metaNodeInfo *proto.MetaNodeInfo
                err          error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetMetaNode))
        defer func() {
                doStatAndMetric(proto.GetMetaNode, metric, err, nil)
        }()

        if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if metaNode, err = m.cluster.metaNode(nodeAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaNodeNotExists))
                return
        }
        metaNode.PersistenceMetaPartitions = m.cluster.getAllMetaPartitionIDByMetaNode(nodeAddr)
        metaNodeInfo = &proto.MetaNodeInfo{
                ID:                        metaNode.ID,
                Addr:                      metaNode.Addr,
                DomainAddr:                metaNode.DomainAddr,
                IsActive:                  metaNode.IsActive,
                IsWriteAble:               metaNode.isWritable(),
                ZoneName:                  metaNode.ZoneName,
                MaxMemAvailWeight:         metaNode.MaxMemAvailWeight,
                Total:                     metaNode.Total,
                Used:                      metaNode.Used,
                Ratio:                     metaNode.Ratio,
                SelectCount:               metaNode.SelectCount,
                Threshold:                 metaNode.Threshold,
                ReportTime:                metaNode.ReportTime,
                MetaPartitionCount:        metaNode.MetaPartitionCount,
                NodeSetID:                 metaNode.NodeSetID,
                PersistenceMetaPartitions: metaNode.PersistenceMetaPartitions,
                CpuUtil:                   metaNode.CpuUtil.Load(),
        }
        sendOkReply(w, r, newSuccessHTTPReply(metaNodeInfo))
}

func (m *Server) decommissionMetaPartition(w http.ResponseWriter, r *http.Request) {
        var (
                partitionID uint64
                nodeAddr    string
                mp          *MetaPartition
                msg         string
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminDecommissionMetaPartition))
        defer func() {
                doStatAndMetric(proto.AdminDecommissionMetaPartition, metric, err, nil)
        }()

        if partitionID, nodeAddr, err = parseRequestToDecommissionMetaPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }
        if err = m.cluster.decommissionMetaPartition(nodeAddr, mp); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg = fmt.Sprintf(proto.AdminDecommissionMetaPartition+" partitionID :%v  decommissionMetaPartition successfully", partitionID)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func parseMigrateNodeParam(r *http.Request) (srcAddr, targetAddr string, limit int, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        srcAddr = r.FormValue(srcAddrKey)
        if srcAddr == "" {
                err = fmt.Errorf("parseMigrateNodeParam %s can't be empty", srcAddrKey)
                return
        }
        if ipAddr, ok := util.ParseAddrToIpAddr(srcAddr); ok {
                srcAddr = ipAddr
        }

        targetAddr = r.FormValue(targetAddrKey)
        if targetAddr == "" {
                err = fmt.Errorf("parseMigrateNodeParam %s can't be empty when migrate", targetAddrKey)
                return
        }
        if ipAddr, ok := util.ParseAddrToIpAddr(targetAddr); ok {
                targetAddr = ipAddr
        }

        if srcAddr == targetAddr {
                err = fmt.Errorf("parseMigrateNodeParam srcAddr %s can't be equal to targetAddr %s", srcAddr, targetAddr)
                return
        }

        limit, err = parseUintParam(r, countKey)
        if err != nil {
                return
        }

        return
}

func parseUintParam(r *http.Request, key string) (num int, err error) {
        val := r.FormValue(key)
        if val == "" {
                num = 0
                return
        }

        numVal, err := strconv.ParseInt(val, 10, 32)
        if err != nil {
                err = fmt.Errorf("parseUintParam %s-%s is not legal, err %s", key, val, err.Error())
                return
        }

        num = int(numVal)
        return
}

func (m *Server) loadMetaPartition(w http.ResponseWriter, r *http.Request) {
        var (
                msg         string
                mp          *MetaPartition
                partitionID uint64
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminLoadMetaPartition))
        defer func() {
                doStatAndMetric(proto.AdminLoadMetaPartition, metric, err, nil)
        }()

        if partitionID, err = parseRequestToLoadMetaPartition(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }

        m.cluster.loadMetaPartitionAndCheckResponse(mp)
        msg = fmt.Sprintf(proto.AdminLoadMetaPartition+" partitionID :%v Load successfully", partitionID)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) migrateMetaNodeHandler(w http.ResponseWriter, r *http.Request) {
        var (
                srcAddr    string
                targetAddr string
                limit      int
                err        error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.MigrateMetaNode))
        defer func() {
                doStatAndMetric(proto.MigrateMetaNode, metric, err, nil)
        }()

        srcAddr, targetAddr, limit, err = parseMigrateNodeParam(r)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if limit > defaultMigrateMpCnt {
                err = fmt.Errorf("limit %d can't be bigger than %d", limit, defaultMigrateMpCnt)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        srcNode, err := m.cluster.metaNode(srcAddr)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMetaNodeNotExists, Msg: err.Error()})
                return
        }

        targetNode, err := m.cluster.metaNode(targetAddr)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMetaNodeNotExists, Msg: err.Error()})
                return
        }

        if srcNode.NodeSetID != targetNode.NodeSetID {
                err = fmt.Errorf("src %s and target %s must exist in the same nodeSet when migrate", srcAddr, targetAddr)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if !targetNode.isWritable() {
                err = fmt.Errorf("[%s] is not writable, can't used as target addr for migrate", targetAddr)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = m.cluster.migrateMetaNode(srcAddr, targetAddr, limit); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        rstMsg := fmt.Sprintf("migrateMetaNodeHandler from src [%v] to targaet[%s] has migrate successfully", srcAddr, targetAddr)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) decommissionMetaNode(w http.ResponseWriter, r *http.Request) {
        var (
                rstMsg      string
                offLineAddr string
                limit       int
                err         error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.DecommissionMetaNode))
        defer func() {
                doStatAndMetric(proto.DecommissionMetaNode, metric, err, nil)
        }()

        if offLineAddr, limit, err = parseDecomNodeReq(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if _, err = m.cluster.metaNode(offLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaNodeNotExists))
                return
        }
        if err = m.cluster.migrateMetaNode(offLineAddr, "", limit); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        rstMsg = fmt.Sprintf("decommissionMetaNode metaNode [%v] limit %d has offline successfully", offLineAddr, limit)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) handleMetaNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
        var (
                tr  *proto.AdminTask
                err error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.GetMetaNodeTaskResponse))
        defer func() {
                doStatAndMetric(proto.GetMetaNodeTaskResponse, metric, err, nil)
        }()

        tr, err = parseRequestToGetTaskResponse(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
        m.cluster.handleMetaNodeTaskResponse(tr.OperatorAddr, tr)
}

// Dynamically add a raft node (replica) for the master.
// By using this function, there is no need to stop all the master services. Adding a new raft node is performed online.
func (m *Server) addRaftNode(w http.ResponseWriter, r *http.Request) {
        var (
                id   uint64
                addr string
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AddRaftNode))
        defer func() {
                doStatAndMetric(proto.AddRaftNode, metric, err, nil)
        }()

        var msg string
        id, addr, err = parseRequestForRaftNode(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.cluster.addRaftNode(id, addr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg = fmt.Sprintf("add  raft node id :%v, addr:%v successfully \n", id, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

// Dynamically remove a master node. Similar to addRaftNode, this operation is performed online.
func (m *Server) removeRaftNode(w http.ResponseWriter, r *http.Request) {
        var (
                id   uint64
                addr string
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.RemoveRaftNode))
        defer func() {
                doStatAndMetric(proto.RemoveRaftNode, metric, err, nil)
        }()

        var msg string
        id, addr, err = parseRequestForRaftNode(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        err = m.cluster.removeRaftNode(id, addr)
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg = fmt.Sprintf("remove  raft node id :%v,adr:%v successfully\n", id, addr)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

// get master's raft status
func (m *Server) getRaftStatus(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.RaftStatus))
        defer func() {
                doStatAndMetric(proto.RaftStatus, metric, nil, nil)
        }()

        data := m.raftStore.RaftStatus(GroupID)
        log.LogInfof("get raft status, %s", data.String())
        sendOkReply(w, r, newSuccessHTTPReply(data))
}

func parseReqToDecoDisk(r *http.Request) (nodeAddr, diskPath string, diskDisable bool, limit, decommissionType int, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        nodeAddr, err = extractNodeAddr(r)
        if err != nil {
                return
        }
        diskPath, err = extractDiskPath(r)
        if err != nil {
                return
        }
        diskDisable, err = extractDiskDisable(r)
        if err != nil {
                return
        }
        decommissionType, err = parseUintParam(r, DecommissionType)
        if err != nil {
                return
        }
        limit, err = parseUintParam(r, countKey)
        if err != nil {
                return
        }
        return
}

func parseReqToRecoDisk(r *http.Request) (nodeAddr, diskPath string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        nodeAddr, err = extractNodeAddr(r)
        if err != nil {
                return
        }
        diskPath, err = extractDiskPath(r)
        if err != nil {
                return
        }

        return
}

type getVolParameter struct {
        name                string
        authKey             string
        skipOwnerValidation bool
}

func pareseBoolWithDefault(r *http.Request, key string, old bool) (bool, error) {
        val := r.FormValue(key)
        if val == "" {
                return old, nil
        }

        newVal, err := strconv.ParseBool(val)
        if err != nil {
                return false, fmt.Errorf("parse %s bool val err, err %s", key, err.Error())
        }

        return newVal, nil
}

func parseRaftForce(r *http.Request) (bool, error) {
        return pareseBoolWithDefault(r, raftForceDelKey, false)
}

func extractPosixAcl(r *http.Request) (enablePosix bool, err error) {
        var value string
        if value = r.FormValue(enablePosixAclKey); value == "" {
                return
        }

        status, err := strconv.ParseBool(value)
        if err != nil {
                return false, fmt.Errorf("parse %s failed, val %s", enablePosixAclKey, value)
        }

        return status, nil
}

func (m *Server) getMetaPartitions(w http.ResponseWriter, r *http.Request) {
        var (
                name string
                vol  *Vol
                err  error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientMetaPartitions))
        defer func() {
                doStatAndMetric(proto.ClientMetaPartitions, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        mpsCache := vol.getMpsCache()
        if len(mpsCache) == 0 {
                vol.updateViewCache(m.cluster)
                mpsCache = vol.getMpsCache()
        }
        send(w, r, mpsCache)
        return
}

func (m *Server) putDataPartitions(w http.ResponseWriter, r *http.Request) {
        var (
                body []byte
                name string
                err  error
        )
        defer func() {
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                }
        }()

        if name, err = parseAndExtractName(r); err != nil {
                return
        }
        if err = r.ParseForm(); err != nil {
                return
        }
        if body, err = io.ReadAll(r.Body); err != nil {
                return
        }
        if !m.cluster.partition.IsRaftLeader() {
                view := &proto.DataPartitionsView{}
                if err = proto.UnmarshalHTTPReply(body, view); err != nil {
                        log.LogErrorf("putDataPartitions. umarshal reply.Data error volName %v", name)
                        return
                }

                m.cluster.followerReadManager.updateVolViewFromLeader(name, view)
                sendOkReply(w, r, newSuccessHTTPReply("success"))
                return
        } else {
                err = fmt.Errorf("raft leader cann't be grant dps info")
                log.LogErrorf("putDataPartitions. err %v", err)
        }
}

// Obtain all the data partitions in a volume.
func (m *Server) getDataPartitions(w http.ResponseWriter, r *http.Request) {
        var (
                body     []byte
                name     string
                compress bool
                vol      *Vol
                err      error
        )
        compress = r.Header.Get(proto.HeaderAcceptEncoding) == compressor.EncodingGzip
        metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientDataPartitions))
        defer func() {
                doStatAndMetric(proto.ClientDataPartitions, metric, err, map[string]string{exporter.Vol: name})
        }()
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        log.LogInfof("action[getDataPartitions] current is leader[%v], compress[%v]",
                m.cluster.partition.IsRaftLeader(), compress)
        if !m.cluster.partition.IsRaftLeader() {
                var ok bool
                if body, ok = m.cluster.followerReadManager.getVolViewAsFollower(name, compress); !ok {
                        log.LogErrorf("action[getDataPartitions] volume [%v] not get partitions info", name)
                        sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("follower volume info not found")))
                        return
                }
                if compress {
                        w.Header().Add(proto.HeaderContentEncoding, compressor.EncodingGzip)
                }
                send(w, r, body)
                return
        }
        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if compress {
                body, err = vol.getDataPartitionViewCompress()
        } else {
                body, err = vol.getDataPartitionsView()
        }
        if err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        if compress {
                w.Header().Add(proto.HeaderContentEncoding, compressor.EncodingGzip)
        }
        send(w, r, body)
}

func (m *Server) getVol(w http.ResponseWriter, r *http.Request) {
        var (
                err     error
                vol     *Vol
                message string
                jobj    proto.APIAccessReq
                ticket  cryptoutil.Ticket
                ts      int64
                param   *getVolParameter
                volName string
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientVol))
        defer func() {
                doStatAndMetric(proto.ClientVol, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if param, err = parseGetVolParameter(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        volName = param.name
        if vol, err = m.cluster.getVol(param.name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if !param.skipOwnerValidation && !matchKey(vol.Owner, param.authKey) {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolAuthKeyNotMatch))
                return
        }
        viewCache := vol.getViewCache()
        if len(viewCache) == 0 {
                vol.updateViewCache(m.cluster)
                viewCache = vol.getViewCache()
        }
        if !param.skipOwnerValidation && vol.authenticate {
                if jobj, ticket, ts, err = parseAndCheckTicket(r, m.cluster.MasterSecretKey, param.name); err != nil {
                        if err == proto.ErrExpiredTicket {
                                sendErrReply(w, r, newErrHTTPReply(err))
                                return
                        }
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInvalidTicket, Msg: err.Error()})
                        return
                }
                if message, err = genRespMessage(viewCache, &jobj, ts, ticket.SessionKey.Key); err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeMasterAPIGenRespError, Msg: err.Error()})
                        return
                }
                sendOkReply(w, r, newSuccessHTTPReply(message))
        } else {
                send(w, r, viewCache)
        }
}

// Obtain the volume information such as total capacity and used space, etc.
func (m *Server) getVolStatInfo(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                name   string
                ver    int
                vol    *Vol
                byMeta bool
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientVolStat))
        defer func() {
                doStatAndMetric(proto.ClientVolStat, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, ver, byMeta, err = parseVolStatReq(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if proto.IsCold(vol.VolType) && ver != proto.LFClient {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "ec-vol is supported by LF client only"})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(volStat(vol, byMeta)))
}

func volStat(vol *Vol, countByMeta bool) (stat *proto.VolStatInfo) {
        stat = new(proto.VolStatInfo)
        stat.Name = vol.Name
        stat.TotalSize = vol.Capacity * util.GB
        stat.UsedSize = vol.totalUsedSpaceByMeta(countByMeta)
        if stat.UsedSize > stat.TotalSize {
                log.LogWarnf("vol(%v) useSize(%v) is larger than capacity(%v)", vol.Name, stat.UsedSize, stat.TotalSize)
        }

        stat.UsedRatio = strconv.FormatFloat(float64(stat.UsedSize)/float64(stat.TotalSize), 'f', 2, 32)
        stat.DpReadOnlyWhenVolFull = vol.DpReadOnlyWhenVolFull

        vol.mpsLock.RLock()
        for _, mp := range vol.MetaPartitions {
                stat.InodeCount += mp.InodeCount
                stat.TxCnt += mp.TxCnt
                stat.TxRbInoCnt += mp.TxRbInoCnt
                stat.TxRbDenCnt += mp.TxRbDenCnt
        }
        vol.mpsLock.RUnlock()

        log.LogDebugf("total[%v],usedSize[%v]", stat.TotalSize, stat.UsedSize)
        if proto.IsHot(vol.VolType) {
                return
        }

        stat.CacheTotalSize = vol.CacheCapacity * util.GB
        stat.CacheUsedSize = vol.cfsUsedSpace()
        stat.CacheUsedRatio = strconv.FormatFloat(float64(stat.CacheUsedSize)/float64(stat.CacheTotalSize), 'f', 2, 32)
        log.LogDebugf("ebsTotal[%v],ebsUsedSize[%v]", stat.CacheTotalSize, stat.CacheUsedSize)

        return
}

func getMetaPartitionView(mp *MetaPartition) (mpView *proto.MetaPartitionView) {
        mpView = proto.NewMetaPartitionView(mp.PartitionID, mp.Start, mp.End, mp.Status)
        mp.RLock()
        defer mp.RUnlock()
        for _, host := range mp.Hosts {
                mpView.Members = append(mpView.Members, host)
        }
        mr, err := mp.getMetaReplicaLeader()
        if err != nil {
                return
        }
        mpView.LeaderAddr = mr.Addr
        mpView.MaxInodeID = mp.MaxInodeID
        mpView.InodeCount = mp.InodeCount
        mpView.DentryCount = mp.DentryCount
        mpView.FreeListLen = mp.FreeListLen
        mpView.TxCnt = mp.TxCnt
        mpView.TxRbInoCnt = mp.TxRbInoCnt
        mpView.TxRbDenCnt = mp.TxRbDenCnt
        mpView.IsRecover = mp.IsRecover
        return
}

func (m *Server) getMetaPartition(w http.ResponseWriter, r *http.Request) {
        var (
                err         error
                partitionID uint64
                mp          *MetaPartition
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.ClientMetaPartition))
        defer func() {
                doStatAndMetric(proto.ClientMetaPartition, metric, err, nil)
        }()

        if partitionID, err = parseAndExtractPartitionInfo(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if mp, err = m.cluster.getMetaPartitionByID(partitionID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrMetaPartitionNotExists))
                return
        }

        toInfo := func(mp *MetaPartition) *proto.MetaPartitionInfo {
                mp.RLock()
                defer mp.RUnlock()
                replicas := make([]*proto.MetaReplicaInfo, len(mp.Replicas))
                zones := make([]string, len(mp.Hosts))
                nodeSets := make([]uint64, len(mp.Hosts))
                for idx, host := range mp.Hosts {
                        metaNode, err := m.cluster.metaNode(host)
                        if err == nil {
                                zones[idx] = metaNode.ZoneName
                                nodeSets[idx] = metaNode.NodeSetID
                        }
                }
                for i := 0; i < len(replicas); i++ {
                        replicas[i] = &proto.MetaReplicaInfo{
                                Addr:        mp.Replicas[i].Addr,
                                DomainAddr:  mp.Replicas[i].metaNode.DomainAddr,
                                MaxInodeID:  mp.Replicas[i].MaxInodeID,
                                ReportTime:  mp.Replicas[i].ReportTime,
                                Status:      mp.Replicas[i].Status,
                                IsLeader:    mp.Replicas[i].IsLeader,
                                InodeCount:  mp.Replicas[i].InodeCount,
                                DentryCount: mp.Replicas[i].DentryCount,
                                MaxInode:    mp.Replicas[i].MaxInodeID,
                        }
                }
                forbidden := true
                vol, err := m.cluster.getVol(mp.volName)
                if err == nil {
                        forbidden = vol.Forbidden
                } else {
                        log.LogErrorf("action[getMetaPartition]failed to get volume %v, err %v", mp.volName, err)
                }
                mpInfo := &proto.MetaPartitionInfo{
                        PartitionID:   mp.PartitionID,
                        Start:         mp.Start,
                        End:           mp.End,
                        VolName:       mp.volName,
                        MaxInodeID:    mp.MaxInodeID,
                        InodeCount:    mp.InodeCount,
                        DentryCount:   mp.DentryCount,
                        Replicas:      replicas,
                        ReplicaNum:    mp.ReplicaNum,
                        Status:        mp.Status,
                        IsRecover:     mp.IsRecover,
                        Hosts:         mp.Hosts,
                        Peers:         mp.Peers,
                        Zones:         zones,
                        NodeSets:      nodeSets,
                        MissNodes:     mp.MissNodes,
                        OfflinePeerID: mp.OfflinePeerID,
                        LoadResponse:  mp.LoadResponse,
                        Forbidden:     forbidden,
                }
                return mpInfo
        }

        sendOkReply(w, r, newSuccessHTTPReply(toInfo(mp)))
}

func (m *Server) listVols(w http.ResponseWriter, r *http.Request) {
        var (
                err      error
                keywords string
                vol      *Vol
                volsInfo []*proto.VolInfo
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminListVols))
        defer func() {
                doStatAndMetric(proto.AdminListVols, metric, err, nil)
        }()

        if keywords, err = parseKeywords(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        volsInfo = make([]*proto.VolInfo, 0)
        for _, name := range m.cluster.allVolNames() {
                if strings.Contains(name, keywords) {
                        if vol, err = m.cluster.getVol(name); err != nil {
                                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                                return
                        }
                        stat := volStat(vol, false)
                        volInfo := proto.NewVolInfo(vol.Name, vol.Owner, vol.createTime, vol.status(), stat.TotalSize,
                                stat.UsedSize, stat.DpReadOnlyWhenVolFull)
                        volsInfo = append(volsInfo, volInfo)
                }
        }
        sendOkReply(w, r, newSuccessHTTPReply(volsInfo))
}

func (m *Server) changeMasterLeader(w http.ResponseWriter, r *http.Request) {
        var err error
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminChangeMasterLeader))
        defer func() {
                doStatAndMetric(proto.AdminChangeMasterLeader, metric, err, nil)
        }()

        if err = m.cluster.tryToChangeLeaderByHost(); err != nil {
                log.LogErrorf("changeMasterLeader.err %v", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        rstMsg := fmt.Sprintf(" changeMasterLeader. command success send to dest host but need check. ")
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) OpFollowerPartitionsRead(w http.ResponseWriter, r *http.Request) {
        var (
                err            error
                enableFollower bool
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminOpFollowerPartitionsRead))
        defer func() {
                doStatAndMetric(proto.AdminOpFollowerPartitionsRead, metric, err, nil)
        }()

        log.LogDebugf("OpFollowerPartitionsRead.")
        if enableFollower, err = extractStatus(r); err != nil {
                log.LogErrorf("OpFollowerPartitionsRead.err %v", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        m.cluster.followerReadManager.needCheck = enableFollower

        rstMsg := fmt.Sprintf(" OpFollowerPartitionsRead. set needCheck %v command success. ", enableFollower)
        _ = sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) CreateVersion(w http.ResponseWriter, r *http.Request) {
        var (
                err   error
                vol   *Vol
                name  string
                ver   *proto.VolVersionInfo
                value string
                force bool
        )
        log.LogInfof("action[CreateVersion]")
        if err = r.ParseForm(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
                return
        }

        if name, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrParamError))
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if value = r.FormValue(forceKey); value != "" {
                force, _ = strconv.ParseBool(value)
        }

        if ver, err = vol.VersionMgr.createVer2PhaseTask(m.cluster, uint64(time.Now().UnixMicro()), proto.CreateVersion, force); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(ver))
}

func (m *Server) DelVersion(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                vol    *Vol
                name   string
                verSeq uint64
                value  string
                force  bool
        )

        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("volName %v not exist", name)))
                return
        }
        if value = r.FormValue(verSeqKey); value == "" {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("verSeq not exist")))
                return
        }

        verSeq, err = extractUint64(r, verSeqKey)
        log.LogDebugf("action[DelVersion] vol %v verSeq %v", name, verSeq)
        if value = r.FormValue(forceKey); value != "" {
                force, _ = strconv.ParseBool(value)
        }
        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if _, err = vol.VersionMgr.createVer2PhaseTask(m.cluster, verSeq, proto.DeleteVersion, force); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply("success!"))
}

func (m *Server) GetVersionInfo(w http.ResponseWriter, r *http.Request) {
        var (
                err     error
                vol     *Vol
                name    string
                verSeq  uint64
                verInfo *proto.VolVersionInfo
        )
        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                return
        }

        if verSeq, err = extractUint64(r, verSeqKey); err != nil {
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        if verInfo, err = vol.VersionMgr.getVersionInfo(verSeq); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(verInfo))
}

func (m *Server) GetAllVersionInfo(w http.ResponseWriter, r *http.Request) {
        var (
                err     error
                vol     *Vol
                name    string
                verList *proto.VolVersionInfoList
        )
        if err = r.ParseForm(); err != nil {
                return
        }

        if name, err = extractName(r); err != nil {
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        //if !proto.IsHot(vol.VolType) {
        //        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVersionOpError, Msg: "vol need be hot one"})
        //        return
        //}

        verList = vol.VersionMgr.getVersionList()

        sendOkReply(w, r, newSuccessHTTPReply(verList))
}

func (m *Server) SetVerStrategy(w http.ResponseWriter, r *http.Request) {
        var (
                err      error
                name     string
                strategy proto.VolumeVerStrategy
                isForce  bool
        )

        if name, err = parseVolName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if strategy, isForce, err = parseVolVerStrategy(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if err = m.cluster.SetVerStrategy(name, strategy, isForce); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func (m *Server) getVolVer(w http.ResponseWriter, r *http.Request) {
        var (
                err  error
                name string
                info *proto.VolumeVerInfo
        )
        if name, err = parseVolName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if info, err = m.cluster.getVolVer(name); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(info))
}

func genRespMessage(data []byte, req *proto.APIAccessReq, ts int64, key []byte) (message string, err error) {
        var (
                jresp []byte
                resp  proto.MasterAPIAccessResp
        )

        resp.Data = data

        resp.APIResp.Type = req.Type + 1
        resp.APIResp.ClientID = req.ClientID
        resp.APIResp.ServiceID = req.ServiceID
        resp.APIResp.Verifier = ts + 1 // increase ts by one for client verify server

        if jresp, err = json.Marshal(resp); err != nil {
                err = fmt.Errorf("json marshal for response failed %s", err.Error())
                return
        }

        if message, err = cryptoutil.EncodeMessage(jresp, key); err != nil {
                err = fmt.Errorf("encdoe message for response failed %s", err.Error())
                return
        }

        return
}

func (m *Server) associateVolWithUser(userID, volName string) error {
        var err error
        var userInfo *proto.UserInfo

        if userInfo, err = m.user.getUserInfo(userID); err != nil && err != proto.ErrUserNotExists {
                return err
        }

        if err == proto.ErrUserNotExists {

                param := proto.UserCreateParam{
                        ID:       userID,
                        Password: DefaultUserPassword,
                        Type:     proto.UserTypeNormal,
                }

                if userInfo, err = m.user.createKey(&param); err != nil {
                        return err
                }
        }

        if _, err = m.user.addOwnVol(userInfo.UserID, volName); err != nil {
                return err
        }

        return nil
}

func (m *Server) updateDecommissionLimit(w http.ResponseWriter, r *http.Request) {
        var (
                limit uint64
                err   error
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminUpdateDecommissionLimit))
        defer func() {
                doStatAndMetric(proto.AdminUpdateDecommissionLimit, metric, err, nil)
        }()

        if limit, err = parseRequestToUpdateDecommissionLimit(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                err = zone.updateDecommissionLimit(int32(limit), m.cluster)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
                        return
                }
        }
        m.cluster.DecommissionLimit = limit
        if err = m.cluster.syncPutCluster(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
                return
        }
        rstMsg := fmt.Sprintf("set decommission limit to %v successfully", limit)
        log.LogDebugf("action[updateDecommissionLimit] %v", rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) updateDecommissionDiskFactor(w http.ResponseWriter, r *http.Request) {
        var (
                factor float64
                err    error
        )

        metric := exporter.NewTPCnt("req_updateDecommissionDiskFactor")
        defer func() {
                metric.Set(err)
        }()

        if factor, err = parseRequestToUpdateDecommissionDiskFactor(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                err = zone.updateDecommissionDiskFactor(factor, m.cluster)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
                        return
                }
        }
        m.cluster.DecommissionDiskFactor = factor
        if err = m.cluster.syncPutCluster(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
                return
        }
        rstMsg := fmt.Sprintf("set decommission factor to %v successfully", factor)
        log.LogDebugf("action[updateDecommissionDiskFactor] %v", rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) queryDecommissionToken(w http.ResponseWriter, r *http.Request) {
        var err error

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminQueryDecommissionToken))
        defer func() {
                doStatAndMetric(proto.AdminQueryDecommissionToken, metric, err, nil)
        }()

        var stats []nodeSetDecommissionParallelStatus
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                err, zoneStats := zone.queryDecommissionParallelStatus()
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
                        return
                }
                stats = append(stats, zoneStats...)
        }
        log.LogDebugf("action[queryDecommissionToken] %v", stats)
        sendOkReply(w, r, newSuccessHTTPReply(stats))
}

func (m *Server) queryDecommissionLimit(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminQueryDecommissionLimit))
        defer func() {
                doStatAndMetric(proto.AdminQueryDecommissionLimit, metric, nil, nil)
        }()

        limit := m.cluster.DecommissionLimit
        rstMsg := fmt.Sprintf("decommission limit is %v", limit)
        log.LogDebugf("action[queryDecommissionLimit] %v", rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) queryDecommissionDiskLimit(w http.ResponseWriter, r *http.Request) {
        var resp proto.DecommissionDiskLimit
        metric := exporter.NewTPCnt("req_queryDecommissionDiskLimit")
        defer func() {
                metric.Set(nil)
        }()
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                err, diskLimit := zone.queryDecommissionDiskLimit()
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: err.Error()})
                        return
                }
                resp.Details = append(resp.Details, diskLimit...)
        }

        log.LogDebugf("action[queryDecommissionDiskLimit] %v", resp)
        sendOkReply(w, r, newSuccessHTTPReply(resp))
}

func (m *Server) queryDataNodeDecoProgress(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr string
                err         error
                dn          *DataNode
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.QueryDataNodeDecoProgress))
        defer func() {
                doStatAndMetric(proto.QueryDataNodeDecoProgress, metric, err, nil)
        }()

        if offLineAddr, err = parseReqToDecoDataNodeProgress(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if dn, err = m.cluster.dataNode(offLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }
        status, progress := dn.updateDecommissionStatus(m.cluster, true)
        progress, _ = FormatFloatFloor(progress, 4)
        resp := &proto.DecommissionProgress{
                Status:        status,
                Progress:      fmt.Sprintf("%.2f%%", progress*float64(100)),
                StatusMessage: GetDecommissionStatusMessage(status),
        }
        if status == DecommissionFail {
                err, dps := dn.GetDecommissionFailedDPByTerm(m.cluster)
                if err != nil {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                        return
                }
                resp.FailedDps = dps
        }

        sendOkReply(w, r, newSuccessHTTPReply(resp))
}

func (m *Server) queryDataNodeDecoFailedDps(w http.ResponseWriter, r *http.Request) {
        var (
                offLineAddr string
                err         error
                dn          *DataNode
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.QueryDataNodeDecoFailedDps))
        defer func() {
                doStatAndMetric(proto.QueryDataNodeDecoFailedDps, metric, err, nil)
        }()

        if offLineAddr, err = parseReqToDecoDataNodeProgress(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if dn, err = m.cluster.dataNode(offLineAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }

        err, dps := dn.GetDecommissionFailedDP(m.cluster)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(dps))
}

func (m *Server) enableAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
        var (
                enable bool
                err    error
        )

        metric := exporter.NewTPCnt("req_enableAutoDecommissionDisk")
        defer func() {
                metric.Set(err)
        }()

        if enable, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        m.cluster.SetAutoDecommissionDisk(enable)
        if err = m.cluster.syncPutCluster(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(fmt.Errorf("set master not worked %v", err)))
                return
        }
        rstMsg := fmt.Sprintf("set auto decommission disk to %v successfully", enable)
        log.LogDebugf("action[enableAutoDecommissionDisk] %v", rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func (m *Server) queryAutoDecommissionDisk(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt("req_queryAutoDecommissionDisk")
        defer func() {
                metric.Set(nil)
        }()
        enable := m.cluster.AutoDecommissionDiskIsEnabled()
        rstMsg := fmt.Sprintf("auto decommission disk is %v ", enable)
        log.LogDebugf("action[queryAutoDecommissionDisk] %v", rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(enable))
}

func (m *Server) queryDisableDisk(w http.ResponseWriter, r *http.Request) {
        var (
                node     *DataNode
                rstMsg   string
                nodeAddr string
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.RecommissionDisk))
        defer func() {
                doStatAndMetric(proto.RecommissionDisk, metric, err, nil)
        }()

        if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if node, err = m.cluster.dataNode(nodeAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrDataNodeNotExists))
                return
        }

        disks := node.getDecommissionedDisks()

        rstMsg = fmt.Sprintf("datanode[%v] disable disk[%v]",
                nodeAddr, disks)

        Warn(m.clusterName, rstMsg)
        sendOkReply(w, r, newSuccessHTTPReply(rstMsg))
}

func parseReqToDecoDataNodeProgress(r *http.Request) (nodeAddr string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        nodeAddr, err = extractNodeAddr(r)
        if err != nil {
                return
        }
        return
}

func FormatFloatFloor(num float64, decimal int) (float64, error) {
        d := float64(1)
        if decimal > 0 {
                d = math.Pow10(decimal)
        }

        res := strconv.FormatFloat(math.Floor(num*d)/d, 'f', -1, 64)
        return strconv.ParseFloat(res, 64)
}

func (m *Server) setCheckDataReplicasEnable(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                enable bool
        )

        if enable, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        oldValue := m.cluster.checkDataReplicasEnable
        if oldValue != enable {
                m.cluster.checkDataReplicasEnable = enable
                if err = m.cluster.syncPutCluster(); err != nil {
                        m.cluster.checkDataReplicasEnable = oldValue
                        log.LogErrorf("action[setCheckDataReplicasEnable] syncPutCluster failed %v", err)
                        sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
                        return
                }
        }

        log.LogInfof("action[setCheckDataReplicasEnable] enable be set [%v]", enable)
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "set checkDataReplicasEnable to [%v] successfully", enable)))
}

func (m *Server) setFileStats(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                enable bool
        )
        if enable, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        oldValue := m.cluster.fileStatsEnable
        m.cluster.fileStatsEnable = enable
        if err = m.cluster.syncPutCluster(); err != nil {
                m.cluster.fileStatsEnable = oldValue
                log.LogErrorf("action[setFileStats] syncPutCluster failed %v", err)
                sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
                return
        }
        log.LogInfof("action[setFileStats] enable be set [%v]", enable)
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "set setFileStats to [%v] successfully", enable)))
}

func (m *Server) getFileStats(w http.ResponseWriter, r *http.Request) {
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "getFileStats enable value [%v]", m.cluster.fileStatsEnable)))
}

func (m *Server) GetClusterValue(w http.ResponseWriter, r *http.Request) {
        result, err := m.cluster.fsm.store.SeekForPrefix([]byte(clusterPrefix))
        if err != nil {
                log.LogErrorf("action[GetClusterValue],err:%v", err.Error())
                sendErrReply(w, r, newErrHTTPReply(proto.ErrInternalError))
                return
        }
        for _, value := range result {
                cv := &clusterValue{}
                if err = json.Unmarshal(value, cv); err != nil {
                        log.LogErrorf("action[GetClusterValue], unmarshal err:%v", err.Error())
                        sendErrReply(w, r, newErrHTTPReply(proto.ErrUnmarshalData))
                        return
                }
                sendOkReply(w, r, newSuccessHTTPReply(cv))
        }
}

func (m *Server) setClusterUuidEnable(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                enable bool
        )

        if m.cluster.clusterUuid == "" {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: "no ClusterUuid, generate it first"})
                return
        }

        if enable, err = parseAndExtractStatus(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        oldValue := m.cluster.clusterUuidEnable
        m.cluster.clusterUuidEnable = enable
        if err = m.cluster.syncPutCluster(); err != nil {
                m.cluster.clusterUuidEnable = oldValue
                log.LogErrorf("action[setClusterUuidEnable] syncPutCluster failed %v", err)
                sendErrReply(w, r, newErrHTTPReply(proto.ErrPersistenceByRaft))
                return
        }

        log.LogInfof("action[setClusterUuidEnable] enable be set [%v]", enable)
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "set clusterUuIdEnable to [%v] successfully", enable)))
}

func (m *Server) generateClusterUuid(w http.ResponseWriter, r *http.Request) {
        if m.cluster.clusterUuid != "" {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInternalError, Msg: "The cluster already has a ClusterUuid"})
                return
        }
        if err := m.cluster.generateClusterUuid(); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrInternalError))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "generate ClusterUUID [%v] successfully", m.cluster.clusterUuid)))
}

func (m *Server) getClusterUuid(w http.ResponseWriter, r *http.Request) {
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf(
                "ClusterUUID [%v], enable value [%v]", m.cluster.clusterUuid, m.cluster.clusterUuidEnable)))
}

func (m *Server) setConfigHandler(w http.ResponseWriter, r *http.Request) {
        var err error

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetConfig))
        defer func() {
                doStatAndMetric(proto.AdminSetConfig, metric, err, nil)
        }()

        key, value, err := parseSetConfigParam(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        log.LogInfof("[setConfigHandler] set config key[%v], value[%v]", key, value)

        err = m.setConfig(key, value)
        if err != nil {
                log.LogErrorf("[setConfigHandler] set config key[%v], value[%v], err (%s)", key, value, err.Error())
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("set config key[%v], value[%v] success", key, value)))
        return
}

func (m *Server) getConfigHandler(w http.ResponseWriter, r *http.Request) {
        var err error

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetConfig))
        defer func() {
                doStatAndMetric(proto.AdminGetConfig, metric, err, nil)
        }()

        key, err := parseGetConfigParam(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        log.LogInfof("[getConfigHandler] get config key[%v]", key)
        value, err := m.getConfig(key)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(value))
}

func (m *Server) setConfig(key string, value string) (err error) {
        var metaPartitionInodeIdStep uint64
        if key == cfgmetaPartitionInodeIdStep {
                if metaPartitionInodeIdStep, err = strconv.ParseUint(value, 10, 64); err != nil {
                        return err
                }
                oldValue := m.config.MetaPartitionInodeIdStep
                m.config.MetaPartitionInodeIdStep = metaPartitionInodeIdStep
                if err = m.cluster.syncPutCluster(); err != nil {
                        m.config.MetaPartitionInodeIdStep = oldValue
                        log.LogErrorf("setConfig syncPutCluster fail err %v", err)
                        return err
                }
        } else {
                err = keyNotFound("config")
        }
        return err
}

func (m *Server) getConfig(key string) (value string, err error) {
        if key == cfgmetaPartitionInodeIdStep {
                v := m.config.MetaPartitionInodeIdStep
                value = strconv.FormatUint(v, 10)
        } else {
                err = keyNotFound("config")
        }
        return value, err
}

func (m *Server) CreateQuota(w http.ResponseWriter, r *http.Request) {
        req := &proto.SetMasterQuotaReuqest{}
        var (
                err     error
                vol     *Vol
                quotaId uint32
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaCreate))
        defer func() {
                doStatAndMetric(proto.QuotaCreate, metric, err, map[string]string{exporter.Vol: req.VolName})
        }()

        if err = parserSetQuotaParam(r, req); err != nil {
                log.LogErrorf("[CreateQuota] set quota fail err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(req.VolName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if !vol.enableQuota {
                err = errors.NewErrorf("vol %v disableQuota.", vol.Name)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if quotaId, err = vol.quotaManager.createQuota(req); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        sendOkReply(w, r, newSuccessHTTPReply(&quotaId))
}

func (m *Server) UpdateQuota(w http.ResponseWriter, r *http.Request) {
        req := &proto.UpdateMasterQuotaReuqest{}
        var (
                err error
                vol *Vol
        )
        if err = parserUpdateQuotaParam(r, req); err != nil {
                log.LogErrorf("[SetQuota] set quota fail err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if vol, err = m.cluster.getVol(req.VolName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if !vol.enableQuota {
                err = errors.NewErrorf("vol %v disableQuota.", vol.Name)
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        if err = vol.quotaManager.updateQuota(req); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        msg := fmt.Sprintf("update quota successfully, req %v", req)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) DeleteQuota(w http.ResponseWriter, r *http.Request) {
        var (
                err     error
                vol     *Vol
                quotaId uint32
                name    string
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaDelete))
        defer func() {
                doStatAndMetric(proto.QuotaDelete, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, quotaId, err = parseDeleteQuotaParam(r); err != nil {
                log.LogErrorf("[DeleteQuota] del quota fail err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if err = vol.quotaManager.deleteQuota(quotaId); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        msg := fmt.Sprintf("delete quota successfully, vol [%v] quotaId [%v]", name, quotaId)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
        return
}

func (m *Server) ListQuota(w http.ResponseWriter, r *http.Request) {
        var (
                err  error
                vol  *Vol
                resp *proto.ListMasterQuotaResponse
                name string
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaList))
        defer func() {
                doStatAndMetric(proto.QuotaList, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        resp = vol.quotaManager.listQuota()

        log.LogInfof("list quota vol [%v] resp [%v] success.", name, *resp)

        sendOkReply(w, r, newSuccessHTTPReply(resp))
        return
}

func (m *Server) ListQuotaAll(w http.ResponseWriter, r *http.Request) {
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaListAll))
        defer func() {
                doStatAndMetric(proto.QuotaListAll, metric, nil, nil)
        }()

        volsInfo := m.cluster.listQuotaAll()
        log.LogInfof("list all vol has quota [%v]", volsInfo)
        sendOkReply(w, r, newSuccessHTTPReply(volsInfo))
        return
}

func (m *Server) GetQuota(w http.ResponseWriter, r *http.Request) {
        var (
                err       error
                vol       *Vol
                name      string
                quotaId   uint32
                quotaInfo *proto.QuotaInfo
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaGet))
        defer func() {
                doStatAndMetric(proto.QuotaGet, metric, err, map[string]string{exporter.Vol: name})
        }()

        if name, quotaId, err = parseGetQuotaParam(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        if vol, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }

        if quotaInfo, err = vol.quotaManager.getQuota(quotaId); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        log.LogInfof("get quota vol [%v] quotaInfo [%v] success.", name, *quotaInfo)
        sendOkReply(w, r, newSuccessHTTPReply(quotaInfo))
        return
}

// func (m *Server) BatchModifyQuotaFullPath(w http.ResponseWriter, r *http.Request) {
//         var (
//                 name              string
//                 body              []byte
//                 changeFullPathMap map[uint32]string
//                 err               error
//                 vol               *Vol
//         )
//         metric := exporter.NewTPCnt(apiToMetricsName(proto.QuotaGet))
//         defer func() {
//                 doStatAndMetric(proto.QuotaBatchModifyPath, metric, err, map[string]string{exporter.Vol: name})
//         }()

//         if name, err = parseAndExtractName(r); err != nil {
//                 sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
//                 return
//         }

//         if body, err = io.ReadAll(r.Body); err != nil {
//                 sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
//                 return
//         }
//         changeFullPathMap = make(map[uint32]string)
//         if err = json.Unmarshal(body, &changeFullPathMap); err != nil {
//                 sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
//                 return
//         }

//         if vol, err = m.cluster.getVol(name); err != nil {
//                 sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
//                 return
//         }

//         vol.quotaManager.batchModifyQuotaFullPath(changeFullPathMap)

//         log.LogInfof("BatchModifyQuotaFullPath vol [%v] changeFullPathMap [%v] success.", name, changeFullPathMap)
//         msg := fmt.Sprintf("BatchModifyQuotaFullPath successfully, vol [%v]", name)
//         sendOkReply(w, r, newSuccessHTTPReply(msg))
// }

func parseSetDpDiscardParam(r *http.Request) (dpId uint64, rdOnly bool, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }

        if dpId, err = extractDataPartitionID(r); err != nil {
                err = fmt.Errorf("parseSetDpDiscardParam get dpid error %v", err)
                return
        }

        val := r.FormValue(dpDiscardKey)
        if val == "" {
                err = fmt.Errorf("parseSetDpDiscardParam %s is empty", dpDiscardKey)
                return
        }

        if rdOnly, err = strconv.ParseBool(val); err != nil {
                err = fmt.Errorf("parseSetDpDiscardParam %s is not bool value %s", dpDiscardKey, val)
                return
        }

        return
}

func (m *Server) setDpDiscard(partitionID uint64, isDiscard bool) (err error) {
        var dp *DataPartition
        if dp, err = m.cluster.getDataPartitionByID(partitionID); err != nil {
                return fmt.Errorf("[setDpDiacard] getDataPartitionByID err(%s)", err.Error())
        }
        dp.Lock()
        defer dp.Unlock()
        if dp.IsDiscard && !isDiscard {
                log.LogWarnf("[setDpDiscard] usnet dp discard flag may cause some junk data")
        }
        dp.IsDiscard = isDiscard
        m.cluster.syncUpdateDataPartition(dp)

        return
}

func (m *Server) setDpDiscardHandler(w http.ResponseWriter, r *http.Request) {
        var (
                dpId    uint64
                discard bool
                err     error
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminSetDpDiscard))
        defer func() {
                doStatAndMetric(proto.AdminSetDpDiscard, metric, err, nil)
        }()

        dpId, discard, err = parseSetDpDiscardParam(r)
        if err != nil {
                log.LogInfof("[setDpDiscardHandler] set dp %v to discard(%v)", dpId, discard)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        err = m.setDpDiscard(dpId, discard)
        if err != nil {
                log.LogErrorf("[setDpDiscardHandler] set dp %v to discard %v, err (%s)", dpId, discard, err.Error())
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }

        msg := fmt.Sprintf("[setDpDiscardHandler] set dpid %v to discard(%v) success", dpId, discard)
        log.LogInfo(msg)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
        return
}

func (m *Server) getDiscardDpHandler(w http.ResponseWriter, r *http.Request) {
        DiscardDpInfos := proto.DiscardDataPartitionInfos{}

        metric := exporter.NewTPCnt(apiToMetricsName(proto.AdminGetDiscardDp))
        defer func() {
                doStatAndMetric(proto.AdminGetDiscardDp, metric, nil, nil)
        }()

        vols := m.cluster.copyVols()
        for _, vol := range vols {
                var dps *DataPartitionMap
                dps = vol.dataPartitions
                for _, dp := range dps.partitions {
                        if dp.IsDiscard {
                                DiscardDpInfos.DiscardDps = append(DiscardDpInfos.DiscardDps, *dp.buildDpInfo(m.cluster))
                        }
                }
        }

        msg := fmt.Sprintf("[GetDiscardDpHandler] discard dp num:%v", len(DiscardDpInfos.DiscardDps))
        log.LogInfo(msg)
        sendOkReply(w, r, newSuccessHTTPReply(DiscardDpInfos))
        return
}

func (m *Server) queryBadDisks(w http.ResponseWriter, r *http.Request) {
        var (
                err   error
                infos proto.BadDiskInfos
        )

        metric := exporter.NewTPCnt("req_queryBadDisks")
        defer func() {
                metric.Set(err)
        }()

        m.cluster.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode, ok := node.(*DataNode)
                if !ok {
                        return true
                }

                for _, bds := range dataNode.BadDiskStats {
                        info := proto.BadDiskInfo{
                                Address:              dataNode.Addr,
                                Path:                 bds.DiskPath,
                                TotalPartitionCnt:    bds.TotalPartitionCnt,
                                DiskErrPartitionList: bds.DiskErrPartitionList,
                        }
                        infos.BadDisks = append(infos.BadDisks, info)
                }
                return true
        })

        sendOkReply(w, r, newSuccessHTTPReply(infos))
}

func (m *Server) addLcNode(w http.ResponseWriter, r *http.Request) {
        var (
                nodeAddr string
                id       uint64
                err      error
        )
        if nodeAddr, err = parseAndExtractNodeAddr(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !checkIp(nodeAddr) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: fmt.Errorf("addr not legal").Error()})
                return
        }
        if id, err = m.cluster.addLcNode(nodeAddr); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(id))
}

// handle tasks such as heartbeat，expiration scanning, etc.
func (m *Server) handleLcNodeTaskResponse(w http.ResponseWriter, r *http.Request) {
        tr, err := parseRequestToGetTaskResponse(r)
        if err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("%v", http.StatusOK)))
        m.cluster.handleLcNodeTaskResponse(tr.OperatorAddr, tr)
}

func (m *Server) SetBucketLifecycle(w http.ResponseWriter, r *http.Request) {
        var (
                bytes []byte
                err   error
        )
        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        req := proto.LcConfiguration{}
        if err = json.Unmarshal(bytes, &req); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if _, err = m.cluster.getVol(req.VolName); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        _ = m.cluster.SetBucketLifecycle(&req)
        sendOkReply(w, r, newSuccessHTTPReply(fmt.Sprintf("PutBucketLifecycleConfiguration successful ")))
}

func (m *Server) GetBucketLifecycle(w http.ResponseWriter, r *http.Request) {
        var (
                err    error
                name   string
                lcConf *proto.LcConfiguration
        )
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if _, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        lcConf = m.cluster.GetBucketLifecycle(name)
        if lcConf == nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrNoSuchLifecycleConfiguration))
        }
        sendOkReply(w, r, newSuccessHTTPReply(lcConf))
}

func (m *Server) DelBucketLifecycle(w http.ResponseWriter, r *http.Request) {
        var (
                err  error
                name string
        )
        if name, err = parseAndExtractName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if _, err = m.cluster.getVol(name); err != nil {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrVolNotExists))
                return
        }
        m.cluster.DelBucketLifecycle(name)
        msg := fmt.Sprintf("delete vol[%v] lifecycle successfully", name)
        log.LogWarn(msg)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) lcnodeInfo(w http.ResponseWriter, r *http.Request) {
        if err := r.ParseForm(); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        switch r.FormValue("op") {
        case "info":
                var (
                        rsp *LcNodeInfoResponse
                        err error
                )
                if rsp, err = m.cluster.getAllLcNodeInfo(); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
                sendOkReply(w, r, newSuccessHTTPReply(rsp))
        case "start":
                if m.cluster.partition != nil && m.cluster.partition.IsRaftLeader() {
                        m.cluster.startLcScan()
                        sendOkReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeSuccess})
                } else {
                        sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "not leader"})
                }
        default:
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "invalid op"})
        }
}

func (m *Server) S3QosSet(w http.ResponseWriter, r *http.Request) {
        var (
                param = &proto.S3QosRequest{}
                err   error
        )

        metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSSet))
        defer func() {
                doStatAndMetric(proto.S3QoSSet, metric, err, nil)
        }()

        if err = parseS3QosReq(r, param); err != nil {
                log.LogErrorf("[S3QosSet] parse fail err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !isS3QosConfigValid(param) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "s3 qos param err"})
                return
        }

        // set s3 qos quota
        if param.Quota != 0 {
                if strings.ToLower(param.Uid) == proto.DefaultUid {
                        param.Uid = proto.DefaultUid
                }
                param.Api = strings.ToLower(param.Api)
                metadata := new(RaftCmd)
                metadata.Op = opSyncS3QosSet
                key := param.Api + keySeparator + param.Uid + keySeparator + param.Type
                metadata.K = S3QoSPrefix + key
                metadata.V = []byte(strconv.FormatUint(param.Quota, 10))

                // raft sync
                if err = m.cluster.submit(metadata); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
                // memory cache
                m.cluster.S3ApiQosQuota.Store(metadata.K, param.Quota)
        }

        // set s3 node num
        if param.Nodes != 0 {
                metadata := new(RaftCmd)
                metadata.Op = opSyncS3QosSet
                key := proto.S3Nodes
                metadata.K = S3QoSPrefix + key
                metadata.V = []byte(strconv.FormatUint(param.Nodes, 10))
                // raft sync
                if err = m.cluster.submit(metadata); err != nil {
                        sendErrReply(w, r, newErrHTTPReply(err))
                        return
                }
                // memory cache
                m.cluster.S3ApiQosQuota.Store(metadata.K, param.Nodes)
        }

        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func (m *Server) S3QosGet(w http.ResponseWriter, r *http.Request) {
        var err error
        metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSGet))
        defer func() {
                doStatAndMetric(proto.S3QoSGet, metric, err, nil)
        }()

        apiLimitConf := make(map[string]*proto.UserLimitConf, 0)
        s3QosResponse := proto.S3QoSResponse{
                ApiLimitConf: apiLimitConf,
        }
        // memory cache
        m.cluster.S3ApiQosQuota.Range(func(key, value interface{}) bool {
                k := key.(string)
                v := value.(uint64)
                api, uid, limitType, nodeNumKey, err := parseS3QoSKey(k)
                if err != nil {
                        log.LogErrorf("[S3QosGet] parseS3QoSKey err [%v]", err)
                        return true
                }
                if nodeNumKey != "" {
                        s3QosResponse.Nodes = v
                        return true
                }
                if _, ok := apiLimitConf[api]; !ok {
                        bandWidthQuota := make(map[string]uint64, 0)
                        qpsQuota := make(map[string]uint64, 0)
                        concurrentQuota := make(map[string]uint64, 0)
                        userLimitConf := &proto.UserLimitConf{
                                BandWidthQuota:  bandWidthQuota,
                                QPSQuota:        qpsQuota,
                                ConcurrentQuota: concurrentQuota,
                        }
                        apiLimitConf[api] = userLimitConf
                }
                switch limitType {
                case proto.FlowLimit:
                        apiLimitConf[api].BandWidthQuota[uid] = v
                case proto.QPSLimit:
                        apiLimitConf[api].QPSQuota[uid] = v
                case proto.ConcurrentLimit:
                        apiLimitConf[api].ConcurrentQuota[uid] = v
                default:
                        // do nothing
                }
                return true
        })

        log.LogDebugf("[S3QosGet] s3qosInfoMap %+v", s3QosResponse)
        sendOkReply(w, r, newSuccessHTTPReply(s3QosResponse))
}

func (m *Server) S3QosDelete(w http.ResponseWriter, r *http.Request) {
        var (
                param = &proto.S3QosRequest{}
                err   error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.S3QoSDelete))
        defer func() {
                doStatAndMetric(proto.S3QoSDelete, metric, err, nil)
        }()

        if err = parseS3QosReq(r, param); err != nil {
                log.LogErrorf("[S3QosSet] parse fail err [%v]", err)
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !isS3QosConfigValid(param) {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: "s3 qos param err"})
                return
        }

        if strings.ToLower(param.Uid) == proto.DefaultUid {
                param.Uid = proto.DefaultUid
        }
        param.Api = strings.ToLower(param.Api)
        metadata := new(RaftCmd)
        metadata.Op = opSyncS3QosDelete
        key := param.Api + keySeparator + param.Uid + keySeparator + param.Type
        metadata.K = S3QoSPrefix + key

        // raft sync
        if err = m.cluster.submit(metadata); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }

        // memory cache
        m.cluster.S3ApiQosQuota.Delete(metadata.K)

        sendOkReply(w, r, newSuccessHTTPReply("success"))
}

func parseS3QoSKey(key string) (api, uid, limitType, nodes string, err error) {
        s3qosInfo := strings.TrimPrefix(key, S3QoSPrefix)
        strs := strings.Split(s3qosInfo, keySeparator)
        if len(strs) == 3 {
                return strs[0], strs[1], strs[2], "", nil
        }
        if len(strs) == 1 && strs[0] == proto.S3Nodes {
                return "", "", "", strs[0], nil
        }
        return "", "", "", "", errors.New("unexpected key")
}

func isS3QosConfigValid(param *proto.S3QosRequest) bool {
        if param.Type != proto.FlowLimit && param.Type != proto.QPSLimit && param.Type != proto.ConcurrentLimit {
                return false
        }

        if proto.IsS3PutApi(param.Api) {
                return false
        }

        return true
}

package master

import (
        "encoding/json"
        "fmt"
        "io"
        "net/http"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

func (m *Server) createUser(w http.ResponseWriter, r *http.Request) {
        var (
                userInfo *proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserCreate))
        defer func() {
                doStatAndMetric(proto.UserCreate, metric, err, nil)
        }()

        var bytes []byte
        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        param := proto.UserCreateParam{}
        if err = json.Unmarshal(bytes, &param); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if !ownerRegexp.MatchString(param.ID) {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserID))
                return
        }
        if param.Type == proto.UserTypeRoot {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserType))
                return
        }
        if userInfo, err = m.user.createKey(&param); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        _ = sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) deleteUser(w http.ResponseWriter, r *http.Request) {
        var (
                userID string
                err    error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserDelete))
        defer func() {
                doStatAndMetric(proto.UserDelete, metric, err, nil)
        }()

        if userID, err = parseUser(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.user.deleteKey(userID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg := fmt.Sprintf("delete user[%v] successfully", userID)
        log.LogWarn(msg)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) updateUser(w http.ResponseWriter, r *http.Request) {
        var (
                userInfo *proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserUpdate))
        defer func() {
                doStatAndMetric(proto.UserUpdate, metric, err, nil)
        }()

        var bytes []byte
        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        param := proto.UserUpdateParam{}
        if err = json.Unmarshal(bytes, &param); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if param.Type == proto.UserTypeRoot {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrInvalidUserType))
                return
        }
        if userInfo, err = m.user.updateKey(&param); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        _ = sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) getUserAKInfo(w http.ResponseWriter, r *http.Request) {
        var (
                ak       string
                userInfo *proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserGetAKInfo))
        defer func() {
                doStatAndMetric(proto.UserGetAKInfo, metric, err, nil)
        }()

        if ak, err = parseAccessKey(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if userInfo, err = m.user.getKeyInfo(ak); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) getUserInfo(w http.ResponseWriter, r *http.Request) {
        var (
                userID   string
                userInfo *proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserGetInfo))
        defer func() {
                doStatAndMetric(proto.UserGetInfo, metric, err, nil)
        }()

        if userID, err = parseUser(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if userInfo, err = m.user.getUserInfo(userID); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) updateUserPolicy(w http.ResponseWriter, r *http.Request) {
        var (
                userInfo *proto.UserInfo
                bytes    []byte
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserUpdatePolicy))
        defer func() {
                doStatAndMetric(proto.UserUpdatePolicy, metric, err, nil)
        }()

        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        param := proto.UserPermUpdateParam{}
        if err = json.Unmarshal(bytes, &param); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if _, err = m.cluster.getVol(param.Volume); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        if userInfo, err = m.user.updatePolicy(&param); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) removeUserPolicy(w http.ResponseWriter, r *http.Request) {
        var (
                userInfo *proto.UserInfo
                bytes    []byte
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserRemovePolicy))
        defer func() {
                doStatAndMetric(proto.UserRemovePolicy, metric, err, nil)
        }()

        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        param := proto.UserPermRemoveParam{}
        if err = json.Unmarshal(bytes, &param); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if _, err = m.cluster.getVol(param.Volume); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        if userInfo, err = m.user.removePolicy(&param); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) deleteUserVolPolicy(w http.ResponseWriter, r *http.Request) {
        var (
                vol string
                err error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserDeleteVolPolicy))
        defer func() {
                doStatAndMetric(proto.UserDeleteVolPolicy, metric, err, map[string]string{exporter.Vol: vol})
        }()

        if vol, err = parseVolName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if err = m.user.deleteVolPolicy(vol); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        msg := fmt.Sprintf("delete vol[%v] policy successfully", vol)
        log.LogWarn(msg)
        sendOkReply(w, r, newSuccessHTTPReply(msg))
}

func (m *Server) transferUserVol(w http.ResponseWriter, r *http.Request) {
        var (
                bytes    []byte
                vol      *Vol
                volName  string
                userInfo *proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserTransferVol))
        defer func() {
                doStatAndMetric(proto.UserTransferVol, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if bytes, err = io.ReadAll(r.Body); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        param := proto.UserTransferVolParam{}
        if err = json.Unmarshal(bytes, &param); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        volName = param.Volume
        if vol, err = m.cluster.getVol(param.Volume); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeVolNotExists, Msg: err.Error()})
                return
        }
        if !param.Force && vol.Owner != param.UserSrc {
                sendErrReply(w, r, newErrHTTPReply(proto.ErrHaveNoPolicy))
                return
        }
        if userInfo, err = m.user.transferVol(&param); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        owner := vol.Owner
        vol.Owner = userInfo.UserID
        if err = m.cluster.syncUpdateVol(vol); err != nil {
                vol.Owner = owner
                err = proto.ErrPersistenceByRaft
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(userInfo))
}

func (m *Server) getAllUsers(w http.ResponseWriter, r *http.Request) {
        var (
                keywords string
                users    []*proto.UserInfo
                err      error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UserList))
        defer func() {
                doStatAndMetric(proto.UserList, metric, err, nil)
        }()

        if keywords, err = parseKeywords(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        users = m.user.getAllUserInfo(keywords)
        sendOkReply(w, r, newSuccessHTTPReply(users))
}

func (m *Server) getUsersOfVol(w http.ResponseWriter, r *http.Request) {
        var (
                volName string
                users   []string
                err     error
        )
        metric := exporter.NewTPCnt(apiToMetricsName(proto.UsersOfVol))
        defer func() {
                doStatAndMetric(proto.UsersOfVol, metric, err, map[string]string{exporter.Vol: volName})
        }()

        if volName, err = parseVolName(r); err != nil {
                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeParamError, Msg: err.Error()})
                return
        }
        if users, err = m.user.getUsersOfVol(volName); err != nil {
                sendErrReply(w, r, newErrHTTPReply(err))
                return
        }
        sendOkReply(w, r, newSuccessHTTPReply(users))
}

func parseUser(r *http.Request) (userID string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if userID, err = extractUser(r); err != nil {
                return
        }
        return
}

func extractUser(r *http.Request) (user string, err error) {
        if user = r.FormValue(userKey); user == "" {
                err = keyNotFound(userKey)
                return
        }
        return
}

func parseAccessKey(r *http.Request) (ak string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        if ak, err = extractAccessKey(r); err != nil {
                return
        }
        return
}

func parseKeywords(r *http.Request) (keywords string, err error) {
        if err = r.ParseForm(); err != nil {
                return
        }
        keywords = extractKeywords(r)
        return
}

func extractAccessKey(r *http.Request) (ak string, err error) {
        if ak = r.FormValue(akKey); ak == "" {
                err = keyNotFound(akKey)
                return
        }
        if !proto.AKRegexp.MatchString(ak) {
                return "", errors.New("accesskey can only be number and letters")
        }
        return
}

func extractKeywords(r *http.Request) (keywords string) {
        keywords = r.FormValue(keywordsKey)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "math"
        "net/http"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/google/uuid"
        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        authSDK "github.com/cubefs/cubefs/sdk/auth"
        masterSDK "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/compressor"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// Cluster stores all the cluster-level information.
type Cluster struct {
        Name                         string
        CreateTime                   int64
        vols                         map[string]*Vol
        dataNodes                    sync.Map
        metaNodes                    sync.Map
        volMutex                     sync.RWMutex // volume mutex
        createVolMutex               sync.RWMutex // create volume mutex
        mnMutex                      sync.RWMutex // meta node mutex
        dnMutex                      sync.RWMutex // data node mutex
        nsMutex                      sync.RWMutex // nodeset mutex
        badPartitionMutex            sync.RWMutex // BadDataPartitionIds and BadMetaPartitionIds operate mutex
        leaderInfo                   *LeaderInfo
        cfg                          *clusterConfig
        metaReady                    bool
        retainLogs                   uint64
        idAlloc                      *IDAllocator
        t                            *topology
        dataNodeStatInfo             *nodeStatInfo
        metaNodeStatInfo             *nodeStatInfo
        zoneStatInfos                map[string]*proto.ZoneStat
        volStatInfo                  sync.Map
        domainManager                *DomainManager
        BadDataPartitionIds          *sync.Map
        BadMetaPartitionIds          *sync.Map
        DisableAutoAllocate          bool
        ForbidMpDecommission         bool
        FaultDomain                  bool
        needFaultDomain              bool // FaultDomain is true and normal zone aleady used up
        fsm                          *MetadataFsm
        partition                    raftstore.Partition
        MasterSecretKey              []byte
        lastZoneIdxForNode           int
        zoneIdxMux                   sync.Mutex //
        zoneList                     []string
        followerReadManager          *followerReadManager
        diskQosEnable                bool
        QosAcceptLimit               *rate.Limiter
        apiLimiter                   *ApiLimiter
        DecommissionDisks            sync.Map
        DecommissionLimit            uint64
        EnableAutoDecommissionDisk   bool
        AutoDecommissionDiskMux      sync.Mutex
        checkAutoCreateDataPartition bool
        masterClient                 *masterSDK.MasterClient
        checkDataReplicasEnable      bool
        fileStatsEnable              bool
        clusterUuid                  string
        clusterUuidEnable            bool
        inodeCountNotEqualMP         *sync.Map
        maxInodeNotEqualMP           *sync.Map
        dentryCountNotEqualMP        *sync.Map
        ac                           *authSDK.AuthClient
        authenticate                 bool
        lcNodes                      sync.Map
        lcMgr                        *lifecycleManager
        snapshotMgr                  *snapshotDelManager
        DecommissionDiskFactor       float64
        S3ApiQosQuota                *sync.Map // (api,uid,limtType) -> limitQuota
}

type followerReadManager struct {
        volDataPartitionsView     map[string][]byte
        volDataPartitionsCompress map[string][]byte
        status                    map[string]bool
        lastUpdateTick            map[string]time.Time
        needCheck                 bool
        c                         *Cluster
        volViewMap                map[string]*volValue
        rwMutex                   sync.RWMutex
}

func newFollowerReadManager(c *Cluster) (mgr *followerReadManager) {
        mgr = new(followerReadManager)
        mgr.volDataPartitionsView = make(map[string][]byte)
        mgr.volDataPartitionsCompress = make(map[string][]byte)
        mgr.status = make(map[string]bool)
        mgr.lastUpdateTick = make(map[string]time.Time)
        mgr.c = c
        return
}

func (mgr *followerReadManager) reSet() {
        mgr.rwMutex.Lock()
        defer mgr.rwMutex.Unlock()

        mgr.volDataPartitionsView = make(map[string][]byte)
        mgr.volDataPartitionsCompress = make(map[string][]byte)
        mgr.status = make(map[string]bool)
        mgr.lastUpdateTick = make(map[string]time.Time)
}

func (mgr *followerReadManager) getVolumeDpView() {
        var (
                err      error
                volViews []*volValue
                view     *proto.DataPartitionsView
        )
        if err, volViews = mgr.c.loadVolsViews(); err != nil {
                panic(err)
        }

        mgr.rwMutex.Lock()
        mgr.volViewMap = make(map[string]*volValue)
        for _, vv := range volViews {
                mgr.volViewMap[vv.Name] = vv

                if _, ok := mgr.lastUpdateTick[vv.Name]; !ok {
                        // record when first discovery the volume
                        mgr.lastUpdateTick[vv.Name] = time.Now()
                        mgr.status[vv.Name] = false
                }
        }
        mgr.rwMutex.Unlock()

        if mgr.c.masterClient.Leader() == "" {
                log.LogErrorf("followerReadManager.getVolumeDpView but master leader not ready")
                return
        }

        for _, vv := range volViews {
                if vv.Status == proto.VolStatusMarkDelete {
                        mgr.rwMutex.Lock()
                        mgr.lastUpdateTick[vv.Name] = time.Now()
                        mgr.status[vv.Name] = false
                        mgr.rwMutex.Unlock()
                        continue
                }

                log.LogDebugf("followerReadManager.getVolumeDpView %v", vv.Name)
                if view, err = mgr.c.masterClient.ClientAPI().GetDataPartitions(vv.Name); err != nil {
                        log.LogErrorf("followerReadManager.getVolumeDpView %v GetDataPartitions err %v", vv.Name, err)
                        continue
                }
                mgr.updateVolViewFromLeader(vv.Name, view)
        }
}

func (mgr *followerReadManager) sendFollowerVolumeDpView() {
        var err error
        vols := mgr.c.copyVols()
        for _, vol := range vols {
                log.LogDebugf("followerReadManager.getVolumeDpView %v", vol.Name)
                if vol.Status == proto.VolStatusMarkDelete {
                        continue
                }
                var body []byte
                if body, err = vol.getDataPartitionsView(); err != nil {
                        log.LogErrorf("followerReadManager.sendFollowerVolumeDpView err %v", err)
                        continue
                }
                for _, addr := range AddrDatabase {
                        if addr == mgr.c.leaderInfo.addr {
                                continue
                        }
                        mgr.c.masterClient.SetLeader(addr)
                        if err = mgr.c.masterClient.AdminAPI().PutDataPartitions(vol.Name, body); err != nil {
                                mgr.c.masterClient.SetLeader("")
                                log.LogErrorf("followerReadManager.sendFollowerVolumeDpView PutDataPartitions name %v addr %v err %v", vol.Name, addr, err)
                                continue
                        }
                        mgr.c.masterClient.SetLeader("")
                        log.LogDebugf("followerReadManager.sendFollowerVolumeDpView PutDataPartitions name %v addr %v err %v", vol.Name, addr, err)
                }
        }
}

// NOTICE: caller must correctly use mgr.rwMutex
func (mgr *followerReadManager) isVolRecordObsolete(volName string) bool {
        volView, ok := mgr.volViewMap[volName]
        if !ok {
                // vol has been completely deleted
                return true
        }

        if volView.Status == proto.VolStatusMarkDelete {
                return true
        }

        return false
}

func (mgr *followerReadManager) DelObsoleteVolRecord(obsoleteVolNames map[string]struct{}) {
        mgr.rwMutex.Lock()
        defer mgr.rwMutex.Unlock()

        for volName := range obsoleteVolNames {
                log.LogDebugf("followerReadManager.DelObsoleteVolRecord, delete obsolete vol: %v", volName)
                delete(mgr.volDataPartitionsView, volName)
                delete(mgr.volDataPartitionsCompress, volName)
                delete(mgr.status, volName)
                delete(mgr.lastUpdateTick, volName)
        }
}

func (mgr *followerReadManager) checkStatus() {
        mgr.rwMutex.Lock()
        defer mgr.rwMutex.Unlock()

        timeNow := time.Now()
        for volNm, lastTime := range mgr.lastUpdateTick {
                if mgr.isVolRecordObsolete(volNm) {
                        log.LogDebugf("action[checkStatus] volume %v is obsolete, skip it", volNm)
                        continue
                }

                if lastTime.Before(timeNow.Add(-5 * time.Minute)) {
                        mgr.status[volNm] = false
                        log.LogWarnf("action[checkStatus] volume %v expired last time %v, now %v", volNm, lastTime, timeNow)
                }
        }
}

func (mgr *followerReadManager) updateVolViewFromLeader(key string, view *proto.DataPartitionsView) {
        if !mgr.checkViewContent(key, view, true) {
                log.LogErrorf("updateVolViewFromLeader. key %v checkViewContent failed status %v", key, mgr.status[key])
                return
        }

        reply := newSuccessHTTPReply(view)
        if body, err := json.Marshal(reply); err != nil {
                log.LogErrorf("action[updateDpResponseCache] marshal error %v", err)
                return
        } else {
                mgr.rwMutex.Lock()
                defer mgr.rwMutex.Unlock()
                mgr.volDataPartitionsView[key] = body
                gzipData, err := compressor.New(compressor.EncodingGzip).Compress(body)
                if err != nil {
                        log.LogErrorf("action[updateDpResponseCache] compress error:%+v", err)
                        return
                }
                mgr.volDataPartitionsCompress[key] = gzipData
        }
        mgr.status[key] = true
        mgr.lastUpdateTick[key] = time.Now()
}

func (mgr *followerReadManager) checkViewContent(volName string, view *proto.DataPartitionsView, isUpdate bool) (ok bool) {
        if !isUpdate && !mgr.needCheck {
                return true
        }

        if len(view.DataPartitions) == 0 {
                return true
        }
        for i := 0; i < len(view.DataPartitions); i++ {
                dp := view.DataPartitions[i]
                if len(dp.Hosts) == 0 {
                        log.LogErrorf("checkViewContent. dp id %v, leader %v, status %v", dp.PartitionID, dp.LeaderAddr, dp.Status)
                }
        }
        return true
}

func (mgr *followerReadManager) getVolViewAsFollower(key string, compress bool) (value []byte, ok bool) {
        mgr.rwMutex.RLock()
        defer mgr.rwMutex.RUnlock()
        ok = true
        if compress {
                value, _ = mgr.volDataPartitionsCompress[key]
        } else {
                value, _ = mgr.volDataPartitionsView[key]
        }
        log.LogDebugf("getVolViewAsFollower. volume %v return!", key)
        return
}

func (mgr *followerReadManager) IsVolViewReady(volName string) bool {
        mgr.rwMutex.RLock()
        defer mgr.rwMutex.RUnlock()
        if status, ok := mgr.status[volName]; ok {
                return status
        }
        return false
}

func newCluster(name string, leaderInfo *LeaderInfo, fsm *MetadataFsm, partition raftstore.Partition, cfg *clusterConfig) (c *Cluster) {
        c = new(Cluster)
        c.Name = name
        c.leaderInfo = leaderInfo
        c.vols = make(map[string]*Vol, 0)
        c.cfg = cfg
        if c.cfg.MaxDpCntLimit == 0 {
                c.cfg.MaxDpCntLimit = defaultMaxDpCntLimit
        }
        c.t = newTopology()
        c.BadDataPartitionIds = new(sync.Map)
        c.BadMetaPartitionIds = new(sync.Map)
        c.dataNodeStatInfo = new(nodeStatInfo)
        c.metaNodeStatInfo = new(nodeStatInfo)
        c.FaultDomain = cfg.faultDomain
        c.zoneStatInfos = make(map[string]*proto.ZoneStat)
        c.followerReadManager = newFollowerReadManager(c)
        c.fsm = fsm
        c.partition = partition
        c.idAlloc = newIDAllocator(c.fsm.store, c.partition)
        c.domainManager = newDomainManager(c)
        c.QosAcceptLimit = rate.NewLimiter(rate.Limit(c.cfg.QosMasterAcceptLimit), proto.QosDefaultBurst)
        c.apiLimiter = newApiLimiter()
        c.DecommissionLimit = defaultDecommissionParallelLimit
        c.checkAutoCreateDataPartition = false
        c.masterClient = masterSDK.NewMasterClient(nil, false)
        c.inodeCountNotEqualMP = new(sync.Map)
        c.maxInodeNotEqualMP = new(sync.Map)
        c.dentryCountNotEqualMP = new(sync.Map)
        c.lcMgr = newLifecycleManager()
        c.lcMgr.cluster = c
        c.snapshotMgr = newSnapshotManager()
        c.snapshotMgr.cluster = c
        c.S3ApiQosQuota = new(sync.Map)
        return
}

func (c *Cluster) scheduleTask() {
        c.scheduleToCheckDataPartitions()
        c.scheduleToLoadDataPartitions()
        c.scheduleToCheckReleaseDataPartitions()
        c.scheduleToCheckHeartbeat()
        c.scheduleToCheckMetaPartitions()
        c.scheduleToUpdateStatInfo()
        c.scheduleToManageDp()
        c.scheduleToCheckVolStatus()
        c.scheduleToCheckVolQos()
        c.scheduleToCheckDiskRecoveryProgress()
        c.scheduleToCheckMetaPartitionRecoveryProgress()
        c.scheduleToLoadMetaPartitions()
        c.scheduleToReduceReplicaNum()
        c.scheduleToCheckNodeSetGrpManagerStatus()
        c.scheduleToCheckFollowerReadCache()
        c.scheduleToCheckDecommissionDataNode()
        c.scheduleToCheckDecommissionDisk()
        c.scheduleToCheckDataReplicas()
        c.scheduleToLcScan()
        c.scheduleToSnapshotDelVerScan()
        c.scheduleToBadDisk()
}

func (c *Cluster) masterAddr() (addr string) {
        return c.leaderInfo.addr
}

func (c *Cluster) tryToChangeLeaderByHost() error {
        return c.partition.TryToLeader(1)
}

func (c *Cluster) scheduleToUpdateStatInfo() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.updateStatInfo()
                        }
                        time.Sleep(2 * time.Minute)
                }
        }()
}

func (c *Cluster) addNodeSetGrp(ns *nodeSet, load bool) (err error) {
        log.LogWarnf("addNodeSetGrp nodeSet id[%v] zonename[%v] load[%v] grpManager init[%v]",

                ns.ID, ns.zoneName, load, c.domainManager.init)
        if c.domainManager.init {
                err = c.domainManager.putNodeSet(ns, load)
                c.putZoneDomain(false)
        }
        return
}

const (
        TypeMetaPartition uint32 = 0x01
        TypeDataPartition uint32 = 0x02
)

func (c *Cluster) getHostFromDomainZone(domainId uint64, createType uint32, replicaNum uint8) (hosts []string, peers []proto.Peer, err error) {
        hosts, peers, err = c.domainManager.getHostFromNodeSetGrp(domainId, replicaNum, createType)
        return
}

func (c *Cluster) IsLeader() bool {
        if c.partition != nil {
                return c.partition.IsRaftLeader()
        }
        return false
}

func (c *Cluster) scheduleToManageDp() {
        go func() {
                // check volumes after switching leader two minutes
                time.Sleep(2 * time.Minute)
                c.checkAutoCreateDataPartition = true
        }()

        // schedule delete dataPartition
        go func() {
                time.Sleep(2 * time.Minute)

                for {

                        if c.partition != nil && c.partition.IsRaftLeader() {

                                vols := c.copyVols()

                                for _, vol := range vols {

                                        if proto.IsHot(vol.VolType) {
                                                continue
                                        }

                                        vol.autoDeleteDp(c)
                                }
                        }

                        time.Sleep(2 * time.Minute)
                }
        }()
}

func (c *Cluster) scheduleToCheckDataPartitions() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkDataPartitions()
                        }
                        time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
                }
        }()
}

func (c *Cluster) scheduleToCheckVolStatus() {
        go func() {
                // check vols after switching leader two minutes
                for {
                        if c.partition.IsRaftLeader() {
                                vols := c.copyVols()
                                for _, vol := range vols {
                                        vol.checkStatus(c)
                                        vol.CheckStrategy(c)
                                }
                        }
                        time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
                }
        }()
}

func (c *Cluster) scheduleToCheckFollowerReadCache() {
        go func() {
                for {
                        if !c.partition.IsRaftLeader() {
                                c.followerReadManager.getVolumeDpView()
                                c.followerReadManager.checkStatus()
                        } else {
                                c.followerReadManager.sendFollowerVolumeDpView()
                        }
                        time.Sleep(5 * time.Second)
                }
        }()
}

func (c *Cluster) scheduleToCheckVolQos() {
        go func() {
                // check vols after switching leader two minutes
                for {
                        if c.partition.IsRaftLeader() {
                                vols := c.copyVols()
                                for _, vol := range vols {
                                        vol.checkQos()
                                }
                        }
                        // time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckQos))
                        time.Sleep(time.Duration(float32(time.Second) * 0.5))
                }
        }()
}

func (c *Cluster) scheduleToCheckNodeSetGrpManagerStatus() {
        go func() {
                for {
                        if c.FaultDomain == false || !c.partition.IsRaftLeader() {
                                time.Sleep(time.Minute)
                                continue
                        }
                        c.domainManager.checkAllGrpState()
                        c.domainManager.checkExcludeZoneState()
                        time.Sleep(5 * time.Second)
                }
        }()
}

func (c *Cluster) scheduleToLoadDataPartitions() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.doLoadDataPartitions()
                        }
                        time.Sleep(time.Second * 5)
                }
        }()
}

// Check the replica status of each data partition.
func (c *Cluster) checkDataPartitions() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkDataPartitions occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkDataPartitions occurred panic")
                }
        }()

        vols := c.allVols()
        for _, vol := range vols {
                readWrites := vol.checkDataPartitions(c)
                vol.dataPartitions.setReadWriteDataPartitions(readWrites, c.Name)
                if c.metaReady {
                        vol.dataPartitions.updateResponseCache(true, 0, vol.VolType)
                        vol.dataPartitions.updateCompressCache(true, 0, vol.VolType)
                }
                msg := fmt.Sprintf("action[checkDataPartitions],vol[%v] can readWrite partitions:%v  ",
                        vol.Name, vol.dataPartitions.readableAndWritableCnt)
                log.LogInfo(msg)

                if c.checkAutoCreateDataPartition {
                        vol.checkAutoDataPartitionCreation(c)
                }
        }
}

func (c *Cluster) doLoadDataPartitions() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("doLoadDataPartitions occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "doLoadDataPartitions occurred panic")
                }
        }()
        vols := c.allVols()
        for _, vol := range vols {
                if vol.Status == proto.VolStatusMarkDelete {
                        continue
                }
                vol.loadDataPartition(c)
        }
}

func (c *Cluster) scheduleToCheckReleaseDataPartitions() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.releaseDataPartitionAfterLoad()
                        }
                        time.Sleep(time.Second * defaultIntervalToFreeDataPartition)
                }
        }()
}

// Release the memory used for loading the data partition.
func (c *Cluster) releaseDataPartitionAfterLoad() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("releaseDataPartitionAfterLoad occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "releaseDataPartitionAfterLoad occurred panic")
                }
        }()
        vols := c.copyVols()
        for _, vol := range vols {
                vol.releaseDataPartitions(c.cfg.numberOfDataPartitionsToFree, c.cfg.secondsToFreeDataPartitionAfterLoad)
        }
}

func (c *Cluster) scheduleToCheckHeartbeat() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkLeaderAddr()
                                c.checkDataNodeHeartbeat()
                                // update load factor
                                setOverSoldFactor(c.cfg.ClusterLoadFactor)
                        }
                        time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
                }
        }()

        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkMetaNodeHeartbeat()
                        }
                        time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
                }
        }()

        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkLcNodeHeartbeat()
                        }
                        time.Sleep(time.Second * defaultIntervalToCheckHeartbeat)
                }
        }()
}

func (c *Cluster) passAclCheck(ip string) {
        // do nothing
}

func (c *Cluster) checkLeaderAddr() {
        leaderID, _ := c.partition.LeaderTerm()
        c.leaderInfo.addr = AddrDatabase[leaderID]
}

func (c *Cluster) checkDataNodeHeartbeat() {
        tasks := make([]*proto.AdminTask, 0)
        c.dataNodes.Range(func(addr, dataNode interface{}) bool {
                node := dataNode.(*DataNode)
                node.checkLiveness()
                task := node.createHeartbeatTask(c.masterAddr(), c.diskQosEnable)
                hbReq := task.Request.(*proto.HeartBeatRequest)
                c.volMutex.RLock()
                defer c.volMutex.RUnlock()
                for _, vol := range c.vols {
                        if vol.Forbidden {
                                hbReq.ForbiddenVols = append(hbReq.ForbiddenVols, vol.Name)
                        }
                }
                tasks = append(tasks, task)
                return true
        })
        c.addDataNodeTasks(tasks)
}

func (c *Cluster) checkMetaNodeHeartbeat() {
        tasks := make([]*proto.AdminTask, 0)
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()

        c.metaNodes.Range(func(addr, metaNode interface{}) bool {
                node := metaNode.(*MetaNode)
                node.checkHeartbeat()
                task := node.createHeartbeatTask(c.masterAddr(), c.fileStatsEnable)
                hbReq := task.Request.(*proto.HeartBeatRequest)

                for _, vol := range c.vols {
                        if vol.FollowerRead {
                                hbReq.FLReadVols = append(hbReq.FLReadVols, vol.Name)
                        }
                        if vol.Forbidden {
                                hbReq.ForbiddenVols = append(hbReq.ForbiddenVols, vol.Name)
                        }
                        if !vol.EnableAuditLog {
                                hbReq.DisableAuditVols = append(hbReq.DisableAuditVols, vol.Name)
                        }

                        spaceInfo := vol.uidSpaceManager.getSpaceOp()
                        hbReq.UidLimitInfo = append(hbReq.UidLimitInfo, spaceInfo...)

                        if vol.quotaManager != nil {
                                quotaHbInfos := vol.quotaManager.getQuotaHbInfos()
                                if len(quotaHbInfos) != 0 {
                                        hbReq.QuotaHbInfos = append(hbReq.QuotaHbInfos, quotaHbInfos...)
                                }
                        }

                        hbReq.TxInfo = append(hbReq.TxInfo, &proto.TxInfo{
                                Volume:     vol.Name,
                                Mask:       vol.enableTransaction,
                                OpLimitVal: vol.txOpLimit,
                        })
                }
                log.LogDebugf("checkMetaNodeHeartbeat start")
                for _, info := range hbReq.QuotaHbInfos {
                        log.LogDebugf("checkMetaNodeHeartbeat info [%v]", info)
                }
                tasks = append(tasks, task)
                return true
        })

        c.addMetaNodeTasks(tasks)
}

func (c *Cluster) checkLcNodeHeartbeat() {
        tasks := make([]*proto.AdminTask, 0)
        diedNodes := make([]string, 0)
        c.lcNodes.Range(func(addr, lcNode interface{}) bool {
                node := lcNode.(*LcNode)
                node.checkLiveness()
                if !node.IsActive {
                        log.LogInfof("checkLcNodeHeartbeat: lcnode(%v) is inactive", node.Addr)
                        diedNodes = append(diedNodes, node.Addr)
                        return true
                }
                task := node.createHeartbeatTask(c.masterAddr())
                tasks = append(tasks, task)
                return true
        })
        c.addLcNodeTasks(tasks)
        for _, node := range diedNodes {
                log.LogInfof("checkLcNodeHeartbeat: deregister node(%v)", node)
                _ = c.delLcNode(node)
        }
        return
}

func (c *Cluster) scheduleToCheckMetaPartitions() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkMetaPartitions()
                        }
                        time.Sleep(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
                }
        }()
}

func (c *Cluster) checkMetaPartitions() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkMetaPartitions occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkMetaPartitions occurred panic")
                }
        }()
        vols := c.allVols()
        for _, vol := range vols {
                vol.checkMetaPartitions(c)
        }
}

func (c *Cluster) scheduleToReduceReplicaNum() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.checkVolReduceReplicaNum()
                        }
                        time.Sleep(5 * time.Minute)
                }
        }()
}

func (c *Cluster) checkVolReduceReplicaNum() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkVolReduceReplicaNum occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkVolReduceReplicaNum occurred panic")
                }
        }()
        vols := c.allVols()
        for _, vol := range vols {
                vol.checkReplicaNum(c)
        }
}

func (c *Cluster) getInvalidIDNodes() (nodes []*InvalidNodeView) {
        metaNodes := c.getNotConsistentIDMetaNodes()
        nodes = append(nodes, metaNodes...)
        dataNodes := c.getNotConsistentIDDataNodes()
        nodes = append(nodes, dataNodes...)
        return
}

func (c *Cluster) scheduleToCheckDataReplicas() {
        go func() {
                for {
                        if c.checkDataReplicasEnable {
                                if c.partition != nil && c.partition.IsRaftLeader() {
                                        c.checkDataReplicas()
                                }
                        }
                        time.Sleep(1 * time.Minute)
                }
        }()
}

func (c *Cluster) checkDataReplicas() {
        lackReplicaDataPartitions, _ := c.checkLackReplicaAndHostDataPartitions()
        if len(lackReplicaDataPartitions) == 0 {
                return
        }

        successCnt := 0
        for _, dp := range lackReplicaDataPartitions {
                if success, _ := c.autoAddDataReplica(dp); success {
                        successCnt += 1
                }
        }
        failCnt := len(lackReplicaDataPartitions) - successCnt
        log.LogInfof("action[checkDataReplicas] autoAddDataReplica successCnt[%v], failedCnt[%v]", successCnt, failCnt)
}

func (c *Cluster) getNotConsistentIDMetaNodes() (metaNodes []*InvalidNodeView) {
        metaNodes = make([]*InvalidNodeView, 0)
        c.metaNodes.Range(func(key, value interface{}) bool {
                metanode, ok := value.(*MetaNode)
                if !ok {
                        return true
                }
                notConsistent, oldID := c.hasNotConsistentIDMetaPartitions(metanode)
                if notConsistent {
                        metaNodes = append(metaNodes, &InvalidNodeView{Addr: metanode.Addr, ID: metanode.ID, OldID: oldID, NodeType: "meta"})
                }
                return true
        })
        return
}

func (c *Cluster) hasNotConsistentIDMetaPartitions(metanode *MetaNode) (notConsistent bool, oldID uint64) {
        safeVols := c.allVols()
        for _, vol := range safeVols {
                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        for _, peer := range mp.Peers {
                                if peer.Addr == metanode.Addr && peer.ID != metanode.ID {
                                        vol.mpsLock.RUnlock()
                                        return true, peer.ID
                                }
                        }
                }
                vol.mpsLock.RUnlock()
        }
        return
}

func (c *Cluster) getNotConsistentIDDataNodes() (dataNodes []*InvalidNodeView) {
        dataNodes = make([]*InvalidNodeView, 0)
        c.dataNodes.Range(func(key, value interface{}) bool {
                datanode, ok := value.(*DataNode)
                if !ok {
                        return true
                }
                notConsistent, oldID := c.hasNotConsistentIDDataPartitions(datanode)
                if notConsistent {
                        dataNodes = append(dataNodes, &InvalidNodeView{Addr: datanode.Addr, ID: datanode.ID, OldID: oldID, NodeType: "data"})
                }
                return true
        })
        return
}

func (c *Cluster) hasNotConsistentIDDataPartitions(datanode *DataNode) (notConsistent bool, oldID uint64) {
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, mp := range vol.dataPartitions.partitions {
                        for _, peer := range mp.Peers {
                                if peer.Addr == datanode.Addr && peer.ID != datanode.ID {
                                        return true, peer.ID
                                }
                        }
                }
        }
        return
}

func (c *Cluster) updateDataNodeBaseInfo(nodeAddr string, id uint64) (err error) {
        c.dnMutex.Lock()
        defer c.dnMutex.Unlock()
        value, ok := c.dataNodes.Load(nodeAddr)
        if !ok {
                err = fmt.Errorf("node %v is not exist", nodeAddr)
                return
        }
        dataNode := value.(*DataNode)
        if dataNode.ID == id {
                return
        }
        cmds := make(map[string]*RaftCmd)
        metadata, err := c.buildDeleteDataNodeCmd(dataNode)
        if err != nil {
                return
        }
        cmds[metadata.K] = metadata
        dataNode.ID = id
        metadata, err = c.buildUpdateDataNodeCmd(dataNode)
        if err != nil {
                return
        }
        cmds[metadata.K] = metadata
        if err = c.syncBatchCommitCmd(cmds); err != nil {
                return
        }
        // partitions := c.getAllMetaPartitionsByMetaNode(nodeAddr)
        return
}

func (c *Cluster) updateMetaNodeBaseInfo(nodeAddr string, id uint64) (err error) {
        c.mnMutex.Lock()
        defer c.mnMutex.Unlock()
        value, ok := c.metaNodes.Load(nodeAddr)
        if !ok {
                err = fmt.Errorf("node %v is not exist", nodeAddr)
                return
        }
        metaNode := value.(*MetaNode)
        if metaNode.ID == id {
                return
        }
        cmds := make(map[string]*RaftCmd)
        metadata, err := c.buildDeleteMetaNodeCmd(metaNode)
        if err != nil {
                return
        }
        cmds[metadata.K] = metadata
        metaNode.ID = id
        metadata, err = c.buildUpdateMetaNodeCmd(metaNode)
        if err != nil {
                return
        }
        cmds[metadata.K] = metadata
        if err = c.syncBatchCommitCmd(cmds); err != nil {
                return
        }
        // partitions := c.getAllMetaPartitionsByMetaNode(nodeAddr)
        return
}

func (c *Cluster) addMetaNode(nodeAddr, zoneName string, nodesetId uint64) (id uint64, err error) {
        c.mnMutex.Lock()
        defer c.mnMutex.Unlock()

        var metaNode *MetaNode
        if value, ok := c.metaNodes.Load(nodeAddr); ok {
                metaNode = value.(*MetaNode)
                if nodesetId > 0 && nodesetId != metaNode.ID {
                        return metaNode.ID, fmt.Errorf("addr already in nodeset [%v]", nodeAddr)
                }
                return metaNode.ID, nil
        }

        metaNode = newMetaNode(nodeAddr, zoneName, c.Name)
        zone, err := c.t.getZone(zoneName)
        if err != nil {
                zone = c.t.putZoneIfAbsent(newZone(zoneName))
        }

        var ns *nodeSet
        if nodesetId > 0 {
                if ns, err = zone.getNodeSet(nodesetId); err != nil {
                        return nodesetId, err
                }
        } else {
                c.nsMutex.Lock()
                ns = zone.getAvailNodeSetForMetaNode()
                if ns == nil {
                        if ns, err = zone.createNodeSet(c); err != nil {
                                c.nsMutex.Unlock()
                                goto errHandler
                        }
                }
                c.nsMutex.Unlock()
        }

        if id, err = c.idAlloc.allocateCommonID(); err != nil {
                goto errHandler
        }
        metaNode.ID = id
        metaNode.NodeSetID = ns.ID
        log.LogInfof("action[addMetaNode] metanode id[%v] zonename [%v] add meta node to nodesetid[%v]", id, zoneName, ns.ID)
        if err = c.syncAddMetaNode(metaNode); err != nil {
                goto errHandler
        }
        if err = c.syncUpdateNodeSet(ns); err != nil {
                goto errHandler
        }
        c.t.putMetaNode(metaNode)
        // nodeset be avaliable first time can be put into nodesetGrp

        c.addNodeSetGrp(ns, false)
        c.metaNodes.Store(nodeAddr, metaNode)
        log.LogInfof("action[addMetaNode],clusterID[%v] metaNodeAddr:%v,nodeSetId[%v],capacity[%v]",
                c.Name, nodeAddr, ns.ID, ns.Capacity)
        return
errHandler:
        err = fmt.Errorf("action[addMetaNode],clusterID[%v] metaNodeAddr:%v err:%v ",
                c.Name, nodeAddr, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

func (c *Cluster) addDataNode(nodeAddr, zoneName string, nodesetId uint64) (id uint64, err error) {
        c.dnMutex.Lock()
        defer c.dnMutex.Unlock()
        var dataNode *DataNode
        if node, ok := c.dataNodes.Load(nodeAddr); ok {
                dataNode = node.(*DataNode)
                if nodesetId > 0 && nodesetId != dataNode.NodeSetID {
                        return dataNode.ID, fmt.Errorf("addr already in nodeset [%v]", nodeAddr)
                }
                return dataNode.ID, nil
        }

        dataNode = newDataNode(nodeAddr, zoneName, c.Name)
        dataNode.DpCntLimit = newDpCountLimiter(&c.cfg.MaxDpCntLimit)
        zone, err := c.t.getZone(zoneName)
        if err != nil {
                zone = c.t.putZoneIfAbsent(newZone(zoneName))
        }
        var ns *nodeSet
        if nodesetId > 0 {
                if ns, err = zone.getNodeSet(nodesetId); err != nil {
                        return nodesetId, err
                }
        } else {
                c.nsMutex.Lock()
                ns = zone.getAvailNodeSetForDataNode()
                if ns == nil {
                        if ns, err = zone.createNodeSet(c); err != nil {
                                c.nsMutex.Unlock()
                                goto errHandler
                        }
                }
                c.nsMutex.Unlock()
        }
        // allocate dataNode id
        if id, err = c.idAlloc.allocateCommonID(); err != nil {
                goto errHandler
        }
        dataNode.ID = id
        dataNode.NodeSetID = ns.ID
        log.LogInfof("action[addDataNode] datanode id[%v] zonename [%v] add node to nodesetid[%v]", id, zoneName, ns.ID)
        if err = c.syncAddDataNode(dataNode); err != nil {
                goto errHandler
        }
        if err = c.syncUpdateNodeSet(ns); err != nil {
                goto errHandler
        }
        c.t.putDataNode(dataNode)
        // nodeset be avaliable first time can be put into nodesetGrp

        c.addNodeSetGrp(ns, false)

        c.dataNodes.Store(nodeAddr, dataNode)
        log.LogInfof("action[addDataNode],clusterID[%v] dataNodeAddr:%v,nodeSetId[%v],capacity[%v]",
                c.Name, nodeAddr, ns.ID, ns.Capacity)
        return
errHandler:
        err = fmt.Errorf("action[addDataNode],clusterID[%v] dataNodeAddr:%v err:%v ", c.Name, nodeAddr, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

func (c *Cluster) checkInactiveDataNodes() (inactiveDataNodes []string, err error) {
        inactiveDataNodes = make([]string, 0)

        c.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode := node.(*DataNode)
                if !dataNode.isActive {
                        inactiveDataNodes = append(inactiveDataNodes, dataNode.Addr)
                }
                return true
        })

        log.LogInfof("clusterID[%v] inactiveDataNodes:%v", c.Name, inactiveDataNodes)
        return
}

func (c *Cluster) checkLackReplicaAndHostDataPartitions() (lackReplicaDataPartitions []*DataPartition, err error) {
        lackReplicaDataPartitions = make([]*DataPartition, 0)
        vols := c.copyVols()
        for _, vol := range vols {
                var dps *DataPartitionMap
                dps = vol.dataPartitions
                for _, dp := range dps.partitions {
                        if dp.ReplicaNum > uint8(len(dp.Hosts)) && len(dp.Hosts) == len(dp.Replicas) && dp.IsDecommissionInitial() {
                                lackReplicaDataPartitions = append(lackReplicaDataPartitions, dp)
                        }
                }
        }
        log.LogInfof("clusterID[%v] checkLackReplicaAndHostDataPartitions count:[%v]", c.Name, len(lackReplicaDataPartitions))
        return
}

func (c *Cluster) checkLackReplicaDataPartitions() (lackReplicaDataPartitions []*DataPartition, err error) {
        lackReplicaDataPartitions = make([]*DataPartition, 0)
        vols := c.copyVols()
        for _, vol := range vols {
                var dps *DataPartitionMap
                dps = vol.dataPartitions
                for _, dp := range dps.partitions {
                        if dp.ReplicaNum > uint8(len(dp.Hosts)) {
                                lackReplicaDataPartitions = append(lackReplicaDataPartitions, dp)
                        }
                }
        }
        log.LogInfof("clusterID[%v] lackReplicaDataPartitions count:[%v]", c.Name, len(lackReplicaDataPartitions))
        return
}

func (c *Cluster) checkReplicaOfDataPartitions(ignoreDiscardDp bool) (
        lackReplicaDPs []*DataPartition, unavailableReplicaDPs []*DataPartition, repFileCountDifferDps []*DataPartition,
        repUsedSizeDifferDps []*DataPartition, excessReplicaDPs []*DataPartition, noLeaderDPs []*DataPartition, err error) {
        noLeaderDPs = make([]*DataPartition, 0)
        lackReplicaDPs = make([]*DataPartition, 0)
        unavailableReplicaDPs = make([]*DataPartition, 0)
        excessReplicaDPs = make([]*DataPartition, 0)

        vols := c.copyVols()
        for _, vol := range vols {
                var dps *DataPartitionMap
                dps = vol.dataPartitions
                for _, dp := range dps.partitions {
                        if ignoreDiscardDp && dp.IsDiscard {
                                continue
                        }

                        if vol.Status == proto.VolStatusMarkDelete {
                                continue
                        }

                        if proto.IsHot(vol.VolType) {
                                if dp.getLeaderAddr() == "" && (time.Now().Unix()-dp.LeaderReportTime > c.cfg.DpNoLeaderReportIntervalSec) {
                                        noLeaderDPs = append(noLeaderDPs, dp)
                                }
                        }

                        if dp.ReplicaNum > uint8(len(dp.Hosts)) || dp.ReplicaNum > uint8(len(dp.Replicas)) {
                                lackReplicaDPs = append(lackReplicaDPs, dp)
                        }

                        if (dp.GetDecommissionStatus() == DecommissionInitial || dp.GetDecommissionStatus() == DecommissionFail) &&
                                (uint8(len(dp.Hosts)) > dp.ReplicaNum || uint8(len(dp.Replicas)) > dp.ReplicaNum) {
                                excessReplicaDPs = append(excessReplicaDPs, dp)
                        }

                        repSizeDiff := 0.0
                        repSizeSentry := 0.0
                        repFileCountDiff := uint32(0)
                        repFileCountSentry := uint32(0)
                        if len(dp.Replicas) != 0 {
                                repSizeSentry = float64(dp.Replicas[0].Used)
                                repFileCountSentry = dp.Replicas[0].FileCount
                        }

                        recordReplicaUnavailable := false
                        for _, replica := range dp.Replicas {
                                if !recordReplicaUnavailable && replica.Status == proto.Unavailable {
                                        unavailableReplicaDPs = append(unavailableReplicaDPs, dp)
                                        recordReplicaUnavailable = true
                                }

                                if dp.IsDoingDecommission() {
                                        continue
                                }

                                tempSizeDiff := math.Abs(float64(replica.Used) - repSizeSentry)
                                if tempSizeDiff > repSizeDiff {
                                        repSizeDiff = tempSizeDiff
                                }

                                tempFileCountDiff := replica.FileCount - repFileCountSentry
                                if tempFileCountDiff > repFileCountDiff {
                                        repFileCountDiff = tempFileCountDiff
                                }
                        }

                        if repSizeDiff > float64(c.cfg.diffReplicaSpaceUsage) {
                                repUsedSizeDifferDps = append(repUsedSizeDifferDps, dp)
                        }

                        if repFileCountDiff > c.cfg.diffReplicaFileCount {
                                repFileCountDifferDps = append(repFileCountDifferDps, dp)
                        }
                }
        }

        log.LogInfof("clusterID[%v] lackReplicaDp count:[%v], unavailableReplicaDp count:[%v], "+
                "repFileCountDifferDps count[%v], repUsedSizeDifferDps count[%v], "+
                "excessReplicaDPs count[%v], noLeaderDPs count[%v]",
                c.Name, len(lackReplicaDPs), len(unavailableReplicaDPs),
                len(repFileCountDifferDps), len(repUsedSizeDifferDps),
                len(excessReplicaDPs), len(noLeaderDPs))
        return
}

func (c *Cluster) getDataPartitionByID(partitionID uint64) (dp *DataPartition, err error) {
        vols := c.copyVols()

        for _, vol := range vols {
                if dp, err = vol.getDataPartitionByID(partitionID); err == nil {
                        return
                }
        }

        err = dataPartitionNotFound(partitionID)
        return
}

func (c *Cluster) getMetaPartitionByID(id uint64) (mp *MetaPartition, err error) {
        vols := c.copyVols()
        for _, vol := range vols {
                if mp, err = vol.metaPartition(id); err == nil {
                        return
                }
        }
        err = metaPartitionNotFound(id)
        return
}

func (c *Cluster) putVol(vol *Vol) {
        c.volMutex.Lock()
        defer c.volMutex.Unlock()
        if _, ok := c.vols[vol.Name]; !ok {
                c.vols[vol.Name] = vol
        }
}

func (c *Cluster) SetVerStrategy(volName string, strategy proto.VolumeVerStrategy, isForce bool) (err error) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()

        vol, ok := c.vols[volName]
        if !ok {
                err = proto.ErrVolNotExists
                return
        }

        if !proto.IsHot(vol.VolType) {
                err = fmt.Errorf("vol need be hot one")
                return
        }
        return vol.VersionMgr.SetVerStrategy(strategy, isForce)
}

func (c *Cluster) getVolVer(volName string) (info *proto.VolumeVerInfo, err error) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()

        var verSeqPrepare uint64

        vol, ok := c.vols[volName]
        if !ok {
                err = proto.ErrVolNotExists
                return
        }

        if !proto.IsHot(vol.VolType) {
                err = fmt.Errorf("vol need be hot one")
                return
        }

        if vol.VersionMgr.enabled {
                verSeqPrepare = vol.VersionMgr.prepareCommit.prepareInfo.Ver
        }
        var pStatus uint8
        if vol.VersionMgr.prepareCommit.prepareInfo != nil {
                pStatus = vol.VersionMgr.prepareCommit.prepareInfo.Status
        }
        info = &proto.VolumeVerInfo{
                Name:             volName,
                VerSeq:           vol.VersionMgr.verSeq,
                VerSeqPrepare:    verSeqPrepare,
                VerPrepareStatus: pStatus,
                Enabled:          vol.VersionMgr.enabled,
        }
        return
}

func (c *Cluster) getVol(volName string) (vol *Vol, err error) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        vol, ok := c.vols[volName]
        if !ok {
                err = proto.ErrVolNotExists
        }
        return
}

func (c *Cluster) deleteVol(name string) {
        c.volMutex.Lock()
        defer c.volMutex.Unlock()
        delete(c.vols, name)
        return
}

func (c *Cluster) markDeleteVol(name, authKey string, force bool) (err error) {
        var (
                vol           *Vol
                serverAuthKey string
        )

        if vol, err = c.getVol(name); err != nil {
                log.LogErrorf("action[markDeleteVol] err[%v]", err)
                return proto.ErrVolNotExists
        }

        if !c.cfg.volForceDeletion {
                volDentryCount := uint64(0)
                mpsCopy := vol.cloneMetaPartitionMap()
                for _, mp := range mpsCopy {
                        // to avoid latency, fetch latest mp dentry count from metanode
                        c.doLoadMetaPartition(mp)
                        mpDentryCount := uint64(0)
                        for _, response := range mp.LoadResponse {
                                if response.DentryCount > mpDentryCount {
                                        mpDentryCount = response.DentryCount
                                }
                        }
                        volDentryCount += mpDentryCount
                }

                if volDentryCount > c.cfg.volDeletionDentryThreshold {
                        return fmt.Errorf("vol %s is not empty ! it's dentry count %d > dentry count deletion threshold %d, deletion not permitted ! ",
                                vol.Name, volDentryCount, c.cfg.volDeletionDentryThreshold)
                }
        }

        if proto.IsCold(vol.VolType) && vol.totalUsedSpace() > 0 && !force {
                return fmt.Errorf("ec-vol can't be deleted if ec used size not equal 0, now(%d)", vol.totalUsedSpace())
        }

        serverAuthKey = vol.Owner
        if !matchKey(serverAuthKey, authKey) {
                return proto.ErrVolAuthKeyNotMatch
        }

        vol.Status = proto.VolStatusMarkDelete
        if err = c.syncUpdateVol(vol); err != nil {
                vol.Status = proto.VolStatusNormal
                return proto.ErrPersistenceByRaft
        }

        return
}

func (c *Cluster) batchCreatePreLoadDataPartition(vol *Vol, preload *DataPartitionPreLoad) (err error, dps []*DataPartition) {
        if proto.IsHot(vol.VolType) {
                return fmt.Errorf("vol type is not warm"), nil
        }

        total := overSoldCap(uint64(preload.preloadCacheCapacity))
        reqCreateCount := (total-1)/(util.DefaultDataPartitionSize/util.GB) + 1

        for i := 0; i < int(reqCreateCount); i++ {
                log.LogInfof("create preload data partition (%v) total (%v)", i, reqCreateCount)

                var dp *DataPartition
                if dp, err = c.createDataPartition(vol.Name, preload); err != nil {
                        log.LogErrorf("create preload data partition fail: volume(%v) err(%v)", vol.Name, err)
                        return err, nil
                }

                dps = append(dps, dp)
        }

        return
}

func (c *Cluster) batchCreateDataPartition(vol *Vol, reqCount int, init bool) (err error) {
        if !init {
                if _, err = vol.needCreateDataPartition(); err != nil {
                        log.LogWarnf("action[batchCreateDataPartition] create data partition failed, err[%v]", err)
                        return
                }
        }
        for i := 0; i < reqCount; i++ {
                if c.DisableAutoAllocate {
                        log.LogWarn("disable auto allocate dataPartition")
                        return fmt.Errorf("cluster is disable auto allocate dataPartition")
                }

                if vol.Forbidden {
                        log.LogWarn("disable auto allocate dataPartition by forbidden volume")
                        return fmt.Errorf("volume is forbidden")
                }

                if _, err = c.createDataPartition(vol.Name, nil); err != nil {
                        log.LogErrorf("action[batchCreateDataPartition] after create [%v] data partition,occurred error,err[%v]", i, err)
                        break
                }
        }
        return
}

func (c *Cluster) isFaultDomain(vol *Vol) bool {
        var specifyZoneNeedDomain bool
        if c.FaultDomain && !vol.crossZone && !c.needFaultDomain {
                if value, ok := c.t.zoneMap.Load(vol.zoneName); ok {
                        if value.(*Zone).status == unavailableZone {
                                specifyZoneNeedDomain = true
                        }
                }
        }
        log.LogInfof("action[isFaultDomain] vol [%v] zoname [%v] FaultDomain[%v] need fault domain[%v] vol crosszone[%v] default[%v] specifyZoneNeedDomain[%v] domainOn[%v]",
                vol.Name, vol.zoneName, c.FaultDomain, c.needFaultDomain, vol.crossZone, vol.defaultPriority, specifyZoneNeedDomain, vol.domainOn)
        domainOn := c.FaultDomain &&
                (vol.domainOn ||
                        (!vol.crossZone && c.needFaultDomain) || specifyZoneNeedDomain ||
                        (vol.crossZone && (!vol.defaultPriority ||
                                (vol.defaultPriority && (c.needFaultDomain || len(c.t.domainExcludeZones) <= 1)))))
        if !vol.domainOn && domainOn {
                vol.domainOn = domainOn
                // todo:(leonchang). updateView used to update domainOn status in viewCache, use channel may be better or else lock may happend
                // vol.updateViewCache(c)
                c.syncUpdateVol(vol)
                log.LogInfof("action[isFaultDomain] vol [%v] set domainOn", vol.Name)
        }
        return vol.domainOn
}

// Synchronously create a data partition.
// 1. Choose one of the available data nodes.
// 2. Assign it a partition ID.
// 3. Communicate with the data node to synchronously create a data partition.
// - If succeeded, replicate the data through raft and persist it to RocksDB.
// - Otherwise, throw errors

func (c *Cluster) createDataPartition(volName string, preload *DataPartitionPreLoad) (dp *DataPartition, err error) {
        log.LogInfof("action[createDataPartition] preload [%v]", preload)
        var (
                vol          *Vol
                partitionID  uint64
                targetHosts  []string
                targetPeers  []proto.Peer
                wg           sync.WaitGroup
                isPreload    bool
                partitionTTL int64
                ok           bool
        )

        c.volMutex.RLock()
        if vol, ok = c.vols[volName]; !ok {
                err = fmt.Errorf("vol %v not exist", volName)
                log.LogWarnf("createDataPartition volName %v not found", volName)
                c.volMutex.RUnlock()
                return
        }
        c.volMutex.RUnlock()

        dpReplicaNum := vol.dpReplicaNum
        zoneName := vol.zoneName

        if preload != nil {
                dpReplicaNum = uint8(preload.preloadReplicaNum)
                zoneName = preload.preloadZoneName
                isPreload = true
                partitionTTL = int64(preload.PreloadCacheTTL)*util.OneDaySec() + time.Now().Unix()
        }

        if vol, err = c.getVol(volName); err != nil {
                return
        }

        vol.createDpMutex.Lock()
        defer vol.createDpMutex.Unlock()

        errChannel := make(chan error, dpReplicaNum)

        if c.isFaultDomain(vol) {
                if targetHosts, targetPeers, err = c.getHostFromDomainZone(vol.domainId, TypeDataPartition, dpReplicaNum); err != nil {
                        goto errHandler
                }
        } else {
                zoneNum := c.decideZoneNum(vol.crossZone)
                if targetHosts, targetPeers, err = c.getHostFromNormalZone(TypeDataPartition, nil, nil, nil,
                        int(dpReplicaNum), zoneNum, zoneName); err != nil {
                        goto errHandler
                }
        }

        if partitionID, err = c.idAlloc.allocateDataPartitionID(); err != nil {
                goto errHandler
        }
        dp = newDataPartition(partitionID, dpReplicaNum, volName, vol.ID, proto.GetDpType(vol.VolType, isPreload), partitionTTL)
        dp.Hosts = targetHosts
        dp.Peers = targetPeers

        log.LogInfof("action[createDataPartition] partitionID [%v] get host [%v]", partitionID, targetHosts)

        for _, host := range targetHosts {
                wg.Add(1)
                go func(host string) {
                        defer func() {
                                wg.Done()
                        }()

                        var diskPath string

                        if diskPath, err = c.syncCreateDataPartitionToDataNode(host, vol.dataPartitionSize,
                                dp, dp.Peers, dp.Hosts, proto.NormalCreateDataPartition, dp.PartitionType, false); err != nil {
                                errChannel <- err
                                return
                        }

                        dp.Lock()
                        defer dp.Unlock()
                        if err = dp.afterCreation(host, diskPath, c); err != nil {
                                errChannel <- err
                        }
                }(host)
        }

        wg.Wait()

        select {
        case err = <-errChannel:
                for _, host := range targetHosts {
                        wg.Add(1)
                        go func(host string) {
                                defer func() {
                                        wg.Done()
                                }()
                                _, err := dp.getReplica(host)
                                if err != nil {
                                        return
                                }
                                task := dp.createTaskToDeleteDataPartition(host)
                                tasks := make([]*proto.AdminTask, 0)
                                tasks = append(tasks, task)
                                c.addDataNodeTasks(tasks)
                        }(host)
                }
                wg.Wait()
                goto errHandler
        default:
                dp.total = vol.dataPartitionSize
                dp.setReadWrite()
        }

        if err = c.syncAddDataPartition(dp); err != nil {
                goto errHandler
        }

        vol.dataPartitions.put(dp)
        log.LogInfof("action[createDataPartition] success,volName[%v],partitionId[%v], count[%d]", volName, partitionID, len(vol.dataPartitions.partitions))
        return

errHandler:
        err = fmt.Errorf("action[createDataPartition],clusterID[%v] vol[%v] Err:%v ", c.Name, volName, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

func (c *Cluster) syncCreateDataPartitionToDataNode(host string, size uint64, dp *DataPartition,
        peers []proto.Peer, hosts []string, createType int, partitionType int, needRollBack bool) (diskPath string, err error) {
        log.LogInfof("action[syncCreateDataPartitionToDataNode] dp [%v] createtype[%v], partitionType[%v]", dp.PartitionID, createType, partitionType)
        dataNode, err := c.dataNode(host)
        if err != nil {
                return
        }
        task := dp.createTaskToCreateDataPartition(host, size, peers, hosts, createType, partitionType, dataNode.getDecommissionedDisks())
        var resp *proto.Packet
        if resp, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
                // data node is not alive or other process error
                if needRollBack {
                        dp.DecommissionNeedRollback = true
                        c.syncUpdateDataPartition(dp)
                }
                return
        }
        return string(resp.Data), nil
}

func (c *Cluster) syncCreateMetaPartitionToMetaNode(host string, mp *MetaPartition) (err error) {
        hosts := make([]string, 0)
        hosts = append(hosts, host)
        tasks := mp.buildNewMetaPartitionTasks(hosts, mp.Peers, mp.volName)
        metaNode, err := c.metaNode(host)
        if err != nil {
                return
        }
        if _, err = metaNode.Sender.syncSendAdminTask(tasks[0]); err != nil {
                return
        }
        return
}

// decideZoneNum
// if vol is not cross zone, return 1
// if vol enable cross zone and the zone number of cluster less than defaultReplicaNum return 2
// otherwise, return defaultReplicaNum
func (c *Cluster) decideZoneNum(crossZone bool) (zoneNum int) {
        if !crossZone {
                return 1
        }

        var zoneLen int
        if c.FaultDomain {
                zoneLen = len(c.t.domainExcludeZones)
        } else {
                zoneLen = c.t.zoneLen()
        }

        if zoneLen < defaultReplicaNum {
                zoneNum = 2
        } else {
                zoneNum = defaultReplicaNum
        }

        return zoneNum
}

func (c *Cluster) chooseZone2Plus1(zones []*Zone, excludeNodeSets []uint64, excludeHosts []string,
        nodeType uint32, replicaNum int) (hosts []string, peers []proto.Peer, err error,
) {
        if replicaNum < 2 || replicaNum > 3 {
                return nil, nil, fmt.Errorf("action[chooseZone2Plus1] replicaNum [%v]", replicaNum)
        }

        zoneList := make([]*Zone, 2)
        if zones[0].getSpaceLeft(nodeType) < zones[1].getSpaceLeft(nodeType) {
                zoneList[0] = zones[0]
                zoneList[1] = zones[1]
        } else {
                zoneList[0] = zones[1]
                zoneList[1] = zones[0]
        }

        for i := 2; i < len(zones); i++ {
                spaceLeft := zones[i].getSpaceLeft(nodeType)
                if spaceLeft > zoneList[0].getSpaceLeft(nodeType) {
                        if spaceLeft > zoneList[1].getSpaceLeft(nodeType) {
                                zoneList[1] = zones[i]
                        } else {
                                zoneList[0] = zones[i]
                        }
                }
        }
        log.LogInfof("action[chooseZone2Plus1] type [%v] after check,zone0 [%v] left [%v] zone1 [%v] left [%v]",
                nodeType, zoneList[0].name, zoneList[0].getSpaceLeft(nodeType), zoneList[1].name, zoneList[1].getSpaceLeft(nodeType))

        num := 1
        for _, zone := range zoneList {
                selectedHosts, selectedPeers, e := zone.getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, num)
                if e != nil {
                        log.LogErrorf("action[getHostFromNormalZone] error [%v]", e)
                        return nil, nil, e
                }

                hosts = append(hosts, selectedHosts...)
                peers = append(peers, selectedPeers...)
                log.LogInfof("action[chooseZone2Plus1] zone [%v] left [%v] get hosts[%v]",
                        zone.name, zone.getSpaceLeft(nodeType), selectedHosts)

                num = replicaNum - num
        }
        log.LogInfof("action[chooseZone2Plus1] finally get hosts[%v]", hosts)

        return hosts, peers, nil
}

func (c *Cluster) chooseZoneNormal(zones []*Zone, excludeNodeSets []uint64, excludeHosts []string,
        nodeType uint32, replicaNum int) (hosts []string, peers []proto.Peer, err error) {
        log.LogInfof("action[chooseZoneNormal] zones[%s] nodeType[%d] replicaNum[%d]", printZonesName(zones), nodeType, replicaNum)

        c.zoneIdxMux.Lock()
        defer c.zoneIdxMux.Unlock()

        for i := 0; i < replicaNum; i++ {
                zone := zones[c.lastZoneIdxForNode]
                c.lastZoneIdxForNode = (c.lastZoneIdxForNode + 1) % len(zones)
                selectedHosts, selectedPeers, err := zone.getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, 1)
                if err != nil {
                        log.LogErrorf("action[chooseZoneNormal] error [%v]", err)
                        return nil, nil, err
                }

                hosts = append(hosts, selectedHosts...)
                peers = append(peers, selectedPeers...)
        }

        return
}

func (c *Cluster) getHostFromNormalZone(nodeType uint32, excludeZones []string, excludeNodeSets []uint64,
        excludeHosts []string, replicaNum int,
        zoneNum int, specifiedZone string) (hosts []string, peers []proto.Peer, err error,
) {
        var zones []*Zone
        zones = make([]*Zone, 0)
        if replicaNum <= zoneNum {
                zoneNum = replicaNum
        }
        // when creating vol,user specified a zone,we reset zoneNum to 1,to be created partition with specified zone,
        // if specified zone is not writable,we choose a zone randomly
        if specifiedZone != "" {
                if err = c.checkNormalZoneName(specifiedZone); err != nil {
                        Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is found", c.Name, specifiedZone))
                        return
                }
                zoneList := strings.Split(specifiedZone, ",")
                for i := 0; i < len(zoneList); i++ {
                        var zone *Zone
                        if zone, err = c.t.getZone(zoneList[i]); err != nil {
                                Warn(c.Name, fmt.Sprintf("cluster[%v],specified zone[%v]is found", c.Name, specifiedZone))
                                return
                        }
                        zones = append(zones, zone)
                }
        } else {
                if nodeType == TypeDataPartition {
                        if zones, err = c.t.allocZonesForDataNode(zoneNum, replicaNum, excludeZones); err != nil {
                                return
                        }
                } else {
                        if zones, err = c.t.allocZonesForMetaNode(zoneNum, replicaNum, excludeZones); err != nil {
                                return
                        }
                }
        }

        if len(zones) == 1 {
                log.LogInfof("action[getHostFromNormalZone] zones [%v]", zones[0].name)
                if hosts, peers, err = zones[0].getAvailNodeHosts(nodeType, excludeNodeSets, excludeHosts, replicaNum); err != nil {
                        log.LogErrorf("action[getHostFromNormalZone],err[%v]", err)
                        return
                }
                goto result
        }

        hosts = make([]string, 0)
        peers = make([]proto.Peer, 0)
        if excludeHosts == nil {
                excludeHosts = make([]string, 0)
        }

        if c.cfg.DefaultNormalZoneCnt == defaultNormalCrossZoneCnt && len(zones) >= defaultNormalCrossZoneCnt {
                if hosts, peers, err = c.chooseZoneNormal(zones, excludeNodeSets, excludeHosts, nodeType, replicaNum); err != nil {
                        return
                }
        } else {
                if hosts, peers, err = c.chooseZone2Plus1(zones, excludeNodeSets, excludeHosts, nodeType, replicaNum); err != nil {
                        return
                }
        }

result:
        log.LogInfof("action[getHostFromNormalZone] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts)
        if len(hosts) != replicaNum {
                log.LogErrorf("action[getHostFromNormalZone] replicaNum[%v],zoneNum[%v],selectedZones[%v],hosts[%v]", replicaNum, zoneNum, len(zones), hosts)
                return nil, nil, errors.Trace(proto.ErrNoDataNodeToCreateDataPartition, "hosts len[%v],replicaNum[%v],zoneNum[%v],selectedZones[%v]",
                        len(hosts), replicaNum, zoneNum, len(zones))
        }

        return
}

func (c *Cluster) dataNode(addr string) (dataNode *DataNode, err error) {
        value, ok := c.dataNodes.Load(addr)
        if !ok {
                if !c.IsLeader() {
                        err = errors.New("meta data for data nodes is cleared due to leader change!")
                } else {
                        err = errors.Trace(dataNodeNotFound(addr), "%v not found", addr)
                }
                return
        }

        dataNode = value.(*DataNode)
        return
}

func (c *Cluster) metaNode(addr string) (metaNode *MetaNode, err error) {
        value, ok := c.metaNodes.Load(addr)
        if !ok {
                if !c.IsLeader() {
                        err = errors.New("meta data for meta nodes is cleared due to leader change!")
                } else {
                        err = errors.Trace(metaNodeNotFound(addr), "%v not found", addr)
                }
                return
        }
        metaNode = value.(*MetaNode)
        return
}

func (c *Cluster) lcNode(addr string) (lcNode *LcNode, err error) {
        value, ok := c.lcNodes.Load(addr)
        if !ok {
                err = errors.Trace(lcNodeNotFound(addr), "%v not found", addr)
                return
        }
        lcNode = value.(*LcNode)
        return
}

func (c *Cluster) getAllDataPartitionByDataNode(addr string) (partitions []*DataPartition) {
        partitions = make([]*DataPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, dp := range vol.dataPartitions.partitions {
                        for _, host := range dp.Hosts {
                                if host == addr {
                                        partitions = append(partitions, dp)
                                        break
                                }
                        }
                }
        }

        return
}

func (c *Cluster) getAllMetaPartitionByMetaNode(addr string) (partitions []*MetaPartition) {
        partitions = make([]*MetaPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        for _, host := range mp.Hosts {
                                if host == addr {
                                        partitions = append(partitions, mp)
                                        break
                                }
                        }
                }
                vol.mpsLock.RUnlock()
        }

        return
}

func (c *Cluster) getAllDataPartitionIDByDatanode(addr string) (partitionIDs []uint64) {
        partitionIDs = make([]uint64, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, dp := range vol.dataPartitions.partitions {
                        for _, host := range dp.Hosts {
                                if host == addr {
                                        partitionIDs = append(partitionIDs, dp.PartitionID)
                                        break
                                }
                        }
                }
        }

        return
}

func (c *Cluster) getAllMetaPartitionIDByMetaNode(addr string) (partitionIDs []uint64) {
        partitionIDs = make([]uint64, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, mp := range vol.MetaPartitions {
                        vol.mpsLock.RLock()
                        for _, host := range mp.Hosts {
                                if host == addr {
                                        partitionIDs = append(partitionIDs, mp.PartitionID)
                                        break
                                }
                        }
                        vol.mpsLock.RUnlock()
                }
        }

        return
}

func (c *Cluster) getAllMetaPartitionsByMetaNode(addr string) (partitions []*MetaPartition) {
        partitions = make([]*MetaPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, mp := range vol.MetaPartitions {
                        vol.mpsLock.RLock()
                        for _, host := range mp.Hosts {
                                if host == addr {
                                        partitions = append(partitions, mp)
                                        break
                                }
                        }
                        vol.mpsLock.RUnlock()
                }
        }
        return
}

func (c *Cluster) decommissionDataNodeCancel(dataNode *DataNode) (err error, failed []uint64) {
        if !dataNode.CanBePaused() {
                err = fmt.Errorf("action[decommissionDataNodeCancel] dataNode[%v] status[%v] donot support cancel",
                        dataNode.Addr, dataNode.GetDecommissionStatus())
                return
        }
        dataNode.SetDecommissionStatus(DecommissionPause)
        // may cause progress confused for new allocated dp
        dataNode.ToBeOffline = false
        dataNode.DecommissionCompleteTime = time.Now().Unix()
        if err = c.syncUpdateDataNode(dataNode); err != nil {
                log.LogErrorf("action[decommissionDataNodeCancel] dataNode[%v] sync update failed[ %v]",
                        dataNode.Addr, err.Error())
                return
        }
        for _, disk := range dataNode.DecommissionDiskList {
                key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
                if value, ok := c.DecommissionDisks.Load(key); ok {
                        dd := value.(*DecommissionDisk)
                        _, dps := c.decommissionDiskCancel(dd)
                        log.LogInfof("action[decommissionDataNodeCancel] dataNode [%s] pause disk %v with failed dp[%v]",
                                dataNode.Addr, dd.GenerateKey(), dps)
                        failed = append(failed, dps...)
                }
        }
        log.LogDebugf("action[decommissionDataNodeCancel] dataNode[%v] cancel decommission, offline %v with failed dp[%v]",
                dataNode.Addr, dataNode.ToBeOffline, failed)
        return
}

func (c *Cluster) decommissionDiskCancel(disk *DecommissionDisk) (err error, failed []uint64) {
        if !disk.CanBePaused() {
                err = fmt.Errorf("action[decommissionDiskCancel] dataNode[%v] disk[%s] status[%v] donot support cancel",
                        disk.SrcAddr, disk.SrcAddr, disk.GetDecommissionStatus())
                return
        }
        disk.SetDecommissionStatus(DecommissionPause)
        // disk.DecommissionDpTotal = 0
        if err = c.syncUpdateDecommissionDisk(disk); err != nil {
                log.LogErrorf("action[decommissionDiskCancel] dataNode[%v] disk[%s] sync update failed[ %v]",
                        disk.SrcAddr, disk.SrcAddr, err.Error())
                return
        }
        partitions := disk.GetLatestDecommissionDP(c)
        dpIds := make([]uint64, 0)
        for _, dp := range partitions {
                if !dp.PauseDecommission(c) {
                        failed = append(failed, dp.PartitionID)
                }
                dpIds = append(dpIds, dp.PartitionID)
        }
        log.LogDebugf("action[decommissionDiskCancel] dataNode[%v] disk[%s] cancel decommission dps[%v] with failed [%v]",
                disk.SrcAddr, disk.SrcAddr, dpIds, failed)
        return
}

func (c *Cluster) migrateDataNode(srcAddr, targetAddr string, raftForce bool, limit int) (err error) {
        msg := fmt.Sprintf("action[migrateDataNode], src(%s) migrate to target(%s) raftForcs(%v) limit(%v)",
                srcAddr, targetAddr, raftForce, limit)
        log.LogWarn(msg)

        srcNode, err := c.dataNode(srcAddr)
        if err != nil {
                return
        }

        if !srcNode.canMarkDecommission() {
                err = fmt.Errorf("migrate src(%v) is still on working, please wait,check or cancel if abnormal:%v",
                        srcAddr, srcNode.GetDecommissionStatus())
                log.LogWarnf("action[migrateDataNode] %v", err)
                return
        }
        srcNode.markDecommission(targetAddr, raftForce, limit)
        c.syncUpdateDataNode(srcNode)
        log.LogInfof("action[migrateDataNode] %v return now", srcAddr)
        return
}

func (c *Cluster) decommissionDataNode(dataNode *DataNode, force bool) (err error) {
        return c.migrateDataNode(dataNode.Addr, "", false, 0)
}

func (c *Cluster) delDataNodeFromCache(dataNode *DataNode) {
        c.dataNodes.Delete(dataNode.Addr)
        c.t.deleteDataNode(dataNode)
        go dataNode.clean()
}

func (c *Cluster) delDecommissionDiskFromCache(dd *DecommissionDisk) {
        c.DecommissionDisks.Delete(dd.GenerateKey())
}

func (c *Cluster) decommissionSingleDp(dp *DataPartition, newAddr, offlineAddr string) (err error) {
        var (
                dataNode       *DataNode
                decommContinue = false
                newReplica     *DataReplica
        )

        ticker := time.NewTicker(time.Second * time.Duration(c.cfg.IntervalToCheckDataPartition))
        defer func() {
                ticker.Stop()
        }()
        // 1. add new replica first
        if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionEnter {
                if err = c.addDataReplica(dp, newAddr); err != nil {
                        err = fmt.Errorf("action[decommissionSingleDp] dp %v addDataReplica fail err %v", dp.PartitionID, err)
                        goto ERR
                }
                // if addDataReplica is success, can add to BadDataPartitionIds
                dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionWaitAddRes)
                dp.SetDecommissionStatus(DecommissionRunning)
                dp.isRecover = true
                dp.Status = proto.ReadOnly
                dp.RecoverStartTime = time.Now()
                c.syncUpdateDataPartition(dp)
                c.putBadDataPartitionIDsByDiskPath(dp.DecommissionSrcDiskPath, dp.DecommissionSrcAddr, dp.PartitionID)
                log.LogWarnf("action[decommissionSingleDp] dp %v start wait add replica %v", dp.PartitionID, newAddr)
        }
        // 2. wait for repair
        if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddRes {
                for {
                        select {
                        case decommContinue = <-dp.SpecialReplicaDecommissionStop: //
                                if !decommContinue {
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica is stopped", dp.PartitionID)
                                        dp.SetDecommissionStatus(DecommissionPause)
                                        log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
                                        goto ERR
                                }
                        case <-ticker.C:
                                if !c.partition.IsRaftLeader() {
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica result addr %v master leader changed", dp.PartitionID, newAddr)
                                        log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
                                        goto ERR
                                }
                        }
                        // check new replica status
                        liveReplicas := dp.getLiveReplicasFromHosts(c.cfg.DataPartitionTimeOutSec)
                        newReplica, err = dp.getReplica(newAddr)
                        if err != nil {
                                err = fmt.Errorf("action[decommissionSingleDp] dp %v replica %v not found",
                                        dp.PartitionID, newAddr)
                                log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
                                goto ERR
                        }
                        if len(liveReplicas) == int(dp.ReplicaNum+1) {
                                log.LogInfof("action[decommissionSingleDp] dp %v replica[%v] status %v",
                                        dp.PartitionID, newReplica.Addr, newReplica.Status)
                                if newReplica.isRepairing() { // wait for repair
                                        if time.Now().Sub(dp.RecoverStartTime) > c.GetDecommissionDataPartitionRecoverTimeOut() {
                                                err = fmt.Errorf("action[decommissionSingleDp] dp %v new replica %v repair time out",
                                                        dp.PartitionID, newAddr)
                                                dp.DecommissionNeedRollback = true
                                                newReplica.Status = proto.Unavailable // remove from data partition check
                                                log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
                                                goto ERR
                                        }
                                        continue
                                } else if newReplica.isUnavailable() { // repair failed,need rollback
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v new replica %v is Unavailable",
                                                dp.PartitionID, newAddr)
                                        dp.DecommissionNeedRollback = true
                                        log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
                                        goto ERR
                                } else {
                                        dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionWaitAddResFin)
                                        c.syncUpdateDataPartition(dp)
                                        log.LogInfof("action[decommissionSingleDp] dp %v add replica success", dp.PartitionID)
                                        break
                                }
                        }
                }
        }
        // 2. wait for leader
        if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddResFin {
                if !c.partition.IsRaftLeader() {
                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait addDataReplica result addr %v master leader changed", dp.PartitionID, newAddr)
                        goto ERR
                }
                if dataNode, err = c.dataNode(newAddr); err != nil {
                        err = fmt.Errorf("action[decommissionSingleDp] dp %v get offlineAddr %v err %v", dp.PartitionID, newAddr, err)
                        goto ERR
                }
                times := 0
                for {
                        // if leader is selected
                        if dp.getLeaderAddr() != "" {
                                break
                        }
                        log.LogInfof("action[decommissionSingleDp] dp %v try tryToChangeLeader addr %v", dp.PartitionID, newAddr)
                        if err = dp.tryToChangeLeader(c, dataNode); err != nil {
                                log.LogWarnf("action[decommissionSingleDp] dp %v ChangeLeader to addr %v err %v", dp.PartitionID, newAddr, err)
                        }

                        select {
                        case <-ticker.C:
                                if !c.partition.IsRaftLeader() {
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait tryToChangeLeader  addr %v master leader changed", dp.PartitionID, newAddr)
                                        goto ERR
                                }
                                times++
                                if times == 60 {
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait leader selection  new addr %v  timeout", dp.PartitionID, newAddr)
                                        goto ERR
                                }
                        case decommContinue = <-dp.SpecialReplicaDecommissionStop:
                                if !decommContinue {
                                        err = fmt.Errorf("action[decommissionSingleDp] dp %v wait for leader selection is stopped", dp.PartitionID)
                                        dp.SetDecommissionStatus(DecommissionPause)
                                        goto ERR
                                }
                        }
                }
                log.LogInfof("action[decommissionSingleDp] dp %v try removeDataReplica %v", dp.PartitionID, offlineAddr)
                dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionRemoveOld)
                c.syncUpdateDataPartition(dp)
        }
        // 3.delete offline replica
        if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionRemoveOld {
                if err = c.removeDataReplica(dp, offlineAddr, false, false); err != nil {
                        err = fmt.Errorf("action[decommissionSingleDp] dp %v err %v", dp.PartitionID, err)
                        goto ERR
                }
                dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
                dp.SetDecommissionStatus(DecommissionSuccess)
                c.syncUpdateDataPartition(dp)
                log.LogInfof("action[decommissionSingleDp] dp %v success", dp.PartitionID)
                return
        }
        log.LogWarnf("action[decommissionSingleDp] dp %v unexpect end: %v", dp.PartitionID, dp.GetSpecialReplicaDecommissionStep())
        return nil
ERR:
        log.LogWarnf("action[decommissionSingleDp] dp %v err:%v", dp.PartitionID, err)
        return err
}

func (c *Cluster) autoAddDataReplica(dp *DataPartition) (success bool, err error) {
        var (
                targetHosts []string
                newAddr     string
                vol         *Vol
                zone        *Zone
                ns          *nodeSet
        )
        success = false

        dp.RLock()

        // not support
        if dp.isSpecialReplicaCnt() {
                dp.RUnlock()
                return
        }

        dp.RUnlock()

        // not support
        if !proto.IsNormalDp(dp.PartitionType) {
                return
        }

        var ok bool
        if vol, ok = c.vols[dp.VolName]; !ok {
                log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] vol not exist, PersistenceHosts:[%v]",
                        c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
                return
        }

        // not support
        if c.isFaultDomain(vol) {
                return
        }

        if vol.crossZone {
                zones := dp.getZones()
                if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, zones, nil, dp.Hosts, 1, 1, ""); err != nil {
                        goto errHandler
                }
        } else {
                if zone, err = c.t.getZone(vol.zoneName); err != nil {
                        log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] zone not exist, PersistenceHosts:[%v]",
                                c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
                        return
                }
                nodeSets := dp.getNodeSets()
                if len(nodeSets) != 1 {
                        log.LogWarnf("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] the number of nodeSets is not one, PersistenceHosts:[%v]",
                                c.Name, dp.VolName, dp.PartitionID, dp.Hosts)
                        return
                }
                if ns, err = zone.getNodeSet(nodeSets[0]); err != nil {
                        goto errHandler
                }
                if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil {
                        goto errHandler
                }
        }

        newAddr = targetHosts[0]
        if err = c.addDataReplica(dp, newAddr); err != nil {
                goto errHandler
        }

        dp.Status = proto.ReadOnly
        dp.isRecover = true
        c.putBadDataPartitionIDs(nil, newAddr, dp.PartitionID)

        dp.RLock()
        c.syncUpdateDataPartition(dp)
        dp.RUnlock()

        log.LogInfof("action[autoAddDataReplica] clusterID[%v] vol[%v] partitionID[%v] auto add data replica success, newReplicaHost[%v], PersistenceHosts:[%v]",
                c.Name, dp.VolName, dp.PartitionID, newAddr, dp.Hosts)
        success = true
        return

errHandler:
        if err != nil {
                err = fmt.Errorf("clusterID[%v] vol[%v] partitionID[%v], err[%v]", c.Name, dp.VolName, dp.PartitionID, err)
                log.LogErrorf("action[autoAddDataReplica] err %v", err)
        }
        return
}

// Decommission a data partition.
// 1. Check if we can decommission a data partition. In the following cases, we are not allowed to do so:
// - (a) a replica is not in the latest host list;
// - (b) there is already a replica been taken offline;
// - (c) the remaining number of replicas is less than the majority
// 2. Choose a new data node.
// 3. synchronized decommission data partition
// 4. synchronized create a new data partition
// 5. Set the data partition as readOnly.
// 6. persistent the new host list
func (c *Cluster) migrateDataPartition(srcAddr, targetAddr string, dp *DataPartition, raftForce bool, errMsg string) (err error) {
        var (
                targetHosts     []string
                newAddr         string
                msg             string
                dataNode        *DataNode
                zone            *Zone
                replica         *DataReplica
                ns              *nodeSet
                excludeNodeSets []uint64
                zones           []string
        )
        log.LogDebugf("[migrateDataPartition] src %v target %v raftForce %v", srcAddr, targetAddr, raftForce)
        dp.RLock()
        if ok := dp.hasHost(srcAddr); !ok {
                dp.RUnlock()
                return
        }
        if dp.isSpecialReplicaCnt() {
                if dp.GetSpecialReplicaDecommissionStep() >= SpecialDecommissionInitial {
                        err = fmt.Errorf("volume [%v] dp [%v] is on decommission", dp.VolName, dp.PartitionID)
                        log.LogErrorf("action[decommissionDataPartition][%v] ", err)
                        dp.RUnlock()
                        return
                }
                dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
        }

        replica, _ = dp.getReplica(srcAddr)
        dp.RUnlock()

        // delete if not normal data partition
        if !proto.IsNormalDp(dp.PartitionType) {
                c.vols[dp.VolName].deleteDataPartition(c, dp)
                return
        }

        if err = c.validateDecommissionDataPartition(dp, srcAddr); err != nil {
                goto errHandler
        }

        if dataNode, err = c.dataNode(srcAddr); err != nil {
                goto errHandler
        }

        if dataNode.ZoneName == "" {
                err = fmt.Errorf("dataNode[%v] zone is nil", dataNode.Addr)
                goto errHandler
        }

        if zone, err = c.t.getZone(dataNode.ZoneName); err != nil {
                goto errHandler
        }

        if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
                goto errHandler
        }

        if targetAddr != "" {
                targetHosts = []string{targetAddr}
        } else if targetHosts, _, err = ns.getAvailDataNodeHosts(dp.Hosts, 1); err != nil {
                if _, ok := c.vols[dp.VolName]; !ok {
                        log.LogWarnf("clusterID[%v] partitionID:%v  on node:%v offline failed,PersistenceHosts:[%v]",
                                c.Name, dp.PartitionID, srcAddr, dp.Hosts)
                        goto errHandler
                }
                if c.isFaultDomain(c.vols[dp.VolName]) {
                        log.LogErrorf("clusterID[%v] partitionID:%v  on node:%v is banlance zone,PersistenceHosts:[%v]",
                                c.Name, dp.PartitionID, srcAddr, dp.Hosts)
                        goto errHandler
                }
                // select data nodes from the other node set in same zone
                excludeNodeSets = append(excludeNodeSets, ns.ID)
                if targetHosts, _, err = zone.getAvailNodeHosts(TypeDataPartition, excludeNodeSets, dp.Hosts, 1); err != nil {
                        // select data nodes from the other zone
                        zones = dp.getLiveZones(srcAddr)
                        var excludeZone []string
                        if len(zones) == 0 {
                                excludeZone = append(excludeZone, zone.name)
                        } else {
                                excludeZone = append(excludeZone, zones[0])
                        }
                        if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, excludeZone, excludeNodeSets, dp.Hosts, 1, 1, ""); err != nil {
                                goto errHandler
                        }
                }
        }

        newAddr = targetHosts[0]
        err = c.updateDataNodeSize(newAddr, dp)
        if err != nil {
                log.LogErrorf("action[migrateDataPartition] target addr can't be writable, add %s %s", newAddr, err.Error())
                return
        }

        defer func() {
                if err != nil {
                        c.returnDataSize(newAddr, dp)
                }
        }()

        // if special replica wait for
        if dp.ReplicaNum == 1 || (dp.ReplicaNum == 2 && (dp.ReplicaNum == c.vols[dp.VolName].dpReplicaNum) && !raftForce) {
                dp.Status = proto.ReadOnly
                dp.isRecover = true
                c.putBadDataPartitionIDs(replica, srcAddr, dp.PartitionID)

                if err = c.decommissionSingleDp(dp, newAddr, srcAddr); err != nil {
                        goto errHandler
                }
        } else {
                if err = c.removeDataReplica(dp, srcAddr, false, raftForce); err != nil {
                        goto errHandler
                }
                if err = c.addDataReplica(dp, newAddr); err != nil {
                        goto errHandler
                }

                dp.Status = proto.ReadOnly
                dp.isRecover = true
                c.putBadDataPartitionIDs(replica, srcAddr, dp.PartitionID)
        }
        log.LogDebugf("[migrateDataPartition] src %v target %v raftForce %v", srcAddr, targetAddr, raftForce)
        dp.RLock()
        c.syncUpdateDataPartition(dp)
        dp.RUnlock()

        log.LogWarnf("[migrateDataPartition] clusterID[%v] partitionID:%v  on node:%v offline success,newHost[%v],PersistenceHosts:[%v]",
                c.Name, dp.PartitionID, srcAddr, newAddr, dp.Hosts)
        dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
        return

errHandler:
        if dp.isSpecialReplicaCnt() {
                if dp.GetSpecialReplicaDecommissionStep() == SpecialDecommissionEnter {
                        dp.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
                }
        }
        msg = fmt.Sprintf(errMsg+" clusterID[%v] partitionID:%v  on Node:%v  "+
                "Then Fix It on newHost:%v   Err:%v , PersistenceHosts:%v  ",
                c.Name, dp.PartitionID, srcAddr, newAddr, err, dp.Hosts)

        if err != nil {
                Warn(c.Name, msg)
                err = fmt.Errorf("vol[%v],partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
                log.LogErrorf("actin[decommissionDataPartition] err %v", err)
        }

        return
}

// Decommission a data partition.
// 1. Check if we can decommission a data partition. In the following cases, we are not allowed to do so:
// - (a) a replica is not in the latest host list;
// - (b) there is already a replica been taken offline;
// - (c) the remaining number of replicas is less than the majority
// 2. Choose a new data node.
// 3. synchronized decommission data partition
// 4. synchronized create a new data partition
// 5. Set the data partition as readOnly.
// 6. persistent the new host list
func (c *Cluster) decommissionDataPartition(offlineAddr string, dp *DataPartition, raftForce bool, errMsg string) (err error) {
        return c.migrateDataPartition(offlineAddr, "", dp, raftForce, errMsg)
}

func (c *Cluster) validateDecommissionDataPartition(dp *DataPartition, offlineAddr string) (err error) {
        dp.RLock()
        defer dp.RUnlock()
        var vol *Vol
        if vol, err = c.getVol(dp.VolName); err != nil {
                log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
                return
        }

        if err = dp.hasMissingOneReplica(offlineAddr, int(vol.dpReplicaNum)); err != nil {
                log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
                return
        }

        // if the partition can be offline or not
        if err = dp.canBeOffLine(offlineAddr); err != nil {
                log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
                return
        }

        if dp.isRecover && !dp.activeUsedSimilar() {
                err = fmt.Errorf("vol[%v],data partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, dp.PartitionID, offlineAddr)
                log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v err %v", dp.VolName, dp.PartitionID, err)
                return
        }
        log.LogInfof("action[validateDecommissionDataPartition] dp vol %v dp %v looks fine!", dp.VolName, dp.PartitionID)
        return
}

func (c *Cluster) addDataReplica(dp *DataPartition, addr string) (err error) {
        defer func() {
                if err != nil {
                        log.LogErrorf("action[addDataReplica],vol[%v],dp %v ,err[%v]", dp.VolName, dp.PartitionID, err)
                } else {
                        log.LogInfof("action[addDataReplica]  dp %v add replica dst addr %v success!", dp.PartitionID, addr)
                }
        }()

        log.LogInfof("action[addDataReplica]  dp %v try add replica dst addr %v try add raft member", dp.PartitionID, addr)

        dp.addReplicaMutex.Lock()
        defer dp.addReplicaMutex.Unlock()

        dataNode, err := c.dataNode(addr)
        if err != nil {
                return
        }

        addPeer := proto.Peer{ID: dataNode.ID, Addr: addr}

        if !proto.IsNormalDp(dp.PartitionType) {
                return fmt.Errorf("action[addDataReplica] [%d] is not normal dp, not support add or delete replica", dp.PartitionID)
        }

        log.LogInfof("action[addDataReplica] dp %v dst addr %v try add raft member, node id %v", dp.PartitionID, addr, dataNode.ID)
        if err = c.addDataPartitionRaftMember(dp, addPeer); err != nil {
                log.LogWarnf("action[addDataReplica] dp %v addr %v try add raft member err [%v]", dp.PartitionID, addr, err)
                return
        }
        log.LogInfof("action[addDataReplica] dp %v addr %v try create data replica", dp.PartitionID, addr)
        if err = c.createDataReplica(dp, addPeer); err != nil {
                log.LogWarnf("action[addDataReplica] dp %v addr %v createDataReplica err [%v]", dp.PartitionID, addr, err)
                return
        }
        return
}

// update datanode size with to replica size
func (c *Cluster) updateDataNodeSize(addr string, dp *DataPartition) error {
        leaderSize := dp.Replicas[0].Used
        dataNode, err := c.dataNode(addr)
        if err != nil {
                return err
        }

        dataNode.Lock()
        defer dataNode.Unlock()

        if dataNode.AvailableSpace < 10*util.GB {
                return fmt.Errorf("new datanode %s is not writable %d", addr, dataNode.AvailableSpace)
        }

        dataNode.LastUpdateTime = time.Now()
        if dataNode.AvailableSpace < leaderSize {
                dataNode.AvailableSpace = 0
                return nil
        }

        dataNode.AvailableSpace -= leaderSize

        return nil
}

func (c *Cluster) returnDataSize(addr string, dp *DataPartition) {
        leaderSize := dp.Replicas[0].Used
        dataNode, err := c.dataNode(addr)
        if err != nil {
                return
        }

        dataNode.Lock()
        defer dataNode.Unlock()
        log.LogWarnf("returnDataSize after error, addr %s, ava %d, leader %d", addr, dataNode.AvailableSpace, leaderSize)

        dataNode.LastUpdateTime = time.Now()
        dataNode.AvailableSpace += leaderSize
}

func (c *Cluster) buildAddDataPartitionRaftMemberTaskAndSyncSendTask(dp *DataPartition, addPeer proto.Peer, leaderAddr string) (resp *proto.Packet, err error) {
        log.LogInfof("action[buildAddDataPartitionRaftMemberTaskAndSyncSendTask] add peer [%v] start", addPeer)
        defer func() {
                var resultCode uint8
                if resp != nil {
                        resultCode = resp.ResultCode
                }
                if err != nil {
                        log.LogErrorf("vol[%v],data partition[%v],resultCode[%v],err[%v]", dp.VolName, dp.PartitionID, resultCode, err)
                } else {
                        log.LogWarnf("vol[%v],data partition[%v],resultCode[%v],err[%v]", dp.VolName, dp.PartitionID, resultCode, err)
                }
        }()
        task, err := dp.createTaskToAddRaftMember(addPeer, leaderAddr)
        if err != nil {
                return
        }
        leaderDataNode, err := c.dataNode(leaderAddr)
        if err != nil {
                return
        }
        if resp, err = leaderDataNode.TaskManager.syncSendAdminTask(task); err != nil {
                return
        }
        log.LogInfof("action[buildAddDataPartitionRaftMemberTaskAndSyncSendTask] add peer [%v] finished", addPeer)
        return
}

func (c *Cluster) addDataPartitionRaftMember(dp *DataPartition, addPeer proto.Peer) (err error) {
        var (
                candidateAddrs []string
                leaderAddr     string
        )

        if leaderAddr, candidateAddrs, err = dp.prepareAddRaftMember(addPeer); err != nil {
                // maybe already add success before(master has updated hosts)
                return nil
        }

        dp.Lock()

        oldHosts := make([]string, len(dp.Hosts))
        copy(oldHosts, dp.Hosts)
        oldPeers := make([]proto.Peer, len(dp.Peers))
        copy(oldPeers, dp.Peers)

        dp.Hosts = append(dp.Hosts, addPeer.Addr)
        dp.Peers = append(dp.Peers, addPeer)

        dp.Unlock()

        // send task to leader addr first,if need to retry,then send to other addr
        for index, host := range candidateAddrs {
                if leaderAddr == "" && len(candidateAddrs) < int(dp.ReplicaNum) {
                        time.Sleep(retrySendSyncTaskInternal)
                }
                _, err = c.buildAddDataPartitionRaftMemberTaskAndSyncSendTask(dp, addPeer, host)
                if err == nil {
                        break
                }
                if index < len(candidateAddrs)-1 {
                        time.Sleep(retrySendSyncTaskInternal)
                }
        }

        dp.Lock()
        defer dp.Unlock()
        if err != nil {
                dp.Hosts = oldHosts
                dp.Peers = oldPeers
                return
        }

        log.LogInfof("action[addDataPartitionRaftMember] try host [%v] to [%v] peers [%v] to [%v]",
                dp.Hosts, dp.Hosts, dp.Peers, dp.Peers)

        if err = dp.update("addDataPartitionRaftMember", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
                dp.Hosts = oldHosts
                dp.Peers = oldPeers
                return
        }

        return
}

func (c *Cluster) createDataReplica(dp *DataPartition, addPeer proto.Peer) (err error) {
        vol, err := c.getVol(dp.VolName)
        if err != nil {
                return
        }

        dp.RLock()
        hosts := make([]string, len(dp.Hosts))
        copy(hosts, dp.Hosts)
        peers := make([]proto.Peer, len(dp.Peers))
        copy(peers, dp.Peers)
        dp.RUnlock()

        diskPath, err := c.syncCreateDataPartitionToDataNode(addPeer.Addr, vol.dataPartitionSize,
                dp, peers, hosts, proto.DecommissionedCreateDataPartition, dp.PartitionType, true)
        if err != nil {
                return
        }

        dp.Lock()
        defer dp.Unlock()

        if err = dp.afterCreation(addPeer.Addr, diskPath, c); err != nil {
                return
        }
        if err = dp.update("createDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
                return
        }

        return
}

func (c *Cluster) removeDataReplica(dp *DataPartition, addr string, validate bool, raftForceDel bool) (err error) {
        defer func() {
                if err != nil {
                        log.LogErrorf("action[removeDataReplica],vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
                }
        }()
        log.LogInfof("action[removeDataReplica]  dp %v try remove replica  addr [%v]", dp.PartitionID, addr)
        // validate be set true only in api call
        if validate && !raftForceDel {
                if err = c.validateDecommissionDataPartition(dp, addr); err != nil {
                        return
                }
        }

        dataNode, err := c.dataNode(addr)
        if err != nil {
                return
        }

        if !proto.IsNormalDp(dp.PartitionType) {
                return fmt.Errorf("[%d] is not normal dp, not support add or delete replica", dp.PartitionID)
        }
        removePeer := proto.Peer{ID: dataNode.ID, Addr: addr}
        if err = c.removeDataPartitionRaftMember(dp, removePeer, raftForceDel); err != nil {
                return
        }

        if err = c.removeHostMember(dp, removePeer); err != nil {
                return
        }

        if err = c.deleteDataReplica(dp, dataNode); err != nil {
                return
        }
        // may already change leader during last decommission
        leaderAddr := dp.getLeaderAddrWithLock()
        if leaderAddr != addr {
                return
        }

        if dataNode, err = c.dataNode(dp.Hosts[0]); err != nil {
                return
        }

        if err = dp.tryToChangeLeader(c, dataNode); err != nil {
                return
        }

        return
}

func (c *Cluster) isRecovering(dp *DataPartition, addr string) (isRecover bool) {
        var key string
        dp.RLock()
        defer dp.RUnlock()
        replica, _ := dp.getReplica(addr)
        if replica != nil {
                key = fmt.Sprintf("%s:%s", addr, replica.DiskPath)
        } else {
                key = fmt.Sprintf("%s:%s", addr, "")
        }

        c.badPartitionMutex.RLock()
        defer c.badPartitionMutex.RUnlock()

        var badPartitionIDs []uint64
        badPartitions, ok := c.BadDataPartitionIds.Load(key)
        if ok {
                badPartitionIDs = badPartitions.([]uint64)
        }
        for _, id := range badPartitionIDs {
                if id == dp.PartitionID {
                        isRecover = true
                }
        }
        return
}

func (c *Cluster) removeHostMember(dp *DataPartition, removePeer proto.Peer) (err error) {
        newHosts := make([]string, 0, len(dp.Hosts)-1)
        for _, host := range dp.Hosts {
                if host == removePeer.Addr {
                        continue
                }
                newHosts = append(newHosts, host)
        }
        newPeers := make([]proto.Peer, 0, len(dp.Peers)-1)
        for _, peer := range dp.Peers {
                if peer.ID == removePeer.ID && peer.Addr == removePeer.Addr {
                        continue
                }
                newPeers = append(newPeers, peer)
        }

        dp.Lock()
        defer dp.Unlock()
        if err = dp.update("removeDataPartitionRaftMember", dp.VolName, newPeers, newHosts, c); err != nil {
                return
        }
        return
}

func (c *Cluster) removeDataPartitionRaftMember(dp *DataPartition, removePeer proto.Peer, force bool) (err error) {
        dp.offlineMutex.Lock()
        defer dp.offlineMutex.Unlock()
        defer func() {
                if err1 := c.updateDataPartitionOfflinePeerIDWithLock(dp, 0); err1 != nil {
                        err = errors.Trace(err, "updateDataPartitionOfflinePeerIDWithLock failed, err[%v]", err1)
                }
        }()
        if err = c.updateDataPartitionOfflinePeerIDWithLock(dp, removePeer.ID); err != nil {
                log.LogErrorf("action[removeDataPartitionRaftMember] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
                return
        }
        return dp.createTaskToRemoveRaftMember(c, removePeer, force)
}

// call from remove raft member
func (c *Cluster) updateDataPartitionOfflinePeerIDWithLock(dp *DataPartition, peerID uint64) (err error) {
        dp.Lock()
        defer dp.Unlock()
        dp.OfflinePeerID = peerID
        if err = dp.update("updateDataPartitionOfflinePeerIDWithLock", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
                return
        }
        return
}

func (c *Cluster) deleteDataReplica(dp *DataPartition, dataNode *DataNode) (err error) {
        dp.Lock()
        // in case dataNode is unreachable,update meta first.
        dp.removeReplicaByAddr(dataNode.Addr)
        dp.checkAndRemoveMissReplica(dataNode.Addr)

        if err = dp.update("deleteDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
                dp.Unlock()
                return
        }
        task := dp.createTaskToDeleteDataPartition(dataNode.Addr)
        dp.Unlock()

        _, err = dataNode.TaskManager.syncSendAdminTask(task)
        if err != nil {
                log.LogErrorf("action[deleteDataReplica] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
        }

        return nil
}

func (c *Cluster) putBadMetaPartitions(addr string, partitionID uint64) {
        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        newBadPartitionIDs := make([]uint64, 0)
        badPartitionIDs, ok := c.BadMetaPartitionIds.Load(addr)
        if ok {
                newBadPartitionIDs = badPartitionIDs.([]uint64)
        }
        newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
        c.BadMetaPartitionIds.Store(addr, newBadPartitionIDs)
}

func (c *Cluster) getBadMetaPartitionsView() (bmpvs []badPartitionView) {
        c.badPartitionMutex.RLock()
        defer c.badPartitionMutex.RUnlock()

        bmpvs = make([]badPartitionView, 0)
        c.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
                badPartitionIds := value.([]uint64)
                path := key.(string)
                bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds}
                bmpvs = append(bmpvs, bpv)
                return true
        })
        return
}

func (c *Cluster) putBadDataPartitionIDs(replica *DataReplica, addr string, partitionID uint64) {
        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        var key string
        newBadPartitionIDs := make([]uint64, 0)
        if replica != nil {
                key = fmt.Sprintf("%s:%s", addr, replica.DiskPath)
        } else {
                key = fmt.Sprintf("%s:%s", addr, "")
        }
        badPartitionIDs, ok := c.BadDataPartitionIds.Load(key)
        if ok {
                newBadPartitionIDs = badPartitionIDs.([]uint64)
        }
        newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
        c.BadDataPartitionIds.Store(key, newBadPartitionIDs)
}

func (c *Cluster) putBadDataPartitionIDsByDiskPath(disk, addr string, partitionID uint64) {
        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        var key string
        newBadPartitionIDs := make([]uint64, 0)
        key = fmt.Sprintf("%s:%s", addr, disk)

        badPartitionIDs, ok := c.BadDataPartitionIds.Load(key)
        if ok {
                newBadPartitionIDs = badPartitionIDs.([]uint64)
        }
        if in(partitionID, newBadPartitionIDs) {
                return
        }
        newBadPartitionIDs = append(newBadPartitionIDs, partitionID)
        c.BadDataPartitionIds.Store(key, newBadPartitionIDs)
}

func in(target uint64, strArray []uint64) bool {
        for _, element := range strArray {
                if target == element {
                        return true
                }
        }
        return false
}

func (c *Cluster) getBadDataPartitionsView() (bpvs []badPartitionView) {
        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        bpvs = make([]badPartitionView, 0)
        c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
                badDataPartitionIds := value.([]uint64)
                path := key.(string)
                bpv := badPartitionView{Path: path, PartitionIDs: badDataPartitionIds}
                bpvs = append(bpvs, bpv)
                return true
        })
        return
}

func (c *Cluster) getBadDataPartitionsRepairView() (bprvs []proto.BadPartitionRepairView) {
        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        bprvs = make([]proto.BadPartitionRepairView, 0)
        c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
                badDataPartitionIds := value.([]uint64)
                dpRepairInfos := make([]proto.DpRepairInfo, 0)
                path := key.(string)

                for _, partitionID := range badDataPartitionIds {
                        partition, err := c.getDataPartitionByID(partitionID)
                        if err != nil {
                                continue
                        }
                        replica, err := partition.getReplica(partition.DecommissionDstAddr)
                        if err != nil {
                                log.LogDebugf("getBadDataPartitionsRepairView: replica for partitionID[%v] addr[%v] is empty", partitionID, partition.DecommissionDstAddr)
                                continue
                        }
                        dpRepairInfo := proto.DpRepairInfo{PartitionID: partitionID, DecommissionRepairProgress: replica.DecommissionRepairProgress}
                        dpRepairInfos = append(dpRepairInfos, dpRepairInfo)
                        log.LogDebugf("getBadDataPartitionsRepairView: partitionID[%v], addr[%v], dpRepairInfo[%v]",
                                partitionID, partition.DecommissionDstAddr, dpRepairInfo)
                }

                bprv := proto.BadPartitionRepairView{Path: path, PartitionInfos: dpRepairInfos}
                bprvs = append(bprvs, bprv)
                return true
        })
        return
}

func (c *Cluster) migrateMetaNode(srcAddr, targetAddr string, limit int) (err error) {
        var toBeOfflineMps []*MetaPartition

        if c.ForbidMpDecommission {
                err = fmt.Errorf("cluster mataPartition decommission switch is disabled")
                return
        }

        msg := fmt.Sprintf("action[migrateMetaNode],clusterID[%v] migrate from node[%v] to [%s] begin", c.Name, srcAddr, targetAddr)
        log.LogWarn(msg)

        metaNode, err := c.metaNode(srcAddr)
        if err != nil {
                return err
        }

        metaNode.MigrateLock.Lock()
        defer metaNode.MigrateLock.Unlock()

        partitions := c.getAllMetaPartitionByMetaNode(srcAddr)
        if targetAddr != "" {
                toBeOfflineMps = make([]*MetaPartition, 0)
                for _, mp := range partitions {
                        if contains(mp.Hosts, targetAddr) {
                                continue
                        }

                        toBeOfflineMps = append(toBeOfflineMps, mp)
                }
        } else {
                toBeOfflineMps = partitions
        }

        if len(toBeOfflineMps) <= 0 && len(partitions) != 0 {
                return fmt.Errorf("migrateMataNode no partition can migrate from [%s] to [%s] limit [%v]", srcAddr, targetAddr, limit)
        }

        if limit <= 0 {
                limit = defaultMigrateMpCnt
        }

        if limit > len(toBeOfflineMps) {
                limit = len(toBeOfflineMps)
        }

        var wg sync.WaitGroup
        metaNode.ToBeOffline = true
        metaNode.MaxMemAvailWeight = 1
        errChannel := make(chan error, limit)

        defer func() {
                metaNode.ToBeOffline = false
                close(errChannel)
        }()

        for idx := 0; idx < limit; idx++ {
                wg.Add(1)
                go func(mp *MetaPartition) {
                        defer wg.Done()
                        if err1 := c.migrateMetaPartition(srcAddr, targetAddr, mp); err1 != nil {
                                errChannel <- err1
                        }
                }(toBeOfflineMps[idx])
        }

        wg.Wait()
        select {
        case err = <-errChannel:
                log.LogErrorf("action[migrateMetaNode] clusterID[%v] migrate node[%s] to [%s] faild, err(%s)",
                        c.Name, srcAddr, targetAddr, err.Error())
                return
        default:
        }

        if limit < len(partitions) {
                log.LogWarnf("action[migrateMetaNode] clusterID[%v] migrate from [%s] to [%s] cnt[%d] success",
                        c.Name, srcAddr, targetAddr, limit)
                return
        }

        if err = c.syncDeleteMetaNode(metaNode); err != nil {
                msg = fmt.Sprintf("action[migrateMetaNode], clusterID[%v] node[%v] synDelMetaNode failed,err[%s]",
                        c.Name, srcAddr, err.Error())
                Warn(c.Name, msg)
                return
        }

        c.deleteMetaNodeFromCache(metaNode)
        msg = fmt.Sprintf("action[migrateMetaNode],clusterID[%v] migrate from node[%v] to node(%s) success", c.Name, srcAddr, targetAddr)
        Warn(c.Name, msg)
        return
}

func (c *Cluster) decommissionMetaNode(metaNode *MetaNode) (err error) {
        return c.migrateMetaNode(metaNode.Addr, "", 0)
}

func (c *Cluster) deleteMetaNodeFromCache(metaNode *MetaNode) {
        c.metaNodes.Delete(metaNode.Addr)
        c.t.deleteMetaNode(metaNode)
        go metaNode.clean()
}

func (c *Cluster) updateVol(name, authKey string, newArgs *VolVarargs) (err error) {
        var (
                vol           *Vol
                serverAuthKey string
                volUsedSpace  uint64
                oldArgs       *VolVarargs
        )

        if vol, err = c.getVol(name); err != nil {
                log.LogErrorf("action[updateVol] err[%v]", err)
                err = proto.ErrVolNotExists
                goto errHandler
        }

        if vol.status() == proto.VolStatusMarkDelete {
                log.LogErrorf("action[updateVol] vol is already deleted, name(%s)", name)
                err = proto.ErrVolNotExists
                goto errHandler
        }

        vol.volLock.Lock()
        defer vol.volLock.Unlock()

        serverAuthKey = vol.Owner
        if !matchKey(serverAuthKey, authKey) {
                return proto.ErrVolAuthKeyNotMatch
        }

        volUsedSpace = vol.totalUsedSpace()
        if float64(newArgs.capacity*util.GB) < float64(volUsedSpace)*1.01 && newArgs.capacity != vol.Capacity {
                err = fmt.Errorf("capacity[%v] has to be 1 percent larger than the used space[%v]", newArgs.capacity,
                        volUsedSpace/util.GB)
                goto errHandler
        }

        log.LogInfof("[checkZoneName] name [%s], zone [%s]", name, newArgs.zoneName)
        if newArgs.zoneName, err = c.checkZoneName(name, vol.crossZone, vol.defaultPriority, newArgs.zoneName, vol.domainId); err != nil {
                goto errHandler
        }

        if newArgs.coldArgs.cacheCap >= newArgs.capacity {
                err = fmt.Errorf("capacity must be large than cache capacity, newCap(%d), newCacheCap(%d)", newArgs.capacity, newArgs.coldArgs.cacheCap)
                goto errHandler
        }

        oldArgs = getVolVarargs(vol)
        setVolFromArgs(newArgs, vol)
        if err = c.syncUpdateVol(vol); err != nil {
                setVolFromArgs(oldArgs, vol)
                log.LogErrorf("action[updateVol] vol[%v] err[%v]", name, err)
                err = proto.ErrPersistenceByRaft
                goto errHandler
        }

        return

errHandler:
        err = fmt.Errorf("action[updateVol], clusterID[%v] name:%v, err:%v ", c.Name, name, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

func (c *Cluster) checkNormalZoneName(zoneName string) (err error) {
        var zones []string
        if c.needFaultDomain {
                zones = c.t.domainExcludeZones
        } else {
                zones = c.t.getZoneNameList()
        }

        zoneList := strings.Split(zoneName, ",")
        for i := 0; i < len(zoneList); i++ {
                var isZone bool
                for j := 0; j < len(zones); j++ {
                        if zoneList[i] == zones[j] {
                                isZone = true
                                break
                        }
                }

                if !isZone {
                        return fmt.Errorf("action[checkZoneName] the zonename[%s] not found", zoneList[i])
                }
        }
        return
}

func (c *Cluster) checkZoneName(name string,
        crossZone bool,
        defaultPriority bool,
        zoneName string,
        domainId uint64) (newZoneName string, err error,
) {
        zoneList := strings.Split(zoneName, ",")
        newZoneName = zoneName

        if crossZone {
                if newZoneName != "" {
                        if len(zoneList) == 1 {
                                return newZoneName, fmt.Errorf("action[checkZoneName] vol use specified single zoneName conflit with cross zone flag")
                        } else {
                                if err = c.checkNormalZoneName(newZoneName); err != nil {
                                        return newZoneName, err
                                }
                        }
                }
                if c.FaultDomain {
                        if newZoneName != "" {
                                if !defaultPriority || domainId > 0 {
                                        return newZoneName, fmt.Errorf("action[checkZoneName] vol need FaultDomain but set zone name")
                                }
                        } else {
                                if domainId > 0 {
                                        if _, ok := c.domainManager.domainId2IndexMap[domainId]; !ok {
                                                return newZoneName, fmt.Errorf("action[checkZoneName] cluster can't find oomainId [%v]", domainId)
                                        }
                                }
                        }
                } else {
                        if c.t.zoneLen() <= 1 {
                                return newZoneName, fmt.Errorf("action[checkZoneName] cluster has one zone,can't cross zone")
                        }
                }
        } else { // cross zone disable means not use domain at the time vol be created
                if newZoneName == "" {
                        if !c.needFaultDomain {
                                if _, err = c.t.getZone(DefaultZoneName); err != nil {
                                        return newZoneName, fmt.Errorf("action[checkZoneName] the vol is not cross zone and didn't set zone name,but there's no default zone")
                                }
                                log.LogInfof("action[checkZoneName] vol [%v] use default zone", name)
                                newZoneName = DefaultZoneName
                        }
                } else {
                        if len(zoneList) > 1 {
                                return newZoneName, fmt.Errorf("action[checkZoneName] vol specified zoneName need cross zone")
                        }

                        if err = c.checkNormalZoneName(newZoneName); err != nil {
                                return newZoneName, err
                        }
                }
        }
        return
}

// Create a new volume.
// By default we create 3 meta partitions and 10 data partitions during initialization.
func (c *Cluster) createVol(req *createVolReq) (vol *Vol, err error) {
        if c.DisableAutoAllocate {
                log.LogWarn("the cluster is frozen")
                return nil, fmt.Errorf("the cluster is frozen, can not create volume")
        }

        var readWriteDataPartitions int

        if req.zoneName, err = c.checkZoneName(req.name, req.crossZone, req.normalZonesFirst, req.zoneName, req.domainId); err != nil {
                return
        }

        if vol, err = c.doCreateVol(req); err != nil {
                goto errHandler
        }

        vol.aclMgr.init(c, vol)
        vol.initUidSpaceManager(c)
        vol.initQuotaManager(c)
        if err = vol.VersionMgr.init(c); err != nil {
                log.LogError("init dataPartition error in verMgr init", err.Error())
        }

        if err = vol.initMetaPartitions(c, req.mpCount); err != nil {

                vol.Status = proto.VolStatusMarkDelete
                if e := vol.deleteVolFromStore(c); e != nil {
                        log.LogErrorf("action[createVol] deleteVolFromStore failed, vol[%v] err[%v]", vol.Name, e)
                }

                c.deleteVol(req.name)

                err = fmt.Errorf("action[createVol] initMetaPartitions failed, vol[%v] err[%v]", vol.Name, err)
                goto errHandler
        }

        if vol.CacheCapacity > 0 || (proto.IsHot(vol.VolType) && vol.Capacity > 0) {
                if req.dpCount > maxInitDataPartitionCnt {
                        err = fmt.Errorf("action[createVol] initDataPartitions failed, vol[%v], dpCount[%d] exceeds maximum limit[%d]",
                                req.name, req.dpCount, maxInitDataPartitionCnt)
                        goto errHandler
                }
                for retryCount := 0; readWriteDataPartitions < defaultInitMetaPartitionCount && retryCount < 3; retryCount++ {
                        err = vol.initDataPartitions(c, req.dpCount)
                        if err != nil {
                                log.LogError("action[createVol] init dataPartition error ",
                                        err.Error(), retryCount, len(vol.dataPartitions.partitionMap))
                        }

                        readWriteDataPartitions = len(vol.dataPartitions.partitionMap)
                }

                if len(vol.dataPartitions.partitionMap) < defaultInitMetaPartitionCount {
                        err = fmt.Errorf("action[createVol] vol[%v] initDataPartitions failed, less than %d",
                                vol.Name, defaultInitMetaPartitionCount)

                        oldVolStatus := vol.Status
                        vol.Status = proto.VolStatusMarkDelete
                        if errSync := c.syncUpdateVol(vol); errSync != nil {
                                log.LogErrorf("action[createVol] vol[%v] after init dataPartition error, mark vol delete persist failed", vol.Name)
                                vol.Status = oldVolStatus
                        } else {
                                log.LogErrorf("action[createVol] vol[%v] mark vol delete after init dataPartition error", vol.Name)
                        }

                        goto errHandler
                }
        }

        vol.dataPartitions.readableAndWritableCnt = readWriteDataPartitions
        vol.updateViewCache(c)

        log.LogInfof("action[createVol] vol[%v], readableAndWritableCnt[%v]", req.name, readWriteDataPartitions)
        return

errHandler:
        err = fmt.Errorf("action[createVol], clusterID[%v] name:%v, err:%v ", c.Name, req.name, err)
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

func (c *Cluster) doCreateVol(req *createVolReq) (vol *Vol, err error) {
        c.createVolMutex.Lock()
        defer c.createVolMutex.Unlock()

        createTime := time.Now().Unix() // record unix seconds of volume create time
        var dataPartitionSize uint64

        if req.dpSize*util.GB == 0 {
                dataPartitionSize = util.DefaultDataPartitionSize
        } else {
                dataPartitionSize = uint64(req.dpSize) * util.GB
        }

        vv := volValue{
                Name:                    req.name,
                Owner:                   req.owner,
                ZoneName:                req.zoneName,
                DataPartitionSize:       dataPartitionSize,
                Capacity:                uint64(req.capacity),
                DpReplicaNum:            req.dpReplicaNum,
                ReplicaNum:              defaultReplicaNum,
                FollowerRead:            req.followerRead,
                Authenticate:            req.authenticate,
                CrossZone:               req.crossZone,
                DefaultPriority:         req.normalZonesFirst,
                DomainId:                req.domainId,
                CreateTime:              createTime,
                DeleteLockTime:          req.deleteLockTime,
                Description:             req.description,
                EnablePosixAcl:          req.enablePosixAcl,
                EnableQuota:             req.enableQuota,
                EnableTransaction:       req.enableTransaction,
                TxTimeout:               req.txTimeout,
                TxConflictRetryNum:      req.txConflictRetryNum,
                TxConflictRetryInterval: req.txConflictRetryInterval,

                VolType:          req.volType,
                EbsBlkSize:       req.coldArgs.objBlockSize,
                CacheCapacity:    req.coldArgs.cacheCap,
                CacheAction:      req.coldArgs.cacheAction,
                CacheThreshold:   req.coldArgs.cacheThreshold,
                CacheTTL:         req.coldArgs.cacheTtl,
                CacheHighWater:   req.coldArgs.cacheHighWater,
                CacheLowWater:    req.coldArgs.cacheLowWater,
                CacheLRUInterval: req.coldArgs.cacheLRUInterval,
                CacheRule:        req.coldArgs.cacheRule,

                VolQosEnable: req.qosLimitArgs.qosEnable,
                IopsRLimit:   req.qosLimitArgs.iopsRVal,
                IopsWLimit:   req.qosLimitArgs.iopsWVal,
                FlowRlimit:   req.qosLimitArgs.flowRVal,
                FlowWlimit:   req.qosLimitArgs.flowWVal,

                DpReadOnlyWhenVolFull: req.DpReadOnlyWhenVolFull,
        }

        log.LogInfof("[doCreateVol] volView, %v", vv)

        if _, err = c.getVol(req.name); err == nil {
                err = proto.ErrDuplicateVol
                goto errHandler
        }

        vv.ID, err = c.idAlloc.allocateCommonID()
        if err != nil {
                goto errHandler
        }

        vol = newVol(vv)
        log.LogInfof("[doCreateVol] vol, %v", vol)

        // refresh oss secure
        vol.refreshOSSSecure()

        if err = c.syncAddVol(vol); err != nil {
                goto errHandler
        }

        c.putVol(vol)

        return

errHandler:
        err = fmt.Errorf("action[doCreateVol], clusterID[%v] name:%v, err:%v ", c.Name, req.name, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

// Update the upper bound of the inode ids in a meta partition.
func (c *Cluster) updateInodeIDRange(volName string, start uint64) (err error) {
        var (
                maxPartitionID uint64
                vol            *Vol
                partition      *MetaPartition
        )

        if vol, err = c.getVol(volName); err != nil {
                log.LogErrorf("action[updateInodeIDRange]  vol [%v] not found", volName)
                return proto.ErrVolNotExists
        }

        maxPartitionID = vol.maxPartitionID()
        if partition, err = vol.metaPartition(maxPartitionID); err != nil {
                log.LogErrorf("action[updateInodeIDRange]  mp[%v] not found", maxPartitionID)
                return proto.ErrMetaPartitionNotExists
        }

        adjustStart := start
        if adjustStart < partition.Start {
                adjustStart = partition.Start
        }

        if adjustStart < partition.MaxInodeID {
                adjustStart = partition.MaxInodeID
        }

        metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
        adjustStart = adjustStart + metaPartitionInodeIdStep
        log.LogWarnf("vol[%v],maxMp[%v],start[%v],adjustStart[%v]", volName, maxPartitionID, start, adjustStart)
        if err = vol.splitMetaPartition(c, partition, adjustStart, metaPartitionInodeIdStep, false); err != nil {
                log.LogErrorf("action[updateInodeIDRange]  mp[%v] err[%v]", partition.PartitionID, err)
        }
        return
}

func (c *Cluster) dataNodeCount() (len int) {
        c.dataNodes.Range(func(key, value interface{}) bool {
                len++
                return true
        })
        return
}

func (c *Cluster) metaNodeCount() (len int) {
        c.metaNodes.Range(func(key, value interface{}) bool {
                len++
                return true
        })
        return
}

func (c *Cluster) allMasterNodes() (masterNodes []proto.NodeView) {
        masterNodes = make([]proto.NodeView, 0)

        for _, addr := range c.cfg.peerAddrs {
                split := strings.Split(addr, colonSplit)
                id, _ := strconv.ParseUint(split[0], 10, 64)
                masterNode := proto.NodeView{ID: id, Addr: split[1] + ":" + split[2], IsActive: true}
                masterNodes = append(masterNodes, masterNode)
        }
        return masterNodes
}

func (c *Cluster) lcNodeCount() (len int) {
        c.lcNodes.Range(func(key, value interface{}) bool {
                len++
                return true
        })
        return
}

func (c *Cluster) allDataNodes() (dataNodes []proto.NodeView) {
        dataNodes = make([]proto.NodeView, 0)
        c.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode := node.(*DataNode)
                dataNodes = append(dataNodes, proto.NodeView{
                        Addr: dataNode.Addr, DomainAddr: dataNode.DomainAddr,
                        IsActive: dataNode.isActive, ID: dataNode.ID, IsWritable: dataNode.isWriteAble(),
                })
                return true
        })
        return
}

func (c *Cluster) allMetaNodes() (metaNodes []proto.NodeView) {
        metaNodes = make([]proto.NodeView, 0)
        c.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode := node.(*MetaNode)
                metaNodes = append(metaNodes, proto.NodeView{
                        ID: metaNode.ID, Addr: metaNode.Addr, DomainAddr: metaNode.DomainAddr,
                        IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable(),
                })
                return true
        })
        return
}

// get metaNode with specified condition
func (c *Cluster) getSpecifiedMetaNodes(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) (metaNodes []*MetaNode) {
        log.LogInfof("cluster metaNode length:%v", c.allMetaNodes())
        // if nodeSetId is set,choose metaNode which in nodesetId and ignore zones
        if len(nodeSetIds) != 0 {
                log.LogInfof("select from nodeSet")
                c.metaNodes.Range(func(addr, node interface{}) bool {
                        metaNode := node.(*MetaNode)
                        if _, ok := nodeSetIds[metaNode.NodeSetID]; ok {
                                metaNodes = append(metaNodes, metaNode)
                        }
                        return true
                })
                return
        }

        // if zones is set, choose metaNodes which in zones
        if len(zones) != 0 {
                log.LogInfof("select from zone")
                c.metaNodes.Range(func(addr, node interface{}) bool {
                        metaNode := node.(*MetaNode)
                        if _, ok := zones[metaNode.ZoneName]; ok {
                                metaNodes = append(metaNodes, metaNode)
                        }
                        return true
                })
                return
        }

        log.LogInfof("select all cluster metaNode")
        // get all metaNodes in cluster
        c.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode := node.(*MetaNode)
                metaNodes = append(metaNodes, metaNode)
                return true
        })

        return
}

func (c *Cluster) balanceMetaPartitionLeader(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) error {
        sortedNodes := c.getSortLeaderMetaNodes(zones, nodeSetIds)
        if sortedNodes == nil || len(sortedNodes.nodes) == 0 {
                return errors.New("no metaNode be selected")
        }

        sortedNodes.balanceLeader()

        return nil
}

func (c *Cluster) getSortLeaderMetaNodes(zones map[string]struct{}, nodeSetIds map[uint64]struct{}) *sortLeaderMetaNode {
        metaNodes := c.getSpecifiedMetaNodes(zones, nodeSetIds)
        log.LogInfof("metaNode length:%d", len(metaNodes))
        if len(metaNodes) == 0 {
                return nil
        }

        leaderNodes := make([]*LeaderMetaNode, 0)
        countM := make(map[string]int)
        totalCount := 0
        average := 0
        for _, node := range metaNodes {
                metaPartitions := make([]*MetaPartition, 0)
                for _, mp := range node.metaPartitionInfos {
                        if mp.IsLeader {
                                metaPartition, err := c.getMetaPartitionByID(mp.PartitionID)
                                if err != nil {
                                        continue
                                }
                                metaPartitions = append(metaPartitions, metaPartition)
                        }
                }

                // some metaNode's mps length could be 0
                leaderNodes = append(leaderNodes, &LeaderMetaNode{
                        metaPartitions: metaPartitions,
                        addr:           node.Addr,
                })
                countM[node.Addr] = len(metaPartitions)
                totalCount += len(metaPartitions)
        }
        if len(leaderNodes) != 0 {
                average = totalCount / len(leaderNodes)
        }
        s := &sortLeaderMetaNode{
                nodes:        leaderNodes,
                leaderCountM: countM,
                average:      average,
        }
        sort.Sort(s)
        return s
}

func (c *Cluster) allVolNames() (vols []string) {
        vols = make([]string, 0)
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        for name := range c.vols {
                vols = append(vols, name)
        }
        return
}

func (c *Cluster) copyVols() (vols map[string]*Vol) {
        vols = make(map[string]*Vol, 0)
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()

        for name, vol := range c.vols {
                vols[name] = vol
        }

        return
}

// Return all the volumes except the ones that have been marked to be deleted.
func (c *Cluster) allVols() (vols map[string]*Vol) {
        vols = make(map[string]*Vol, 0)
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        for name, vol := range c.vols {
                if vol.Status == proto.VolStatusNormal {
                        vols[name] = vol
                }
        }
        return
}

func (c *Cluster) getDataPartitionCount() (count int) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        for _, vol := range c.vols {
                count = count + len(vol.dataPartitions.partitions)
        }
        return
}

func (c *Cluster) getMetaPartitionCount() (count int) {
        vols := c.copyVols()
        for _, vol := range vols {
                vol.mpsLock.RLock()
                count = count + len(vol.MetaPartitions)
                vol.mpsLock.RUnlock()
        }
        return count
}

func (c *Cluster) setClusterInfo(dirLimit uint32) (err error) {
        oldLimit := c.cfg.DirChildrenNumLimit
        atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, dirLimit)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setClusterInfo] err[%v]", err)
                atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, oldLimit)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) getMonitorPushAddr() (addr string) {
        addr = c.cfg.MonitorPushAddr
        return
}

func (c *Cluster) setMetaNodeThreshold(threshold float32) (err error) {
        if threshold > 1.0 || threshold < 0.0 {
                err = fmt.Errorf("set threshold failed: threshold (%v) should between 0.0 and 1.0", threshold)
                return
        }
        oldThreshold := c.cfg.MetaNodeThreshold
        c.cfg.MetaNodeThreshold = threshold
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setMetaNodeThreshold] err[%v]", err)
                c.cfg.MetaNodeThreshold = oldThreshold
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setMetaNodeDeleteBatchCount(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.MetaNodeDeleteBatchCount)
        atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setMetaNodeDeleteBatchCount] err[%v]", err)
                atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setClusterLoadFactor(factor float32) (err error) {
        oldVal := c.cfg.ClusterLoadFactor
        c.cfg.ClusterLoadFactor = factor
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setClusterLoadFactorErr] err[%v]", err)
                c.cfg.ClusterLoadFactor = oldVal
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setDataNodeDeleteLimitRate(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.DataNodeDeleteLimitRate)
        atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setDataNodeDeleteLimitRate] err[%v]", err)
                atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setDataPartitionMaxRepairErrCnt(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.DpMaxRepairErrCnt)
        atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setDataPartitionMaxRepairErrCnt] err[%v]", err)
                atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setDataPartitionRepairTimeOut(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.DpRepairTimeOut)
        atomic.StoreUint64(&c.cfg.DpRepairTimeOut, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setDataPartitionRepairTimeOut] err[%v]", err)
                atomic.StoreUint64(&c.cfg.DpRepairTimeOut, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setDataNodeAutoRepairLimitRate(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.DataNodeAutoRepairLimitRate)
        atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setDataNodeAutoRepairLimitRate] err[%v]", err)
                atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setMetaNodeDeleteWorkerSleepMs(val uint64) (err error) {
        oldVal := atomic.LoadUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs)
        atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setMetaNodeDeleteWorkerSleepMs] err[%v]", err)
                atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) getMaxDpCntLimit() (dpCntInLimit uint64) {
        dpCntInLimit = atomic.LoadUint64(&c.cfg.MaxDpCntLimit)
        return
}

func (c *Cluster) setMaxDpCntLimit(val uint64) (err error) {
        if val == 0 {
                val = defaultMaxDpCntLimit
        }
        oldVal := atomic.LoadUint64(&c.cfg.MaxDpCntLimit)
        atomic.StoreUint64(&c.cfg.MaxDpCntLimit, val)
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[MaxDpCntLimit] err[%v]", err)
                atomic.StoreUint64(&c.cfg.MaxDpCntLimit, oldVal)
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setClusterCreateTime(createTime int64) (err error) {
        oldVal := c.CreateTime
        c.CreateTime = createTime
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setClusterCreateTime] err[%v]", err)
                c.CreateTime = oldVal
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setDisableAutoAllocate(disableAutoAllocate bool) (err error) {
        oldFlag := c.DisableAutoAllocate
        c.DisableAutoAllocate = disableAutoAllocate
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setDisableAutoAllocate] err[%v]", err)
                c.DisableAutoAllocate = oldFlag
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setForbidMpDecommission(isForbid bool) (err error) {
        oldFlag := c.ForbidMpDecommission
        c.ForbidMpDecommission = isForbid
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setForbidMpDecommission] err[%v]", err)
                c.ForbidMpDecommission = oldFlag
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) setMaxConcurrentLcNodes(count uint64) (err error) {
        oldCount := c.cfg.MaxConcurrentLcNodes
        c.cfg.MaxConcurrentLcNodes = count
        if err = c.syncPutCluster(); err != nil {
                log.LogErrorf("action[setMaxConcurrentLcNodes] err[%v]", err)
                c.cfg.MaxConcurrentLcNodes = oldCount
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (c *Cluster) clearVols() {
        c.volMutex.Lock()
        defer c.volMutex.Unlock()
        c.vols = make(map[string]*Vol, 0)
}

func (c *Cluster) clearTopology() {
        c.t.clear()
}

func (c *Cluster) clearDataNodes() {
        c.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                c.dataNodes.Delete(key)
                dataNode.clean()
                return true
        })
}

func (c *Cluster) clearMetaNodes() {
        c.metaNodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                c.metaNodes.Delete(key)
                metaNode.clean()
                return true
        })
}

func (c *Cluster) scheduleToCheckDecommissionDataNode() {
        go func() {
                for {
                        if c.partition.IsRaftLeader() && c.metaReady {
                                c.checkDecommissionDataNode()
                        }
                        time.Sleep(10 * time.Second)
                }
        }()
}

func (c *Cluster) checkDecommissionDataNode() {
        // decommission datanode mark
        c.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode := node.(*DataNode)
                dataNode.updateDecommissionStatus(c, false)
                if dataNode.GetDecommissionStatus() == markDecommission {
                        c.TryDecommissionDataNode(dataNode)
                } else if dataNode.GetDecommissionStatus() == DecommissionSuccess {
                        partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
                        // if only decommission part of data partitions, do not remove the datanode
                        if len(partitions) != 0 {
                                if time.Now().Sub(time.Unix(dataNode.DecommissionCompleteTime, 0)) > (20 * time.Minute) {
                                        log.LogWarnf("action[checkDecommissionDataNode] dataNode %v decommission completed, "+
                                                "but has dp left, so only reset decommission status", dataNode.Addr)
                                        dataNode.resetDecommissionStatus()
                                }
                                return true
                        }
                        if err := c.syncDeleteDataNode(dataNode); err != nil {
                                msg := fmt.Sprintf("action[checkDecommissionDataNode],clusterID[%v] Node[%v] syncDeleteDataNode failed,err[%v]",
                                        c.Name, dataNode.Addr, err)
                                log.LogWarnf("%s", msg)
                        } else {
                                log.LogWarnf("action[checkDecommissionDataNode] del dataNode %v", dataNode.Addr)
                                c.delDataNodeFromCache(dataNode)
                        }
                }
                return true
        })
}

func (c *Cluster) TryDecommissionDataNode(dataNode *DataNode) {
        var (
                toBeOffLinePartitions []*DataPartition
                err                   error
        )
        log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] limit[%v]", dataNode.Addr, dataNode.DecommissionLimit)
        dataNode.MigrateLock.Lock()
        defer func() {
                dataNode.MigrateLock.Unlock()
                if err != nil {
                        dataNode.DecommissionRetry++
                        log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] retry %v", dataNode.Addr, dataNode.DecommissionRetry)
                }
                c.syncUpdateDataNode(dataNode)
        }()
        // recover from stop
        if len(dataNode.DecommissionDiskList) != 0 {
                for _, disk := range dataNode.DecommissionDiskList {
                        key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
                        // if not found, may already success, so only care running disk
                        if value, ok := c.DecommissionDisks.Load(key); ok {
                                dd := value.(*DecommissionDisk)
                                if dd.GetDecommissionStatus() == DecommissionPause {
                                        dd.SetDecommissionStatus(markDecommission)
                                        log.LogInfof("action[TryDecommissionDataNode] dataNode [%s] restore %v from stop",
                                                dataNode.Addr, dd.GenerateKey())
                                }
                        }
                }
                dataNode.SetDecommissionStatus(DecommissionPrepare)
                dataNode.ToBeOffline = true
                log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s] recover from DecommissionDiskList", dataNode.Addr)
                return
        }
        log.LogDebugf("action[TryDecommissionDataNode] dataNode [%s]  prepare to decommission", dataNode.Addr)
        var partitions []*DataPartition
        disks := dataNode.getDisks(c)
        for _, disk := range disks {
                partitionsFromDisk := dataNode.badPartitions(disk, c)
                partitions = append(partitions, partitionsFromDisk...)
        }
        // may allocate new dp when dataNode cancel decommission before
        // partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
        if dataNode.DecommissionDstAddr != "" {
                for _, dp := range partitions {
                        // two replica can't exist on same node
                        if dp.hasHost(dataNode.DecommissionDstAddr) {
                                log.LogWarnf("action[TryDecommissionDataNode] skip dp [%v] on both data node", dp.PartitionID)
                                continue
                        }
                        toBeOffLinePartitions = append(toBeOffLinePartitions, dp)
                }
        } else {
                toBeOffLinePartitions = partitions
        }

        if len(toBeOffLinePartitions) <= 0 && len(partitions) != 0 {
                err = fmt.Errorf("DecommissionDataNode no partition can migrate from [%s] to [%s] for replica address conflict",
                        dataNode.Addr, dataNode.DecommissionDstAddr)
                log.LogWarnf("action[TryDecommissionDataNode] %v", err.Error())
                return
        }

        // check decommission dp last time
        oldPartitions := c.getAllDecommissionDataPartitionByDataNode(dataNode.Addr)

        if len(oldPartitions) != 0 {
                toBeOffLinePartitions = mergeDataPartitionArr(toBeOffLinePartitions, oldPartitions)
        }

        if !(dataNode.DecommissionLimit == 0 || dataNode.DecommissionLimit > len(toBeOffLinePartitions)) {
                toBeOffLinePartitions = toBeOffLinePartitions[:dataNode.DecommissionLimit]
        }
        if len(toBeOffLinePartitions) == 0 {
                dataNode.markDecommissionSuccess(c)
                return
        }

        // recode dp count in each disk
        dpToDecommissionByDisk := make(map[string]int)
        // find respond disk
        for _, dp := range toBeOffLinePartitions {
                disk := dp.getReplicaDisk(dataNode.Addr)
                if disk == "" {
                        log.LogWarnf("action[TryDecommissionDataNode] ignore dp [%v] on dataNode[%v]with empty disk",
                                dp.PartitionID, dataNode.Addr)
                        if dp.IsDecommissionSuccess() {
                                dp.ResetDecommissionStatus()
                                c.syncUpdateDataPartition(dp)
                        }
                        continue
                }
                dpToDecommissionByDisk[disk]++
        }
        decommissionDpTotal := 0
        left := len(toBeOffLinePartitions)
        decommissionDiskList := make([]string, 0)
        for disk, dpCnt := range dpToDecommissionByDisk {
                //
                if left == 0 {
                        break
                }
                if left-dpCnt >= 0 {
                        err = c.migrateDisk(dataNode.Addr, disk, dataNode.DecommissionDstAddr, dataNode.DecommissionRaftForce, dpCnt, true, ManualDecommission)
                        if err != nil {
                                log.LogWarnf("action[TryDecommissionDataNode] %v failed", err)
                                continue
                        }
                        decommissionDpTotal += dpCnt
                        left = left - dpCnt
                } else {
                        err = c.migrateDisk(dataNode.Addr, disk, dataNode.DecommissionDstAddr, dataNode.DecommissionRaftForce, left, true, ManualDecommission)
                        if err != nil {
                                log.LogWarnf("action[TryDecommissionDataNode] %v failed", err)
                                continue
                        }
                        decommissionDpTotal += left
                        left = 0
                }
                decommissionDiskList = append(decommissionDiskList, disk)
        }
        //put all dp to nodeset's decommission list
        //for _, dp := range toBeOffLinePartitions {
        //        dp.MarkDecommissionStatus(dataNode.Addr, dataNode.DecommissionDstAddr, "",
        //                dataNode.DecommissionRaftForce, dataNode.DecommissionTerm, c)
        //        c.syncUpdateDataPartition(dp)
        //        ns.AddToDecommissionDataPartitionList(dp)
        //        toBeOffLinePartitionIds = append(toBeOffLinePartitionIds, dp.PartitionID)
        //}
        //disk wait for decommission
        dataNode.SetDecommissionStatus(DecommissionPrepare)
        // avoid alloc dp on this node
        dataNode.ToBeOffline = true
        dataNode.DecommissionDiskList = decommissionDiskList
        dataNode.DecommissionDpTotal = decommissionDpTotal
        log.LogInfof("action[TryDecommissionDataNode] try decommission disk[%v] from dataNode[%s] "+
                "raftForce [%v] to dst [%v] DecommissionDpTotal[%v]",
                decommissionDiskList, dataNode.Addr, dataNode.DecommissionRaftForce,
                dataNode.DecommissionDstAddr, dataNode.DecommissionDpTotal)
}

func (c *Cluster) migrateDisk(nodeAddr, diskPath, dstPath string, raftForce bool, limit int, diskDisable bool, migrateType uint32) (err error) {
        var disk *DecommissionDisk
        key := fmt.Sprintf("%s_%s", nodeAddr, diskPath)

        if value, ok := c.DecommissionDisks.Load(key); ok {
                disk = value.(*DecommissionDisk)
                status := disk.GetDecommissionStatus()
                if status == markDecommission || status == DecommissionRunning {
                        err = fmt.Errorf("migrate src(%v) diskPath(%v)s still on working, please wait,check or cancel if abnormal",
                                nodeAddr, diskPath)
                        log.LogWarnf("action[addDecommissionDisk] %v", err)
                        return
                }
        } else {
                disk = &DecommissionDisk{
                        SrcAddr:     nodeAddr,
                        DiskPath:    diskPath,
                        DiskDisable: diskDisable,
                }
                c.DecommissionDisks.Store(disk.GenerateKey(), disk)
        }
        disk.Type = migrateType
        // disk should be decommission all the dp
        disk.markDecommission(dstPath, raftForce, limit)
        if err = c.syncAddDecommissionDisk(disk); err != nil {
                err = fmt.Errorf("action[addDecommissionDisk],clusterID[%v] dataNodeAddr:%v diskPath:%v err:%v ",
                        c.Name, nodeAddr, diskPath, err.Error())
                Warn(c.Name, err.Error())
                c.delDecommissionDiskFromCache(disk)
                return
        }
        // add to the nodeset decommission list
        c.addDecommissionDiskToNodeset(disk)
        log.LogInfof("action[addDecommissionDisk],clusterID[%v] dataNodeAddr:%v,diskPath[%v] raftForce [%v] "+
                "limit [%v], diskDisable [%v], migrateType [%v] term [%v]",
                c.Name, nodeAddr, diskPath, raftForce, limit, diskDisable, migrateType, disk.DecommissionTerm)
        return
}

func (c *Cluster) restoreStoppedAutoDecommissionDisk(nodeAddr, diskPath string) (err error) {
        var disk *DecommissionDisk
        key := fmt.Sprintf("%s_%s", nodeAddr, diskPath)

        if value, ok := c.DecommissionDisks.Load(key); !ok {
                disk = value.(*DecommissionDisk)
        } else {
                return errors.NewErrorf("cannot find auto decommission disk %v", key)
        }
        if disk.GetDecommissionStatus() != DecommissionPause {
                err = fmt.Errorf("decommission disk [%v]is not stopped: %v", key, disk.GetDecommissionStatus())
                log.LogWarnf("action[restoreStoppedAutoDecommissionDisk] %v", err)
                return
        }
        if disk.IsManualDecommissionDisk() {
                err = fmt.Errorf("decommission disk [%v]is not manual decommission type: %v", key, disk.Type)
                log.LogWarnf("action[restoreStoppedAutoDecommissionDisk] %v", err)
                return
        }

        disk.SetDecommissionStatus(markDecommission)
        c.syncAddDecommissionDisk(disk)
        log.LogInfof("action[restoreStoppedAutoDecommissionDisk],clusterID[%v] dataNodeAddr:%v,diskPath[%v] ",
                c.Name, nodeAddr, diskPath)
        return
}

func (c *Cluster) scheduleToCheckDecommissionDisk() {
        go func() {
                for {
                        if c.partition.IsRaftLeader() && c.metaReady {
                                c.checkDecommissionDisk()
                        }
                        time.Sleep(10 * time.Second)
                }
        }()
}

func (c *Cluster) checkDecommissionDisk() {
        // decommission disk mark
        c.DecommissionDisks.Range(func(key, value interface{}) bool {
                disk := value.(*DecommissionDisk)
                status := disk.GetDecommissionStatus()
                if status == DecommissionSuccess || status == DecommissionFail {
                        if time.Now().Sub(time.Unix(disk.DecommissionCompleteTime, 0)) > (20 * time.Minute) {
                                if err := c.syncDeleteDecommissionDisk(disk); err != nil {
                                        msg := fmt.Sprintf("action[checkDecommissionDisk],clusterID[%v] node[%v] disk[%v],"+
                                                "syncDeleteDecommissionDisk failed,err[%v]",
                                                c.Name, disk.SrcAddr, disk.DiskPath, err)
                                        log.LogWarnf("%s", msg)
                                } else {
                                        c.delDecommissionDiskFromCache(disk)
                                        log.LogDebugf("action[checkDecommissionDisk] delete DecommissionDisk[%s] status(%v)",
                                                disk.GenerateKey(), status)
                                }
                        }
                }
                return true
        })
}

func (c *Cluster) scheduleToBadDisk() {
        go func() {
                for {
                        if c.partition.IsRaftLeader() {
                                c.checkBadDisk()
                        }
                        time.Sleep(10 * time.Second)
                }
        }()
}

func (c *Cluster) checkBadDisk() {
        c.dataNodes.Range(func(addr, node interface{}) bool {
                //TODO add to auto decommission disk
                //dataNode, ok := node.(*DataNode)
                //if !ok {
                //        return true
                //}
                //for _, badDisk := range dataNode.BadDisks {
                //
                //}

                return true
        })
}

func (c *Cluster) TryDecommissionDisk(disk *DecommissionDisk) {
        var (
                node            *DataNode
                err             error
                badPartitionIds []uint64
                badPartitions   []*DataPartition
                rstMsg          string
                zone            *Zone
                ns              *nodeSet
        )
        defer func() {
                if err != nil {
                        disk.DecommissionRetry++
                }
                c.syncUpdateDecommissionDisk(disk)
        }()
        if node, err = c.dataNode(disk.SrcAddr); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] cannot find dataNode[%s]", disk.SrcAddr)
                disk.markDecommissionFailed()
                return
        }
        badPartitions = node.badPartitions(disk.DiskPath, c)
        // check decommission dp last time
        lastBadPartitions := c.getAllDecommissionDataPartitionByDisk(disk.SrcAddr, disk.DiskPath)
        badPartitions = mergeDataPartitionArr(badPartitions, lastBadPartitions)
        if len(badPartitions) == 0 {
                log.LogInfof("action[TryDecommissionDisk] receive decommissionDisk node[%v] "+
                        "no any partitions on disk[%v],offline successfully",
                        node.Addr, disk.DiskPath)
                disk.markDecommissionSuccess()
                disk.DecommissionDpTotal = 0
                if disk.DiskDisable {
                        c.addAndSyncDecommissionedDisk(node, disk.DiskPath)
                }
                return
        }
        // recover from pause
        if disk.DecommissionDpTotal != InvalidDecommissionDpCnt {
                badPartitions = lastBadPartitions
        } else { // the first time for decommission
                if disk.DecommissionDpCount == 0 || disk.DecommissionDpCount > len(badPartitions) {
                        disk.DecommissionDpTotal = len(badPartitions)
                } else {
                        disk.DecommissionDpTotal = disk.DecommissionDpCount
                        badPartitions = badPartitions[:disk.DecommissionDpCount]
                }
        }

        if zone, err = c.t.getZone(node.ZoneName); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] zone failed[%v]",
                        node.Addr, err.Error())
                disk.markDecommissionFailed()
                return
        }
        if ns, err = zone.getNodeSet(node.NodeSetID); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] nodeset[%v] failed[%v]",
                        node.Addr, node.NodeSetID, err.Error())
                disk.markDecommissionFailed()
                return
        }
        for _, dp := range badPartitions {
                // dp with decommission success cannot be reset during master load metadata
                if dp.IsDecommissionSuccess() && dp.DecommissionTerm == disk.DecommissionTerm {
                        log.LogInfof("action[TryDecommissionDisk] reset dp [%v] decommission status for disk %v:%v",
                                dp.PartitionID, disk.SrcAddr, disk.DiskPath)
                        dp.ResetDecommissionStatus()
                        c.syncUpdateDataPartition(dp)
                        disk.DecommissionDpTotal -= 1
                        continue
                }
                if !dp.MarkDecommissionStatus(node.Addr, disk.DstAddr, disk.DiskPath, disk.DecommissionRaftForce, disk.DecommissionTerm, c) {
                        continue
                }
                c.syncUpdateDataPartition(dp)
                ns.AddToDecommissionDataPartitionList(dp, c)
                badPartitionIds = append(badPartitionIds, dp.PartitionID)
        }
        disk.SetDecommissionStatus(DecommissionRunning)
        if disk.DiskDisable {
                c.addAndSyncDecommissionedDisk(node, disk.DiskPath)
        }
        rstMsg = fmt.Sprintf("receive decommissionDisk node[%v] disk[%v],badPartitionIds %v,raftForce %v"+
                " DecommissionDpTotal %v term %v  Type[%v] has offline to [%v]successfully",
                node.Addr, disk.DiskPath, badPartitionIds, disk.DecommissionRaftForce,
                disk.DecommissionDpTotal, disk.DecommissionTerm, disk.Type, disk.DstAddr)
        log.LogInfof("action[TryDecommissionDisk] %s", rstMsg)
}

func (c *Cluster) getAllDecommissionDataPartitionByDataNode(addr string) (partitions []*DataPartition) {
        partitions = make([]*DataPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, dp := range vol.dataPartitions.partitions {
                        if dp.DecommissionSrcAddr == addr {
                                partitions = append(partitions, dp)
                        }
                }
        }
        return
}

func (c *Cluster) getAllDecommissionDataPartitionByDiskAndTerm(addr, disk string, term uint64) (partitions []*DataPartition) {
        partitions = make([]*DataPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, dp := range vol.dataPartitions.partitions {
                        if dp.DecommissionSrcAddr == addr && dp.DecommissionSrcDiskPath == disk && dp.DecommissionTerm == term {
                                partitions = append(partitions, dp)
                        }
                }
        }
        return
}

func (c *Cluster) getAllDecommissionDataPartitionByDisk(addr, disk string) (partitions []*DataPartition) {
        partitions = make([]*DataPartition, 0)
        safeVols := c.allVols()
        for _, vol := range safeVols {
                for _, dp := range vol.dataPartitions.partitions {
                        if dp.DecommissionSrcAddr == addr && dp.DecommissionSrcDiskPath == disk {
                                partitions = append(partitions, dp)
                        }
                }
        }
        return
}

func (c *Cluster) listQuotaAll() (volsInfo []*proto.VolInfo) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        for _, vol := range c.vols {
                if vol.quotaManager.HasQuota() {
                        stat := volStat(vol, false)
                        volInfo := proto.NewVolInfo(vol.Name, vol.Owner, vol.createTime, vol.status(), stat.TotalSize,
                                stat.UsedSize, stat.DpReadOnlyWhenVolFull)
                        volsInfo = append(volsInfo, volInfo)
                }
        }

        return
}

func mergeDataPartitionArr(newDps, oldDps []*DataPartition) []*DataPartition {
        ret := make([]*DataPartition, 0)
        tempMap := make(map[uint64]bool)
        for _, v := range newDps {
                ret = append(ret, v)
                tempMap[v.PartitionID] = true
        }
        for _, v := range oldDps {
                if !tempMap[v.PartitionID] {
                        ret = append(ret, v)
                        tempMap[v.PartitionID] = true
                }
        }
        return ret
}

func (c *Cluster) generateClusterUuid() (err error) {
        cid := "CID-" + uuid.NewString()
        c.clusterUuid = cid
        if err := c.syncPutCluster(); err != nil {
                c.clusterUuid = ""
                return errors.NewErrorf(fmt.Sprintf("syncPutCluster failed %v", err.Error()))

        }
        return
}

func (c *Cluster) initAuthentication(cfg *config.Config) {
        var (
                authnodes   []string
                enableHTTPS bool
                certFile    string
        )
        authNodeHostConfig := cfg.GetString(AuthNodeHost)
        authnodes = strings.Split(authNodeHostConfig, ",")

        enableHTTPS = cfg.GetBool(AuthNodeEnableHTTPS)
        if enableHTTPS {
                certFile = cfg.GetString(AuthNodeCertFile)
        }

        c.ac = authSDK.NewAuthClient(authnodes, enableHTTPS, certFile)
}

func (c *Cluster) parseAndCheckClientIDKey(r *http.Request, Type proto.MsgType) (err error) {
        var (
                clientIDKey string
                clientID    string
                clientKey   []byte
        )

        if err = r.ParseForm(); err != nil {
                return
        }

        if clientIDKey, err = extractClientIDKey(r); err != nil {
                return
        }

        if clientID, clientKey, err = proto.ExtractIDAndAuthKey(clientIDKey); err != nil {
                return
        }

        if err = proto.IsValidClientID(clientID); err != nil {
                return
        }

        ticket, err := c.ac.API().GetTicket(clientID, string(clientKey), proto.MasterServiceID)
        if err != nil {
                err = fmt.Errorf("get ticket from auth node failed, clientIDKey[%v], err[%v]", clientIDKey, err.Error())
                return
        }

        _, err = checkTicket(ticket.Ticket, c.MasterSecretKey, Type)
        if err != nil {
                err = fmt.Errorf("check ticket failed, clientIDKey[%v], err[%v]", clientIDKey, err.Error())
                return
        }
        return
}

func (c *Cluster) addLcNode(nodeAddr string) (id uint64, err error) {
        var ln *LcNode
        if value, ok := c.lcNodes.Load(nodeAddr); ok {
                ln = value.(*LcNode)
                ln.ReportTime = time.Now()
                ln.clean()
                ln.TaskManager = newAdminTaskManager(ln.Addr, c.Name)
                log.LogInfof("action[addLcNode] already add nodeAddr: %v, id: %v", nodeAddr, ln.ID)
        } else {
                ln = newLcNode(nodeAddr, c.Name)
                // allocate LcNode id
                if id, err = c.idAlloc.allocateCommonID(); err != nil {
                        goto errHandler
                }
                ln.ID = id
                log.LogInfof("action[addLcNode] allocateCommonID: %v", id)
        }

        if err = c.syncAddLcNode(ln); err != nil {
                goto errHandler
        }
        c.lcNodes.Store(nodeAddr, ln)

        c.lcMgr.lcNodeStatus.Lock()
        c.lcMgr.lcNodeStatus.WorkingCount[nodeAddr] = 0
        c.lcMgr.lcNodeStatus.Unlock()

        c.snapshotMgr.lcNodeStatus.Lock()
        c.snapshotMgr.lcNodeStatus.WorkingCount[nodeAddr] = 0
        c.snapshotMgr.lcNodeStatus.Unlock()
        log.LogInfof("action[addLcNode], clusterID[%v], lcNodeAddr: %v, id: %v, add idleNodes", c.Name, nodeAddr, ln.ID)
        return ln.ID, nil

errHandler:
        err = fmt.Errorf("action[addLcNode],clusterID[%v] lcNodeAddr:%v err:%v ", c.Name, nodeAddr, err.Error())
        log.LogError(errors.Stack(err))
        Warn(c.Name, err.Error())
        return
}

type LcNodeStatInfo struct {
        Addr string
}

type LcNodeInfoResponse struct {
        RegisterInfos      []*LcNodeStatInfo
        LcConfigurations   map[string]*proto.LcConfiguration
        LcRuleTaskStatus   lcRuleTaskStatus
        LcNodeStatus       lcNodeStatus
        SnapshotVerStatus  lcSnapshotVerStatus
        SnapshotNodeStatus lcNodeStatus
}

func (c *Cluster) getAllLcNodeInfo() (rsp *LcNodeInfoResponse, err error) {
        rsp = &LcNodeInfoResponse{}
        c.lcNodes.Range(func(addr, value interface{}) bool {
                rsp.RegisterInfos = append(rsp.RegisterInfos, &LcNodeStatInfo{
                        Addr: addr.(string),
                })
                return true
        })
        var b []byte

        c.lcMgr.RLock()
        if b, err = json.Marshal(c.lcMgr.lcConfigurations); err != nil {
                c.lcMgr.RUnlock()
                return
        }
        c.lcMgr.RUnlock()
        if err = json.Unmarshal(b, &rsp.LcConfigurations); err != nil {
                return
        }

        c.lcMgr.lcRuleTaskStatus.RLock()
        if b, err = json.Marshal(c.lcMgr.lcRuleTaskStatus); err != nil {
                c.lcMgr.lcRuleTaskStatus.RUnlock()
                return
        }
        c.lcMgr.lcRuleTaskStatus.RUnlock()
        if err = json.Unmarshal(b, &rsp.LcRuleTaskStatus); err != nil {
                return
        }

        c.lcMgr.lcNodeStatus.RLock()
        if b, err = json.Marshal(c.lcMgr.lcNodeStatus); err != nil {
                c.lcMgr.lcNodeStatus.RUnlock()
                return
        }
        c.lcMgr.lcNodeStatus.RUnlock()
        if err = json.Unmarshal(b, &rsp.LcNodeStatus); err != nil {
                return
        }

        c.snapshotMgr.lcSnapshotTaskStatus.RLock()
        if b, err = json.Marshal(c.snapshotMgr.lcSnapshotTaskStatus); err != nil {
                c.snapshotMgr.lcSnapshotTaskStatus.RUnlock()
                return
        }
        c.snapshotMgr.lcSnapshotTaskStatus.RUnlock()
        if err = json.Unmarshal(b, &rsp.SnapshotVerStatus); err != nil {
                return
        }

        c.snapshotMgr.lcNodeStatus.RLock()
        if b, err = json.Marshal(c.snapshotMgr.lcNodeStatus); err != nil {
                c.snapshotMgr.lcNodeStatus.RUnlock()
                return
        }
        c.snapshotMgr.lcNodeStatus.RUnlock()
        if err = json.Unmarshal(b, &rsp.SnapshotNodeStatus); err != nil {
                return
        }
        return
}

func (c *Cluster) clearLcNodes() {
        c.lcNodes.Range(func(key, value interface{}) bool {
                lcNode := value.(*LcNode)
                c.lcNodes.Delete(key)
                lcNode.clean()
                return true
        })
}

func (c *Cluster) delLcNode(nodeAddr string) (err error) {
        c.lcMgr.lcNodeStatus.RemoveNode(nodeAddr)
        c.snapshotMgr.lcNodeStatus.RemoveNode(nodeAddr)

        lcNode, err := c.lcNode(nodeAddr)
        if err != nil {
                log.LogErrorf("action[delLcNode], clusterID:%v, lcNodeAddr:%v, load err:%v ", c.Name, nodeAddr, err)
                return
        }
        if err = c.syncDeleteLcNode(lcNode); err != nil {
                log.LogErrorf("action[delLcNode], clusterID:%v, lcNodeAddr:%v syncDeleteLcNode err:%v ", c.Name, nodeAddr, err)
                return
        }
        val, loaded := c.lcNodes.LoadAndDelete(nodeAddr)
        log.LogInfof("action[delLcNode], clusterID:%v, lcNodeAddr:%v, LoadAndDelete result val:%v, loaded:%v", c.Name, nodeAddr, val, loaded)
        return
}

func (c *Cluster) scheduleToLcScan() {
        go func() {
                for {
                        now := time.Now()
                        next := now.Add(time.Hour * 24)
                        next = time.Date(next.Year(), next.Month(), next.Day(), 1, 0, 0, 0, next.Location())
                        t := time.NewTimer(next.Sub(now))
                        <-t.C
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                c.startLcScan()
                        }
                }
        }()
}

func (c *Cluster) startLcScan() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("startLcScan occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "startLcScan occurred panic")
                }
        }()
        c.lcMgr.startLcScan()
}

func (c *Cluster) scheduleToSnapshotDelVerScan() {
        go c.snapshotMgr.process()
        // make sure resume all the processing ver deleting tasks before checking
        waitTime := time.Second * defaultIntervalToCheck
        waited := false
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                if !waited {
                                        log.LogInfof("wait for %v seconds once after becoming leader to make sure all the ver deleting tasks are resumed",
                                                waitTime)
                                        time.Sleep(waitTime)
                                        waited = true
                                }
                                c.getSnapshotDelVer()
                        }
                        time.Sleep(waitTime)
                }
        }()
}

func (c *Cluster) getSnapshotDelVer() {
        if c.partition == nil || !c.partition.IsRaftLeader() {
                log.LogWarn("getSnapshotDelVer: master is not leader")
                return
        }
        c.snapshotMgr.lcSnapshotTaskStatus.ResetVerInfos()

        vols := c.allVols()
        for volName, vol := range vols {
                volVerInfoList := vol.VersionMgr.getVersionList()
                for _, volVerInfo := range volVerInfoList.VerList {
                        if volVerInfo.Status == proto.VersionDeleting {
                                task := &proto.SnapshotVerDelTask{
                                        Id:             fmt.Sprintf("%s:%d", volName, volVerInfo.Ver),
                                        VolName:        volName,
                                        VolVersionInfo: volVerInfo,
                                }
                                c.snapshotMgr.lcSnapshotTaskStatus.AddVerInfo(task)
                        }
                }
        }
        log.LogDebug("getSnapshotDelVer AddVerInfo finish")
        c.snapshotMgr.lcSnapshotTaskStatus.DeleteOldResult()
        log.LogDebug("getSnapshotDelVer DeleteOldResult finish")
}

func (c *Cluster) SetBucketLifecycle(req *proto.LcConfiguration) error {
        lcConf := &proto.LcConfiguration{
                VolName: req.VolName,
                Rules:   req.Rules,
        }
        if c.lcMgr.GetS3BucketLifecycle(req.VolName) != nil {
                if err := c.syncUpdateLcConf(lcConf); err != nil {
                        err = fmt.Errorf("action[SetS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, lcConf.VolName, err.Error())
                        log.LogError(errors.Stack(err))
                        Warn(c.Name, err.Error())
                }
        } else {
                if err := c.syncAddLcConf(lcConf); err != nil {
                        err = fmt.Errorf("action[SetS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, lcConf.VolName, err.Error())
                        log.LogError(errors.Stack(err))
                        Warn(c.Name, err.Error())
                }
        }
        _ = c.lcMgr.SetS3BucketLifecycle(lcConf)
        log.LogInfof("action[SetS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, lcConf.VolName)
        return nil
}

func (c *Cluster) GetBucketLifecycle(VolName string) (lcConf *proto.LcConfiguration) {
        lcConf = c.lcMgr.GetS3BucketLifecycle(VolName)
        log.LogInfof("action[GetS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, VolName)
        return
}

func (c *Cluster) DelBucketLifecycle(VolName string) {
        lcConf := &proto.LcConfiguration{
                VolName: VolName,
        }
        if err := c.syncDeleteLcConf(lcConf); err != nil {
                err = fmt.Errorf("action[DelS3BucketLifecycle],clusterID[%v] vol:%v err:%v ", c.Name, VolName, err.Error())
                log.LogError(errors.Stack(err))
                Warn(c.Name, err.Error())
        }
        c.lcMgr.DelS3BucketLifecycle(VolName)
        log.LogInfof("action[DelS3BucketLifecycle],clusterID[%v] vol:%v", c.Name, VolName)
        return
}

func (c *Cluster) addDecommissionDiskToNodeset(dd *DecommissionDisk) (err error) {
        var (
                node *DataNode
                zone *Zone
                ns   *nodeSet
        )
        if node, err = c.dataNode(dd.SrcAddr); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] cannot find dataNode[%s]", dd.SrcAddr)
                return
        }
        if zone, err = c.t.getZone(node.ZoneName); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] zone failed[%v]",
                        node.Addr, err.Error())
                return
        }
        if ns, err = zone.getNodeSet(node.NodeSetID); err != nil {
                log.LogWarnf("action[TryDecommissionDisk] find datanode[%s] nodeset[%v] failed[%v]",
                        node.Addr, node.NodeSetID, err.Error())
                return
        }
        ns.AddDecommissionDisk(dd)
        return nil
}

func (c *Cluster) AutoDecommissionDiskIsEnabled() bool {
        c.AutoDecommissionDiskMux.Lock()
        defer c.AutoDecommissionDiskMux.Unlock()
        return c.EnableAutoDecommissionDisk
}

func (c *Cluster) SetAutoDecommissionDisk(flag bool) {
        c.AutoDecommissionDiskMux.Lock()
        defer c.AutoDecommissionDiskMux.Unlock()
        c.EnableAutoDecommissionDisk = flag
}

func (c *Cluster) GetDecommissionDataPartitionRecoverTimeOut() time.Duration {
        if c.cfg.DpRepairTimeOut == 0 {
                return time.Hour * 2
        } else {
                return time.Second * time.Duration(c.cfg.DpRepairTimeOut)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "strconv"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

type nodeStatInfo = proto.NodeStatInfo

type volStatInfo = proto.VolStatInfo

func newVolStatInfo(name string, total, used, cacheTotal, cacheUsed, inodeCount uint64) *volStatInfo {
        usedRatio := strconv.FormatFloat(float64(used)/float64(total), 'f', 3, 32)
        cacheUsedRatio := "0.00"
        if cacheTotal > 0 {
                strconv.FormatFloat(float64(cacheUsed)/float64(cacheTotal), 'f', 3, 32)
        }

        return &volStatInfo{
                Name:           name,
                TotalSize:      total,
                UsedSize:       used,
                UsedRatio:      usedRatio,
                CacheTotalSize: cacheTotal,
                CacheUsedSize:  cacheUsed,
                CacheUsedRatio: cacheUsedRatio,
                InodeCount:     inodeCount,
        }
}

func newZoneStatInfo() *proto.ZoneStat {
        return &proto.ZoneStat{DataNodeStat: new(proto.ZoneNodesStat), MetaNodeStat: new(proto.ZoneNodesStat)}
}

// Check the total space, available space, and daily-used space in data nodes,  meta nodes, and volumes
func (c *Cluster) updateStatInfo() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("updateStatInfo occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "updateStatInfo occurred panic")
                }
        }()
        c.updateDataNodeStatInfo()
        c.updateMetaNodeStatInfo()
        c.updateVolStatInfo()
        c.updateZoneStatInfo()
}

func (c *Cluster) updateZoneStatInfo() {
        for _, zone := range c.t.zones {
                zs := newZoneStatInfo()
                c.zoneStatInfos[zone.name] = zs
                zone.dataNodes.Range(func(key, value interface{}) bool {
                        zs.DataNodeStat.TotalNodes++
                        node := value.(*DataNode)
                        if node.isActive && node.isWriteAble() {
                                zs.DataNodeStat.WritableNodes++
                        }
                        zs.DataNodeStat.Total += float64(node.Total) / float64(util.GB)
                        zs.DataNodeStat.Used += float64(node.Used) / float64(util.GB)
                        return true
                })
                zs.DataNodeStat.Total = fixedPoint(zs.DataNodeStat.Total, 2)
                zs.DataNodeStat.Used = fixedPoint(zs.DataNodeStat.Used, 2)
                zs.DataNodeStat.Avail = fixedPoint(zs.DataNodeStat.Total-zs.DataNodeStat.Used, 2)
                if zs.DataNodeStat.Total == 0 {
                        zs.DataNodeStat.Total = 1
                }
                zs.DataNodeStat.UsedRatio = fixedPoint(float64(zs.DataNodeStat.Used)/float64(zs.DataNodeStat.Total), 2)
                zone.metaNodes.Range(func(key, value interface{}) bool {
                        zs.MetaNodeStat.TotalNodes++
                        node := value.(*MetaNode)
                        if node.IsActive && node.isWritable() {
                                zs.MetaNodeStat.WritableNodes++
                        }
                        zs.MetaNodeStat.Total += float64(node.Total) / float64(util.GB)
                        zs.MetaNodeStat.Used += float64(node.Used) / float64(util.GB)
                        return true
                })
                zs.MetaNodeStat.Total = fixedPoint(zs.MetaNodeStat.Total, 2)
                zs.MetaNodeStat.Used = fixedPoint(zs.MetaNodeStat.Used, 2)
                zs.MetaNodeStat.Avail = fixedPoint(zs.MetaNodeStat.Total-zs.MetaNodeStat.Used, 2)
                if zs.MetaNodeStat.Total == 0 {
                        zs.MetaNodeStat.Total = 1
                }
                zs.MetaNodeStat.UsedRatio = fixedPoint(float64(zs.MetaNodeStat.Used)/float64(zs.MetaNodeStat.Total), 2)
        }
}

func fixedPoint(x float64, scale int) float64 {
        decimal := math.Pow10(scale)
        return float64(int(math.Round(x*decimal))) / decimal
}

func (c *Cluster) updateDataNodeStatInfo() {
        var (
                total uint64
                used  uint64
                avail uint64
        )
        c.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode := node.(*DataNode)
                total = total + dataNode.Total
                used = used + dataNode.Used

                if dataNode.isActive {
                        avail = avail + dataNode.AvailableSpace
                }
                return true
        })
        if total <= 0 {
                return
        }
        usedRate := float64(used) / float64(total)
        if usedRate > spaceAvailableRate {
                Warn(c.Name, fmt.Sprintf("clusterId[%v] space utilization reached [%v],usedSpace[%v],totalSpace[%v] please add dataNode",
                        c.Name, usedRate, used, total))
        }
        c.dataNodeStatInfo.TotalGB = total / util.GB
        c.dataNodeStatInfo.AvailGB = avail / util.GB
        usedGB := used / util.GB
        c.dataNodeStatInfo.IncreasedGB = int64(usedGB) - int64(c.dataNodeStatInfo.UsedGB)
        c.dataNodeStatInfo.UsedGB = usedGB
        c.dataNodeStatInfo.UsedRatio = strconv.FormatFloat(usedRate, 'f', 3, 32)
}

func (c *Cluster) updateMetaNodeStatInfo() {
        var (
                total uint64
                used  uint64
                avail uint64
        )
        c.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode := node.(*MetaNode)
                total = total + metaNode.Total
                used = used + metaNode.Used
                if metaNode.IsActive {
                        avail = avail + metaNode.MaxMemAvailWeight
                }
                return true
        })
        if total <= 0 {
                return
        }
        useRate := float64(used) / float64(total)
        if useRate > spaceAvailableRate {
                Warn(c.Name, fmt.Sprintf("clusterId[%v] space utilization reached [%v],usedSpace[%v],totalSpace[%v] please add metaNode",
                        c.Name, useRate, used, total))
        }
        c.metaNodeStatInfo.TotalGB = total / util.GB
        c.metaNodeStatInfo.AvailGB = avail / util.GB
        newUsed := used / util.GB
        c.metaNodeStatInfo.IncreasedGB = int64(newUsed) - int64(c.metaNodeStatInfo.UsedGB)
        c.metaNodeStatInfo.UsedGB = newUsed
        c.metaNodeStatInfo.UsedRatio = strconv.FormatFloat(useRate, 'f', 3, 32)
}

func (c *Cluster) updateVolStatInfo() {
        vols := c.copyVols()
        for _, vol := range vols {
                used, total := vol.totalUsedSpace(), vol.Capacity*util.GB
                if total <= 0 {
                        continue
                }

                cacheUsed, cacheTotal := vol.cfsUsedSpace(), vol.CacheCapacity*util.GB
                if proto.IsHot(vol.VolType) {
                        cacheUsed, cacheTotal = 0, 0
                }

                var inodeCount uint64
                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        inodeCount += mp.InodeCount
                }
                vol.mpsLock.RUnlock()

                c.volStatInfo.Store(vol.Name, newVolStatInfo(vol.Name, total, used, cacheTotal, cacheUsed, inodeCount))
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "runtime"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

func (c *Cluster) addDataNodeTasks(tasks []*proto.AdminTask) {
        for _, t := range tasks {
                c.addDataNodeTask(t)
        }
}

func (c *Cluster) addDataNodeTask(task *proto.AdminTask) {
        if task == nil {
                return
        }
        if node, err := c.dataNode(task.OperatorAddr); err != nil {
                log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", task.OperatorAddr, task.ID, err))
        } else {
                node.TaskManager.AddTask(task)
        }
}

func (c *Cluster) addMetaNodeTasks(tasks []*proto.AdminTask) {
        for _, t := range tasks {
                if t == nil {
                        continue
                }
                if node, err := c.metaNode(t.OperatorAddr); err != nil {
                        log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", t.OperatorAddr, t.ID, err.Error()))
                } else {
                        node.Sender.AddTask(t)
                }
        }
}

func (c *Cluster) addLcNodeTasks(tasks []*proto.AdminTask) {
        for _, t := range tasks {
                if t == nil {
                        continue
                }
                if node, err := c.lcNode(t.OperatorAddr); err != nil {
                        log.LogWarn(fmt.Sprintf("action[putTasks],nodeAddr:%v,taskID:%v,err:%v", t.OperatorAddr, t.ID, err.Error()))
                } else {
                        node.TaskManager.AddTask(t)
                }
        }
}

func (c *Cluster) waitForResponseToLoadDataPartition(partitions []*DataPartition) {
        var wg sync.WaitGroup
        for _, dp := range partitions {
                wg.Add(1)
                go func(dp *DataPartition) {
                        defer func() {
                                wg.Done()
                                if err := recover(); err != nil {
                                        const size = runtimeStackBufSize
                                        buf := make([]byte, size)
                                        buf = buf[:runtime.Stack(buf, false)]
                                        log.LogError(fmt.Sprintf("doLoadDataPartition panic %v: %s\n", err, buf))
                                }
                        }()
                        c.doLoadDataPartition(dp)
                }(dp)
        }
        wg.Wait()
}

func (c *Cluster) loadDataPartition(dp *DataPartition) {
        go func() {
                c.doLoadDataPartition(dp)
        }()
}

func (c *Cluster) migrateMetaPartition(srcAddr, targetAddr string, mp *MetaPartition) (err error) {
        var (
                newPeers        []proto.Peer
                metaNode        *MetaNode
                zone            *Zone
                ns              *nodeSet
                excludeNodeSets []uint64
                oldHosts        []string
                zones           []string
        )

        log.LogWarnf("action[migrateMetaPartition],volName[%v], migrate from src[%s] to target[%s],partitionID[%v] begin",
                mp.volName, srcAddr, targetAddr, mp.PartitionID)

        mp.RLock()
        if !contains(mp.Hosts, srcAddr) {
                mp.RUnlock()
                log.LogErrorf("action[migrateMetaPartition],volName[%v], src[%s] not exist, partitionID[%v]",
                        mp.volName, srcAddr, mp.PartitionID)
                return fmt.Errorf("migrateMetaPartition src [%s] is not exist in mp(%d)", srcAddr, mp.PartitionID)
        }
        oldHosts = mp.Hosts
        mp.RUnlock()

        if err = c.validateDecommissionMetaPartition(mp, srcAddr, false); err != nil {
                goto errHandler
        }

        if metaNode, err = c.metaNode(srcAddr); err != nil {
                goto errHandler
        }

        if zone, err = c.t.getZone(metaNode.ZoneName); err != nil {
                goto errHandler
        }

        if ns, err = zone.getNodeSet(metaNode.NodeSetID); err != nil {
                goto errHandler
        }

        if targetAddr != "" {
                newPeers = []proto.Peer{{
                        Addr: targetAddr,
                }}
        } else if _, newPeers, err = ns.getAvailMetaNodeHosts(oldHosts, 1); err != nil {
                if _, ok := c.vols[mp.volName]; !ok {
                        log.LogWarnf("[migrateMetaPartition] clusterID[%v] partitionID:%v  on node:[%v]",
                                c.Name, mp.PartitionID, mp.Hosts)
                        return
                }
                if c.isFaultDomain(c.vols[mp.volName]) {
                        log.LogWarnf("[migrateMetaPartition] clusterID[%v] partitionID:%v  on node:[%v]",
                                c.Name, mp.PartitionID, mp.Hosts)
                        return
                }
                // choose a meta node in other node set in the same zone
                excludeNodeSets = append(excludeNodeSets, ns.ID)
                if _, newPeers, err = zone.getAvailNodeHosts(TypeMetaPartition, excludeNodeSets, oldHosts, 1); err != nil {
                        zones = mp.getLiveZones(srcAddr)
                        var excludeZone []string
                        if len(zones) == 0 {
                                excludeZone = append(excludeZone, zone.name)
                        } else {
                                excludeZone = append(excludeZone, zones[0])
                        }
                        // choose a meta node in other zone
                        if _, newPeers, err = c.getHostFromNormalZone(TypeMetaPartition, excludeZone, excludeNodeSets, oldHosts, 1, 1, ""); err != nil {
                                goto errHandler
                        }
                }
        }

        if err = c.deleteMetaReplica(mp, srcAddr, false, false); err != nil {
                goto errHandler
        }

        if err = c.addMetaReplica(mp, newPeers[0].Addr); err != nil {
                goto errHandler
        }

        mp.IsRecover = true
        c.putBadMetaPartitions(srcAddr, mp.PartitionID)

        mp.RLock()
        c.syncUpdateMetaPartition(mp)
        mp.RUnlock()

        Warn(c.Name, fmt.Sprintf("action[migrateMetaPartition] clusterID[%v] vol[%v] meta partition[%v] "+
                "migrate addr[%v] success,new addr[%v]", c.Name, mp.volName, mp.PartitionID, srcAddr, newPeers[0].Addr))
        return

errHandler:
        msg := fmt.Sprintf("action[migrateMetaPartition],volName: %v,partitionID: %v,err: %v", mp.volName, mp.PartitionID, errors.Stack(err))
        log.LogError(msg)
        Warn(c.Name, msg)

        if err != nil {
                err = fmt.Errorf("action[migrateMetaPartition] vol[%v],partition[%v],err[%v]", mp.volName, mp.PartitionID, err)
        }
        return
}

// taking the given mata partition offline.
// 1. checking if the meta partition can be offline.
// There are two cases where the partition is not allowed to be offline:
// (1) the replica is not in the latest host list
// (2) there are too few replicas
// 2. choosing a new available meta node
// 3. synchronized decommission meta partition
// 4. synchronized create a new meta partition
// 5. persistent the new host list
func (c *Cluster) decommissionMetaPartition(nodeAddr string, mp *MetaPartition) (err error) {
        if c.ForbidMpDecommission {
                err = fmt.Errorf("cluster mataPartition decommission switch is disabled")
                return
        }
        return c.migrateMetaPartition(nodeAddr, "", mp)
}

func (c *Cluster) validateDecommissionMetaPartition(mp *MetaPartition, nodeAddr string, forceDel bool) (err error) {
        mp.RLock()
        defer mp.RUnlock()

        var vol *Vol
        if vol, err = c.getVol(mp.volName); err != nil {
                return
        }

        if err = mp.canBeOffline(nodeAddr, int(vol.mpReplicaNum)); err != nil {
                return
        }

        if forceDel {
                log.LogWarnf("action[validateDecommissionMetaPartition] mp relica be force delete without check missing and recovery status")
                return
        }

        if err = mp.hasMissingOneReplica(nodeAddr, int(vol.mpReplicaNum)); err != nil {
                return
        }

        if mp.IsRecover && !mp.activeMaxInodeSimilar() {
                err = fmt.Errorf("vol[%v],meta partition[%v] is recovering,[%v] can't be decommissioned", vol.Name, mp.PartitionID, nodeAddr)
                return
        }
        return
}

func (c *Cluster) checkInactiveMetaNodes() (inactiveMetaNodes []string, err error) {
        inactiveMetaNodes = make([]string, 0)

        c.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode := node.(*MetaNode)
                if !metaNode.IsActive {
                        inactiveMetaNodes = append(inactiveMetaNodes, metaNode.Addr)
                }
                return true
        })

        log.LogInfof("clusterID[%v] inactiveMetaNodes:%v", c.Name, inactiveMetaNodes)
        return
}

// check corrupt partitions related to this meta node
func (c *Cluster) checkCorruptMetaNode(metaNode *MetaNode) (corruptPartitions []*MetaPartition, err error) {
        var (
                partition         *MetaPartition
                mn                *MetaNode
                corruptPids       []uint64
                corruptReplicaNum uint8
        )
        metaNode.RLock()
        defer metaNode.RUnlock()
        for _, pid := range metaNode.PersistenceMetaPartitions {
                corruptReplicaNum = 0
                if partition, err = c.getMetaPartitionByID(pid); err != nil {
                        return
                }
                for _, host := range partition.Hosts {
                        if mn, err = c.metaNode(host); err != nil {
                                return
                        }
                        if !mn.IsActive {
                                corruptReplicaNum = corruptReplicaNum + 1
                        }
                }
                if corruptReplicaNum > partition.ReplicaNum/2 {
                        corruptPartitions = append(corruptPartitions, partition)
                        corruptPids = append(corruptPids, pid)
                }
        }
        log.LogInfof("action[checkCorruptMetaNode],clusterID[%v] metaNodeAddr:[%v], corrupt partitions%v",
                c.Name, metaNode.Addr, corruptPids)
        return
}

type VolNameSet map[string]struct{}

func (c *Cluster) checkReplicaMetaPartitions() (
        lackReplicaMetaPartitions []*MetaPartition, noLeaderMetaPartitions []*MetaPartition,
        unavailableReplicaMPs []*MetaPartition, excessReplicaMetaPartitions, inodeCountNotEqualMPs, maxInodeNotEqualMPs, dentryCountNotEqualMPs []*MetaPartition, err error) {
        lackReplicaMetaPartitions = make([]*MetaPartition, 0)
        noLeaderMetaPartitions = make([]*MetaPartition, 0)
        excessReplicaMetaPartitions = make([]*MetaPartition, 0)
        inodeCountNotEqualMPs = make([]*MetaPartition, 0)
        maxInodeNotEqualMPs = make([]*MetaPartition, 0)
        dentryCountNotEqualMPs = make([]*MetaPartition, 0)

        markDeleteVolNames := make(VolNameSet)

        vols := c.copyVols()
        for _, vol := range vols {
                if vol.Status == proto.VolStatusMarkDelete {
                        markDeleteVolNames[vol.Name] = struct{}{}
                        continue
                }

                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        if uint8(len(mp.Hosts)) < mp.ReplicaNum || uint8(len(mp.Replicas)) < mp.ReplicaNum {
                                lackReplicaMetaPartitions = append(lackReplicaMetaPartitions, mp)
                        }

                        if !mp.isLeaderExist() && (time.Now().Unix()-mp.LeaderReportTime > c.cfg.MpNoLeaderReportIntervalSec) {
                                noLeaderMetaPartitions = append(noLeaderMetaPartitions, mp)
                        }

                        if uint8(len(mp.Hosts)) > mp.ReplicaNum || uint8(len(mp.Replicas)) > mp.ReplicaNum {
                                excessReplicaMetaPartitions = append(excessReplicaMetaPartitions, mp)
                        }

                        for _, replica := range mp.Replicas {
                                if replica.Status == proto.Unavailable {
                                        unavailableReplicaMPs = append(unavailableReplicaMPs, mp)
                                        break
                                }
                        }
                }
                vol.mpsLock.RUnlock()
        }
        c.inodeCountNotEqualMP.Range(func(key, value interface{}) bool {
                mp := value.(*MetaPartition)
                if _, ok := markDeleteVolNames[mp.volName]; !ok {
                        inodeCountNotEqualMPs = append(inodeCountNotEqualMPs, mp)
                }
                return true
        })
        c.maxInodeNotEqualMP.Range(func(key, value interface{}) bool {
                mp := value.(*MetaPartition)
                if _, ok := markDeleteVolNames[mp.volName]; !ok {
                        maxInodeNotEqualMPs = append(maxInodeNotEqualMPs, mp)
                }
                return true
        })
        c.dentryCountNotEqualMP.Range(func(key, value interface{}) bool {
                mp := value.(*MetaPartition)
                if _, ok := markDeleteVolNames[mp.volName]; !ok {
                        dentryCountNotEqualMPs = append(dentryCountNotEqualMPs, mp)
                }
                return true
        })
        log.LogInfof("clusterID[%v], lackReplicaMetaPartitions count:[%v], noLeaderMetaPartitions count[%v]"+
                "unavailableReplicaMPs count:[%v], excessReplicaMp count:[%v]",
                c.Name, len(lackReplicaMetaPartitions), len(noLeaderMetaPartitions),
                len(unavailableReplicaMPs), len(excessReplicaMetaPartitions))
        return
}

func (c *Cluster) deleteMetaReplica(partition *MetaPartition, addr string, validate bool, forceDel bool) (err error) {
        defer func() {
                if err != nil {
                        log.LogErrorf("action[deleteMetaReplica],vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
                }
        }()

        if validate {
                if err = c.validateDecommissionMetaPartition(partition, addr, forceDel); err != nil {
                        return
                }
        }

        metaNode, err := c.metaNode(addr)
        if err != nil {
                return
        }

        removePeer := proto.Peer{ID: metaNode.ID, Addr: addr}
        if err = c.removeMetaPartitionRaftMember(partition, removePeer); err != nil {
                return
        }

        if err = c.deleteMetaPartition(partition, metaNode); err != nil {
                return
        }
        return
}

func (c *Cluster) deleteMetaPartition(partition *MetaPartition, removeMetaNode *MetaNode) (err error) {
        partition.Lock()
        mr, err := partition.getMetaReplica(removeMetaNode.Addr)
        if err != nil {
                partition.Unlock()
                log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v], err[%v]", partition.volName, partition.PartitionID, err)
                return nil
        }
        task := mr.createTaskToDeleteReplica(partition.PartitionID)
        partition.removeReplicaByAddr(removeMetaNode.Addr)
        partition.removeMissingReplica(removeMetaNode.Addr)
        partition.Unlock()
        _, err = removeMetaNode.Sender.syncSendAdminTask(task)
        if err != nil {
                log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
        }
        return nil
}

func (c *Cluster) removeMetaPartitionRaftMember(partition *MetaPartition, removePeer proto.Peer) (err error) {
        partition.offlineMutex.Lock()
        defer partition.offlineMutex.Unlock()
        defer func() {
                if err1 := c.updateMetaPartitionOfflinePeerIDWithLock(partition, 0); err1 != nil {
                        err = errors.Trace(err, "updateMetaPartitionOfflinePeerIDWithLock failed, err[%v]", err1)
                }
        }()
        if err = c.updateMetaPartitionOfflinePeerIDWithLock(partition, removePeer.ID); err != nil {
                return
        }
        mr, err := partition.getMetaReplicaLeader()
        if err != nil {
                return
        }
        t, err := partition.createTaskToRemoveRaftMember(removePeer)
        if err != nil {
                return
        }
        var leaderMetaNode *MetaNode
        leaderMetaNode = mr.metaNode
        if leaderMetaNode == nil {
                leaderMetaNode, err = c.metaNode(mr.Addr)
                if err != nil {
                        return
                }
        }
        if _, err = leaderMetaNode.Sender.syncSendAdminTask(t); err != nil {
                return
        }
        newHosts := make([]string, 0, len(partition.Hosts)-1)
        newPeers := make([]proto.Peer, 0, len(partition.Hosts)-1)
        for _, host := range partition.Hosts {
                if host == removePeer.Addr {
                        continue
                }
                newHosts = append(newHosts, host)
        }
        for _, peer := range partition.Peers {
                if peer.Addr == removePeer.Addr && peer.ID == removePeer.ID {
                        continue
                }
                newPeers = append(newPeers, peer)
        }
        if err = partition.persistToRocksDB("removeMetaPartitionRaftMember", partition.volName, newHosts, newPeers, c); err != nil {
                return
        }
        if mr.Addr != removePeer.Addr {
                return
        }
        metaNode, err := c.metaNode(partition.Hosts[0])
        if err != nil {
                return
        }
        if err = partition.tryToChangeLeader(c, metaNode); err != nil {
                return
        }
        return
}

func (c *Cluster) updateMetaPartitionOfflinePeerIDWithLock(mp *MetaPartition, peerID uint64) (err error) {
        mp.Lock()
        defer mp.Unlock()
        mp.OfflinePeerID = peerID
        if err = mp.persistToRocksDB("updateMetaPartitionOfflinePeerIDWithLock", mp.volName, mp.Hosts, mp.Peers, c); err != nil {
                return
        }
        return
}

func (c *Cluster) addMetaReplica(partition *MetaPartition, addr string) (err error) {
        defer func() {
                if err != nil {
                        log.LogErrorf("action[addMetaReplica],vol[%v],data partition[%v],err[%v]", partition.volName, partition.PartitionID, err)
                }
        }()
        partition.Lock()
        defer partition.Unlock()
        if contains(partition.Hosts, addr) {
                err = fmt.Errorf("vol[%v],mp[%v] has contains host[%v]", partition.volName, partition.PartitionID, addr)
                return
        }
        metaNode, err := c.metaNode(addr)
        if err != nil {
                return
        }
        addPeer := proto.Peer{ID: metaNode.ID, Addr: addr}
        if err = c.addMetaPartitionRaftMember(partition, addPeer); err != nil {
                return
        }
        newHosts := make([]string, 0, len(partition.Hosts)+1)
        newPeers := make([]proto.Peer, 0, len(partition.Hosts)+1)
        newHosts = append(partition.Hosts, addPeer.Addr)
        newPeers = append(partition.Peers, addPeer)
        if err = partition.persistToRocksDB("addMetaReplica", partition.volName, newHosts, newPeers, c); err != nil {
                return
        }
        if err = c.createMetaReplica(partition, addPeer); err != nil {
                return
        }
        if err = partition.afterCreation(addPeer.Addr, c); err != nil {
                return
        }
        return
}

func (c *Cluster) createMetaReplica(partition *MetaPartition, addPeer proto.Peer) (err error) {
        task, err := partition.createTaskToCreateReplica(addPeer.Addr)
        if err != nil {
                return
        }
        metaNode, err := c.metaNode(addPeer.Addr)
        if err != nil {
                return
        }
        if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
                return
        }
        return
}

func (c *Cluster) buildAddMetaPartitionRaftMemberTaskAndSyncSend(mp *MetaPartition, addPeer proto.Peer, leaderAddr string) (resp *proto.Packet, err error) {
        defer func() {
                var resultCode uint8
                if resp != nil {
                        resultCode = resp.ResultCode
                }

                if err != nil {
                        log.LogErrorf("action[addMetaRaftMemberAndSend],vol[%v],meta partition[%v],resultCode[%v],err[%v]",
                                mp.volName, mp.PartitionID, resultCode, err)
                } else {
                        log.LogWarnf("action[addMetaRaftMemberAndSend],vol[%v],meta partition[%v],resultCode[%v]",
                                mp.volName, mp.PartitionID, resultCode)
                }
        }()

        t, err := mp.createTaskToAddRaftMember(addPeer, leaderAddr)
        if err != nil {
                return
        }
        leaderMetaNode, err := c.metaNode(leaderAddr)
        if err != nil {
                return
        }
        if resp, err = leaderMetaNode.Sender.syncSendAdminTask(t); err != nil {
                return
        }
        return
}

func (c *Cluster) addMetaPartitionRaftMember(partition *MetaPartition, addPeer proto.Peer) (err error) {
        var (
                candidateAddrs []string
                leaderAddr     string
        )
        candidateAddrs = make([]string, 0, len(partition.Hosts))
        leaderMr, err := partition.getMetaReplicaLeader()
        if err == nil {
                leaderAddr = leaderMr.Addr
                if contains(partition.Hosts, leaderAddr) {
                        candidateAddrs = append(candidateAddrs, leaderAddr)
                } else {
                        leaderAddr = ""
                }
        }
        for _, host := range partition.Hosts {
                if host == leaderAddr {
                        continue
                }
                candidateAddrs = append(candidateAddrs, host)
        }
        // send task to leader addr first,if need to retry,then send to other addr
        for index, host := range candidateAddrs {
                // wait for a new leader
                if leaderAddr == "" && len(candidateAddrs) < int(partition.ReplicaNum) {
                        time.Sleep(retrySendSyncTaskInternal)
                }
                _, err = c.buildAddMetaPartitionRaftMemberTaskAndSyncSend(partition, addPeer, host)
                if err == nil {
                        break
                }
                if index < len(candidateAddrs)-1 {
                        time.Sleep(retrySendSyncTaskInternal)
                }
        }
        return
}

func (c *Cluster) loadMetaPartitionAndCheckResponse(mp *MetaPartition) {
        go func() {
                c.doLoadMetaPartition(mp)
        }()
}

func (c *Cluster) doLoadMetaPartition(mp *MetaPartition) {
        var wg sync.WaitGroup
        mp.Lock()
        hosts := make([]string, len(mp.Hosts))
        copy(hosts, mp.Hosts)
        mp.LoadResponse = make([]*proto.MetaPartitionLoadResponse, 0)
        mp.Unlock()
        errChannel := make(chan error, len(hosts))
        for _, host := range hosts {
                wg.Add(1)
                go func(host string) {
                        defer func() {
                                wg.Done()
                        }()
                        mr, err := mp.getMetaReplica(host)
                        if err != nil {
                                errChannel <- err
                                return
                        }
                        task := mr.createTaskToLoadMetaPartition(mp.PartitionID)
                        response, err := mr.metaNode.Sender.syncSendAdminTask(task)
                        if err != nil {
                                errChannel <- err
                                return
                        }
                        loadResponse := &proto.MetaPartitionLoadResponse{}
                        if err = json.Unmarshal(response.Data, loadResponse); err != nil {
                                errChannel <- err
                                return
                        }
                        loadResponse.Addr = host
                        mp.addOrReplaceLoadResponse(loadResponse)
                }(host)
        }
        wg.Wait()
        select {
        case err := <-errChannel:
                msg := fmt.Sprintf("action[doLoadMetaPartition] vol[%v],mpID[%v],err[%v]", mp.volName, mp.PartitionID, err.Error())
                Warn(c.Name, msg)
                return
        default:
        }
        mp.checkSnapshot(c)
}

func (c *Cluster) doLoadDataPartition(dp *DataPartition) {
        log.LogInfo(fmt.Sprintf("action[doLoadDataPartition],partitionID:%v", dp.PartitionID))
        if !dp.needsToCompareCRC() {
                log.LogInfo(fmt.Sprintf("action[doLoadDataPartition],partitionID:%v isRecover[%v] don't need compare", dp.PartitionID, dp.isRecover))
                return
        }
        dp.resetFilesWithMissingReplica()
        loadTasks := dp.createLoadTasks()
        c.addDataNodeTasks(loadTasks)
        success := false
        for i := 0; i < timeToWaitForResponse; i++ {
                if dp.checkLoadResponse(c.cfg.DataPartitionTimeOutSec) {
                        success = true
                        break
                }
                time.Sleep(time.Second)
        }

        if !success {
                return
        }

        dp.getFileCount()
        if proto.IsNormalDp(dp.PartitionType) {
                dp.validateCRC(c.Name)
                dp.checkReplicaSize(c.Name, c.cfg.diffReplicaSpaceUsage)
        }

        dp.setToNormal()
}

func (c *Cluster) handleMetaNodeTaskResponse(nodeAddr string, task *proto.AdminTask) (err error) {
        if task == nil {
                return
        }
        log.LogDebugf(fmt.Sprintf("action[handleMetaNodeTaskResponse] receive Task response:%v from %v now:%v", task.IdString(), nodeAddr, time.Now().Unix()))
        var metaNode *MetaNode

        if metaNode, err = c.metaNode(nodeAddr); err != nil {
                goto errHandler
        }
        metaNode.Sender.DelTask(task)
        if err = unmarshalTaskResponse(task); err != nil {
                goto errHandler
        }

        switch task.OpCode {
        case proto.OpMetaNodeHeartbeat:
                response := task.Response.(*proto.MetaNodeHeartbeatResponse)
                err = c.dealMetaNodeHeartbeatResp(task.OperatorAddr, response)
        case proto.OpDeleteMetaPartition:
                response := task.Response.(*proto.DeleteMetaPartitionResponse)
                err = c.dealDeleteMetaPartitionResp(task.OperatorAddr, response)
        case proto.OpUpdateMetaPartition:
                response := task.Response.(*proto.UpdateMetaPartitionResponse)
                err = c.dealUpdateMetaPartitionResp(task.OperatorAddr, response)
        case proto.OpVersionOperation:
                response := task.Response.(*proto.MultiVersionOpResponse)
                err = c.dealOpMetaNodeMultiVerResp(task.OperatorAddr, response)
        default:
                err := fmt.Errorf("unknown operate code %v", task.OpCode)
                log.LogError(err)
        }

        if err != nil {
                log.LogError(fmt.Sprintf("process task[%v] failed", task.ToString()))
        } else {
                log.LogInfof("process task:%v status:%v success", task.IdString(), task.Status)
        }
        return
errHandler:
        log.LogError(fmt.Sprintf("action[handleMetaNodeTaskResponse],nodeAddr %v,taskId %v,err %v",
                nodeAddr, task.IdString(), err.Error()))
        return
}

func (c *Cluster) dealUpdateMetaPartitionResp(nodeAddr string, resp *proto.UpdateMetaPartitionResponse) (err error) {
        if resp.Status == proto.TaskFailed {
                msg := fmt.Sprintf("action[dealUpdateMetaPartitionResp],clusterID[%v] nodeAddr %v update meta partition failed,err %v",
                        c.Name, nodeAddr, resp.Result)
                log.LogError(msg)
                Warn(c.Name, msg)
        }
        return
}

func (c *Cluster) dealOpMetaNodeMultiVerResp(nodeAddr string, resp *proto.MultiVersionOpResponse) (err error) {
        if resp.Status == proto.TaskFailed {
                msg := fmt.Sprintf("action[dealOpMetaNodeMultiVerResp],clusterID[%v] volume [%v] nodeAddr %v operate meta partition snapshot version,err %v",
                        c.Name, resp.VolumeID, nodeAddr, resp.Result)
                log.LogError(msg)
                Warn(c.Name, msg)
        }
        var vol *Vol
        if vol, err = c.getVol(resp.VolumeID); err != nil {
                return
        }
        vol.VersionMgr.handleTaskRsp(resp, TypeMetaPartition)
        return
}

func (c *Cluster) dealOpDataNodeMultiVerResp(nodeAddr string, resp *proto.MultiVersionOpResponse) (err error) {
        if resp.Status == proto.TaskFailed {
                msg := fmt.Sprintf("action[dealOpMetaNodeMultiVerResp],clusterID[%v] volume [%v] nodeAddr %v operate meta partition snapshot version,err %v",
                        c.Name, resp.VolumeID, nodeAddr, resp.Result)
                log.LogError(msg)
                Warn(c.Name, msg)
        }
        var vol *Vol
        if vol, err = c.getVol(resp.VolumeID); err != nil {
                return
        }
        vol.VersionMgr.handleTaskRsp(resp, TypeDataPartition)
        return
}

func (c *Cluster) dealDeleteMetaPartitionResp(nodeAddr string, resp *proto.DeleteMetaPartitionResponse) (err error) {
        if resp.Status == proto.TaskFailed {
                msg := fmt.Sprintf("action[dealDeleteMetaPartitionResp],clusterID[%v] nodeAddr %v "+
                        "delete meta partition failed,err %v", c.Name, nodeAddr, resp.Result)
                log.LogError(msg)
                Warn(c.Name, msg)
                return
        }
        var mr *MetaReplica
        mp, err := c.getMetaPartitionByID(resp.PartitionID)
        if err != nil {
                goto errHandler
        }
        mp.Lock()
        defer mp.Unlock()
        if mr, err = mp.getMetaReplica(nodeAddr); err != nil {
                goto errHandler
        }
        mp.removeReplica(mr)
        return

errHandler:
        log.LogError(fmt.Sprintf("dealDeleteMetaPartitionResp %v", err))
        return
}

func (c *Cluster) dealMetaNodeHeartbeatResp(nodeAddr string, resp *proto.MetaNodeHeartbeatResponse) (err error) {
        var (
                metaNode *MetaNode
                logMsg   string
        )

        log.LogInfof("action[dealMetaNodeHeartbeatResp],clusterID[%v] receive nodeAddr[%v] heartbeat", c.Name, nodeAddr)
        if resp.Status == proto.TaskFailed {
                msg := fmt.Sprintf("action[dealMetaNodeHeartbeatResp],clusterID[%v] nodeAddr %v heartbeat failed,err %v",
                        c.Name, nodeAddr, resp.Result)
                log.LogError(msg)
                Warn(c.Name, msg)
                return
        }

        if metaNode, err = c.metaNode(nodeAddr); err != nil {
                goto errHandler
        }

        if metaNode.ToBeOffline {
                log.LogInfof("action[dealMetaNodeHeartbeatResp] dataNode is toBeOffline, addr[%s]", nodeAddr)
                return
        }

        if resp.ZoneName == "" {
                resp.ZoneName = DefaultZoneName
        }

        if metaNode.ZoneName != resp.ZoneName {
                c.t.deleteMetaNode(metaNode)
                oldZoneName := metaNode.ZoneName
                metaNode.ZoneName = resp.ZoneName
                c.adjustMetaNode(metaNode)
                log.LogWarnf("metaNode zone changed from [%v] to [%v]", oldZoneName, resp.ZoneName)
        }

        // change cpu util and io used
        metaNode.CpuUtil.Store(resp.CpuUtil)
        metaNode.updateMetric(resp, c.cfg.MetaNodeThreshold)
        metaNode.setNodeActive()

        if err = c.t.putMetaNode(metaNode); err != nil {
                log.LogErrorf("action[dealMetaNodeHeartbeatResp],metaNode[%v] error[%v]", metaNode.Addr, err)
        }
        c.updateMetaNode(metaNode, resp.MetaPartitionReports, metaNode.reachesThreshold())
        // todo remove, this no need set metaNode.metaPartitionInfos = nil
        // metaNode.metaPartitionInfos = nil
        logMsg = fmt.Sprintf("action[dealMetaNodeHeartbeatResp],metaNode:%v,zone[%v], ReportTime:%v  success", metaNode.Addr, metaNode.ZoneName, time.Now().Unix())
        log.LogInfof(logMsg)
        return
errHandler:
        logMsg = fmt.Sprintf("nodeAddr %v heartbeat error :%v", nodeAddr, errors.Stack(err))
        log.LogError(logMsg)
        return
}

func (c *Cluster) adjustMetaNode(metaNode *MetaNode) {
        c.mnMutex.Lock()
        defer c.mnMutex.Unlock()
        oldNodeSetID := metaNode.NodeSetID
        var err error
        defer func() {
                if err != nil {
                        err = fmt.Errorf("action[adjustMetaNode],clusterID[%v] addr:%v,zone[%v] err:%v ", c.Name, metaNode.Addr, metaNode.ZoneName, err.Error())
                        log.LogError(errors.Stack(err))
                        Warn(c.Name, err.Error())
                }
        }()
        var zone *Zone
        zone, err = c.t.getZone(metaNode.ZoneName)
        if err != nil {
                zone = newZone(metaNode.ZoneName)
                c.t.putZone(zone)
        }
        c.nsMutex.Lock()
        ns := zone.getAvailNodeSetForMetaNode()
        if ns == nil {
                if ns, err = zone.createNodeSet(c); err != nil {
                        c.nsMutex.Unlock()
                        return
                }
        }
        c.nsMutex.Unlock()

        metaNode.NodeSetID = ns.ID
        if err = c.syncUpdateMetaNode(metaNode); err != nil {
                metaNode.NodeSetID = oldNodeSetID
                return
        }
        if err = c.syncUpdateNodeSet(ns); err != nil {
                return
        }
        err = c.t.putMetaNode(metaNode)
        return
}

func (c *Cluster) handleDataNodeTaskResponse(nodeAddr string, task *proto.AdminTask) {
        if task == nil {
                log.LogInfof("action[handleDataNodeTaskResponse] receive addr[%v] task response,but task is nil", nodeAddr)
                return
        }
        if log.EnableDebug() {
                log.LogDebugf("action[handleDataNodeTaskResponse] receive addr[%v] task response:%v", nodeAddr, task.ToString())
        }
        var (
                err      error
                dataNode *DataNode
        )

        if dataNode, err = c.dataNode(nodeAddr); err != nil {
                goto errHandler
        }
        dataNode.TaskManager.DelTask(task)
        if err = unmarshalTaskResponse(task); err != nil {
                goto errHandler
        }

        switch task.OpCode {
        case proto.OpDeleteDataPartition:
                response := task.Response.(*proto.DeleteDataPartitionResponse)
                err = c.dealDeleteDataPartitionResponse(task.OperatorAddr, response)
        case proto.OpLoadDataPartition:
                response := task.Response.(*proto.LoadDataPartitionResponse)
                err = c.handleResponseToLoadDataPartition(task.OperatorAddr, response)
        case proto.OpDataNodeHeartbeat:
                response := task.Response.(*proto.DataNodeHeartbeatResponse)
                err = c.handleDataNodeHeartbeatResp(task.OperatorAddr, response)
        case proto.OpVersionOperation:
                response := task.Response.(*proto.MultiVersionOpResponse)
                err = c.dealOpDataNodeMultiVerResp(task.OperatorAddr, response)
        default:
                err = fmt.Errorf(fmt.Sprintf("unknown operate code %v", task.OpCode))
                goto errHandler
        }

        if err != nil {
                goto errHandler
        }
        return

errHandler:
        log.LogErrorf("process task[%v] failed,err:%v", task.ToString(), err)
        return
}

func (c *Cluster) dealDeleteDataPartitionResponse(nodeAddr string, resp *proto.DeleteDataPartitionResponse) (err error) {
        var dp *DataPartition
        if resp.Status == proto.TaskSucceeds {
                if dp, err = c.getDataPartitionByID(resp.PartitionId); err != nil {
                        return
                }
                dp.Lock()
                defer dp.Unlock()
                dp.removeReplicaByAddr(nodeAddr)

        } else {
                Warn(c.Name, fmt.Sprintf("clusterID[%v] delete data partition[%v] failed,err[%v]", c.Name, nodeAddr, resp.Result))
        }

        return
}

func (c *Cluster) handleResponseToLoadDataPartition(nodeAddr string, resp *proto.LoadDataPartitionResponse) (err error) {
        if resp.Status == proto.TaskFailed || resp.PartitionSnapshot == nil {
                return
        }
        var (
                dataNode *DataNode
                dp       *DataPartition
                vol      *Vol
        )
        if dataNode, err = c.dataNode(nodeAddr); err != nil {
                return
        }
        if resp.VolName != "" {
                vol, err = c.getVol(resp.VolName)
                if err != nil {
                        return
                }
                dp, err = vol.getDataPartitionByID(resp.PartitionId)
        } else {
                dp, err = c.getDataPartitionByID(resp.PartitionId)
        }
        if err != nil {
                return
        }
        dp.loadFile(dataNode, resp)

        return
}

func (c *Cluster) handleDataNodeHeartbeatResp(nodeAddr string, resp *proto.DataNodeHeartbeatResponse) (err error) {
        var (
                dataNode *DataNode
                logMsg   string
        )
        log.LogInfof("action[handleDataNodeHeartbeatResp] clusterID[%v] receive dataNode[%v] heartbeat, ", c.Name, nodeAddr)
        if resp.Status != proto.TaskSucceeds {
                Warn(c.Name, fmt.Sprintf("action[handleDataNodeHeartbeatResp] clusterID[%v] dataNode[%v] heartbeat task failed",
                        c.Name, nodeAddr))
                return
        }

        if dataNode, err = c.dataNode(nodeAddr); err != nil {
                goto errHandler
        }
        if dataNode.ToBeOffline {
                log.LogInfof("action[handleDataNodeHeartbeatResp] dataNode is toBeOffline, addr[%s]", nodeAddr)
                // return
        }
        if resp.ZoneName == "" {
                resp.ZoneName = DefaultZoneName
        }
        if dataNode.ZoneName != resp.ZoneName {
                c.t.deleteDataNode(dataNode)
                oldZoneName := dataNode.ZoneName
                dataNode.ZoneName = resp.ZoneName
                c.adjustDataNode(dataNode)
                log.LogWarnf("dataNode [%v] zone changed from [%v] to [%v]", dataNode.Addr, oldZoneName, resp.ZoneName)
        }
        // change cpu util and io used
        dataNode.CpuUtil.Store(resp.CpuUtil)
        dataNode.SetIoUtils(resp.IoUtils)

        dataNode.updateNodeMetric(resp)
        if err = c.t.putDataNode(dataNode); err != nil {
                log.LogErrorf("action[handleDataNodeHeartbeatResp] dataNode[%v],zone[%v],node set[%v], err[%v]", dataNode.Addr, dataNode.ZoneName, dataNode.NodeSetID, err)
        }
        c.updateDataNode(dataNode, resp.PartitionReports)
        logMsg = fmt.Sprintf("action[handleDataNodeHeartbeatResp],dataNode:%v,zone[%v], ReportTime:%v  success", dataNode.Addr, dataNode.ZoneName, time.Now().Unix())
        log.LogInfof(logMsg)
        return
errHandler:
        logMsg = fmt.Sprintf("nodeAddr %v heartbeat error :%v", nodeAddr, err.Error())
        log.LogError(logMsg)
        return
}

func (c *Cluster) adjustDataNode(dataNode *DataNode) {
        c.dnMutex.Lock()
        defer c.dnMutex.Unlock()
        oldNodeSetID := dataNode.NodeSetID
        var err error
        defer func() {
                if err != nil {
                        err = fmt.Errorf("action[adjustDataNode],clusterID[%v] dataNodeAddr:%v,zone[%v] err:%v ", c.Name, dataNode.Addr, dataNode.ZoneName, err.Error())
                        log.LogError(errors.Stack(err))
                        Warn(c.Name, err.Error())
                }
        }()
        var zone *Zone
        zone, err = c.t.getZone(dataNode.ZoneName)
        if err != nil {
                zone = newZone(dataNode.ZoneName)
                c.t.putZone(zone)
        }

        c.nsMutex.Lock()
        ns := zone.getAvailNodeSetForDataNode()
        if ns == nil {
                if ns, err = zone.createNodeSet(c); err != nil {
                        c.nsMutex.Unlock()
                        return
                }
        }
        c.nsMutex.Unlock()

        dataNode.NodeSetID = ns.ID
        if err = c.syncUpdateDataNode(dataNode); err != nil {
                dataNode.NodeSetID = oldNodeSetID
                return
        }
        if err = c.syncUpdateNodeSet(ns); err != nil {
                return
        }
        err = c.t.putDataNode(dataNode)
        return
}

/*if node report data partition infos,so range data partition infos,then update data partition info*/
func (c *Cluster) updateDataNode(dataNode *DataNode, dps []*proto.DataPartitionReport) {
        for _, vr := range dps {
                if vr == nil {
                        continue
                }
                if vr.VolName != "" {
                        vol, err := c.getVol(vr.VolName)
                        if err != nil {
                                continue
                        }
                        if vol.Status == proto.VolStatusMarkDelete {
                                continue
                        }
                        if dp, err := vol.getDataPartitionByID(vr.PartitionID); err == nil {
                                dp.updateMetric(vr, dataNode, c)
                        }
                } else {
                        if dp, err := c.getDataPartitionByID(vr.PartitionID); err == nil {
                                dp.updateMetric(vr, dataNode, c)
                        }
                }
        }
}

func (c *Cluster) updateMetaNode(metaNode *MetaNode, metaPartitions []*proto.MetaPartitionReport, threshold bool) {
        var (
                vol *Vol
                err error
        )
        for _, mr := range metaPartitions {
                if mr == nil {
                        continue
                }
                var mp *MetaPartition
                if mr.VolName != "" {

                        vol, err = c.getVol(mr.VolName)
                        if err != nil {
                                continue
                        }

                        if vol.Status == proto.VolStatusMarkDelete {
                                continue
                        }

                        mp, err = vol.metaPartition(mr.PartitionID)
                        if err != nil {
                                continue
                        }

                } else {
                        mp, err = c.getMetaPartitionByID(mr.PartitionID)
                        if err != nil {
                                continue
                        }
                }

                // send latest end to replica metanode, including updating the end after MaxMP split when the old MaxMP is unavailable
                if mr.End != mp.End {
                        mp.addUpdateMetaReplicaTask(c)
                }

                mp.updateMetaPartition(mr, metaNode)
                vol.uidSpaceManager.volUidUpdate(mr)
                vol.quotaManager.quotaUpdate(mr)
                c.updateInodeIDUpperBound(mp, mr, threshold, metaNode)
        }
}

func (c *Cluster) updateInodeIDUpperBound(mp *MetaPartition, mr *proto.MetaPartitionReport, hasArriveThreshold bool, metaNode *MetaNode) (err error) {
        if !hasArriveThreshold {
                return
        }
        var vol *Vol
        if vol, err = c.getVol(mp.volName); err != nil {
                log.LogWarnf("action[updateInodeIDRange] vol[%v] not found", mp.volName)
                return
        }

        maxPartitionID := vol.maxPartitionID()
        if mr.PartitionID < maxPartitionID {
                return
        }
        var end uint64
        metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
        if mr.MaxInodeID <= 0 {
                end = mr.Start + metaPartitionInodeIdStep
        } else {
                end = mr.MaxInodeID + metaPartitionInodeIdStep
        }
        log.LogWarnf("mpId[%v],start[%v],end[%v],addr[%v],used[%v]", mp.PartitionID, mp.Start, mp.End, metaNode.Addr, metaNode.Used)
        if c.cfg.DisableAutoCreate {
                log.LogWarnf("updateInodeIDUpperBound: disable auto create meta partition, mp %d", mp.PartitionID)
                return
        }
        if err = vol.splitMetaPartition(c, mp, end, metaPartitionInodeIdStep, false); err != nil {
                log.LogError(err)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        syslog "log"
        "strconv"
        "strings"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        pt "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
)

// config key
const (
        colonSplit = ":"
        commaSplit = ","
        cfgPeers   = "peers"
        // if the data partition has not been reported within this interval  (in terms of seconds), it will be considered as missing.
        missingDataPartitionInterval        = "missingDataPartitionInterval"
        cfgDpNoLeaderReportIntervalSec      = "dpNoLeaderReportIntervalSec"
        cfgMpNoLeaderReportIntervalSec      = "mpNoLeaderReportIntervalSec"
        dataPartitionTimeOutSec             = "dataPartitionTimeOutSec"
        NumberOfDataPartitionsToLoad        = "numberOfDataPartitionsToLoad"
        secondsToFreeDataPartitionAfterLoad = "secondsToFreeDataPartitionAfterLoad"
        nodeSetCapacity                     = "nodeSetCap"
        cfgMetaNodeReservedMem              = "metaNodeReservedMem"
        heartbeatPortKey                    = "heartbeatPort"
        replicaPortKey                      = "replicaPort"
        faultDomain                         = "faultDomain"
        cfgDomainBatchGrpCnt                = "faultDomainGrpBatchCnt"
        cfgDomainBuildAsPossible            = "faultDomainBuildAsPossible"
        cfgmetaPartitionInodeIdStep         = "metaPartitionInodeIdStep"
        cfgMaxQuotaNumPerVol                = "maxQuotaNumPerVol"
        disableAutoCreate                   = "disableAutoCreate"
        cfgMonitorPushAddr                  = "monitorPushAddr"
        intervalToScanS3Expiration          = "intervalToScanS3Expiration"

        cfgVolForceDeletion           = "volForceDeletion"
        cfgVolDeletionDentryThreshold = "volDeletionDentryThreshold"
)

// default value
const (
        defaultTobeFreedDataPartitionCount         = 1000
        defaultSecondsToFreeDataPartitionAfterLoad = 5 * 60 // a data partition can only be freed after loading 5 mins
        defaultIntervalToFreeDataPartition         = 10     // in terms of seconds
        defaultIntervalToCheck                     = 60
        defaultIntervalToCheckHeartbeat            = 6
        defaultIntervalToCheckDataPartition        = 5
        defaultIntervalToCheckQos                  = 1
        defaultIntervalToCheckCrc                  = 20 * defaultIntervalToCheck // in terms of seconds
        noHeartBeatTimes                           = 3                           // number of times that no heartbeat reported
        defaultNodeTimeOutSec                      = noHeartBeatTimes * defaultIntervalToCheckHeartbeat
        defaultDataPartitionTimeOutSec             = 5 * defaultIntervalToCheckHeartbeat
        defaultMissingDataPartitionInterval        = 24 * 3600
        defaultDpNoLeaderReportIntervalSec         = 10 * 60
        defaultMpNoLeaderReportIntervalSec         = 5
        defaultIntervalToAlarmMissingDataPartition = 60 * 60
        timeToWaitForResponse                      = 120         // time to wait for response by the master during loading partition
        defaultPeriodToLoadAllDataPartitions       = 60 * 60 * 4 // how long we need to load all the data partitions on the master every time
        defaultNumberOfDataPartitionsToLoad        = 50          // how many data partitions to load every time
        defaultMetaPartitionTimeOutSec             = 10 * defaultIntervalToCheckHeartbeat
        // DefaultMetaPartitionMissSec                         = 3600

        defaultIntervalToAlarmMissingMetaPartition         = 10 * 60 // interval of checking if a replica is missing
        defaultMetaPartitionMemUsageThreshold      float32 = 0.75    // memory usage threshold on a meta partition
        defaultDomainUsageThreshold                float64 = 0.90    // storage usage threshold on a data partition
        defaultOverSoldFactor                      float32 = 0       // 0 means no oversold limit
        defaultMaxMetaPartitionCountOnEachNode             = 10000
        defaultReplicaNum                                  = 3
        defaultDiffSpaceUsage                              = 1024 * 1024 * 1024
        defaultDiffReplicaFileCount                        = 20
        defaultNodeSetGrpStep                              = 1
        defaultMasterMinQosAccept                          = 20000
        defaultMaxDpCntLimit                               = 3000
        defaultIntervalToScanS3Expiration                  = 12 * 3600
        defaultMaxConcurrentLcNodes                        = 3
        defaultIntervalToCheckDelVerTaskExpiration         = 3
        metaPartitionInodeUsageThreshold           float64 = 0.75 // inode usage threshold on a meta partition
        lowerLimitRWMetaPartition                          = 3    // lower limit of RW meta partition, equal defaultReplicaNum
)

// AddrDatabase is a map that stores the address of a given host (e.g., the leader)
var AddrDatabase = make(map[uint64]string)

type clusterConfig struct {
        secondsToFreeDataPartitionAfterLoad int64
        NodeTimeOutSec                      int64
        MissingDataPartitionInterval        int64
        DpNoLeaderReportIntervalSec         int64
        MpNoLeaderReportIntervalSec         int64
        DataPartitionTimeOutSec             int64
        IntervalToAlarmMissingDataPartition int64
        PeriodToLoadALLDataPartitions       int64
        metaNodeReservedMem                 uint64
        IntervalToCheckDataPartition        int // seconds
        IntervalToCheckQos                  int // seconds
        numberOfDataPartitionsToFree        int
        numberOfDataPartitionsToLoad        int
        nodeSetCapacity                     int
        MetaNodeThreshold                   float32
        ClusterLoadFactor                   float32
        MetaNodeDeleteBatchCount            uint64 // metanode delete batch count
        DataNodeDeleteLimitRate             uint64 // datanode delete limit rate
        MetaNodeDeleteWorkerSleepMs         uint64 // metaNode delete worker sleep time with millisecond. if 0 for no sleep
        MaxDpCntLimit                       uint64 // datanode data partition limit
        DataNodeAutoRepairLimitRate         uint64 // datanode autorepair limit rate
        DpMaxRepairErrCnt                   uint64
        DpRepairTimeOut                     uint64
        peers                               []raftstore.PeerAddress
        peerAddrs                           []string
        heartbeatPort                       int64
        replicaPort                         int64
        diffReplicaSpaceUsage               uint64
        diffReplicaFileCount                uint32
        faultDomain                         bool
        DefaultNormalZoneCnt                int
        DomainBuildAsPossible               bool
        DataPartitionUsageThreshold         float64
        QosMasterAcceptLimit                uint64
        DirChildrenNumLimit                 uint32
        MetaPartitionInodeIdStep            uint64
        MaxQuotaNumPerVol                   int
        DisableAutoCreate                   bool
        MonitorPushAddr                     string
        IntervalToScanS3Expiration          int64
        MaxConcurrentLcNodes                uint64

        volForceDeletion           bool   // when delete a volume, ignore it's dentry count or not
        volDeletionDentryThreshold uint64 // in case of volForceDeletion is set to false, define the dentry count threshold to allow volume deletion
}

func newClusterConfig() (cfg *clusterConfig) {
        cfg = new(clusterConfig)
        cfg.numberOfDataPartitionsToFree = defaultTobeFreedDataPartitionCount
        cfg.secondsToFreeDataPartitionAfterLoad = defaultSecondsToFreeDataPartitionAfterLoad
        cfg.NodeTimeOutSec = defaultNodeTimeOutSec
        cfg.MissingDataPartitionInterval = defaultMissingDataPartitionInterval
        cfg.DpNoLeaderReportIntervalSec = defaultDpNoLeaderReportIntervalSec
        cfg.MpNoLeaderReportIntervalSec = defaultMpNoLeaderReportIntervalSec
        cfg.DataPartitionTimeOutSec = defaultDataPartitionTimeOutSec
        cfg.IntervalToCheckDataPartition = defaultIntervalToCheckDataPartition
        cfg.IntervalToCheckQos = defaultIntervalToCheckQos
        cfg.IntervalToAlarmMissingDataPartition = defaultIntervalToAlarmMissingDataPartition
        cfg.numberOfDataPartitionsToLoad = defaultNumberOfDataPartitionsToLoad
        cfg.PeriodToLoadALLDataPartitions = defaultPeriodToLoadAllDataPartitions
        cfg.MetaNodeThreshold = defaultMetaPartitionMemUsageThreshold
        cfg.ClusterLoadFactor = defaultOverSoldFactor
        cfg.MaxDpCntLimit = defaultMaxDpCntLimit
        cfg.metaNodeReservedMem = defaultMetaNodeReservedMem
        cfg.diffReplicaSpaceUsage = defaultDiffSpaceUsage
        cfg.diffReplicaFileCount = defaultDiffReplicaFileCount
        cfg.QosMasterAcceptLimit = defaultMasterMinQosAccept
        cfg.DirChildrenNumLimit = pt.DefaultDirChildrenNumLimit
        cfg.MetaPartitionInodeIdStep = defaultMetaPartitionInodeIDStep
        cfg.MaxQuotaNumPerVol = defaultMaxQuotaNumPerVol
        cfg.IntervalToScanS3Expiration = defaultIntervalToScanS3Expiration
        cfg.MaxConcurrentLcNodes = defaultMaxConcurrentLcNodes
        return
}

func parsePeerAddr(peerAddr string) (id uint64, ip string, port uint64, err error) {
        peerStr := strings.Split(peerAddr, colonSplit)
        id, err = strconv.ParseUint(peerStr[0], 10, 64)
        if err != nil {
                return
        }
        port, err = strconv.ParseUint(peerStr[2], 10, 64)
        if err != nil {
                return
        }
        ip = peerStr[1]
        return
}

func (cfg *clusterConfig) parsePeers(peerStr string) error {
        peerArr := strings.Split(peerStr, commaSplit)
        cfg.peerAddrs = peerArr
        for _, peerAddr := range peerArr {
                id, ip, port, err := parsePeerAddr(peerAddr)
                if err != nil {
                        return err
                }
                cfg.peers = append(cfg.peers, raftstore.PeerAddress{Peer: proto.Peer{ID: id}, Address: ip, HeartbeatPort: int(cfg.heartbeatPort), ReplicaPort: int(cfg.replicaPort)})
                address := fmt.Sprintf("%v:%v", ip, port)
                syslog.Println(address)
                AddrDatabase[id] = address
        }
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/atomicutil"
        "github.com/cubefs/cubefs/util/log"
)

// DataNode stores all the information about a data node
type DataNode struct {
        Total                     uint64 `json:"TotalWeight"`
        Used                      uint64 `json:"UsedWeight"`
        AvailableSpace            uint64
        ID                        uint64
        ZoneName                  string `json:"Zone"`
        Addr                      string
        DomainAddr                string
        ReportTime                time.Time
        StartTime                 int64
        LastUpdateTime            time.Time
        isActive                  bool
        sync.RWMutex              `graphql:"-"`
        UsageRatio                float64           // used / total space
        SelectedTimes             uint64            // number times that this datanode has been selected as the location for a data partition.
        TaskManager               *AdminTaskManager `graphql:"-"`
        DataPartitionReports      []*proto.DataPartitionReport
        DataPartitionCount        uint32
        TotalPartitionSize        uint64
        NodeSetID                 uint64
        PersistenceDataPartitions []uint64
        BadDisks                  []string            // Keep this old field for compatibility
        BadDiskStats              []proto.BadDiskStat // key: disk path
        DecommissionedDisks       sync.Map
        ToBeOffline               bool
        RdOnly                    bool
        MigrateLock               sync.RWMutex
        QosIopsRLimit             uint64
        QosIopsWLimit             uint64
        QosFlowRLimit             uint64
        QosFlowWLimit             uint64
        DecommissionStatus        uint32
        DecommissionDstAddr       string
        DecommissionRaftForce     bool
        DecommissionRetry         uint8
        DecommissionLimit         int
        DecommissionCompleteTime  int64
        DpCntLimit                DpCountLimiter     `json:"-"` // max count of data partition in a data node
        CpuUtil                   atomicutil.Float64 `json:"-"`
        ioUtils                   atomic.Value       `json:"-"`
        DecommissionDiskList      []string
        DecommissionDpTotal       int
}

func newDataNode(addr, zoneName, clusterID string) (dataNode *DataNode) {
        dataNode = new(DataNode)
        dataNode.Total = 1
        dataNode.Addr = addr
        dataNode.ZoneName = zoneName
        dataNode.LastUpdateTime = time.Now().Add(-time.Minute)
        dataNode.TaskManager = newAdminTaskManager(dataNode.Addr, clusterID)
        dataNode.DecommissionStatus = DecommissionInitial
        dataNode.DpCntLimit = newDpCountLimiter(nil)
        dataNode.CpuUtil.Store(0)
        dataNode.SetIoUtils(make(map[string]float64))
        return
}

func (dataNode *DataNode) GetIoUtils() map[string]float64 {
        return dataNode.ioUtils.Load().(map[string]float64)
}

func (dataNode *DataNode) SetIoUtils(used map[string]float64) {
        dataNode.ioUtils.Store(used)
}

func (dataNode *DataNode) checkLiveness() {
        dataNode.Lock()
        defer dataNode.Unlock()
        log.LogInfof("action[checkLiveness] datanode[%v] report time[%v],since report time[%v], need gap [%v]",
                dataNode.Addr, dataNode.ReportTime, time.Since(dataNode.ReportTime), time.Second*time.Duration(defaultNodeTimeOutSec))
        if time.Since(dataNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
                dataNode.isActive = false
        }

        return
}

func (dataNode *DataNode) badPartitions(diskPath string, c *Cluster) (partitions []*DataPartition) {
        partitions = make([]*DataPartition, 0)
        vols := c.copyVols()
        if len(vols) == 0 {
                return partitions
        }
        for _, vol := range vols {
                dps := vol.dataPartitions.checkBadDiskDataPartitions(diskPath, dataNode.Addr)
                partitions = append(partitions, dps...)
        }
        return
}

func (dataNode *DataNode) getDisks(c *Cluster) (diskPaths []string) {
        diskPaths = make([]string, 0)
        vols := c.copyVols()
        if len(vols) == 0 {
                return diskPaths
        }
        for _, vol := range vols {
                disks := vol.dataPartitions.getReplicaDiskPaths(dataNode.Addr)
                for _, disk := range disks {
                        if inStingList(disk, diskPaths) {
                                continue
                        }
                        diskPaths = append(diskPaths, disk)
                }
        }

        return
}

func (dataNode *DataNode) updateNodeMetric(resp *proto.DataNodeHeartbeatResponse) {
        dataNode.Lock()
        defer dataNode.Unlock()
        dataNode.DomainAddr = util.ParseIpAddrToDomainAddr(dataNode.Addr)
        dataNode.Total = resp.Total
        dataNode.Used = resp.Used
        if dataNode.AvailableSpace > resp.Available ||
                time.Since(dataNode.LastUpdateTime) > defaultNodeTimeOutSec*time.Second {
                dataNode.AvailableSpace = resp.Available
                dataNode.LastUpdateTime = time.Now()
        }
        dataNode.ZoneName = resp.ZoneName
        dataNode.DataPartitionCount = resp.CreatedPartitionCnt
        dataNode.DataPartitionReports = resp.PartitionReports
        dataNode.TotalPartitionSize = resp.TotalPartitionSize

        dataNode.BadDisks = resp.BadDisks
        dataNode.BadDiskStats = resp.BadDiskStats

        dataNode.StartTime = resp.StartTime
        if dataNode.Total == 0 {
                dataNode.UsageRatio = 0.0
        } else {
                dataNode.UsageRatio = (float64)(dataNode.Used) / (float64)(dataNode.Total)
        }
        dataNode.ReportTime = time.Now()
        dataNode.isActive = true
        log.LogDebugf("updateNodeMetric. datanode id %v addr %v total %v used %v avaliable %v", dataNode.ID, dataNode.Addr,
                dataNode.Total, dataNode.Used, dataNode.AvailableSpace)
}

func (dataNode *DataNode) canAlloc() bool {
        dataNode.RLock()
        defer dataNode.RUnlock()

        if !overSoldLimit() {
                return true
        }

        maxCapacity := overSoldCap(dataNode.Total)
        if maxCapacity < dataNode.TotalPartitionSize {
                return false
        }

        return true
}

func (dataNode *DataNode) isWriteAble() (ok bool) {
        dataNode.RLock()
        defer dataNode.RUnlock()

        if dataNode.isActive && dataNode.AvailableSpace > 10*util.GB && !dataNode.RdOnly {
                ok = true
        }

        return
}

func (dataNode *DataNode) canAllocDp() bool {
        if !dataNode.isWriteAble() {
                return false
        }

        if dataNode.ToBeOffline {
                log.LogWarnf("action[canAllocDp] dataNode [%v] is offline ", dataNode.Addr)
                return false
        }

        if !dataNode.dpCntInLimit() {
                return false
        }

        return true
}

func (dataNode *DataNode) GetDpCntLimit() uint32 {
        return uint32(dataNode.DpCntLimit.GetCntLimit())
}

func (dataNode *DataNode) dpCntInLimit() bool {
        return dataNode.DataPartitionCount <= dataNode.GetDpCntLimit()
}

func (dataNode *DataNode) isWriteAbleWithSize(size uint64) (ok bool) {
        dataNode.RLock()
        defer dataNode.RUnlock()

        if dataNode.isActive == true && dataNode.AvailableSpace > size {
                ok = true
        }

        return
}

func (dataNode *DataNode) GetID() uint64 {
        dataNode.RLock()
        defer dataNode.RUnlock()
        return dataNode.ID
}

func (dataNode *DataNode) GetAddr() string {
        dataNode.RLock()
        defer dataNode.RUnlock()
        return dataNode.Addr
}

// SelectNodeForWrite implements "SelectNodeForWrite" in the Node interface
func (dataNode *DataNode) SelectNodeForWrite() {
        dataNode.Lock()
        defer dataNode.Unlock()
        dataNode.UsageRatio = float64(dataNode.Used) / float64(dataNode.Total)
        dataNode.SelectedTimes++
}

func (dataNode *DataNode) clean() {
        dataNode.TaskManager.exitCh <- struct{}{}
}

func (dataNode *DataNode) createHeartbeatTask(masterAddr string, enableDiskQos bool) (task *proto.AdminTask) {
        request := &proto.HeartBeatRequest{
                CurrTime:   time.Now().Unix(),
                MasterAddr: masterAddr,
        }
        request.EnableDiskQos = enableDiskQos
        request.QosIopsReadLimit = dataNode.QosIopsRLimit
        request.QosIopsWriteLimit = dataNode.QosIopsWLimit
        request.QosFlowReadLimit = dataNode.QosFlowRLimit
        request.QosFlowWriteLimit = dataNode.QosFlowWLimit
        request.DecommissionDisks = dataNode.getDecommissionedDisks()

        task = proto.NewAdminTask(proto.OpDataNodeHeartbeat, dataNode.Addr, request)
        return
}

func (dataNode *DataNode) addDecommissionedDisk(diskPath string) (exist bool) {
        _, exist = dataNode.DecommissionedDisks.LoadOrStore(diskPath, struct{}{})
        log.LogInfof("action[addDecommissionedDisk] finish, exist[%v], decommissioned disk[%v], dataNode[%v]", exist, diskPath, dataNode.Addr)
        return
}

func (dataNode *DataNode) deleteDecommissionedDisk(diskPath string) (exist bool) {
        _, exist = dataNode.DecommissionedDisks.LoadAndDelete(diskPath)
        log.LogInfof("action[deleteDecommissionedDisk] finish, exist[%v], decommissioned disk[%v], dataNode[%v]", exist, diskPath, dataNode.Addr)
        return
}

func (dataNode *DataNode) getDecommissionedDisks() (decommissionedDisks []string) {
        dataNode.DecommissionedDisks.Range(func(key, value interface{}) bool {
                if diskPath, ok := key.(string); ok {
                        decommissionedDisks = append(decommissionedDisks, diskPath)
                }
                return true
        })
        return
}

func (dataNode *DataNode) updateDecommissionStatus(c *Cluster, debug bool) (uint32, float64) {
        var (
                partitionIds        []uint64
                failedPartitionIds  []uint64
                runningPartitionIds []uint64
                preparePartitionIds []uint64
                stopPartitionIds    []uint64
                totalDisk           = len(dataNode.DecommissionDiskList)
                markDiskNum         = 0
                successDiskNum      = 0
                progress            float64
        )
        if dataNode.GetDecommissionStatus() == DecommissionInitial {
                return DecommissionInitial, float64(0)
        }
        if dataNode.GetDecommissionStatus() == markDecommission {
                return markDecommission, float64(0)
        }
        if dataNode.GetDecommissionStatus() == DecommissionSuccess {
                return DecommissionSuccess, float64(1)
        }
        if dataNode.GetDecommissionStatus() == DecommissionPause {
                return DecommissionPause, float64(0)
        }
        defer func() {
                c.syncUpdateDataNode(dataNode)
        }()
        // not enter running status
        if dataNode.DecommissionRetry >= defaultDecommissionRetryLimit {
                dataNode.markDecommissionFail()
                return DecommissionFail, float64(0)
        }

        log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v diskList %v",
                dataNode.Addr, dataNode.DecommissionDiskList)

        if totalDisk == 0 {
                dataNode.SetDecommissionStatus(DecommissionInitial)
                return DecommissionInitial, float64(0)
        }
        for _, disk := range dataNode.DecommissionDiskList {
                key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
                // if not found, may already success, so only care running disk
                if value, ok := c.DecommissionDisks.Load(key); ok {
                        dd := value.(*DecommissionDisk)
                        status := dd.GetDecommissionStatus()
                        if status == DecommissionSuccess {
                                successDiskNum++
                        } else if status == markDecommission {
                                markDiskNum++
                        }
                        _, diskProgress := dd.updateDecommissionStatus(c, debug)
                        progress += diskProgress
                } else {
                        successDiskNum++ // disk with DecommissionSuccess will be removed from cache
                        progress += float64(1)
                }

        }
        // only care data node running/prepare/success
        // no disk get token
        if markDiskNum == totalDisk {
                dataNode.SetDecommissionStatus(DecommissionPrepare)
                return DecommissionPrepare, float64(0)
        } else {
                if successDiskNum == totalDisk {
                        dataNode.SetDecommissionStatus(DecommissionSuccess)
                        return DecommissionSuccess, float64(1)
                }
        }
        // update datanode or running status
        partitions := dataNode.GetLatestDecommissionDataPartition(c)
        // Get all dp on this dataNode
        failedNum := 0
        runningNum := 0
        prepareNum := 0
        stopNum := 0

        for _, dp := range partitions {
                if dp.IsDecommissionFailed() {
                        failedNum++
                        failedPartitionIds = append(failedPartitionIds, dp.PartitionID)
                }
                if dp.GetDecommissionStatus() == DecommissionRunning {
                        runningNum++
                        runningPartitionIds = append(runningPartitionIds, dp.PartitionID)
                }
                if dp.GetDecommissionStatus() == DecommissionPrepare {
                        prepareNum++
                        preparePartitionIds = append(preparePartitionIds, dp.PartitionID)
                }
                // datanode may stop before and will be counted into partitions
                if dp.GetDecommissionStatus() == DecommissionPause {
                        stopNum++
                        stopPartitionIds = append(stopPartitionIds, dp.PartitionID)
                }
                partitionIds = append(partitionIds, dp.PartitionID)
        }
        progress = progress / float64(totalDisk)
        if failedNum >= (len(partitions)-stopNum) && failedNum != 0 {
                dataNode.markDecommissionFail()
                return DecommissionFail, progress
        }
        dataNode.SetDecommissionStatus(DecommissionRunning)
        if debug {
                log.LogInfof("action[updateDecommissionStatus] dataNode[%v] progress[%v] totalNum[%v] "+
                        "partitionIds %v  FailedNum[%v] failedPartitionIds %v, runningNum[%v] runningDp %v, prepareNum[%v] prepareDp %v "+
                        "stopNum[%v] stopPartitionIds %v ",
                        dataNode.Addr, progress, len(partitions), partitionIds, failedNum, failedPartitionIds, runningNum, runningPartitionIds,
                        prepareNum, preparePartitionIds, stopNum, stopPartitionIds)
        }
        return DecommissionRunning, progress
}

func (dataNode *DataNode) GetLatestDecommissionDataPartition(c *Cluster) (partitions []*DataPartition) {
        log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v diskList %v", dataNode.Addr, dataNode.DecommissionDiskList)
        for _, disk := range dataNode.DecommissionDiskList {
                key := fmt.Sprintf("%s_%s", dataNode.Addr, disk)
                // if not found, may already success, so only care running disk
                if value, ok := c.DecommissionDisks.Load(key); ok {
                        dd := value.(*DecommissionDisk)
                        dps := c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)
                        partitions = append(partitions, dps...)
                        dpIds := make([]uint64, 0)
                        for _, dp := range dps {
                                dpIds = append(dpIds, dp.PartitionID)
                        }
                        log.LogDebugf("action[GetLatestDecommissionDataPartition]dataNode %v disk %v dps[%v]",
                                dataNode.Addr, dd.DiskPath, dpIds)
                }
        }
        return
}

func (dataNode *DataNode) GetDecommissionStatus() uint32 {
        return atomic.LoadUint32(&dataNode.DecommissionStatus)
}

func (dataNode *DataNode) SetDecommissionStatus(status uint32) {
        atomic.StoreUint32(&dataNode.DecommissionStatus, status)
}

func (dataNode *DataNode) GetDecommissionFailedDPByTerm(c *Cluster) (error, []uint64) {
        var (
                failedDps []uint64
                err       error
        )
        if dataNode.GetDecommissionStatus() != DecommissionFail {
                err = fmt.Errorf("action[GetDecommissionDataNodeFailedDP]dataNode[%s] status must be failed,but[%d]",
                        dataNode.Addr, dataNode.GetDecommissionStatus())
                return err, failedDps
        }
        partitions := dataNode.GetLatestDecommissionDataPartition(c)
        log.LogDebugf("action[GetDecommissionDataNodeFailedDP] partitions len %v", len(partitions))
        for _, dp := range partitions {
                if dp.IsDecommissionFailed() {
                        failedDps = append(failedDps, dp.PartitionID)
                        log.LogWarnf("action[GetDecommissionDataNodeFailedDP] dp[%v] failed", dp.PartitionID)
                }
        }
        log.LogWarnf("action[GetDecommissionDataNodeFailedDP] failed dp list [%v]", failedDps)
        return nil, failedDps
}

func (dataNode *DataNode) GetDecommissionFailedDP(c *Cluster) (error, []uint64) {
        var (
                failedDps []uint64
                err       error
        )
        if dataNode.GetDecommissionStatus() != DecommissionFail {
                err = fmt.Errorf("action[GetDecommissionDataNodeFailedDP]dataNode[%s] status must be failed,but[%d]",
                        dataNode.Addr, dataNode.GetDecommissionStatus())
                return err, failedDps
        }
        partitions := c.getAllDecommissionDataPartitionByDataNode(dataNode.Addr)
        log.LogDebugf("action[GetDecommissionDataNodeFailedDP] partitions len %v", len(partitions))
        for _, dp := range partitions {
                if dp.IsDecommissionFailed() {
                        failedDps = append(failedDps, dp.PartitionID)
                        log.LogWarnf("action[GetDecommissionDataNodeFailedDP] dp[%v] failed", dp.PartitionID)
                }
        }
        log.LogWarnf("action[GetDecommissionDataNodeFailedDP] failed dp list [%v]", failedDps)
        return nil, failedDps
}

func (dataNode *DataNode) markDecommission(targetAddr string, raftForce bool, limit int) {
        dataNode.SetDecommissionStatus(markDecommission)
        dataNode.DecommissionRaftForce = raftForce
        dataNode.DecommissionDstAddr = targetAddr
        // reset decommission status for failed once
        dataNode.DecommissionRetry = 0
        dataNode.DecommissionLimit = limit
        dataNode.DecommissionDiskList = make([]string, 0)
}

func (dataNode *DataNode) canMarkDecommission() bool {
        status := dataNode.GetDecommissionStatus()
        return status == DecommissionInitial || status == DecommissionPause || status == DecommissionFail
}

func (dataNode *DataNode) markDecommissionSuccess(c *Cluster) {
        dataNode.SetDecommissionStatus(DecommissionSuccess)
        partitions := c.getAllDataPartitionByDataNode(dataNode.Addr)
        // if only decommission part of data partitions, can alloc dp in future
        if len(partitions) != 0 {
                dataNode.ToBeOffline = false
        }
        dataNode.DecommissionCompleteTime = time.Now().Unix()
}

func (dataNode *DataNode) markDecommissionFail() {
        dataNode.SetDecommissionStatus(DecommissionFail)
        // dataNode.ToBeOffline = false
        // dataNode.DecommissionCompleteTime = time.Now().Unix()
}

func (dataNode *DataNode) resetDecommissionStatus() {
        dataNode.SetDecommissionStatus(DecommissionInitial)
        dataNode.DecommissionRaftForce = false
        dataNode.DecommissionDstAddr = ""
        dataNode.DecommissionRetry = 0
        dataNode.DecommissionLimit = 0
        dataNode.DecommissionCompleteTime = 0
        dataNode.DecommissionDiskList = make([]string, 0)
}

func (dataNode *DataNode) createVersionTask(volume string, version uint64, op uint8, addr string, verList []*proto.VolVersionInfo) (task *proto.AdminTask) {
        request := &proto.MultiVersionOpRequest{
                VolumeID:   volume,
                VerSeq:     version,
                Op:         uint8(op),
                Addr:       addr,
                VolVerList: verList,
        }
        log.LogInfof("action[createVersionTask] op %v  datanode addr %v addr %v volume %v seq %v", op, dataNode.Addr, addr, volume, version)
        task = proto.NewAdminTask(proto.OpVersionOperation, dataNode.Addr, request)
        return
}

func (dataNode *DataNode) CanBePaused() bool {
        status := dataNode.GetDecommissionStatus()
        if status == DecommissionRunning || status == markDecommission || status == DecommissionPause {
                return true
        }
        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// DataPartition represents the structure of storing the file contents.
type DataPartition struct {
        PartitionID      uint64
        PartitionType    int
        PartitionTTL     int64
        LastLoadedTime   int64
        ReplicaNum       uint8
        Status           int8
        isRecover        bool
        Replicas         []*DataReplica
        LeaderReportTime int64
        Hosts            []string // host addresses
        Peers            []proto.Peer
        offlineMutex     sync.RWMutex
        sync.RWMutex

        total                   uint64
        used                    uint64
        MissingNodes            map[string]int64 // key: address of the missing node, value: when the node is missing
        VolName                 string
        VolID                   uint64
        modifyTime              int64
        createTime              int64
        lastWarnTime            int64
        OfflinePeerID           uint64
        FileInCoreMap           map[string]*FileInCore
        FilesWithMissingReplica map[string]int64 // key: file name, value: last time when a missing replica is found

        RdOnly                         bool
        addReplicaMutex                sync.RWMutex
        DecommissionRetry              int
        DecommissionStatus             uint32
        DecommissionSrcAddr            string
        DecommissionDstAddr            string
        DecommissionRaftForce          bool
        DecommissionSrcDiskPath        string
        DecommissionTerm               uint64
        DecommissionDstAddrSpecify     bool // if DecommissionDstAddrSpecify is true, donot rollback when add replica fail
        DecommissionNeedRollback       bool
        DecommissionNeedRollbackTimes  int
        SpecialReplicaDecommissionStop chan bool // used for stop
        SpecialReplicaDecommissionStep uint32
        IsDiscard                      bool
        VerSeq                         uint64
        RecoverStartTime               time.Time
        RecoverLastConsumeTime         time.Duration
        DecommissionWaitTimes          int
}

type DataPartitionPreLoad struct {
        PreloadCacheTTL      uint64
        preloadCacheCapacity int
        preloadReplicaNum    int
        preloadZoneName      string
}

func (d *DataPartitionPreLoad) toString() string {
        return fmt.Sprintf("PreloadCacheTTL[%d]_preloadCacheCapacity[%d]_preloadReplicaNum[%d]_preloadZoneName[%s]",
                d.PreloadCacheTTL, d.preloadCacheCapacity, d.preloadReplicaNum, d.preloadZoneName)
}

func newDataPartition(ID uint64, replicaNum uint8, volName string, volID uint64, partitionType int, partitionTTL int64) (partition *DataPartition) {
        partition = new(DataPartition)
        partition.ReplicaNum = replicaNum
        partition.PartitionID = ID
        partition.Hosts = make([]string, 0)
        partition.Peers = make([]proto.Peer, 0)
        partition.Replicas = make([]*DataReplica, 0)
        partition.FileInCoreMap = make(map[string]*FileInCore, 0)
        partition.FilesWithMissingReplica = make(map[string]int64)
        partition.MissingNodes = make(map[string]int64)

        partition.Status = proto.ReadOnly
        partition.VolName = volName
        partition.VolID = volID
        partition.PartitionType = partitionType
        partition.PartitionTTL = partitionTTL

        now := time.Now().Unix()
        partition.modifyTime = now
        partition.createTime = now
        partition.lastWarnTime = now
        partition.SpecialReplicaDecommissionStop = make(chan bool, 1024)
        partition.DecommissionStatus = DecommissionInitial
        partition.SpecialReplicaDecommissionStep = SpecialDecommissionInitial
        partition.DecommissionDstAddrSpecify = false
        partition.LeaderReportTime = now
        return
}

func (partition *DataPartition) setReadWrite() {
        partition.Status = proto.ReadWrite
        for _, replica := range partition.Replicas {
                replica.Status = proto.ReadWrite
        }
}

func (partition *DataPartition) isSpecialReplicaCnt() bool {
        return partition.ReplicaNum == 1 || partition.ReplicaNum == 2
}

func (partition *DataPartition) isSingleReplica() bool {
        return partition.ReplicaNum == 1
}

func (partition *DataPartition) isTwoReplica() bool {
        return partition.ReplicaNum == 2
}

func (partition *DataPartition) resetFilesWithMissingReplica() {
        partition.Lock()
        defer partition.Unlock()
        partition.FilesWithMissingReplica = make(map[string]int64)
}

func (partition *DataPartition) dataNodeStartTime() int64 {
        partition.Lock()
        defer partition.Unlock()
        startTime := int64(0)
        for _, replica := range partition.Replicas {
                if startTime < replica.dataNode.StartTime {
                        startTime = replica.dataNode.StartTime
                }
        }

        return startTime
}

func (partition *DataPartition) addReplica(replica *DataReplica) {
        for _, r := range partition.Replicas {
                if replica.Addr == r.Addr {
                        return
                }
        }
        partition.Replicas = append(partition.Replicas, replica)
}

func (partition *DataPartition) tryToChangeLeaderByHost(host string) (err error) {
        var dataNode *DataNode
        for _, r := range partition.Replicas {
                if host == r.Addr {
                        dataNode = r.dataNode
                        break
                }
        }
        if dataNode == nil {
                return fmt.Errorf("host not found[%v]", host)
        }
        task, err := partition.createTaskToTryToChangeLeader(host)
        if err != nil {
                return
        }
        if _, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
                return
        }
        return
}

func (partition *DataPartition) tryToChangeLeader(c *Cluster, dataNode *DataNode) (err error) {
        task, err := partition.createTaskToTryToChangeLeader(dataNode.Addr)
        if err != nil {
                return
        }
        if _, err = dataNode.TaskManager.syncSendAdminTask(task); err != nil {
                return
        }
        return
}

func (partition *DataPartition) prepareAddRaftMember(addPeer proto.Peer) (leaderAddr string, candidateAddrs []string, err error) {
        partition.RLock()
        defer partition.RUnlock()
        if contains(partition.Hosts, addPeer.Addr) {
                err = fmt.Errorf("vol[%v],data partition[%v] has contains host[%v]", partition.VolName, partition.PartitionID, addPeer.Addr)
                return
        }
        candidateAddrs = make([]string, 0, len(partition.Hosts))
        leaderAddr = partition.getLeaderAddr()
        if leaderAddr != "" && contains(partition.Hosts, leaderAddr) {
                candidateAddrs = append(candidateAddrs, leaderAddr)
        } else {
                leaderAddr = ""
        }
        for _, host := range partition.Hosts {
                if host == leaderAddr {
                        continue
                }
                candidateAddrs = append(candidateAddrs, host)
        }
        return
}

func (partition *DataPartition) createTaskToTryToChangeLeader(addr string) (task *proto.AdminTask, err error) {
        task = proto.NewAdminTask(proto.OpDataPartitionTryToLeader, addr, nil)
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) createTaskToAddRaftMember(addPeer proto.Peer, leaderAddr string) (task *proto.AdminTask, err error) {
        task = proto.NewAdminTask(proto.OpAddDataPartitionRaftMember, leaderAddr, newAddDataPartitionRaftMemberRequest(partition.PartitionID, addPeer))
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) createTaskToRemoveRaftMember(c *Cluster, removePeer proto.Peer, force bool) (err error) {
        doWork := func(leaderAddr string) error {
                log.LogInfof("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v] removePeer %v leaderAddr %v", partition.VolName, partition.PartitionID, removePeer, leaderAddr)
                req := newRemoveDataPartitionRaftMemberRequest(partition.PartitionID, removePeer)
                req.Force = force

                task := proto.NewAdminTask(proto.OpRemoveDataPartitionRaftMember, leaderAddr, req)
                partition.resetTaskID(task)

                leaderDataNode, err := c.dataNode(leaderAddr)
                if err != nil {
                        log.LogErrorf("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v],err[%v]", partition.VolName, partition.PartitionID, err)
                        return err
                }
                if _, err = leaderDataNode.TaskManager.syncSendAdminTask(task); err != nil {
                        log.LogErrorf("action[createTaskToRemoveRaftMember] vol[%v],data partition[%v],err[%v]", partition.VolName, partition.PartitionID, err)
                        return err
                }
                return nil
        }

        leaderAddr := partition.getLeaderAddr()
        if leaderAddr == "" {
                if force {
                        for _, replica := range partition.Replicas {
                                if replica.Addr != removePeer.Addr {
                                        leaderAddr = replica.Addr
                                }
                                doWork(leaderAddr)
                        }
                } else {
                        err = proto.ErrNoLeader
                        return
                }
        } else {
                return doWork(leaderAddr)
        }
        return
}

func (partition *DataPartition) createTaskToCreateDataPartition(addr string, dataPartitionSize uint64,
        peers []proto.Peer, hosts []string, createType int, partitionType int, decommissionedDisks []string) (task *proto.AdminTask,
) {
        leaderSize := 0
        if createType == proto.DecommissionedCreateDataPartition {
                leaderSize = int(partition.Replicas[0].Used)
        }

        task = proto.NewAdminTask(proto.OpCreateDataPartition, addr, newCreateDataPartitionRequest(
                partition.VolName, partition.PartitionID, int(partition.ReplicaNum),
                peers, int(dataPartitionSize), leaderSize, hosts, createType,
                partitionType, decommissionedDisks, partition.VerSeq))
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) createTaskToDeleteDataPartition(addr string) (task *proto.AdminTask) {
        task = proto.NewAdminTask(proto.OpDeleteDataPartition, addr, newDeleteDataPartitionRequest(partition.PartitionID))
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) resetTaskID(t *proto.AdminTask) {
        t.ID = fmt.Sprintf("%v_DataPartitionID[%v]", t.ID, partition.PartitionID)
        t.PartitionID = partition.PartitionID
}

// Check if there is a replica missing or not.
func (partition *DataPartition) hasMissingOneReplica(addr string, replicaNum int) (err error) {
        hostNum := len(partition.Replicas)

        inReplicas := false
        for _, rep := range partition.Replicas {
                if addr == rep.Addr {
                        inReplicas = true
                }
        }

        if hostNum <= replicaNum-1 && inReplicas {
                log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v",
                        "hasMissingOneReplica", partition.PartitionID, proto.ErrHasOneMissingReplica))
                err = proto.ErrHasOneMissingReplica
        }
        return
}

func (partition *DataPartition) canBeOffLine(offlineAddr string) (err error) {
        msg := fmt.Sprintf("action[canOffLine],partitionID:%v  RocksDBHost:%v  offLine:%v ",
                partition.PartitionID, partition.Hosts, offlineAddr)
        liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
        otherLiveReplicas := make([]*DataReplica, 0)
        for i := 0; i < len(liveReplicas); i++ {
                replica := liveReplicas[i]
                if replica.Addr != offlineAddr {
                        otherLiveReplicas = append(otherLiveReplicas, replica)
                }
        }

        if partition.ReplicaNum >= 3 && len(otherLiveReplicas) < int(partition.ReplicaNum/2+1) {
                var lives []string
                for _, replica := range otherLiveReplicas {
                        lives = append(lives, replica.Addr)
                }
                msg = fmt.Sprintf(msg+" err:%v  liveReplicas len:%v [%v] not satisify qurom %d ",
                        proto.ErrCannotBeOffLine, len(liveReplicas), lives, int(partition.ReplicaNum/2+1))
                log.LogError(msg)
                err = fmt.Errorf(msg)
                return
        }

        if len(liveReplicas) == 0 {
                msg = fmt.Sprintf(msg+" err:%v  replicaNum:%v liveReplicas is 0 ", proto.ErrCannotBeOffLine, partition.ReplicaNum)
                log.LogError(msg)
                err = fmt.Errorf(msg)
                return
        }

        return
}

// get all the valid replicas of the given data partition
func (partition *DataPartition) availableDataReplicas() (replicas []*DataReplica) {
        replicas = make([]*DataReplica, 0)
        for i := 0; i < len(partition.Replicas); i++ {
                replica := partition.Replicas[i]

                // the node reports heartbeat normally and the node is available
                if replica.isLocationAvailable() == true && partition.hasHost(replica.Addr) == true {
                        replicas = append(replicas, replica)
                }
        }

        return
}

// Remove the replica address from the memory.
func (partition *DataPartition) removeReplicaByAddr(addr string) {
        delIndex := -1
        var replica *DataReplica
        for i := 0; i < len(partition.Replicas); i++ {
                replica = partition.Replicas[i]
                if replica.Addr == addr {
                        delIndex = i
                        break
                }
        }

        msg := fmt.Sprintf("action[removeReplicaByAddr],data partition:%v  on node:%v  OffLine,the node is in replicas:%v", partition.PartitionID, addr, replica != nil)
        log.LogDebug(msg)
        if delIndex == -1 {
                return
        }
        partition.FileInCoreMap = make(map[string]*FileInCore, 0)
        partition.deleteReplicaByIndex(delIndex)
        partition.modifyTime = time.Now().Unix()

        return
}

func (partition *DataPartition) deleteReplicaByIndex(index int) {
        var replicaAddrs []string
        for _, replica := range partition.Replicas {
                replicaAddrs = append(replicaAddrs, replica.Addr)
        }
        msg := fmt.Sprintf("deleteReplicaByIndex dp %v  index:%v  locations :%v ", partition.PartitionID, index, replicaAddrs)
        log.LogInfo(msg)
        replicasAfter := partition.Replicas[index+1:]
        partition.Replicas = partition.Replicas[:index]
        partition.Replicas = append(partition.Replicas, replicasAfter...)
}

func (partition *DataPartition) createLoadTasks() (tasks []*proto.AdminTask) {
        partition.Lock()
        defer partition.Unlock()
        for _, addr := range partition.Hosts {
                replica, err := partition.getReplica(addr)
                if err != nil || replica.isLive(defaultDataPartitionTimeOutSec) == false {
                        continue
                }
                replica.HasLoadResponse = false
                tasks = append(tasks, partition.createLoadTask(addr))
        }
        partition.LastLoadedTime = time.Now().Unix()
        return
}

func (partition *DataPartition) createLoadTask(addr string) (task *proto.AdminTask) {
        task = proto.NewAdminTask(proto.OpLoadDataPartition, addr, newLoadDataPartitionMetricRequest(partition.PartitionID))
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) getReplica(addr string) (replica *DataReplica, err error) {
        for index := 0; index < len(partition.Replicas); index++ {
                replica = partition.Replicas[index]
                if replica.Addr == addr {
                        return
                }
        }
        log.LogErrorf("action[getReplica],partitionID:%v,locations:%v,err:%v",
                partition.PartitionID, addr, dataReplicaNotFound(addr))
        return nil, errors.Trace(dataReplicaNotFound(addr), "%v not found", addr)
}

func (partition *DataPartition) convertToDataPartitionResponse() (dpr *proto.DataPartitionResponse) {
        dpr = new(proto.DataPartitionResponse)
        partition.Lock()
        defer partition.Unlock()

        dpr.PartitionID = partition.PartitionID
        dpr.PartitionType = partition.PartitionType
        dpr.PartitionTTL = partition.PartitionTTL
        dpr.Status = partition.Status
        dpr.ReplicaNum = partition.ReplicaNum
        dpr.Hosts = make([]string, len(partition.Hosts))
        copy(dpr.Hosts, partition.Hosts)
        dpr.LeaderAddr = partition.getLeaderAddr()
        dpr.IsRecover = partition.isRecover
        dpr.IsDiscard = partition.IsDiscard

        return
}

func (partition *DataPartition) getLeaderAddr() (leaderAddr string) {
        for _, replica := range partition.Replicas {
                if replica.IsLeader {
                        return replica.Addr
                }
        }
        return
}

func (partition *DataPartition) getLeaderAddrWithLock() (leaderAddr string) {
        partition.RLock()
        defer partition.RUnlock()
        for _, replica := range partition.Replicas {
                if replica.IsLeader {
                        return replica.Addr
                }
        }
        return
}

func (partition *DataPartition) checkLoadResponse(timeOutSec int64) (isResponse bool) {
        partition.RLock()
        defer partition.RUnlock()
        for _, addr := range partition.Hosts {
                replica, err := partition.getReplica(addr)
                if err != nil {
                        log.LogInfof("action[checkLoadResponse] partitionID:%v getReplica addr %v error %v", partition.PartitionID, addr, err)
                        return
                }
                timePassed := time.Now().Unix() - partition.LastLoadedTime
                if replica.HasLoadResponse == false && timePassed > timeToWaitForResponse {
                        msg := fmt.Sprintf("action[checkLoadResponse], partitionID:%v on node:%v no response, spent time %v s",
                                partition.PartitionID, addr, timePassed)
                        log.LogWarn(msg)
                        return
                }
                if replica.isLive(timeOutSec) == false || replica.HasLoadResponse == false {
                        log.LogInfof("action[checkLoadResponse] partitionID:%v getReplica addr %v replica.isLive(timeOutSec) %v", partition.PartitionID, addr, replica.isLive(timeOutSec))
                        return
                }
        }
        isResponse = true

        return
}

func (partition *DataPartition) getReplicaByIndex(index uint8) (replica *DataReplica) {
        return partition.Replicas[int(index)]
}

func (partition *DataPartition) getFileCount() {
        filesToBeDeleted := make([]string, 0)
        partition.Lock()
        defer partition.Unlock()
        for _, replica := range partition.Replicas {
                replica.FileCount = 0
        }
        for _, fc := range partition.FileInCoreMap {
                if len(fc.MetadataArray) == 0 {
                        filesToBeDeleted = append(filesToBeDeleted, fc.Name)
                }
                for _, vfNode := range fc.MetadataArray {
                        replica := partition.getReplicaByIndex(vfNode.locIndex)
                        replica.FileCount++
                }
        }

        for _, vfName := range filesToBeDeleted {
                delete(partition.FileInCoreMap, vfName)
        }
}

// Release the memory occupied by the data partition.
func (partition *DataPartition) releaseDataPartition() {
        partition.Lock()
        defer partition.Unlock()
        liveReplicas := partition.getLiveReplicasFromHosts(defaultDataPartitionTimeOutSec)
        for _, replica := range liveReplicas {
                replica.HasLoadResponse = false
        }
        for name, fc := range partition.FileInCoreMap {
                fc.MetadataArray = nil
                delete(partition.FileInCoreMap, name)
        }
        partition.FileInCoreMap = make(map[string]*FileInCore, 0)
        for name, fileMissReplicaTime := range partition.FilesWithMissingReplica {
                if time.Now().Unix()-fileMissReplicaTime > 2*intervalToLoadDataPartition {
                        delete(partition.FilesWithMissingReplica, name)
                }
        }
}

func (partition *DataPartition) hasReplica(host string) (replica *DataReplica, ok bool) {
        // using loop instead of map to save the memory
        for _, replica = range partition.Replicas {
                if replica.Addr == host {
                        ok = true
                        break
                }
        }
        return
}

func (partition *DataPartition) checkReplicaNum(c *Cluster, vol *Vol) {
        partition.RLock()
        defer partition.RUnlock()

        if int(partition.ReplicaNum) != len(partition.Hosts) {
                msg := fmt.Sprintf("FIX DataPartition replicaNum,clusterID[%v] volName[%v] partitionID:%v orgReplicaNum:%v",
                        c.Name, vol.Name, partition.PartitionID, partition.ReplicaNum)
                Warn(c.Name, msg)
                if partition.isSpecialReplicaCnt() && partition.IsDecommissionFailed() { // case restart and no message left,delete the lasted replica be added
                        log.LogInfof("action[checkReplicaNum] volume %v partition %v need to lower replica", partition.VolName, partition.PartitionID)
                        vol.NeedToLowerReplica = true
                        return
                }
        }

        if vol.dpReplicaNum != partition.ReplicaNum && !vol.NeedToLowerReplica {
                log.LogDebugf("action[checkReplicaNum] volume %v partiton %v replicanum abnornal %v %v",
                        partition.VolName, partition.PartitionID, vol.dpReplicaNum, partition.ReplicaNum)
                vol.NeedToLowerReplica = true
        }
}

func (partition *DataPartition) hostsToString() (hosts string) {
        return strings.Join(partition.Hosts, underlineSeparator)
}

func (partition *DataPartition) setToNormal() {
        partition.Lock()
        defer partition.Unlock()
        partition.isRecover = false
}

func (partition *DataPartition) hasHost(addr string) (ok bool) {
        for _, host := range partition.Hosts {
                if host == addr {
                        ok = true
                        break
                }
        }
        return
}

func (partition *DataPartition) liveReplicas(timeOutSec int64) (replicas []*DataReplica) {
        replicas = make([]*DataReplica, 0)
        for i := 0; i < len(partition.Replicas); i++ {
                replica := partition.Replicas[i]
                if replica.isLive(timeOutSec) && partition.hasHost(replica.Addr) {
                        replicas = append(replicas, replica)
                }
        }

        return
}

// get all the live replicas from the persistent hosts
func (partition *DataPartition) getLiveReplicasFromHosts(timeOutSec int64) (replicas []*DataReplica) {
        replicas = make([]*DataReplica, 0)
        for _, host := range partition.Hosts {
                replica, ok := partition.hasReplica(host)
                if !ok {
                        continue
                }
                if replica.isLive(timeOutSec) == true {
                        replicas = append(replicas, replica)
                }
        }

        return
}

// get all the live replicas from the persistent hosts
func (partition *DataPartition) getLiveReplicas(timeOutSec int64) (replicas []*DataReplica) {
        replicas = make([]*DataReplica, 0)
        for _, replica := range partition.Replicas {
                if replica.isLive(timeOutSec) == true {
                        replicas = append(replicas, replica)
                }
        }

        return
}

func (partition *DataPartition) checkAndRemoveMissReplica(addr string) {
        if _, ok := partition.MissingNodes[addr]; ok {
                delete(partition.MissingNodes, addr)
        }
}

func (partition *DataPartition) loadFile(dataNode *DataNode, resp *proto.LoadDataPartitionResponse) {
        partition.Lock()
        defer partition.Unlock()

        index, err := partition.getReplicaIndex(dataNode.Addr)
        if err != nil {
                msg := fmt.Sprintf("loadFile partitionID:%v  on node:%v  don't report :%v ", partition.PartitionID, dataNode.Addr, err)
                log.LogWarn(msg)
                return
        }
        replica := partition.Replicas[index]
        for _, dpf := range resp.PartitionSnapshot {
                if dpf == nil {
                        continue
                }
                fc, ok := partition.FileInCoreMap[dpf.Name]
                if !ok {
                        fc = newFileInCore(dpf.Name)
                        partition.FileInCoreMap[dpf.Name] = fc
                }
                log.LogInfof("updateFileInCore partition %v", partition.PartitionID)
                fc.updateFileInCore(partition.PartitionID, dpf, replica, index)
        }
        replica.HasLoadResponse = true
        replica.Used = resp.Used
}

func (partition *DataPartition) getReplicaIndex(addr string) (index int, err error) {
        for index = 0; index < len(partition.Replicas); index++ {
                replica := partition.Replicas[index]
                if replica.Addr == addr {
                        return
                }
        }
        log.LogErrorf("action[getReplicaIndex],partitionID:%v,location:%v,err:%v",
                partition.PartitionID, addr, dataReplicaNotFound(addr))
        return -1, errors.Trace(dataReplicaNotFound(addr), "%v not found ", addr)
}

func (partition *DataPartition) update(action, volName string, newPeers []proto.Peer, newHosts []string, c *Cluster) (err error) {
        if len(newHosts) == 0 {
                log.LogErrorf("update. action[%v] update partition[%v] vol[%v] old host[%v]", action, partition.PartitionID, volName, partition.Hosts)
                return
        }
        orgHosts := make([]string, len(partition.Hosts))
        copy(orgHosts, partition.Hosts)
        oldPeers := make([]proto.Peer, len(partition.Peers))
        copy(oldPeers, partition.Peers)
        partition.Hosts = newHosts
        partition.Peers = newPeers
        if err = c.syncUpdateDataPartition(partition); err != nil {
                partition.Hosts = orgHosts
                partition.Peers = oldPeers
                return errors.Trace(err, "action[%v] update partition[%v] vol[%v] failed", action, partition.PartitionID, volName)
        }
        msg := fmt.Sprintf("action[%v] success,vol[%v] partitionID:%v "+
                "oldHosts:%v newHosts:%v,oldPees[%v],newPeers[%v]",
                action, volName, partition.PartitionID, orgHosts, partition.Hosts, oldPeers, partition.Peers)
        log.LogWarnf(msg)
        return
}

func (partition *DataPartition) updateMetric(vr *proto.DataPartitionReport, dataNode *DataNode, c *Cluster) {
        if !partition.hasHost(dataNode.Addr) {
                return
        }
        partition.Lock()
        defer partition.Unlock()
        replica, err := partition.getReplica(dataNode.Addr)
        if err != nil {
                replica = newDataReplica(dataNode)
                partition.addReplica(replica)
        }
        partition.total = vr.Total
        replica.Status = int8(vr.PartitionStatus)
        replica.Total = vr.Total
        replica.Used = vr.Used
        partition.setMaxUsed()
        replica.FileCount = uint32(vr.ExtentCount)
        replica.setAlive()
        replica.IsLeader = vr.IsLeader
        if replica.IsLeader {
                partition.LeaderReportTime = time.Now().Unix()
        }
        replica.NeedsToCompare = vr.NeedCompare
        replica.DecommissionRepairProgress = vr.DecommissionRepairProgress
        if replica.DiskPath != vr.DiskPath && vr.DiskPath != "" {
                oldDiskPath := replica.DiskPath
                replica.DiskPath = vr.DiskPath
                err = c.syncUpdateDataPartition(partition)
                if err != nil {
                        replica.DiskPath = oldDiskPath
                }
        }
        partition.checkAndRemoveMissReplica(dataNode.Addr)

        if replica.Status == proto.ReadWrite && (partition.RdOnly || replica.dataNode.RdOnly) {
                replica.Status = int8(proto.ReadOnly)
        }
}

func (partition *DataPartition) setMaxUsed() {
        var maxUsed uint64
        for _, r := range partition.Replicas {
                if r.Used > maxUsed {
                        maxUsed = r.Used
                }
        }
        partition.used = maxUsed
}

func (partition *DataPartition) getMaxUsedSpace() uint64 {
        return partition.used
}

func (partition *DataPartition) afterCreation(nodeAddr, diskPath string, c *Cluster) (err error) {
        dataNode, err := c.dataNode(nodeAddr)
        if err != nil {
                return err
        }
        replica := newDataReplica(dataNode)
        if partition.IsDecommissionRunning() {
                replica.Status = proto.Recovering
        } else {
                replica.Status = proto.Unavailable
        }
        replica.DiskPath = diskPath
        replica.ReportTime = time.Now().Unix()
        replica.Total = util.DefaultDataPartitionSize
        partition.addReplica(replica)
        partition.checkAndRemoveMissReplica(replica.Addr)
        log.LogInfof("action[afterCreation] dp %v add new replica %v ", partition.PartitionID, dataNode.Addr)
        return
}

// Check if it makes sense to compare the CRC.
// Note that if loading the data into a data node is not finished, then there is no need to check the CRC.
func (partition *DataPartition) needsToCompareCRC() (needCompare bool) {
        partition.Lock()
        defer partition.Unlock()
        if partition.isRecover {
                return false
        }
        needCompare = true
        for _, replica := range partition.Replicas {
                if !replica.NeedsToCompare {
                        needCompare = false
                        break
                }
        }
        return
}

func (partition *DataPartition) containsBadDisk(diskPath string, nodeAddr string) bool {
        partition.RLock()
        defer partition.RUnlock()
        for _, replica := range partition.Replicas {
                if nodeAddr == replica.Addr && diskPath == replica.DiskPath {
                        return true
                }
        }
        return false
}

func (partition *DataPartition) getReplicaDisk(nodeAddr string) string {
        partition.RLock()
        defer partition.RUnlock()
        for _, replica := range partition.Replicas {
                if nodeAddr == replica.Addr {
                        return replica.DiskPath
                }
        }
        return ""
}

func (partition *DataPartition) getMinus() (minus float64) {
        partition.RLock()
        defer partition.RUnlock()
        used := partition.Replicas[0].Used
        for _, replica := range partition.Replicas {
                if math.Abs(float64(replica.Used)-float64(used)) > minus {
                        minus = math.Abs(float64(replica.Used) - float64(used))
                }
        }
        return minus
}

func (partition *DataPartition) activeUsedSimilar() bool {
        partition.RLock()
        defer partition.RUnlock()
        liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
        used := liveReplicas[0].Used
        minus := float64(0)

        for _, replica := range liveReplicas {
                if math.Abs(float64(replica.Used)-float64(used)) > minus {
                        minus = math.Abs(float64(replica.Used) - float64(used))
                }
        }

        return minus < util.GB
}

func (partition *DataPartition) getToBeDecommissionHost(replicaNum int) (host string) {
        partition.RLock()
        defer partition.RUnlock()

        // if new replica is added success when failed(rollback failed with delete new replica timeout eg)
        if partition.isSpecialReplicaCnt() &&
                partition.GetSpecialReplicaDecommissionStep() >= SpecialDecommissionWaitAddRes &&
                partition.IsDecommissionFailed() {
                log.LogInfof("action[getToBeDecommissionHost] get single replica partition %v need to decommission %v",
                        partition.PartitionID, partition.DecommissionDstAddr)
                host = partition.DecommissionDstAddr
                return
        }
        hostLen := len(partition.Hosts)
        if hostLen <= 1 || hostLen <= replicaNum {
                return
        }
        host = partition.Hosts[hostLen-1]
        return
}

func (partition *DataPartition) removeOneReplicaByHost(c *Cluster, host string, isReplicaNormal bool) (err error) {
        if err = c.removeDataReplica(partition, host, false, false); err != nil {
                return
        }

        partition.RLock()
        defer partition.RUnlock()

        //if partition.isSpecialReplicaCnt() && isReplicaNormal {
        //        partition.SingleDecommissionStatus = 0
        //        partition.SingleDecommissionAddr = ""
        //        return
        //}
        oldReplicaNum := partition.ReplicaNum
        partition.ReplicaNum = partition.ReplicaNum - 1

        if err = c.syncUpdateDataPartition(partition); err != nil {
                partition.ReplicaNum = oldReplicaNum
        }

        return
}

func (partition *DataPartition) getNodeSets() (nodeSets []uint64) {
        partition.RLock()
        defer partition.RUnlock()
        nodeSetMap := map[uint64]struct{}{}
        for _, replica := range partition.Replicas {
                if replica.dataNode == nil {
                        continue
                }
                nodeSetMap[replica.dataNode.NodeSetID] = struct{}{}
        }
        for nodeSet := range nodeSetMap {
                nodeSets = append(nodeSets, nodeSet)
        }
        return
}

func (partition *DataPartition) getZones() (zones []string) {
        partition.RLock()
        defer partition.RUnlock()
        zoneMap := map[string]struct{}{}
        for _, replica := range partition.Replicas {
                if replica.dataNode == nil {
                        continue
                }
                zoneMap[replica.dataNode.ZoneName] = struct{}{}
        }
        for zone := range zoneMap {
                zones = append(zones, zone)
        }
        return
}

func (partition *DataPartition) getLiveZones(offlineAddr string) (zones []string) {
        partition.RLock()
        defer partition.RUnlock()
        for _, replica := range partition.Replicas {
                if replica.dataNode == nil {
                        continue
                }
                if replica.dataNode.Addr == offlineAddr {
                        continue
                }
                zones = append(zones, replica.dataNode.ZoneName)
        }
        return
}

func (partition *DataPartition) buildDpInfo(c *Cluster) *proto.DataPartitionInfo {
        partition.RLock()
        defer partition.RUnlock()

        replicas := make([]*proto.DataReplica, len(partition.Replicas))
        for i, replica := range partition.Replicas {
                dataReplica := replica.DataReplica
                dataReplica.DomainAddr = replica.dataNode.DomainAddr
                replicas[i] = &dataReplica
        }

        fileInCoreMap := make(map[string]*proto.FileInCore)
        for k, v := range partition.FileInCoreMap {
                fileInCoreMap[k] = v.clone()
        }

        zones := make([]string, len(partition.Hosts))
        nodeSets := make([]uint64, len(partition.Hosts))
        for idx, host := range partition.Hosts {
                dataNode, err := c.dataNode(host)
                if err == nil {
                        zones[idx] = dataNode.ZoneName
                        nodeSets[idx] = dataNode.NodeSetID
                }
        }

        forbidden := true
        vol, err := c.getVol(partition.VolName)
        if err == nil {
                forbidden = vol.Forbidden
        } else {
                log.LogErrorf("action[buildDpInfo]failed to get volume %v, err %v", partition.VolName, err)
        }

        return &proto.DataPartitionInfo{
                PartitionID:              partition.PartitionID,
                PartitionTTL:             partition.PartitionTTL,
                PartitionType:            partition.PartitionType,
                LastLoadedTime:           partition.LastLoadedTime,
                ReplicaNum:               partition.ReplicaNum,
                Status:                   partition.Status,
                Replicas:                 replicas,
                Hosts:                    partition.Hosts,
                Peers:                    partition.Peers,
                Zones:                    zones,
                NodeSets:                 nodeSets,
                MissingNodes:             partition.MissingNodes,
                VolName:                  partition.VolName,
                VolID:                    partition.VolID,
                FileInCoreMap:            fileInCoreMap,
                OfflinePeerID:            partition.OfflinePeerID,
                IsRecover:                partition.isRecover,
                FilesWithMissingReplica:  partition.FilesWithMissingReplica,
                IsDiscard:                partition.IsDiscard,
                SingleDecommissionStatus: partition.GetSpecialReplicaDecommissionStep(),
                Forbidden:                forbidden,
        }
}

const (
        DecommissionInitial uint32 = iota
        markDecommission
        DecommissionPause // can only stop markDecommission
        DecommissionPrepare
        DecommissionRunning
        DecommissionSuccess
        DecommissionFail
)

const (
        SpecialDecommissionInitial uint32 = iota
        SpecialDecommissionEnter
        SpecialDecommissionWaitAddRes
        SpecialDecommissionWaitAddResFin
        SpecialDecommissionRemoveOld
)

const InvalidDecommissionDpCnt = -1

const (
        defaultDecommissionParallelLimit      = 10
        defaultDecommissionRetryLimit         = 5
        defaultDecommissionRollbackLimit      = 3
        defaultDecommissionDiskParallelFactor = 0
)

func GetDecommissionStatusMessage(status uint32) string {
        switch status {
        case DecommissionInitial:
                return "Initial"
        case markDecommission:
                return "Marked"
        case DecommissionPause:
                return "Paused"
        case DecommissionRunning:
                return "Running"
        case DecommissionSuccess:
                return "Success"
        case DecommissionFail:
                return "Failed"
        default:
                return "Unknown"
        }
}

func (partition *DataPartition) MarkDecommissionStatus(srcAddr, dstAddr, srcDisk string, raftForce bool, term uint64, c *Cluster) bool {
        if !partition.canMarkDecommission(term) {
                log.LogWarnf("action[MarkDecommissionStatus] dp[%v] cannot make decommission:status[%v]",
                        partition.PartitionID, partition.GetDecommissionStatus())
                return false
        }

        if partition.IsDecommissionPaused() {
                if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, false, c) {
                        log.LogWarnf("action[MarkDecommissionStatus] dp [%d] recover from stop failed", partition.PartitionID)
                        return false
                }
                partition.SetDecommissionStatus(markDecommission)
                // update decommissionTerm for next time query
                partition.DecommissionTerm = term
                return true
        }
        // initial or failed restart
        partition.ResetDecommissionStatus()
        partition.SetDecommissionStatus(markDecommission)
        partition.DecommissionSrcAddr = srcAddr
        partition.DecommissionDstAddr = dstAddr
        partition.DecommissionSrcDiskPath = srcDisk
        partition.DecommissionRaftForce = raftForce
        partition.DecommissionTerm = term
        // reset special replicas decommission status
        partition.isRecover = false
        partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
        if partition.DecommissionSrcDiskPath == "" {
                partition.RLock()
                replica, _ := partition.getReplica(srcAddr)
                partition.RUnlock()
                if replica != nil {
                        partition.DecommissionSrcDiskPath = replica.DiskPath
                }
        }
        if dstAddr != "" {
                partition.DecommissionDstAddrSpecify = true
        }
        log.LogDebugf("action[MarkDecommissionStatus] dp[%v] SrcAddr %v, dstAddr %v, diskPath %v, raftForce %v term %v",
                partition.PartitionID, partition.DecommissionSrcAddr, partition.DecommissionDstAddr,
                partition.DecommissionSrcDiskPath, partition.DecommissionRaftForce, partition.DecommissionTerm)
        return true
}

func (partition *DataPartition) SetDecommissionStatus(status uint32) {
        atomic.StoreUint32(&partition.DecommissionStatus, status)
}

func (partition *DataPartition) SetSpecialReplicaDecommissionStep(step uint32) {
        atomic.StoreUint32(&partition.SpecialReplicaDecommissionStep, step)
}

func (partition *DataPartition) GetDecommissionStatus() uint32 {
        return atomic.LoadUint32(&partition.DecommissionStatus)
}

func (partition *DataPartition) GetSpecialReplicaDecommissionStep() uint32 {
        return atomic.LoadUint32(&partition.SpecialReplicaDecommissionStep)
}

func (partition *DataPartition) IsDecommissionSuccess() bool {
        return partition.GetDecommissionStatus() == DecommissionSuccess
}

func (partition *DataPartition) IsDecommissionFailed() bool {
        return partition.GetDecommissionStatus() == DecommissionFail
}

func (partition *DataPartition) IsDecommissionRunning() bool {
        return partition.GetDecommissionStatus() == DecommissionRunning
}

func (partition *DataPartition) IsDecommissionPrepare() bool {
        return partition.GetDecommissionStatus() == DecommissionPrepare
}

func (partition *DataPartition) IsDecommissionPaused() bool {
        return partition.GetDecommissionStatus() == DecommissionPause
}

func (partition *DataPartition) IsDecommissionInitial() bool {
        return partition.GetDecommissionStatus() == DecommissionInitial
}

func (partition *DataPartition) IsMarkDecommission() bool {
        return partition.GetDecommissionStatus() == markDecommission
}

func (partition *DataPartition) IsDoingDecommission() bool {
        decommStatus := partition.GetDecommissionStatus()

        return (decommStatus > DecommissionInitial && decommStatus < DecommissionSuccess)
}

func (partition *DataPartition) TryToDecommission(c *Cluster) bool {
        if !partition.IsMarkDecommission() {
                log.LogWarnf("action[TryToDecommission] failed dp[%v] status expected markDecommission[%v]",
                        partition.PartitionID, atomic.LoadUint32(&partition.DecommissionStatus))
                return false
        }

        log.LogDebugf("action[TryToDecommission] dp[%v]", partition.PartitionID)

        return partition.Decommission(c)
}

func (partition *DataPartition) Decommission(c *Cluster) bool {
        var (
                msg        string
                err        error
                srcAddr    = partition.DecommissionSrcAddr
                targetAddr = partition.DecommissionDstAddr
        )
        defer func() {
                c.syncUpdateDataPartition(partition)
        }()
        log.LogInfof("action[decommissionDataPartition] dp[%v] from node[%v] to node[%v], raftForce[%v] SingleDecommissionStatus[%v]",
                partition.PartitionID, srcAddr, targetAddr, partition.DecommissionRaftForce, partition.GetSpecialReplicaDecommissionStep())
        begin := time.Now()
        partition.SetDecommissionStatus(DecommissionPrepare)
        err = c.syncUpdateDataPartition(partition)
        if err != nil {
                log.LogWarnf("action[decommissionDataPartition] dp [%v] update to prepare failed", partition.PartitionID)
                goto errHandler
        }

        // delete if not normal data partition
        if !proto.IsNormalDp(partition.PartitionType) {
                c.vols[partition.VolName].deleteDataPartition(c, partition)
                partition.SetDecommissionStatus(DecommissionSuccess)
                log.LogWarnf("action[decommissionDataPartition]delete dp directly[%v]", partition.PartitionID)
                return true
        }

        if err = c.validateDecommissionDataPartition(partition, srcAddr); err != nil {
                goto errHandler
        }

        err = c.updateDataNodeSize(targetAddr, partition)
        if err != nil {
                log.LogWarnf("action[decommissionDataPartition] target addr can't be writable, add %s %s", targetAddr, err.Error())
                goto errHandler
        }
        defer func() {
                if err != nil {
                        c.returnDataSize(targetAddr, partition)
                }
        }()
        // if single/two replica without raftforce
        if partition.isSpecialReplicaCnt() && !partition.DecommissionRaftForce {
                if partition.GetSpecialReplicaDecommissionStep() == SpecialDecommissionInitial {
                        partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionEnter)
                }
                if err = c.decommissionSingleDp(partition, targetAddr, srcAddr); err != nil {
                        goto errHandler
                }
        } else {
                if err = c.removeDataReplica(partition, srcAddr, false, partition.DecommissionRaftForce); err != nil {
                        goto errHandler
                }
                if err = c.addDataReplica(partition, targetAddr); err != nil {
                        goto errHandler
                }
                newReplica, _ := partition.getReplica(targetAddr)
                newReplica.Status = proto.Recovering // in case heartbeat response is not arrived
                partition.isRecover = true
                partition.Status = proto.ReadOnly
                partition.SetDecommissionStatus(DecommissionRunning)
                partition.RecoverStartTime = time.Now()
                c.putBadDataPartitionIDsByDiskPath(partition.DecommissionSrcDiskPath, partition.DecommissionSrcAddr, partition.PartitionID)
        }
        // only stop 3-replica,need to release token
        if partition.IsDecommissionPaused() {
                log.LogInfof("action[decommissionDataPartition]clusterID[%v] partitionID:%v decommission paused", c.Name, partition.PartitionID)
                if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
                        log.LogWarnf("action[decommissionDataPartition]clusterID[%v] partitionID:%v  paused failed", c.Name, partition.PartitionID)
                }
                return true
        } else {
                log.LogInfof("action[decommissionDataPartition]clusterID[%v] partitionID:%v "+
                        "on node:%v offline success,newHost[%v],PersistenceHosts:[%v], SingleDecommissionStatus[%v]prepare consume[%v]seconds",
                        c.Name, partition.PartitionID, srcAddr, targetAddr, partition.Hosts, partition.GetSpecialReplicaDecommissionStep(), time.Since(begin).Seconds())
                return true
        }

errHandler:
        // special replica num receive stop signal,donot reset  SingleDecommissionStatus for decommission again
        if partition.GetDecommissionStatus() == DecommissionPause {
                log.LogWarnf("action[decommissionDataPartition] partitionID:%v is stopped", partition.PartitionID)
                return true
        }

        partition.DecommissionRetry++
        if partition.DecommissionRetry >= defaultDecommissionRetryLimit {
                partition.SetDecommissionStatus(DecommissionFail)
        } else {
                partition.SetDecommissionStatus(markDecommission) // retry again
                partition.DecommissionWaitTimes = 0
        }

        // if need rollback, set to fail,reset DecommissionDstAddr
        if partition.DecommissionNeedRollback {
                partition.SetDecommissionStatus(DecommissionFail)
        }
        msg = fmt.Sprintf("clusterID[%v] vol[%v] partitionID[%v]  on Node:%v  "+
                "to newHost:%v Err:%v, PersistenceHosts:%v ,retry %v,status %v, isRecover %v SingleDecommissionStatus[%v]"+
                " DecommissionNeedRollback[%v]",
                c.Name, partition.VolName, partition.PartitionID, srcAddr, targetAddr, err.Error(),
                partition.Hosts, partition.DecommissionRetry, partition.GetDecommissionStatus(),
                partition.isRecover, partition.GetSpecialReplicaDecommissionStep(), partition.DecommissionNeedRollback)
        Warn(c.Name, msg)
        log.LogWarnf("action[decommissionDataPartition] %s", msg)
        return false
}

func (partition *DataPartition) PauseDecommission(c *Cluster) bool {
        status := partition.GetDecommissionStatus()
        // support retry pause if pause failed last time
        if status == DecommissionInitial || status == DecommissionSuccess ||
                status == DecommissionFail {
                log.LogWarnf("action[PauseDecommission] dp[%v] cannot be stopped status[%v]", partition.PartitionID, status)
                return true
        }
        defer c.syncUpdateDataPartition(partition)
        log.LogDebugf("action[PauseDecommission] dp[%v] status %v set to stop ",
                partition.PartitionID, partition.GetDecommissionStatus())

        if status == markDecommission {
                partition.SetDecommissionStatus(DecommissionPause)
                return true
        }
        if partition.isSpecialReplicaCnt() {
                log.LogDebugf("action[PauseDecommission]special replica dp[%v] status[%v]",
                        partition.PartitionID, partition.GetSpecialReplicaDecommissionStep())
                partition.SpecialReplicaDecommissionStop <- false
                // if special replica is repairing, stop the process
                if partition.GetSpecialReplicaDecommissionStep() == SpecialDecommissionWaitAddRes {
                        if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
                                return false
                        }
                }
        } else {
                if partition.IsDecommissionRunning() {
                        if !partition.pauseReplicaRepair(partition.DecommissionDstAddr, true, c) {
                                return false
                        }
                        log.LogDebugf("action[PauseDecommission] dp[%v] status [%v] send stop signal ",
                                partition.PartitionID, partition.GetDecommissionStatus())
                }
        }
        partition.SetDecommissionStatus(DecommissionPause)
        partition.isRecover = false
        return true
}

func (partition *DataPartition) ResetDecommissionStatus() {
        partition.DecommissionDstAddr = ""
        partition.DecommissionSrcAddr = ""
        partition.DecommissionRetry = 0
        partition.DecommissionRaftForce = false
        partition.DecommissionSrcDiskPath = ""
        partition.isRecover = false
        partition.DecommissionTerm = 0
        partition.DecommissionDstAddrSpecify = false
        partition.DecommissionNeedRollback = false
        partition.DecommissionNeedRollbackTimes = 0
        partition.SetDecommissionStatus(DecommissionInitial)
        partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
        partition.DecommissionWaitTimes = 0
}

func (partition *DataPartition) rollback(c *Cluster) {
        // del new add replica,may timeout, try rollback next time
        err := c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
        if err != nil {
                // keep decommission status to failed for rollback
                log.LogWarnf("action[rollback]dp[%v] rollback to del replica[%v] failed:%v",
                        partition.PartitionID, partition.DecommissionDstAddr, err.Error())
                return
        }
        err = partition.restoreReplicaMeta(c)
        if err != nil {
                return
        }
        // release token first
        partition.ReleaseDecommissionToken(c)
        // reset status if rollback success
        partition.DecommissionDstAddr = ""
        partition.DecommissionRetry = 0
        partition.isRecover = false
        partition.DecommissionNeedRollback = false
        partition.DecommissionWaitTimes = 0
        partition.SetDecommissionStatus(markDecommission)
        partition.SetSpecialReplicaDecommissionStep(SpecialDecommissionInitial)
        c.syncUpdateDataPartition(partition)
        log.LogWarnf("action[rollback]dp[%v] rollback success", partition.PartitionID)
        return
}

func (partition *DataPartition) addToDecommissionList(c *Cluster) {
        if partition.DecommissionSrcAddr == "" {
                return
        }
        var (
                dataNode *DataNode
                zone     *Zone
                ns       *nodeSet
                err      error
        )
        if dataNode, err = c.dataNode(partition.DecommissionSrcAddr); err != nil {
                log.LogWarnf("action[addToDecommissionList]find dp[%v] src decommission dataNode [%v] failed[%v]",
                        partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
                return
        }

        if dataNode.ZoneName == "" {
                log.LogWarnf("action[addToDecommissionList]dataNode[%v] zone is nil", dataNode.Addr)
                return
        }

        if zone, err = c.t.getZone(dataNode.ZoneName); err != nil {
                log.LogWarnf("action[addToDecommissionList]dataNode[%v] zone is nil:%v", dataNode.Addr, err.Error())
                return
        }

        if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
                log.LogWarnf("action[addToDecommissionList]dataNode[%v] nodeSet is nil:%v", dataNode.Addr, err.Error())
                return
        }
        ns.AddToDecommissionDataPartitionList(partition, c)
        log.LogDebugf("action[addToDecommissionList]dp[%v] decommission src[%v] Disk[%v] dst[%v] status[%v] specialStep[%v],"+
                " add to  decommission list[%v] ",
                partition.PartitionID, partition.DecommissionSrcAddr, partition.DecommissionSrcDiskPath,
                partition.DecommissionDstAddr, partition.GetDecommissionStatus(), partition.GetSpecialReplicaDecommissionStep(), ns.ID)
}

func (partition *DataPartition) checkConsumeToken() bool {
        if partition.GetDecommissionStatus() == DecommissionRunning {
                return true
        }
        return false
}

// only mark stop status or initial
func (partition *DataPartition) canMarkDecommission(term uint64) bool {
        // dp may not be reset decommission status from last decommission
        if partition.DecommissionTerm != term {
                return true
        }
        status := partition.GetDecommissionStatus()
        if status == DecommissionInitial ||
                status == DecommissionPause ||
                status == DecommissionFail {
                return true
        }
        return false
}

func (partition *DataPartition) canAddToDecommissionList() bool {
        status := partition.GetDecommissionStatus()
        if status == DecommissionInitial ||
                status == DecommissionPause ||
                status == DecommissionSuccess ||
                status == DecommissionFail {
                return false
        }
        return true
}

func (partition *DataPartition) tryRollback(c *Cluster) bool {
        if !partition.needRollback(c) {
                return false
        }
        partition.DecommissionNeedRollbackTimes++
        partition.rollback(c)
        return true
}

func (partition *DataPartition) pauseReplicaRepair(replicaAddr string, stop bool, c *Cluster) bool {
        index := partition.findReplica(replicaAddr)
        if index == -1 {
                log.LogWarnf("action[pauseReplicaRepair]dp[%v] can't find replica %v", partition.PartitionID, replicaAddr)
                // maybe paused from rollback[mark]
                return true
        }
        const RetryMax = 5
        var (
                dataNode *DataNode
                err      error
                retry    = 0
        )

        for retry <= RetryMax {
                if dataNode, err = c.dataNode(replicaAddr); err != nil {
                        retry++
                        time.Sleep(time.Second)
                        log.LogWarnf("action[pauseReplicaRepair]dp[%v] can't find dataNode %v", partition.PartitionID, partition.DecommissionSrcAddr)
                        continue
                }
                task := partition.createTaskToStopDataPartitionRepair(replicaAddr, stop)
                packet, err := dataNode.TaskManager.syncSendAdminTask(task)
                if err != nil {
                        retry++
                        time.Sleep(time.Second)
                        log.LogWarnf("action[pauseReplicaRepair]dp[%v] send stop task failed %v", partition.PartitionID, err.Error())
                        continue
                }
                if !stop {
                        partition.RecoverStartTime = time.Now().Add(-partition.RecoverLastConsumeTime)
                        partition.RecoverLastConsumeTime = time.Duration(0)
                        log.LogDebugf("action[pauseReplicaRepair]dp[%v] replica %v RecoverStartTime sub %v seconds",
                                partition.PartitionID, replicaAddr, partition.RecoverLastConsumeTime.Seconds())
                } else {
                        partition.RecoverLastConsumeTime = time.Now().Sub(partition.RecoverStartTime)
                        log.LogDebugf("action[pauseReplicaRepair]dp[%v] replica %v already recover %v seconds",
                                partition.PartitionID, replicaAddr, partition.RecoverLastConsumeTime.Seconds())
                }
                log.LogDebugf("action[pauseReplicaRepair]dp[%v] send stop to  replica %v packet %v", partition.PartitionID, replicaAddr, packet)
                return true
        }
        return false
}

func (partition *DataPartition) findReplica(replicaAddr string) int {
        partition.Lock()
        defer partition.Unlock()
        var (
                replica *DataReplica
                index   = -1
        )

        for i := 0; i < len(partition.Replicas); i++ {
                replica = partition.Replicas[i]
                if replica.Addr == replicaAddr {
                        index = i
                        break
                }
        }
        return index
}

func (partition *DataPartition) createTaskToStopDataPartitionRepair(addr string, stop bool) (task *proto.AdminTask) {
        task = proto.NewAdminTask(proto.OpStopDataPartitionRepair, addr, newStopDataPartitionRepairRequest(partition.PartitionID, stop))
        partition.resetTaskID(task)
        return
}

func (partition *DataPartition) TryAcquireDecommissionToken(c *Cluster) bool {
        var (
                zone            *Zone
                ns              *nodeSet
                err             error
                targetHosts     []string
                excludeNodeSets []uint64
                zones           []string
        )
        const MaxRetryDecommissionWait = 60
        defer c.syncUpdateDataPartition(partition)

        if partition.DecommissionRetry > 0 {
                partition.DecommissionWaitTimes++
                if partition.DecommissionWaitTimes < MaxRetryDecommissionWait {
                        // log.LogDebugf("action[TryAcquireDecommissionToken] dp %v wait %v", partition.PartitionID, partition.DecommissionWaitTimes)
                        return false
                } else {
                        partition.DecommissionWaitTimes = 0
                }
        }

        // the first time for dst addr not specify
        if !partition.DecommissionDstAddrSpecify && partition.DecommissionDstAddr == "" {
                // try to find available data node in src nodeset
                ns, zone, err = getTargetNodeset(partition.DecommissionSrcAddr, c)
                if err != nil {
                        log.LogWarnf("action[TryAcquireDecommissionToken] dp %v find src nodeset failed:%v",
                                partition.PartitionID, err.Error())
                        goto errHandler
                }
                targetHosts, _, err = ns.getAvailDataNodeHosts(partition.Hosts, 1)
                if err != nil {
                        log.LogWarnf("action[TryAcquireDecommissionToken] dp %v choose from src nodeset failed:%v",
                                partition.PartitionID, err.Error())
                        if _, ok := c.vols[partition.VolName]; !ok {
                                log.LogWarnf("action[TryAcquireDecommissionToken] dp %v cannot find vol:%v",
                                        partition.PartitionID, err.Error())
                                goto errHandler
                        }

                        if c.isFaultDomain(c.vols[partition.VolName]) {
                                log.LogWarnf("action[TryAcquireDecommissionToken] dp %v is fault domain",
                                        partition.PartitionID)
                                goto errHandler
                        }
                        excludeNodeSets = append(excludeNodeSets, ns.ID)
                        if targetHosts, _, err = zone.getAvailNodeHosts(TypeDataPartition, excludeNodeSets, partition.Hosts, 1); err != nil {
                                // select data nodes from the other zone
                                zones = partition.getLiveZones(partition.DecommissionSrcAddr)
                                var excludeZone []string
                                if len(zones) == 0 {
                                        excludeZone = append(excludeZone, zone.name)
                                } else {
                                        excludeZone = append(excludeZone, zones[0])
                                }
                                if targetHosts, _, err = c.getHostFromNormalZone(TypeDataPartition, excludeZone, excludeNodeSets, partition.Hosts, 1, 1, ""); err != nil {
                                        log.LogWarnf("action[TryAcquireDecommissionToken] dp %v getHostFromNormalZone failed:%v",
                                                partition.PartitionID, err.Error())
                                        goto errHandler
                                }
                        }
                        // get nodeset for target host
                        newAddr := targetHosts[0]
                        ns, zone, err = getTargetNodeset(newAddr, c)
                        if err != nil {
                                log.LogWarnf("action[TryAcquireDecommissionToken] dp %v find new nodeset failed:%v",
                                        partition.PartitionID, err.Error())
                                goto errHandler
                        }
                }
                // only persist DecommissionDstAddr when get token
                if ns.AcquireDecommissionToken(partition.PartitionID) {
                        partition.DecommissionDstAddr = targetHosts[0]
                        log.LogDebugf("action[TryAcquireDecommissionToken] dp %v get token from %v nodeset %v success",
                                partition.PartitionID, partition.DecommissionDstAddr, ns.ID)
                        return true
                } else {
                        log.LogDebugf("action[TryAcquireDecommissionToken] dp %v: nodeset %v token is empty",
                                partition.PartitionID, ns.ID)
                        return false
                }
        } else {
                ns, zone, err = getTargetNodeset(partition.DecommissionDstAddr, c)
                if err != nil {
                        log.LogWarnf("action[TryAcquireDecommissionToken]dp %v find src nodeset failed:%v",
                                partition.PartitionID, err.Error())
                        goto errHandler
                }
                if ns.AcquireDecommissionToken(partition.PartitionID) {
                        log.LogDebugf("action[TryAcquireDecommissionToken]dp %v get token from %v nodeset %v success",
                                partition.PartitionID, partition.DecommissionDstAddr, ns.ID)
                        return true
                } else {
                        return false
                }
        }
errHandler:
        partition.DecommissionRetry++
        if partition.DecommissionRetry >= defaultDecommissionRetryLimit {
                partition.SetDecommissionStatus(DecommissionFail)
        } else {
                partition.DecommissionWaitTimes = 0
        }
        log.LogWarnf("action[TryAcquireDecommissionToken] clusterID[%v] vol[%v] partitionID[%v]"+
                " retry [%v] status [%v] DecommissionDstAddrSpecify [%v] DecommissionDstAddr [%v] failed",
                c.Name, partition.VolName, partition.PartitionID, partition.DecommissionRetry, partition.GetDecommissionStatus(),
                partition.DecommissionDstAddrSpecify, partition.DecommissionDstAddr)
        return false
}

func (partition *DataPartition) ReleaseDecommissionToken(c *Cluster) {
        if partition.DecommissionDstAddr == "" {
                return
        }
        if ns, _, err := getTargetNodeset(partition.DecommissionDstAddr, c); err != nil {
                log.LogWarnf("action[ReleaseDecommissionToken]should never happen dp %v:%v", partition.PartitionID, err.Error())
                return
        } else {
                ns.ReleaseDecommissionToken(partition.PartitionID)
        }
}

//func (partition *DataPartition) ShouldReleaseDecommissionTokenByStop(c *Cluster) {
//        if partition.DecommissionDstAddr == "" && !partition.DecommissionDstAddrSpecify {
//                return
//        }
//        index := partition.findReplica(partition.DecommissionDstAddr)
//        if index == -1 {
//                log.LogWarnf("action[ShouldReleaseDecommissionTokenByStop]dp[%v] has not added replica %v",
//                        partition.PartitionID, partition.DecommissionDstAddr)
//        }
//        partition.ReleaseDecommissionToken(c)
//}

func (partition *DataPartition) restoreReplicaMeta(c *Cluster) (err error) {
        //dst has
        //dstDataNode, err := c.dataNode(partition.DecommissionDstAddr)
        //if err != nil {
        //        log.LogWarnf("action[restoreReplicaMeta]partition %v find dst %v data node failed:%v",
        //                partition.PartitionID, partition.DecommissionDstAddr, err.Error())
        //        return
        //}
        //removePeer := proto.Peer{ID: dstDataNode.ID, Addr: partition.DecommissionDstAddr}
        //if err = c.removeHostMember(partition, removePeer); err != nil {
        //        log.LogWarnf("action[restoreReplicaMeta]partition %v metadata  removeReplica %v failed:%v",
        //                partition.PartitionID, partition.DecommissionDstAddr, err.Error())
        //        return
        //}
        srcDataNode, err := c.dataNode(partition.DecommissionSrcAddr)
        if err != nil {
                log.LogWarnf("action[restoreReplicaMeta]partition %v find src %v data node failed:%v",
                        partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
                return
        }
        addPeer := proto.Peer{ID: srcDataNode.ID, Addr: partition.DecommissionSrcAddr}
        if err = c.addDataPartitionRaftMember(partition, addPeer); err != nil {
                log.LogWarnf("action[restoreReplicaMeta]partition %v metadata addReplica %v failed:%v",
                        partition.PartitionID, partition.DecommissionSrcAddr, err.Error())
                return
        }
        log.LogDebugf("action[restoreReplicaMeta]partition %v meta data has restored:hosts [%v] peers[%v]",
                partition.PartitionID, partition.Hosts, partition.Peers)
        return
}

func getTargetNodeset(addr string, c *Cluster) (ns *nodeSet, zone *Zone, err error) {
        var dataNode *DataNode
        dataNode, err = c.dataNode(addr)
        if err != nil {
                log.LogWarnf("action[getTargetNodeset] find src %v data node failed:%v", addr, err.Error())
                return nil, nil, err
        }
        zone, err = c.t.getZone(dataNode.ZoneName)
        if err != nil {
                log.LogWarnf("action[getTargetNodeset] find src %v zone failed:%v", addr, err.Error())
                return nil, nil, err
        }
        ns, err = zone.getNodeSet(dataNode.NodeSetID)
        if err != nil {
                log.LogWarnf("action[getTargetNodeset] find src %v nodeset failed:%v", addr, err.Error())
                return nil, nil, err
        }
        return ns, zone, nil
}

func (partition *DataPartition) needRollback(c *Cluster) bool {
        log.LogDebugf("action[needRollback]dp[%v]DecommissionNeedRollbackTimes[%v]", partition.PartitionID, partition.DecommissionNeedRollbackTimes)
        // failed by error except add replica or create dp or repair dp
        if !partition.DecommissionNeedRollback {
                return false
        }
        // specify dst addr do not need rollback
        if partition.DecommissionDstAddrSpecify {
                log.LogWarnf("action[needRollback]dp[%v] do not rollback for DecommissionDstAddrSpecify", partition.PartitionID)
                return false
        }
        if partition.DecommissionNeedRollbackTimes >= defaultDecommissionRollbackLimit {
                log.LogDebugf("action[needRollback]try add restore replica, dp[%v]DecommissionNeedRollbackTimes[%v]",
                        partition.PartitionID, partition.DecommissionNeedRollbackTimes)
                partition.DecommissionNeedRollback = false
                err := c.addDataReplica(partition, partition.DecommissionSrcAddr)
                if err != nil {
                        log.LogWarnf("action[needRollback]dp[%v] recover decommission src replica %v failed: %v",
                                partition.PartitionID, partition.DecommissionSrcAddr, err)
                }
                err = c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
                if err != nil {
                        log.LogWarnf("action[needRollback]dp[%v] remove decommission dst replica %v failed: %v",
                                partition.PartitionID, partition.DecommissionDstAddr, err)
                }
                return false
        }
        return true
}

func (partition *DataPartition) restoreReplica(c *Cluster) {
        var err error

        err = c.removeDataReplica(partition, partition.DecommissionDstAddr, false, false)
        if err != nil {
                log.LogWarnf("action[restoreReplica]dp[%v] rollback to del replica[%v] failed:%v",
                        partition.PartitionID, partition.DecommissionDstAddr, err.Error())
        } else {
                log.LogDebugf("action[restoreReplica]dp[%v] rollback to del replica[%v] success",
                        partition.PartitionID, partition.DecommissionDstAddr)
        }

        err = c.addDataReplica(partition, partition.DecommissionSrcAddr)
        if err != nil {
                log.LogWarnf("action[restoreReplica]dp[%v] recover decommission src replica failed", partition.PartitionID)
        } else {
                log.LogDebugf("action[restoreReplica]dp[%v] rollback to add replica[%v] success",
                        partition.PartitionID, partition.DecommissionSrcAddr)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "strconv"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

func (partition *DataPartition) checkStatus(clusterName string, needLog bool, dpTimeOutSec int64, c *Cluster,
        shouldDpInhibitWriteByVolFull bool, forbiddenVol bool) {
        partition.Lock()
        defer partition.Unlock()
        var liveReplicas []*DataReplica

        if proto.IsNormalDp(partition.PartitionType) {
                liveReplicas = partition.getLiveReplicasFromHosts(dpTimeOutSec)
                if len(partition.Replicas) > len(partition.Hosts) {
                        partition.Status = proto.ReadOnly
                        msg := fmt.Sprintf("action[extractStatus],partitionID:%v has exceed repica, replicaNum:%v  liveReplicas:%v   Status:%v  RocksDBHost:%v ",
                                partition.PartitionID, partition.ReplicaNum, len(liveReplicas), partition.Status, partition.Hosts)
                        Warn(clusterName, msg)
                        return
                }
        } else {
                liveReplicas = partition.getLiveReplicas(dpTimeOutSec)
        }

        switch len(liveReplicas) {
        case (int)(partition.ReplicaNum):
                partition.Status = proto.ReadOnly
                if partition.checkReplicaEqualStatus(liveReplicas, proto.ReadWrite) &&
                        partition.hasEnoughAvailableSpace() &&
                        !shouldDpInhibitWriteByVolFull {

                        writable := false
                        if proto.IsNormalDp(partition.PartitionType) {
                                if partition.getLeaderAddr() != "" {
                                        writable = true
                                }
                        } else {
                                // cold volume has no leader
                                writable = true
                        }
                        // if the volume is not forbidden
                        // set status to ReadWrite
                        if writable && !forbiddenVol {
                                partition.Status = proto.ReadWrite
                        }
                }
        default:
                partition.Status = proto.ReadOnly
        }
        // keep readonly if special replica is still decommission
        if partition.isSpecialReplicaCnt() && partition.GetSpecialReplicaDecommissionStep() > 0 {
                log.LogInfof("action[checkStatus] partition %v with Special replica cnt %v on decommison status %v, live replicacnt %v",
                        partition.PartitionID, partition.ReplicaNum, partition.Status, len(liveReplicas))
                partition.Status = proto.ReadOnly
        }

        if partition.checkReplicaEqualStatus(liveReplicas, proto.Unavailable) {
                log.LogWarnf("action[checkStatus] partition %v bet set Unavailable", partition.PartitionID)
                partition.Status = proto.Unavailable
        }

        if needLog == true && len(liveReplicas) != int(partition.ReplicaNum) {
                msg := fmt.Sprintf("action[extractStatus],partitionID:%v  replicaNum:%v  liveReplicas:%v   Status:%v  RocksDBHost:%v ",
                        partition.PartitionID, partition.ReplicaNum, len(liveReplicas), partition.Status, partition.Hosts)
                log.LogInfo(msg)
                if time.Now().Unix()-partition.lastWarnTime > intervalToWarnDataPartition {
                        Warn(clusterName, msg)
                        partition.lastWarnTime = time.Now().Unix()
                }
        }
}

func (partition *DataPartition) hasEnoughAvailableSpace() bool {
        avail := partition.total - partition.used
        if int64(avail) > 10*util.GB {
                return true
        }
        return false
}

func (partition *DataPartition) checkReplicaNotHaveStatus(liveReplicas []*DataReplica, status int8) (equal bool) {
        for _, replica := range liveReplicas {
                if replica.Status == status {
                        log.LogInfof("action[checkReplicaNotHaveStatus] partition %v replica %v status %v dst status %v",
                                partition.PartitionID, replica.Addr, replica.Status, status)
                        return
                }
        }

        return true
}

func (partition *DataPartition) checkReplicaEqualStatus(liveReplicas []*DataReplica, status int8) (equal bool) {
        for _, replica := range liveReplicas {
                if replica.Status != status {
                        log.LogDebugf("action[checkReplicaEqualStatus] partition %v replica %v status %v dst status %v",
                                partition.PartitionID, replica.Addr, replica.Status, status)
                        return
                }
        }

        return true
}

func (partition *DataPartition) checkReplicaStatus(timeOutSec int64) {
        partition.Lock()
        defer partition.Unlock()
        for _, replica := range partition.Replicas {
                if !replica.isLive(timeOutSec) {
                        log.LogInfof("action[checkReplicaStatus] partition %v replica %v be set status ReadOnly", partition.PartitionID, replica.Addr)
                        if replica.Status == proto.ReadWrite {
                                replica.Status = proto.ReadOnly
                        }
                        if partition.isSpecialReplicaCnt() {
                                return
                        }
                        continue
                }

                if (replica.dataNode.RdOnly || partition.RdOnly) && replica.Status == proto.ReadWrite {
                        replica.Status = proto.ReadOnly
                }
        }
}

func (partition *DataPartition) checkLeader(clusterID string, timeOut int64) {
        partition.Lock()
        defer partition.Unlock()
        for _, dr := range partition.Replicas {
                if !dr.isLive(timeOut) {
                        dr.IsLeader = false
                }
        }

        if !proto.IsNormalDp(partition.PartitionType) {
                return
        }

        var report bool
        if partition.getLeaderAddr() == "" {
                report = true
        }
        if WarnMetrics != nil {
                WarnMetrics.WarnDpNoLeader(clusterID, partition.PartitionID, report)
        }
        return
}

// Check if there is any missing replica for a data partition.
func (partition *DataPartition) checkMissingReplicas(clusterID, leaderAddr string, dataPartitionMissSec, dataPartitionWarnInterval int64) {
        partition.Lock()
        defer partition.Unlock()

        id := strconv.FormatUint(partition.PartitionID, 10)
        _, ok := WarnMetrics.dpMissingReplicaInfo[id]
        oldMissingReplicaNum := 0
        if ok {
                oldMissingReplicaNum = len(WarnMetrics.dpMissingReplicaInfo[id].addrs)
        }

        for _, replica := range partition.Replicas {
                if partition.hasHost(replica.Addr) && replica.isMissing(dataPartitionMissSec) && !partition.IsDiscard {
                        if partition.needToAlarmMissingDataPartition(replica.Addr, dataPartitionWarnInterval) {
                                dataNode := replica.getReplicaNode()
                                var lastReportTime time.Time
                                isActive := true
                                if dataNode != nil {
                                        lastReportTime = dataNode.ReportTime
                                        isActive = dataNode.isActive
                                }
                                msg := fmt.Sprintf("action[checkMissErr],clusterID[%v] paritionID:%v  on node:%v  "+
                                        "miss time > %v  lastRepostTime:%v   dnodeLastReportTime:%v  nodeisActive:%v So Migrate by manual",
                                        clusterID, partition.PartitionID, replica.Addr, dataPartitionMissSec, replica.ReportTime, lastReportTime, isActive)
                                // msg = msg + fmt.Sprintf(" decommissionDataPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, partition.PartitionID, replica.Addr)
                                Warn(clusterID, msg)
                                if WarnMetrics != nil {
                                        WarnMetrics.WarnMissingDp(clusterID, replica.Addr, partition.PartitionID, true)
                                }
                        }
                } else {
                        if WarnMetrics != nil {
                                WarnMetrics.WarnMissingDp(clusterID, replica.Addr, partition.PartitionID, false)
                        }
                }
        }
        if WarnMetrics != nil {
                WarnMetrics.CleanObsoleteDpMissing(clusterID, partition)
        }

        WarnMetrics.dpMissingReplicaMutex.Lock()
        replicaInfo, ok := WarnMetrics.dpMissingReplicaInfo[id]
        if ok {
                MissingReplicaNum := len(replicaInfo.addrs)
                oldDpReplicaAliveNum := ""
                if MissingReplicaNum != oldMissingReplicaNum && oldMissingReplicaNum != 0 {
                        oldDpReplicaAliveNum = WarnMetrics.dpMissingReplicaInfo[id].replicaAlive
                }
                dpReplicaMissingNum := uint8(len(WarnMetrics.dpMissingReplicaInfo[id].addrs))
                dpReplicaAliveNum := partition.ReplicaNum - dpReplicaMissingNum
                replicaInfo.replicaNum = strconv.FormatUint(uint64(partition.ReplicaNum), 10)
                replicaInfo.replicaAlive = strconv.FormatUint(uint64(dpReplicaAliveNum), 10)
                WarnMetrics.dpMissingReplicaInfo[id] = replicaInfo
                for missingReplicaAddr := range WarnMetrics.dpMissingReplicaInfo[id].addrs {
                        if oldDpReplicaAliveNum != "" {
                                WarnMetrics.missingDp.DeleteLabelValues(clusterID, id, missingReplicaAddr, oldDpReplicaAliveNum, replicaInfo.replicaNum)
                        }
                        WarnMetrics.missingDp.SetWithLabelValues(1, clusterID, id, missingReplicaAddr, replicaInfo.replicaAlive, replicaInfo.replicaNum)
                }
        }
        WarnMetrics.dpMissingReplicaMutex.Unlock()

        if !proto.IsNormalDp(partition.PartitionType) {
                return
        }

        for _, addr := range partition.Hosts {
                if partition.hasMissingDataPartition(addr) && partition.needToAlarmMissingDataPartition(addr, dataPartitionWarnInterval) {
                        msg := fmt.Sprintf("action[checkMissErr],clusterID[%v] partitionID:%v  on node:%v  "+
                                "miss time  > :%v  but server not exsit So Migrate", clusterID, partition.PartitionID, addr, dataPartitionMissSec)
                        msg = msg + fmt.Sprintf(" decommissionDataPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, partition.PartitionID, addr)
                        Warn(clusterID, msg)
                }
        }
}

func (partition *DataPartition) needToAlarmMissingDataPartition(addr string, interval int64) (shouldAlarm bool) {
        t, ok := partition.MissingNodes[addr]
        if !ok {
                partition.MissingNodes[addr] = time.Now().Unix()
                shouldAlarm = true
        } else {
                if time.Now().Unix()-t > interval {
                        shouldAlarm = true
                        partition.MissingNodes[addr] = time.Now().Unix()
                }
        }

        return
}

func (partition *DataPartition) hasMissingDataPartition(addr string) (isMissing bool) {
        _, ok := partition.hasReplica(addr)

        if !ok {
                isMissing = true
        }

        return
}

func (partition *DataPartition) checkDiskError(clusterID, leaderAddr string) {
        diskErrorAddrs := make(map[string]string, 0)

        partition.Lock()
        defer partition.Unlock()

        for _, addr := range partition.Hosts {
                replica, ok := partition.hasReplica(addr)
                if !ok {
                        continue
                }

                if replica.Status == proto.Unavailable {
                        if partition.isSpecialReplicaCnt() && len(partition.Hosts) > 1 {
                                log.LogWarnf("action[%v],clusterID[%v],partitionID:%v  On :%v status Unavailable",
                                        checkDataPartitionDiskErr, clusterID, partition.PartitionID, addr)
                                continue
                        }
                        diskErrorAddrs[replica.Addr] = replica.DiskPath
                }
        }

        if len(diskErrorAddrs) != (int)(partition.ReplicaNum) && len(diskErrorAddrs) > 0 {
                partition.Status = proto.ReadOnly
        }

        for addr, diskPath := range diskErrorAddrs {
                msg := fmt.Sprintf("action[%v],clusterID[%v],partitionID:%v  On :%v  Disk Error,So Remove it From RocksDBHost, decommissionDiskURL is http://%v/disk/decommission?addr=%v&disk=%v",
                        checkDataPartitionDiskErr, clusterID, partition.PartitionID, addr, leaderAddr, addr, diskPath)
                Warn(clusterID, msg)
        }

        return
}

func (partition *DataPartition) checkReplicationTask(clusterID string, dataPartitionSize uint64) {
        var msg string

        if excessAddr, excessErr := partition.deleteIllegalReplica(); excessErr != nil {
                msg = fmt.Sprintf("action[%v], partitionID:%v  Excess Replication On :%v  Err:%v  rocksDBRecords:%v",
                        deleteIllegalReplicaErr, partition.PartitionID, excessAddr, excessErr.Error(), partition.Hosts)
                Warn(clusterID, msg)
                partition.Lock()
                partition.removeReplicaByAddr(excessAddr)
                partition.Unlock()
        }

        if partition.Status == proto.ReadWrite {
                return
        }

        if lackAddr, lackErr := partition.missingReplicaAddress(dataPartitionSize); lackErr != nil {
                msg = fmt.Sprintf("action[%v], partitionID:%v  Lack Replication On :%v  Err:%v  Hosts:%v  new task to create DataReplica",
                        addMissingReplicaErr, partition.PartitionID, lackAddr, lackErr.Error(), partition.Hosts)
                Warn(clusterID, msg)
        }

        return
}

func (partition *DataPartition) deleteIllegalReplica() (excessAddr string, err error) {
        partition.Lock()
        defer partition.Unlock()

        for i := 0; i < len(partition.Replicas); i++ {
                replica := partition.Replicas[i]
                if ok := partition.hasHost(replica.Addr); !ok {
                        excessAddr = replica.Addr
                        err = proto.ErrIllegalDataReplica
                        break
                }
        }

        return
}

func (partition *DataPartition) missingReplicaAddress(dataPartitionSize uint64) (addr string, err error) {
        partition.Lock()
        defer partition.Unlock()

        if time.Now().Unix()-partition.createTime < 120 {
                return
        }

        // go through all the hosts to find the missing replica
        for _, host := range partition.Hosts {
                if _, ok := partition.hasReplica(host); !ok {
                        log.LogError(fmt.Sprintf("action[missingReplicaAddress],partitionID:%v lack replication:%v",
                                partition.PartitionID, host))
                        err = proto.ErrMissingReplica
                        addr = host
                        break
                }
        }

        return
}

func (partition *DataPartition) checkReplicaSize(clusterID string, diffSpaceUsage uint64) {
        partition.RLock()
        defer partition.RUnlock()
        if len(partition.Replicas) == 0 {
                return
        }
        diff := 0.0
        sentry := float64(partition.Replicas[0].Used)
        for _, dr := range partition.Replicas {
                temp := math.Abs(float64(dr.Used) - sentry)
                if temp > diff {
                        diff = temp
                }
        }
        if diff > float64(diffSpaceUsage) {
                msg := fmt.Sprintf("action[checkReplicaSize] vol[%v],partition[%v] difference space usage [%v] larger than %v, ",
                        partition.VolName, partition.PartitionID, diff, diffSpaceUsage)
                for _, dr := range partition.Replicas {
                        msg = msg + fmt.Sprintf("replica[%v],used[%v];", dr.Addr, dr.Used)
                }
                Warn(clusterID, msg)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "runtime"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/compressor"
        "github.com/cubefs/cubefs/util/log"
)

// DataPartitionMap stores all the data partitionMap
type DataPartitionMap struct {
        sync.RWMutex
        partitionMap           map[uint64]*DataPartition
        readableAndWritableCnt int    // number of readable and writable partitionMap
        lastLoadedIndex        uint64 // last loaded partition index
        lastReleasedIndex      uint64 // last released partition index
        partitions             []*DataPartition
        responseCache          []byte
        responseCompressCache  []byte
        lastAutoCreateTime     time.Time
        volName                string
        readMutex              sync.RWMutex
}

func newDataPartitionMap(volName string) (dpMap *DataPartitionMap) {
        dpMap = new(DataPartitionMap)
        dpMap.partitionMap = make(map[uint64]*DataPartition, 0)
        dpMap.partitions = make([]*DataPartition, 0)
        dpMap.responseCache = make([]byte, 0)
        dpMap.responseCompressCache = make([]byte, 0)
        dpMap.volName = volName
        dpMap.lastAutoCreateTime = time.Now()
        return
}

// attention: it's not deep clone for element, dataPartition
func (dpMap *DataPartitionMap) clonePartitions() []*DataPartition {
        dpMap.RLock()
        defer dpMap.RUnlock()

        partitions := make([]*DataPartition, 0)
        for _, dp := range dpMap.partitions {
                partitions = append(partitions, dp)
        }

        return partitions
}

func (dpMap *DataPartitionMap) get(ID uint64) (*DataPartition, error) {
        dpMap.RLock()
        defer dpMap.RUnlock()
        if v, ok := dpMap.partitionMap[ID]; ok {
                return v, nil
        }
        return nil, proto.ErrDataPartitionNotExists
}

func (dpMap *DataPartitionMap) del(dp *DataPartition) {
        dpMap.Lock()
        defer dpMap.Unlock()
        _, ok := dpMap.partitionMap[dp.PartitionID]
        if !ok {
                return
        }

        dataPartitions := make([]*DataPartition, 0)
        for index, partition := range dpMap.partitions {
                if partition.PartitionID == dp.PartitionID {
                        dataPartitions = append(dataPartitions, dpMap.partitions[:index]...)
                        dataPartitions = append(dataPartitions, dpMap.partitions[index+1:]...)
                        dpMap.partitions = dataPartitions
                        break
                }
        }

        delete(dpMap.partitionMap, dp.PartitionID)
}

func (dpMap *DataPartitionMap) put(dp *DataPartition) {
        dpMap.Lock()
        defer dpMap.Unlock()

        _, ok := dpMap.partitionMap[dp.PartitionID]
        if !ok {
                dpMap.partitions = append(dpMap.partitions, dp)
                dpMap.partitionMap[dp.PartitionID] = dp
                return
        }

        // replace the old partition with dp in the map and array
        dpMap.partitionMap[dp.PartitionID] = dp
        dataPartitions := make([]*DataPartition, 0)
        for index, partition := range dpMap.partitions {
                if partition.PartitionID == dp.PartitionID {
                        dataPartitions = append(dataPartitions, dpMap.partitions[:index]...)
                        dataPartitions = append(dataPartitions, dp)
                        dataPartitions = append(dataPartitions, dpMap.partitions[index+1:]...)
                        dpMap.partitions = dataPartitions
                        break
                }
        }
}

func (dpMap *DataPartitionMap) setReadWriteDataPartitions(readWrites int, clusterName string) {
        dpMap.Lock()
        defer dpMap.Unlock()
        dpMap.readableAndWritableCnt = readWrites
}

func (dpMap *DataPartitionMap) getDataPartitionResponseCache() []byte {
        dpMap.RLock()
        defer dpMap.RUnlock()
        return dpMap.responseCache
}

func (dpMap *DataPartitionMap) getDataPartitionCompressCache() []byte {
        dpMap.RLock()
        defer dpMap.RUnlock()
        return dpMap.responseCompressCache
}

func (dpMap *DataPartitionMap) setDataPartitionResponseCache(responseCache []byte) {
        dpMap.Lock()
        defer dpMap.Unlock()
        if responseCache != nil {
                dpMap.responseCache = responseCache
        }
}

func (dpMap *DataPartitionMap) setDataPartitionCompressCache(responseCompress []byte) {
        dpMap.Lock()
        defer dpMap.Unlock()
        if responseCompress != nil {
                dpMap.responseCompressCache = responseCompress
        }
}

func (dpMap *DataPartitionMap) updateResponseCache(needsUpdate bool, minPartitionID uint64, volType int) (body []byte, err error) {
        responseCache := dpMap.getDataPartitionResponseCache()
        if responseCache == nil || needsUpdate || len(responseCache) == 0 {
                dpMap.readMutex.Lock()
                defer dpMap.readMutex.Unlock()
                responseCache = dpMap.getDataPartitionResponseCache()
                if !(responseCache == nil || needsUpdate || len(responseCache) == 0) {
                        body = responseCache
                        return
                }
                dpResps := dpMap.getDataPartitionsView(minPartitionID)
                if len(dpResps) == 0 && proto.IsHot(volType) {
                        log.LogError(fmt.Sprintf("action[updateDpResponseCache],volName[%v] minPartitionID:%v,err:%v",
                                dpMap.volName, minPartitionID, proto.ErrNoAvailDataPartition))
                        return nil, proto.ErrNoAvailDataPartition
                }
                cv := proto.NewDataPartitionsView()
                cv.DataPartitions = dpResps
                reply := newSuccessHTTPReply(cv)
                if body, err = json.Marshal(reply); err != nil {
                        log.LogError(fmt.Sprintf("action[updateDpResponseCache],minPartitionID:%v,err:%v",
                                minPartitionID, err.Error()))
                        return nil, proto.ErrMarshalData
                }
                dpMap.setDataPartitionResponseCache(body)
                return
        }

        body = responseCache
        return
}

func (dpMap *DataPartitionMap) updateCompressCache(needsUpdate bool, minPartitionID uint64, volType int) (body []byte, err error) {
        body = dpMap.getDataPartitionCompressCache()
        if len(body) > 0 {
                return
        }
        if body, err = dpMap.updateResponseCache(needsUpdate, minPartitionID, volType); err != nil {
                log.LogErrorf("action[updateCompressCache]updateResponseCache failed,err:%+v", err)
                return
        }
        if body, err = compressor.New(compressor.EncodingGzip).Compress(body); err != nil {
                log.LogErrorf("action[updateCompressCache]GzipCompressor.Compress failed,err:%+v", err)
                err = proto.ErrCompressFailed
                return
        }
        dpMap.setDataPartitionCompressCache(body)
        return
}

func (dpMap *DataPartitionMap) getDataPartitionsView(minPartitionID uint64) (dpResps []*proto.DataPartitionResponse) {
        dpResps = make([]*proto.DataPartitionResponse, 0)
        log.LogDebugf("volName[%v] DataPartitionMapLen[%v],DataPartitionsLen[%v],minPartitionID[%v]",
                dpMap.volName, len(dpMap.partitionMap), len(dpMap.partitions), minPartitionID)

        dpMap.RLock()
        defer dpMap.RUnlock()
        for _, dp := range dpMap.partitionMap {
                if len(dp.Hosts) == 0 {
                        log.LogErrorf("getDataPartitionsView. dp %v host nil", dp.PartitionID)
                        continue
                }
                if dp.PartitionID <= minPartitionID {
                        continue
                }
                dpResp := dp.convertToDataPartitionResponse()
                dpResps = append(dpResps, dpResp)
        }

        return
}

func (dpMap *DataPartitionMap) getDataPartitionsToBeReleased(numberOfDataPartitionsToFree int, secondsToFreeDataPartitionAfterLoad int64) (partitions []*DataPartition, startIndex uint64) {
        partitions = make([]*DataPartition, 0)
        dpMap.RLock()
        defer dpMap.RUnlock()
        dpLen := len(dpMap.partitions)
        if dpLen == 0 {
                return
        }
        startIndex = dpMap.lastReleasedIndex
        count := numberOfDataPartitionsToFree
        if dpLen < numberOfDataPartitionsToFree {
                count = dpLen
        }
        for i := 0; i < count; i++ {
                if dpMap.lastReleasedIndex >= uint64(dpLen) {
                        dpMap.lastReleasedIndex = 0
                }
                dp := dpMap.partitions[dpMap.lastReleasedIndex]
                dpMap.lastReleasedIndex++
                if time.Now().Unix()-dp.LastLoadedTime >= secondsToFreeDataPartitionAfterLoad {
                        partitions = append(partitions, dp)
                }
        }

        return
}

func (dpMap *DataPartitionMap) freeMemOccupiedByDataPartitions(partitions []*DataPartition) {
        var wg sync.WaitGroup
        for _, dp := range partitions {
                wg.Add(1)
                go func(dp *DataPartition) {
                        defer func() {
                                wg.Done()
                                if err := recover(); err != nil {
                                        const size = runtimeStackBufSize
                                        buf := make([]byte, size)
                                        buf = buf[:runtime.Stack(buf, false)]
                                        log.LogError(fmt.Sprintf("[%v] freeMemOccupiedByDataPartitions panic %v: %s\n", dpMap.volName, err, buf))
                                }
                        }()
                        dp.releaseDataPartition()
                }(dp)
        }
        wg.Wait()
}

func (dpMap *DataPartitionMap) getDataPartitionsToBeChecked(loadFrequencyTime int64) (partitions []*DataPartition, startIndex uint64) {
        partitions = make([]*DataPartition, 0)
        dpMap.RLock()
        defer dpMap.RUnlock()
        dpLen := len(dpMap.partitions)
        if dpLen == 0 {
                return
        }
        startIndex = dpMap.lastLoadedIndex

        // determine the number of data partitions to load
        count := dpLen / intervalToLoadDataPartition
        if count == 0 {
                count = 1
        }

        for i := 0; i < count; i++ {
                if dpMap.lastLoadedIndex >= (uint64)(len(dpMap.partitions)) {
                        dpMap.lastLoadedIndex = 0
                }
                dp := dpMap.partitions[dpMap.lastLoadedIndex]
                dpMap.lastLoadedIndex++

                if time.Now().Unix()-dp.LastLoadedTime >= loadFrequencyTime {
                        partitions = append(partitions, dp)
                }
        }

        return
}

func (dpMap *DataPartitionMap) totalUsedSpace() (totalUsed uint64) {
        dpMap.RLock()
        defer dpMap.RUnlock()
        for _, dp := range dpMap.partitions {
                totalUsed = totalUsed + dp.getMaxUsedSpace()
        }
        return
}

func (dpMap *DataPartitionMap) setAllDataPartitionsToReadOnly() {
        dpMap.Lock()
        defer dpMap.Unlock()
        changedCnt := 0
        for _, dp := range dpMap.partitions {
                if proto.ReadWrite == dp.Status {
                        dp.Status = proto.ReadOnly
                        changedCnt++
                }
        }
        log.LogDebugf("action[setAllDataPartitionsToReadOnly] ReadWrite->ReadOnly dp cnt: %v", changedCnt)
}

func (dpMap *DataPartitionMap) checkBadDiskDataPartitions(diskPath, nodeAddr string) (partitions []*DataPartition) {
        dpMap.RLock()
        defer dpMap.RUnlock()
        partitions = make([]*DataPartition, 0)
        for _, dp := range dpMap.partitionMap {
                if dp.containsBadDisk(diskPath, nodeAddr) {
                        partitions = append(partitions, dp)
                }
        }
        return
}

func (dpMap *DataPartitionMap) getReplicaDiskPaths(nodeAddr string) (diskPaths []string) {
        dpMap.RLock()
        defer dpMap.RUnlock()
        diskPaths = make([]string, 0)
        for _, dp := range dpMap.partitionMap {
                disk := dp.getReplicaDisk(nodeAddr)
                if len(disk) != 0 && !inStingList(disk, diskPaths) {
                        diskPaths = append(diskPaths, disk)
                }
        }
        return
}

func inStingList(target string, strArray []string) bool {
        for _, element := range strArray {
                if target == element {
                        return true
                }
        }
        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

// DataReplica represents the replica of a data partition
type DataReplica struct {
        proto.DataReplica
        dataNode *DataNode
        loc      uint8
}

func newDataReplica(dataNode *DataNode) (replica *DataReplica) {
        replica = new(DataReplica)
        replica.dataNode = dataNode
        replica.Addr = dataNode.Addr
        replica.ReportTime = time.Now().Unix()
        return
}

func (replica *DataReplica) setAlive() {
        replica.ReportTime = time.Now().Unix()
}

func (replica *DataReplica) isMissing(interval int64) (isMissing bool) {
        if time.Now().Unix()-replica.ReportTime > interval {
                isMissing = true
        }
        return
}

func (replica *DataReplica) isLive(timeOutSec int64) (isAvailable bool) {
        log.LogDebugf("action[isLive] replica addr %v, datanode active %v replica status %v and is active %v",
                replica.Addr, replica.dataNode.isActive, replica.Status, replica.isActive(timeOutSec))
        if replica.dataNode.isActive && replica.Status != proto.Unavailable &&
                replica.isActive(timeOutSec) {
                isAvailable = true
        }
        return
}

func (replica *DataReplica) isActive(timeOutSec int64) bool {
        return time.Now().Unix()-replica.ReportTime <= timeOutSec
}

func (replica *DataReplica) getReplicaNode() (node *DataNode) {
        return replica.dataNode
}

// check if the replica's location is available
func (replica *DataReplica) isLocationAvailable() (isAvailable bool) {
        dataNode := replica.getReplicaNode()
        dataNode.Lock()
        defer dataNode.Unlock()
        if dataNode.isActive == true && replica.isActive(defaultDataPartitionTimeOutSec) == true {
                isAvailable = true
        }

        return
}

func (replica *DataReplica) isRepairing() bool {
        return replica.Status == proto.Recovering
}

func (replica *DataReplica) isUnavailable() bool {
        return replica.Status == proto.Unavailable
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

func (c *Cluster) scheduleToCheckDiskRecoveryProgress() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                if c.vols != nil {
                                        c.checkDiskRecoveryProgress()
                                }
                        }
                        time.Sleep(time.Second * defaultIntervalToCheckDataPartition)
                }
        }()
}

func (c *Cluster) checkDiskRecoveryProgress() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkDiskRecoveryProgress occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkDiskRecoveryProgress occurred panic")
                }
        }()

        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        c.BadDataPartitionIds.Range(func(key, value interface{}) bool {
                badDataPartitionIds := value.([]uint64)
                newBadDpIds := make([]uint64, 0)
                for _, partitionID := range badDataPartitionIds {
                        partition, err := c.getDataPartitionByID(partitionID)
                        if err != nil {
                                Warn(c.Name, fmt.Sprintf("checkDiskRecoveryProgress clusterID[%v],partitionID[%v] is not exist", c.Name, partitionID))
                                continue
                        }
                        // do not update status if paused
                        if partition.IsDecommissionPaused() {
                                continue
                        }
                        _, err = c.getVol(partition.VolName)
                        if err != nil {
                                Warn(c.Name, fmt.Sprintf("checkDiskRecoveryProgress clusterID[%v],partitionID[%v] vol(%s) is not exist",
                                        c.Name, partitionID, partition.VolName))
                                continue
                        }
                        log.LogInfof("action[checkDiskRecoveryProgress] dp %v isSpec %v replicas %v conf replicas num %v",
                                partition.PartitionID, partition.isSpecialReplicaCnt(), len(partition.Replicas), int(partition.ReplicaNum))
                        if len(partition.Replicas) == 0 {
                                partition.SetDecommissionStatus(DecommissionSuccess)
                                log.LogWarnf("action[checkDiskRecoveryProgress] dp %v maybe deleted", partition.PartitionID)
                                continue
                        }
                        //if len(partition.Replicas) == 0 ||
                        //        (!partition.isSpecialReplicaCnt() && len(partition.Replicas) < int(partition.ReplicaNum)) ||
                        //        (partition.isSpecialReplicaCnt() && len(partition.Replicas) > int(partition.ReplicaNum)) {
                        //        newBadDpIds = append(newBadDpIds, partitionID)
                        //        log.LogInfof("action[checkDiskRecoveryProgress] dp %v newBadDpIds [%v] replics %v conf replics num %v",
                        //                partition.PartitionID, newBadDpIds, len(partition.Replicas), int(partition.ReplicaNum))
                        //        continue
                        //}

                        newReplica, _ := partition.getReplica(partition.DecommissionDstAddr)
                        if newReplica == nil {
                                log.LogWarnf("action[checkDiskRecoveryProgress] dp %v cannot find replica %v", partition.PartitionID,
                                        partition.DecommissionDstAddr)
                                partition.DecommissionNeedRollback = true
                                partition.SetDecommissionStatus(DecommissionFail)
                                continue
                        }
                        if newReplica.isRepairing() {
                                if !partition.isSpecialReplicaCnt() &&
                                        time.Now().Sub(partition.RecoverStartTime) > c.GetDecommissionDataPartitionRecoverTimeOut() {
                                        partition.DecommissionNeedRollback = true
                                        partition.SetDecommissionStatus(DecommissionFail)
                                        Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v]  recovered timeout %s",
                                                c.Name, partitionID, time.Now().Sub(partition.RecoverStartTime).String()))
                                } else {
                                        newBadDpIds = append(newBadDpIds, partitionID)
                                }
                        } else {
                                if partition.isSpecialReplicaCnt() {
                                        continue // change dp decommission status in decommission function
                                }
                                // do not add to BadDataPartitionIds
                                if newReplica.isUnavailable() {
                                        partition.DecommissionNeedRollback = true
                                        partition.SetDecommissionStatus(DecommissionFail)
                                        Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v] has recovered failed", c.Name, partitionID))
                                } else {
                                        partition.SetDecommissionStatus(DecommissionSuccess) // can be readonly or readwrite
                                        Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],partitionID[%v] has recovered success", c.Name, partitionID))
                                }
                                partition.RLock()
                                c.syncUpdateDataPartition(partition)
                                partition.RUnlock()
                        }
                }

                if len(newBadDpIds) == 0 {
                        Warn(c.Name, fmt.Sprintf("action[checkDiskRecoveryProgress]clusterID[%v],node:disk[%v] has recovered success", c.Name, key))
                        c.BadDataPartitionIds.Delete(key)
                } else {
                        c.BadDataPartitionIds.Store(key, newBadDpIds)
                        log.LogInfof("action[checkDiskRecoveryProgress]BadDataPartitionIds key(%s) still have (%d) dp in recover", key, len(newBadDpIds))
                }

                return true
        })
}

func (c *Cluster) addAndSyncDecommissionedDisk(dataNode *DataNode, diskPath string) (err error) {
        if exist := dataNode.addDecommissionedDisk(diskPath); exist {
                return
        }
        if err = c.syncUpdateDataNode(dataNode); err != nil {
                dataNode.deleteDecommissionedDisk(diskPath)
                return
        }
        log.LogInfof("action[addAndSyncDecommissionedDisk] finish, remaining decommissioned disks[%v], dataNode[%v]", dataNode.getDecommissionedDisks(), dataNode.Addr)
        return
}

func (c *Cluster) deleteAndSyncDecommissionedDisk(dataNode *DataNode, diskPath string) (err error) {
        if exist := dataNode.deleteDecommissionedDisk(diskPath); !exist {
                return
        }
        if err = c.syncUpdateDataNode(dataNode); err != nil {
                dataNode.addDecommissionedDisk(diskPath)
                return
        }
        log.LogInfof("action[deleteAndSyncDecommissionedDisk] finish, remaining decommissioned disks[%v], dataNode[%v]", dataNode.getDecommissionedDisks(), dataNode.Addr)
        return
}

func (c *Cluster) decommissionDisk(dataNode *DataNode, raftForce bool, badDiskPath string,
        badPartitions []*DataPartition, diskDisable bool) (err error) {
        msg := fmt.Sprintf("action[decommissionDisk], Node[%v] OffLine,disk[%v]", dataNode.Addr, badDiskPath)
        log.LogWarn(msg)

        for _, dp := range badPartitions {
                go func(dp *DataPartition) {
                        if err = c.decommissionDataPartition(dataNode.Addr, dp, raftForce, diskOfflineErr); err != nil {
                                return
                        }
                }(dp)
        }
        msg = fmt.Sprintf("action[decommissionDisk],clusterID[%v] node[%v] OffLine success",
                c.Name, dataNode.Addr)
        Warn(c.Name, msg)
        return
}

const (
        ManualDecommission uint32 = iota
        AutoDecommission
)

type DecommissionDisk struct {
        SrcAddr                  string
        DstAddr                  string
        DiskPath                 string
        DecommissionStatus       uint32
        DecommissionRaftForce    bool
        DecommissionRetry        uint8
        DecommissionDpTotal      int
        DecommissionTerm         uint64
        DecommissionDpCount      int
        DiskDisable              bool
        Type                     uint32
        DecommissionCompleteTime int64
}

func (dd *DecommissionDisk) GenerateKey() string {
        return fmt.Sprintf("%s_%s", dd.SrcAddr, dd.DiskPath)
}

func (dd *DecommissionDisk) updateDecommissionStatus(c *Cluster, debug bool) (uint32, float64) {
        var (
                progress            float64
                totalNum            = dd.DecommissionDpTotal
                partitionIds        []uint64
                failedPartitionIds  []uint64
                runningPartitionIds []uint64
                preparePartitionIds []uint64
                stopPartitionIds    []uint64
        )

        if dd.GetDecommissionStatus() == DecommissionInitial {
                return DecommissionInitial, float64(0)
        }

        if dd.GetDecommissionStatus() == markDecommission {
                return markDecommission, float64(0)
        }

        if totalNum == InvalidDecommissionDpCnt && dd.GetDecommissionStatus() == DecommissionFail {
                return DecommissionFail, float64(0)
        }

        if dd.GetDecommissionStatus() == DecommissionSuccess {
                return DecommissionSuccess, float64(1)
        }

        if dd.GetDecommissionStatus() == DecommissionPause {
                return DecommissionPause, float64(0)
        }

        defer func() {
                c.syncUpdateDecommissionDisk(dd)
        }()
        if dd.DecommissionRetry >= defaultDecommissionRetryLimit {
                dd.markDecommissionFailed()
                return DecommissionFail, float64(0)
        }
        // Get all dp on this disk
        failedNum := 0
        runningNum := 0
        prepareNum := 0
        stopNum := 0
        // get the latest decommission result
        partitions := c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)

        if len(partitions) == 0 {
                log.LogDebugf("action[updateDecommissionDiskStatus]no partitions left:%v", dd.GenerateKey())
                dd.markDecommissionSuccess()
                return DecommissionSuccess, float64(1)
        }

        for _, dp := range partitions {
                if dp.IsDecommissionFailed() && !dp.needRollback(c) {
                        failedNum++
                        failedPartitionIds = append(failedPartitionIds, dp.PartitionID)
                }
                if dp.GetDecommissionStatus() == DecommissionRunning {
                        runningNum++
                        runningPartitionIds = append(runningPartitionIds, dp.PartitionID)
                }
                if dp.GetDecommissionStatus() == DecommissionPrepare {
                        prepareNum++
                        preparePartitionIds = append(preparePartitionIds, dp.PartitionID)
                }
                // disk may stop before and will be counted into partitions
                if dp.GetDecommissionStatus() == DecommissionPause {
                        stopNum++
                        stopPartitionIds = append(stopPartitionIds, dp.PartitionID)
                }
                partitionIds = append(partitionIds, dp.PartitionID)
        }
        progress = float64(totalNum-len(partitions)) / float64(totalNum)
        if debug {
                log.LogInfof("action[updateDecommissionDiskStatus] disk[%v] progress[%v] totalNum[%v] "+
                        "partitionIds %v  FailedNum[%v] failedPartitionIds %v, runningNum[%v] runningDp %v, prepareNum[%v] prepareDp %v "+
                        "stopNum[%v] stopPartitionIds %v ",
                        dd.GenerateKey(), progress, totalNum, partitionIds, failedNum, failedPartitionIds, runningNum, runningPartitionIds,
                        prepareNum, preparePartitionIds, stopNum, stopPartitionIds)
        }
        if failedNum >= (len(partitions)-stopNum) && failedNum != 0 {
                dd.markDecommissionFailed()
                return DecommissionFail, progress
        }
        dd.SetDecommissionStatus(DecommissionRunning)
        return DecommissionRunning, progress
}

func (dd *DecommissionDisk) GetDecommissionStatus() uint32 {
        return atomic.LoadUint32(&dd.DecommissionStatus)
}

func (dd *DecommissionDisk) SetDecommissionStatus(status uint32) {
        atomic.StoreUint32(&dd.DecommissionStatus, status)
}

func (dd *DecommissionDisk) markDecommissionSuccess() {
        dd.SetDecommissionStatus(DecommissionSuccess)
        dd.DecommissionCompleteTime = time.Now().Unix()
}

func (dd *DecommissionDisk) markDecommissionFailed() {
        dd.SetDecommissionStatus(DecommissionFail)
        dd.DecommissionCompleteTime = time.Now().Unix()
}

func (dd *DecommissionDisk) GetLatestDecommissionDP(c *Cluster) (partitions []*DataPartition) {
        partitions = c.getAllDecommissionDataPartitionByDiskAndTerm(dd.SrcAddr, dd.DiskPath, dd.DecommissionTerm)
        return
}

func (dd *DecommissionDisk) GetDecommissionFailedDP(c *Cluster) (error, []uint64) {
        var (
                failedDps     []uint64
                err           error
                badPartitions []*DataPartition
        )
        if dd.GetDecommissionStatus() != DecommissionFail {
                err = fmt.Errorf("action[GetDecommissionDiskFailedDP]dataNode[%s] disk[%s] status must be failed,but[%d]",
                        dd.SrcAddr, dd.DiskPath, dd.GetDecommissionStatus())
                return err, failedDps
        }

        badPartitions = c.getAllDecommissionDataPartitionByDisk(dd.SrcAddr, dd.DiskPath)
        for _, dp := range badPartitions {
                if dp.IsDecommissionFailed() {
                        failedDps = append(failedDps, dp.PartitionID)
                }
        }
        log.LogWarnf("action[GetDecommissionDiskFailedDP] failed dp list [%v]", failedDps)
        return nil, failedDps
}

func (dd *DecommissionDisk) markDecommission(dstPath string, raftForce bool, limit int) {
        // if transfer from pause,do not change these attrs
        if dd.GetDecommissionStatus() != DecommissionPause {
                dd.DecommissionDpTotal = InvalidDecommissionDpCnt
                dd.DecommissionDpCount = limit
                dd.DecommissionRaftForce = raftForce
                dd.DstAddr = dstPath
                dd.DecommissionRetry = 0
        }
        dd.DecommissionTerm = dd.DecommissionTerm + 1
        dd.SetDecommissionStatus(markDecommission)
}

func (dd *DecommissionDisk) canAddToDecommissionList() bool {
        status := dd.GetDecommissionStatus()
        if status == DecommissionRunning ||
                status == markDecommission {
                return true
        }
        return false
}

func (dd *DecommissionDisk) AddToNodeSet() bool {
        status := dd.GetDecommissionStatus()
        if status == DecommissionRunning ||
                status == markDecommission {
                return true
        }
        return false
}

func (dd *DecommissionDisk) IsManualDecommissionDisk() bool {
        return dd.Type == ManualDecommission
}

func (dd *DecommissionDisk) CanBePaused() bool {
        status := dd.GetDecommissionStatus()
        if status == DecommissionRunning || status == markDecommission ||
                status == DecommissionPause {
                return true
        }
        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import "sync/atomic"

type DpCountLimiter struct {
        cntLimit *uint64
}

func newDpCountLimiter(cntLimit *uint64) DpCountLimiter {
        limiter := DpCountLimiter{
                cntLimit: cntLimit,
        }
        return limiter
}

func (cntLimiter *DpCountLimiter) GetCntLimit() uint64 {
        limit := uint64(0)
        if cntLimiter.cntLimit != nil {
                limit = atomic.LoadUint64(cntLimiter.cntLimit)
        }
        if limit == 0 {
                limit = defaultMaxDpCntLimit
        }
        return limit
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "sort"
        "strconv"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/log"
)

// Recover a file if it has bad CRC or it has been timed out before.
func (partition *DataPartition) validateCRC(clusterID string) {
        partition.Lock()
        defer partition.Unlock()
        liveReplicas := partition.liveReplicas(defaultDataPartitionTimeOutSec)
        if len(liveReplicas) == 0 {
                return
        }

        if len(liveReplicas) < int(partition.ReplicaNum) {
                liveAddrs := make([]string, 0)
                for _, replica := range liveReplicas {
                        liveAddrs = append(liveAddrs, replica.Addr)
                }
                inactiveAddrs := make([]string, 0)
                for _, host := range partition.Hosts {
                        if !contains(liveAddrs, host) {
                                inactiveAddrs = append(inactiveAddrs, host)
                        }
                }
                Warn(clusterID, fmt.Sprintf("vol[%v],dpId[%v],liveAddrs[%v],inactiveAddrs[%v]", partition.VolName, partition.PartitionID, liveAddrs, inactiveAddrs))
        }
        partition.doValidateCRC(liveReplicas, clusterID)
        return
}

func (partition *DataPartition) doValidateCRC(liveReplicas []*DataReplica, clusterID string) {
        if !proto.IsNormalDp(partition.PartitionType) {
                return
        }

        for _, fc := range partition.FileInCoreMap {
                extentID, err := strconv.ParseUint(fc.Name, 10, 64)
                if err != nil {
                        continue
                }
                infoFunc := func() string {
                        return fmt.Sprintf("partition[%v] extentID %v, isTiny %v", partition.PartitionID, extentID, storage.IsTinyExtent(extentID))
                }
                if storage.IsTinyExtent(extentID) {
                        partition.checkTinyExtentFile(fc, liveReplicas, clusterID, infoFunc)
                } else {
                        partition.checkExtentFile(fc, liveReplicas, clusterID, infoFunc)
                }
        }
}

func (partition *DataPartition) checkTinyExtentFile(fc *FileInCore, liveReplicas []*DataReplica, clusterID string, getInfoCallback func() string) {
        if fc.shouldCheckCrc() == false {
                return
        }
        fms, needRepair := fc.needCrcRepair(liveReplicas, getInfoCallback)
        if !needRepair {
                return
        }
        if !hasSameSize(fms) {
                msg := fmt.Sprintf("CheckFileError size not match,cluster[%v],dpID[%v],", clusterID, partition.PartitionID)
                for _, fm := range fms {
                        msg = msg + fmt.Sprintf("fm[%v]:size[%v]\n", fm.locIndex, fm.Size)
                }
                log.LogWarn(msg)
                return
        }
        msg := fmt.Sprintf("CheckFileError crc not match,cluster[%v],dpID[%v]", clusterID, partition.PartitionID)
        for _, fm := range fms {
                msg = msg + fmt.Sprintf("fm[%v]:%v\n", fm.locIndex, fm)
        }
        Warn(clusterID, msg)
        return
}

func (partition *DataPartition) checkExtentFile(fc *FileInCore, liveReplicas []*DataReplica, clusterID string, getInfoCallback func() string) {
        if fc.shouldCheckCrc() == false {
                return
        }
        fms, needRepair := fc.needCrcRepair(liveReplicas, getInfoCallback)
        if !hasSameSize(fms) {
                msg := fmt.Sprintf("CheckFileError size not match,cluster[%v],dpID[%v],", clusterID, partition.PartitionID)
                for _, fm := range fms {
                        msg = msg + fmt.Sprintf("fm[%v]:size[%v]\n", fm.locIndex, fm.Size)
                }
                log.LogWarn(msg)
                return
        }
        if len(fms) < len(liveReplicas) && (time.Now().Unix()-fc.LastModify) > intervalToCheckMissingReplica {
                lastReportTime, ok := partition.FilesWithMissingReplica[fc.Name]
                if len(partition.FilesWithMissingReplica) > 400 {
                        Warn(clusterID, fmt.Sprintf("partitionid[%v] has [%v] files missed replica", partition.PartitionID, len(partition.FilesWithMissingReplica)))
                        return
                }

                if !ok {
                        partition.FilesWithMissingReplica[fc.Name] = time.Now().Unix()
                        return
                }
                if time.Now().Unix()-lastReportTime < intervalToCheckMissingReplica {
                        return
                }

                liveAddrs := make([]string, 0)
                for _, replica := range liveReplicas {
                        liveAddrs = append(liveAddrs, replica.Addr)
                }
                Warn(clusterID, fmt.Sprintf("partitionid[%v],file[%v],fms[%v],liveAddr[%v]", partition.PartitionID, fc.Name, fc.getFileMetaAddrs(), liveAddrs))
        }
        if !needRepair {
                log.LogDebugf("checkExtentFile. partition %v all equal so no need compare in details", partition.PartitionID)
                return
        }

        fileCrcArr := fc.calculateCrc(fms)
        sort.Sort(fileCrcSorter(fileCrcArr))
        maxCountFileCrcIndex := len(fileCrcArr) - 1
        if fileCrcArr[maxCountFileCrcIndex].count == 1 {
                msg := fmt.Sprintf("checkFileCrcTaskErr clusterID[%v] partitionID:%v  File:%v  ExtentOffset different between all node  "+
                        " it can not repair it ", clusterID, partition.PartitionID, fc.Name)
                msg += (fileCrcSorter)(fileCrcArr).log()
                Warn(clusterID, msg)
                return
        }

        for index, crc := range fileCrcArr {
                if index != maxCountFileCrcIndex {
                        badNode := crc.meta
                        msg := fmt.Sprintf("checkFileCrcTaskErr clusterID[%v] partitionID:%v  File:%v  badCrc On :%v  ",
                                clusterID, partition.PartitionID, fc.Name, badNode.getLocationAddr())
                        msg += (fileCrcSorter)(fileCrcArr).log()
                        Warn(clusterID, msg)
                }
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

// FileCrc defines the crc of a file
type FileCrc struct {
        crc   uint32
        count int
        meta  *FileMetadata
}

func newFileCrc(volCrc uint32) (fc *FileCrc) {
        fc = new(FileCrc)
        fc.crc = volCrc
        fc.count = 1

        return
}

type fileCrcSorter []*FileCrc

func (fileCrcArr fileCrcSorter) Less(i, j int) bool {
        return fileCrcArr[i].count < fileCrcArr[j].count
}

func (fileCrcArr fileCrcSorter) Swap(i, j int) {
        fileCrcArr[i], fileCrcArr[j] = fileCrcArr[j], fileCrcArr[i]
}

func (fileCrcArr fileCrcSorter) Len() (length int) {
        length = len(fileCrcArr)
        return
}

func (fileCrcArr fileCrcSorter) log() (msg string) {
        for _, fileCrc := range fileCrcArr {
                addr := fileCrc.meta.getLocationAddr()
                count := fileCrc.count
                crc := fileCrc.crc
                msg = fmt.Sprintf(msg+" addr:%v  count:%v  crc:%v ", addr, count, crc)

        }

        return
}

func (fc *FileInCore) shouldCheckCrc() bool {
        return time.Now().Unix()-fc.LastModify > defaultIntervalToCheckCrc
}

func (fc *FileInCore) needCrcRepair(liveReplicas []*DataReplica, getInfoCallback func() string) (fms []*FileMetadata, needRepair bool) {
        var baseCrc uint32
        fms = make([]*FileMetadata, 0)

        for i := 0; i < len(liveReplicas); i++ {
                vol := liveReplicas[i]
                if fm, ok := fc.getFileMetaByAddr(vol); ok {
                        fms = append(fms, fm)
                }
        }
        if len(fms) == 0 {
                return
        }

        baseCrc = fms[0].Crc
        baseApplyId := fms[0].ApplyID
        for _, fm := range fms {
                if fm.getFileCrc() == EmptyCrcValue || fm.getFileCrc() == 0 {
                        needRepair = false
                        return
                }
                if fm.ApplyID == baseApplyId && fm.getFileCrc() != baseCrc {
                        log.LogErrorf("needCrcRepair. getInfoCallback %v, extent %v, applyID(%v:%v), crc %v",
                                getInfoCallback(), fc.Name, fm.ApplyID, baseApplyId, baseCrc)
                        needRepair = true
                        return
                }
        }
        return
}

func hasSameSize(fms []*FileMetadata) (same bool) {
        sentry := fms[0].Size
        for _, fm := range fms {
                if fm.Size != sentry {
                        return
                }
        }
        return true
}

func (fc *FileInCore) calculateCrc(badVfNodes []*FileMetadata) (fileCrcArr []*FileCrc) {
        badLen := len(badVfNodes)
        fileCrcArr = make([]*FileCrc, 0)
        for i := 0; i < badLen; i++ {
                crcKey := badVfNodes[i].getFileCrc()
                isFound := false
                var crc *FileCrc
                for _, crc = range fileCrcArr {
                        if crc.crc == crcKey {
                                isFound = true
                                break
                        }
                }

                if isFound == false {
                        crc = newFileCrc(crcKey)
                        crc.meta = badVfNodes[i]
                        fileCrcArr = append(fileCrcArr, crc)
                } else {
                        crc.count++
                }
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"

        "github.com/cubefs/cubefs/proto"
)

// FileMetadata defines the file metadata on a dataNode
type FileMetadata struct {
        proto.FileMetadata
        locIndex uint8
        ApplyID  uint64
}

func (fm *FileMetadata) String() (msg string) {
        msg = fmt.Sprintf("Crc[%v] LocAddr[%v] locIndex[%v]  Size[%v]",
                fm.Crc, fm.LocAddr, fm.locIndex, fm.Size)
        return
}

func (fm *FileMetadata) getLocationAddr() (loc string) {
        return fm.LocAddr
}

func (fm *FileMetadata) getFileCrc() (crc uint32) {
        return fm.Crc
}

// FileInCore define file in data partition
type FileInCore struct {
        proto.FileInCore
        MetadataArray []*FileMetadata
}

func newFileMetadata(volCrc uint32, volLoc string, volLocIndex int, size uint32, applyId uint64) (fm *FileMetadata) {
        fm = new(FileMetadata)
        fm.Crc = volCrc
        fm.LocAddr = volLoc
        fm.locIndex = uint8(volLocIndex)
        fm.Size = size
        fm.ApplyID = applyId
        return
}

func newFileInCore(name string) (fc *FileInCore) {
        fc = new(FileInCore)
        fc.Name = name
        fc.MetadataArray = make([]*FileMetadata, 0)

        return
}

func (fc FileInCore) clone() *proto.FileInCore {
        metadataArray := make([]*proto.FileMetadata, len(fc.MetadataArray))
        for i, metadata := range fc.MetadataArray {
                metadataArray[i] = &proto.FileMetadata{
                        Crc:     metadata.Crc,
                        LocAddr: metadata.LocAddr,
                        Size:    metadata.Size,
                }
        }

        return &proto.FileInCore{
                Name:          fc.Name,
                LastModify:    fc.LastModify,
                MetadataArray: metadataArray,
        }
}

// Use the File and the volume Location for update.
func (fc *FileInCore) updateFileInCore(volID uint64, vf *proto.File, volLoc *DataReplica, volLocIndex int) {
        if vf.Modified > fc.LastModify {
                fc.LastModify = vf.Modified
        }

        isFind := false
        for i := 0; i < len(fc.MetadataArray); i++ {
                if fc.MetadataArray[i].getLocationAddr() == volLoc.Addr {
                        fc.MetadataArray[i].Crc = vf.Crc
                        fc.MetadataArray[i].Size = vf.Size
                        fc.MetadataArray[i].ApplyID = vf.ApplyID
                        isFind = true
                        break
                }
        }

        if isFind == false {
                fm := newFileMetadata(vf.Crc, volLoc.Addr, volLocIndex, vf.Size, vf.ApplyID)
                fc.MetadataArray = append(fc.MetadataArray, fm)
        }
}

func (fc *FileInCore) getFileMetaByAddr(replica *DataReplica) (fm *FileMetadata, ok bool) {
        for i := 0; i < len(fc.MetadataArray); i++ {
                fm = fc.MetadataArray[i]
                if fm.LocAddr == replica.Addr {
                        ok = true
                        return
                }
        }

        return
}

func (fc *FileInCore) getFileMetaAddrs() (addrs []string) {
        addrs = make([]string, 0)
        if len(fc.MetadataArray) == 0 {
                return
        }
        for _, fm := range fc.MetadataArray {
                addrs = append(addrs, fm.LocAddr)
        }
        return
}

//go:build gofuzz
// +build gofuzz

// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package master

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

type MetaNodeParam struct {
        Addr      string
        ZoneName  string
        ClusterID string
}

func FuzzCreateVol(data []byte) int {
        f := fuzz.NewConsumer(data)
        vv := volValue{}

        err := f.GenerateStruct(&vv)
        if err != nil {
                return 0
        }

        vol := newVol(vv)
        if vol == nil {
                return 0
        }
        return 1
}

func FuzzNewMetaNode(data []byte) int {
        f := fuzz.NewConsumer(data)
        param := MetaNodeParam{}

        err := f.GenerateStruct(&param)
        if err != nil {
                return 0
        }

        node := newMetaNode(param.Addr, param.ZoneName, param.ClusterID)
        if node == nil {
                return 0
        }
        return 1
}

package master

import (
        "bufio"
        "context"
        "encoding/json"
        "fmt"
        "io"
        "os"
        "path/filepath"
        "sort"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "github.com/samsarahq/thunder/graphql"
        "github.com/samsarahq/thunder/graphql/schemabuilder"
)

type ClusterService struct {
        cluster    *Cluster
        user       *User
        conf       *clusterConfig
        leaderInfo *LeaderInfo
}

func (s *ClusterService) Schema() *graphql.Schema {
        schema := schemabuilder.NewSchema()

        s.registerObject(schema)
        s.registerQuery(schema)
        s.registerMutation(schema)

        return schema.MustBuild()
}

func (s *ClusterService) registerObject(schema *schemabuilder.Schema) {
        object := schema.Object("ClusterView", proto.ClusterView{})

        object.FieldFunc("serverCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(s.cluster.dataNodeCount() + s.cluster.metaNodeCount()), nil
        })

        object.FieldFunc("dataPartitionCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(s.cluster.getDataPartitionCount()), nil
        })

        object.FieldFunc("metaPartitionCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(s.cluster.getMetaPartitionCount()), nil
        })

        object.FieldFunc("volumeCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(len(s.cluster.vols)), nil
        })

        object.FieldFunc("masterCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(len(s.conf.peerAddrs)), nil
        })

        object.FieldFunc("metaNodeCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(s.cluster.metaNodeCount()), nil
        })

        object.FieldFunc("dataNodeCount", func(ctx context.Context, args struct{}) (int32, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return 0, err
                }
                return int32(s.cluster.dataNodeCount()), nil
        })

        nv := schema.Object("NodeView", proto.NodeView{})

        nv.FieldFunc("toMetaNode", func(ctx context.Context, n *proto.NodeView) (*MetaNode, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return nil, err
                }
                return s.cluster.metaNode(n.Addr)
        })

        nv.FieldFunc("toDataNode", func(ctx context.Context, n *proto.NodeView) (*DataNode, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return nil, err
                }
                return s.cluster.dataNode(n.Addr)
        })

        nv.FieldFunc("reportDisks", func(ctx context.Context, n *proto.NodeView) ([]string, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return nil, err
                }
                node, err := s.cluster.dataNode(n.Addr)
                if err != nil {
                        return nil, err
                }

                diskmap := make(map[string]bool)
                for _, p := range node.DataPartitionReports {
                        diskmap[p.DiskPath] = true
                }

                keys := make([]string, 0, len(diskmap))

                for key := range diskmap {
                        keys = append(keys, key)
                }

                sort.Slice(keys, func(i, j int) bool {
                        return strings.Compare(keys[i], keys[j]) > 0
                })

                return keys, nil
        })

        vs := schema.Object("VolStatInfo", proto.VolStatInfo{})
        vs.FieldFunc("toVolume", func(ctx context.Context, n *proto.VolStatInfo) (*Vol, error) {
                if _, _, err := permissions(ctx, ADMIN); err != nil {
                        return nil, err
                }
                return s.cluster.getVol(n.Name)
        })

        object = schema.Object("DataNode", DataNode{})
        object.FieldFunc("isActive", func(ctx context.Context, n *DataNode) bool {
                return n.isActive
        })

        object = schema.Object("metaNode", MetaNode{})
        object.FieldFunc("metaPartitionInfos", func(ctx context.Context, n *MetaNode) []*proto.MetaPartitionReport {
                return n.metaPartitionInfos
        })
}

func (s *ClusterService) registerQuery(schema *schemabuilder.Schema) {
        query := schema.Query()
        query.FieldFunc("clusterView", s.clusterView)
        query.FieldFunc("dataNodeList", s.dataNodeList)
        query.FieldFunc("dataNodeListTest", s.dataNodeListTest)
        query.FieldFunc("dataNodeGet", s.dataNodeGet)
        query.FieldFunc("metaNodeList", s.metaNodeList)
        query.FieldFunc("metaNodeGet", s.metaNodeGet)
        query.FieldFunc("masterList", s.masterList)
        query.FieldFunc("getTopology", s.getTopology)
        query.FieldFunc("alarmList", s.alarmList)
}

func (s *ClusterService) registerMutation(schema *schemabuilder.Schema) {
        mutation := schema.Mutation()

        mutation.FieldFunc("clusterFreeze", s.clusterFreeze)
        mutation.FieldFunc("addRaftNode", s.addRaftNode)
        mutation.FieldFunc("removeRaftNode", s.removeRaftNode)
        mutation.FieldFunc("addMetaNode", s.removeRaftNode)
        mutation.FieldFunc("loadMetaPartition", s.loadMetaPartition)
        mutation.FieldFunc("decommissionMetaPartition", s.decommissionMetaPartition)
        mutation.FieldFunc("decommissionMetaNode", s.decommissionMetaNode)
        mutation.FieldFunc("decommissionDisk", s.decommissionDisk)
        mutation.FieldFunc("decommissionDataNode", s.decommissionDataNode)
}

// Decommission a disk. This will decommission all the data partitions on this disk.
func (m *ClusterService) decommissionDisk(ctx context.Context, args struct {
        OffLineAddr string
        DiskPath    string
}) (*proto.GeneralResp, error,
) {
        node, err := m.cluster.dataNode(args.OffLineAddr)
        if err != nil {
                return nil, err
        }

        badPartitions := node.badPartitions(args.DiskPath, m.cluster)
        if len(badPartitions) == 0 {
                err = fmt.Errorf("node[%v] disk[%v] does not have any data partition", node.Addr, args.DiskPath)
                return nil, err
        }

        var badPartitionIds []uint64
        for _, bdp := range badPartitions {
                badPartitionIds = append(badPartitionIds, bdp.PartitionID)
        }
        rstMsg := fmt.Sprintf("receive decommissionDisk node[%v] disk[%v], badPartitionIds[%v] has offline successfully",
                node.Addr, args.DiskPath, badPartitionIds)
        if err = m.cluster.decommissionDisk(node, false, args.DiskPath, badPartitions, true); err != nil {
                return nil, err
        }
        Warn(m.cluster.Name, rstMsg)

        return proto.Success("success"), nil
}

// Decommission a data node. This will decommission all the data partition on that node.
func (m *ClusterService) decommissionDataNode(ctx context.Context, args struct {
        OffLineAddr string
}) (*proto.GeneralResp, error,
) {
        node, err := m.cluster.dataNode(args.OffLineAddr)
        if err != nil {
                return nil, err
        }
        if err := m.cluster.decommissionDataNode(node, false); err != nil {
                return nil, err
        }
        rstMsg := fmt.Sprintf("decommission data node [%v] submited,please check laster!", args.OffLineAddr)

        return proto.Success(rstMsg), nil
}

func (m *ClusterService) decommissionMetaNode(ctx context.Context, args struct {
        OffLineAddr string
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        metaNode, err := m.cluster.metaNode(args.OffLineAddr)
        if err != nil {
                return nil, err
        }
        if err = m.cluster.decommissionMetaNode(metaNode); err != nil {
                return nil, err
        }
        log.LogInfof("decommissionMetaNode metaNode [%v] has offline successfully", args.OffLineAddr)
        return proto.Success("success"), nil
}

func (m *ClusterService) loadMetaPartition(ctx context.Context, args struct {
        PartitionID uint64
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        mp, err := m.cluster.getMetaPartitionByID(args.PartitionID)
        if err != nil {
                return nil, err
        }

        m.cluster.loadMetaPartitionAndCheckResponse(mp)
        log.LogInfof(proto.AdminLoadMetaPartition+" partitionID :%v Load successfully", args.PartitionID)
        return proto.Success("success"), nil
}

func (m *ClusterService) decommissionMetaPartition(ctx context.Context, args struct {
        PartitionID uint64
        NodeAddr    string
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        mp, err := m.cluster.getMetaPartitionByID(args.PartitionID)
        if err != nil {
                return nil, err
        }
        if err := m.cluster.decommissionMetaPartition(args.NodeAddr, mp); err != nil {
                return nil, err
        }
        log.LogInfof(proto.AdminDecommissionMetaPartition+" partitionID :%v  decommissionMetaPartition successfully", args.PartitionID)
        return proto.Success("success"), nil
}

func (m *ClusterService) getMetaNode(ctx context.Context, args struct {
        NodeAddr string
}) (*MetaNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        metaNode, err := m.cluster.metaNode(args.NodeAddr)
        if err != nil {
                return nil, err
        }
        return metaNode, nil
}

// View the topology of the cluster.
func (m *ClusterService) getTopology(ctx context.Context, args struct{}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        tv := &TopologyView{
                Zones: make([]*ZoneView, 0),
        }
        zones := m.cluster.t.getAllZones()
        for _, zone := range zones {
                cv := newZoneView(zone.name)
                cv.Status = zone.getStatusToString()
                cv.DataNodesetSelector = zone.GetDataNodesetSelector()
                cv.MetaNodesetSelector = zone.GetMetaNodesetSelector()
                tv.Zones = append(tv.Zones, cv)
                nsc := zone.getAllNodeSet()
                for _, ns := range nsc {
                        nsView := newNodeSetView(ns.dataNodeLen(), ns.metaNodeLen())
                        cv.NodeSet[ns.ID] = nsView
                        ns.dataNodes.Range(func(key, value interface{}) bool {
                                dataNode := value.(*DataNode)
                                nsView.DataNodes = append(nsView.DataNodes, proto.NodeView{ID: dataNode.ID, Addr: dataNode.Addr, IsActive: dataNode.isActive, IsWritable: dataNode.isWriteAble()})
                                return true
                        })
                        ns.metaNodes.Range(func(key, value interface{}) bool {
                                metaNode := value.(*MetaNode)
                                nsView.MetaNodes = append(nsView.MetaNodes, proto.NodeView{ID: metaNode.ID, Addr: metaNode.Addr, IsActive: metaNode.IsActive, IsWritable: metaNode.isWritable()})
                                return true
                        })
                }
        }

        bs, e := json.Marshal(tv)
        if e != nil {
                return nil, e
        }
        return proto.Success(string(bs)), e
}

func (s *ClusterService) clusterView(ctx context.Context, args struct{}) (*proto.ClusterView, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        return s.makeClusterView(), nil
}

type MasterInfo struct {
        Index    string
        Addr     string
        IsLeader bool
}

func (s *ClusterService) masterList(ctx context.Context, args struct{}) ([]*MasterInfo, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }

        list := make([]*MasterInfo, 0)
        leader := strings.Split(s.leaderInfo.addr, ":")
        for _, addr := range s.conf.peerAddrs {
                split := strings.Split(addr, ":")
                list = append(list, &MasterInfo{
                        Index:    split[0],
                        Addr:     split[1],
                        IsLeader: leader[0] == split[1],
                })
        }
        return list, nil
}

func (s *ClusterService) dataNodeGet(ctx context.Context, args struct {
        Addr string
}) (*DataNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        return s.cluster.dataNode(args.Addr)
}

func (s *ClusterService) dataNodeList(ctx context.Context, args struct{}) ([]*DataNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        var all []*DataNode
        s.cluster.dataNodes.Range(func(_, value interface{}) bool {
                all = append(all, value.(*DataNode))
                return true
        })
        return all, nil
}

func (s *ClusterService) dataNodeListTest(ctx context.Context, args struct {
        Num int64
}) ([]*DataNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        var all []*DataNode

        for i := 0; i < int(args.Num); i++ {
                all = append(all, &DataNode{
                        Total:          uint64(i),
                        Used:           1,
                        AvailableSpace: 1,
                        ID:             1,
                        ZoneName:       "123",
                        Addr:           "123123121231",
                        ReportTime:     time.Time{},
                        isActive:       false,
                        RWMutex:        sync.RWMutex{},
                        UsageRatio:     1,
                        SelectedTimes:  2,
                })
        }

        return all, nil
}

func (s *ClusterService) metaNodeGet(ctx context.Context, args struct {
        Addr string
}) (*MetaNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        mn, found := s.cluster.metaNodes.Load(args.Addr)
        if found {
                return mn.(*MetaNode), nil
        }
        return nil, fmt.Errorf("not found meta_node by add:[%s]", args.Addr)
}

func (s *ClusterService) metaNodeList(ctx context.Context, args struct{}) ([]*MetaNode, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        var all []*MetaNode
        s.cluster.metaNodes.Range(func(_, value interface{}) bool {
                all = append(all, value.(*MetaNode))
                return true
        })
        return all, nil
}

func (m *ClusterService) addMetaNode(ctx context.Context, args struct {
        NodeAddr string
        ZoneName string
}) (uint64, error) {
        if id, err := m.cluster.addMetaNode(args.NodeAddr, args.ZoneName, 0); err != nil {
                return 0, err
        } else {
                return id, nil
        }
}

// Dynamically remove a master node. Similar to addRaftNode, this operation is performed online.
func (m *ClusterService) removeRaftNode(ctx context.Context, args struct {
        Id   uint64
        Addr string
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        if err := m.cluster.removeRaftNode(args.Id, args.Addr); err != nil {
                return nil, err
        }
        log.LogInfof("remove  raft node id :%v,adr:%v successfully\n", args.Id, args.Addr)
        return proto.Success("success"), nil
}

// Dynamically add a raft node (replica) for the master.
// By using this function, there is no need to stop all the master services. Adding a new raft node is performed online.
func (m *ClusterService) addRaftNode(ctx context.Context, args struct {
        Id   uint64
        Addr string
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }

        if err := m.cluster.addRaftNode(args.Id, args.Addr); err != nil {
                return nil, err
        }

        log.LogInfof("add  raft node id :%v, addr:%v successfully \n", args.Id, args.Addr)
        return proto.Success("success"), nil
}

// Turn on or off the automatic allocation of the data partitions.
// If DisableAutoAllocate == off, then we WILL NOT automatically allocate new data partitions for the volume when:
//  1. the used space is below the max capacity,
//  2. and the number of r&w data partition is less than 20.
//
// If DisableAutoAllocate == on, then we WILL automatically allocate new data partitions for the volume when:
//  1. the used space is below the max capacity,
//  2. and the number of r&w data partition is less than 20.
func (m *ClusterService) clusterFreeze(ctx context.Context, args struct {
        Status bool
}) (*proto.GeneralResp, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }

        if err := m.cluster.setDisableAutoAllocate(args.Status); err != nil {
                return nil, err
        }
        return proto.Success("success"), nil
}

type WarnMessage struct {
        Time     string `json:"time"`
        Key      string `json:"key"`
        Hostname string `json:"hostname"`
        Type     string `json:"type"`
        Value    string `json:"value"`
        Detail   string `json:"detail"`
}

func (m *ClusterService) alarmList(ctx context.Context, args struct {
        Size int32
}) ([]*WarnMessage, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }

        size := int64(args.Size * 1000)

        list := make([]*WarnMessage, 0, 100)

        path := filepath.Join(log.LogDir, "master"+log.CriticalLogFileName)

        stat, err := os.Stat(path)
        if err != nil {
                list = append(list, &WarnMessage{
                        Time:     time.Now().Format("2006-01-02 15:04:05"),
                        Key:      "not found",
                        Hostname: m.leaderInfo.addr,
                        Type:     "not found",
                        Value:    "not found",
                        Detail:   path + " read has err:" + err.Error(),
                })
                return list, nil
        }

        f, err := os.Open(path)
        if err != nil {
                return nil, fmt.Errorf("open file has err:[%s]", err.Error())
        }

        if stat.Size() > size {
                if _, err := f.Seek(stat.Size()-size, 0); err != nil {
                        return nil, fmt.Errorf("seek file has err:[%s]", err.Error())
                }
        }

        defer func() {
                if err := f.Close(); err != nil {
                        log.LogErrorf("close alarm file has err:[%s]", err.Error())
                }
        }()

        buf := bufio.NewReader(f)

        all, err := io.ReadAll(buf)
        if err != nil {
                return nil, fmt.Errorf("read file:[%s] size:[%d] has err:[%s]", path, stat.Size(), err.Error())
        }

        for _, line := range strings.Split(string(all), "\n") {

                if len(line) == 0 {
                        break
                }

                split := strings.Split(string(line), " ")

                var msg *WarnMessage

                if len(split) < 7 {
                        value := string(line)
                        msg = &WarnMessage{
                                Time:     "unknow",
                                Key:      "parse msg has err",
                                Hostname: "parse msg has err",
                                Type:     "parse msg has err",
                                Value:    value,
                                Detail:   value,
                        }
                } else {
                        value := strings.Join(split[6:], " ")
                        msg = &WarnMessage{
                                Time:     split[0] + " " + split[1],
                                Key:      split[4],
                                Hostname: split[5],
                                Type:     split[2],
                                Value:    value,
                                Detail:   value,
                        }
                }

                list = append(list, msg)
        }

        // reverse slice
        l := len(list)
        for i := 0; i < l/2; i++ {
                list[i], list[l-i-1] = list[l-i-1], list[i]
        }

        if len(list) > int(args.Size) {
                list = list[:args.Size]
        }

        return list, nil
}

func (m *ClusterService) makeClusterView() *proto.ClusterView {
        cv := &proto.ClusterView{
                Name:                 m.cluster.Name,
                LeaderAddr:           m.cluster.leaderInfo.addr,
                DisableAutoAlloc:     m.cluster.DisableAutoAllocate,
                ForbidMpDecommission: m.cluster.ForbidMpDecommission,
                MetaNodeThreshold:    m.cluster.cfg.MetaNodeThreshold,
                Applied:              m.cluster.fsm.applied,
                MaxDataPartitionID:   m.cluster.idAlloc.dataPartitionID,
                MaxMetaNodeID:        m.cluster.idAlloc.commonID,
                MaxMetaPartitionID:   m.cluster.idAlloc.metaPartitionID,
                MetaNodes:            make([]proto.NodeView, 0),
                DataNodes:            make([]proto.NodeView, 0),
                VolStatInfo:          make([]*proto.VolStatInfo, 0),
                BadPartitionIDs:      make([]proto.BadPartitionView, 0),
                BadMetaPartitionIDs:  make([]proto.BadPartitionView, 0),
        }

        vols := m.cluster.allVolNames()
        cv.MetaNodes = m.cluster.allMetaNodes()
        cv.DataNodes = m.cluster.allDataNodes()
        cv.DataNodeStatInfo = m.cluster.dataNodeStatInfo
        cv.MetaNodeStatInfo = m.cluster.metaNodeStatInfo
        for _, name := range vols {
                stat, ok := m.cluster.volStatInfo.Load(name)
                if !ok {
                        cv.VolStatInfo = append(cv.VolStatInfo, newVolStatInfo(name, 0, 0, 0, 0, 0))
                        continue
                }
                cv.VolStatInfo = append(cv.VolStatInfo, stat.(*volStatInfo))
        }
        m.cluster.BadDataPartitionIds.Range(func(key, value interface{}) bool {
                badDataPartitionIds := value.([]uint64)
                path := key.(string)
                bpv := badPartitionView{Path: path, PartitionIDs: badDataPartitionIds}
                cv.BadPartitionIDs = append(cv.BadPartitionIDs, bpv)
                return true
        })
        m.cluster.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
                badPartitionIds := value.([]uint64)
                path := key.(string)
                bpv := badPartitionView{Path: path, PartitionIDs: badPartitionIds}
                cv.BadMetaPartitionIDs = append(cv.BadMetaPartitionIDs, bpv)
                return true
        })
        return cv
}

package master

import (
        "context"
        "crypto/sha256"
        "encoding/hex"
        "fmt"
        "sort"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "github.com/samsarahq/thunder/graphql"
        "github.com/samsarahq/thunder/graphql/schemabuilder"
)

type UserService struct {
        user    *User
        cluster *Cluster
}

func (s *UserService) Schema() *graphql.Schema {
        schema := schemabuilder.NewSchema()

        s.registerObject(schema)
        s.registerQuery(schema)

        s.registerMutation(schema)

        return schema.MustBuild()
}

type UserStatistical struct {
        Data               uint64
        VolumeCount        int32
        DataPartitionCount int32
        MetaPartitionCount int32
}

type AuthorizedVols struct {
        Vol        string
        Authorized []string
}

func (s *UserService) registerObject(schema *schemabuilder.Schema) {
        object := schema.Object("UserInfo", proto.UserInfo{})

        object.FieldFunc("userStatistical", func(u *proto.UserInfo) (*UserStatistical, error) {
                us := &UserStatistical{
                        VolumeCount: int32(len(u.Policy.OwnVols)),
                }
                for _, volName := range u.Policy.OwnVols {
                        v, e := s.cluster.getVol(volName)
                        if e != nil {
                                return nil, e
                        }
                        us.MetaPartitionCount += int32(len(v.MetaPartitions))
                        us.DataPartitionCount += int32(len(v.dataPartitions.partitions))
                        us.Data += v.totalUsedSpace()
                }
                return us, nil
        })

        object = schema.Object("UserPolicy", proto.UserPolicy{})
        object.FieldFunc("authorizedVols", func(p *proto.UserPolicy) []AuthorizedVols {
                var list []AuthorizedVols
                for vol, a := range p.AuthorizedVols {
                        list = append(list, AuthorizedVols{
                                Vol:        vol,
                                Authorized: a,
                        })
                }
                return list
        })
}

func (s *UserService) registerQuery(schema *schemabuilder.Schema) {
        query := schema.Query()

        query.FieldFunc("getUserInfo", s.getUserInfo)
        query.FieldFunc("getUserAKInfo", s.getUserAKInfo)
        query.FieldFunc("validatePassword", s.validatePassword)
        query.FieldFunc("listUserInfo", s.listUserInfo)
        query.FieldFunc("topNUser", s.topNUser)
}

func (m *UserService) getUserAKInfo(ctx context.Context, args struct {
        AccessKey string
}) (*proto.UserInfo, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        userInfo, err := m.user.getKeyInfo(args.AccessKey)
        if err != nil {
                return nil, err
        }

        if perm != ADMIN {
                if uid != userInfo.UserID {
                        return nil, fmt.Errorf("user info not found by you accesskey")
                }
        }

        return userInfo, nil
}

func (s *UserService) registerMutation(schema *schemabuilder.Schema) {
        mutation := schema.Mutation()

        mutation.FieldFunc("createUser", s.createUser)
        mutation.FieldFunc("updateUser", s.updateUser)
        mutation.FieldFunc("deleteUser", s.deleteUser)
        mutation.FieldFunc("updateUserPolicy", s.updateUserPolicy)
        mutation.FieldFunc("removeUserPolicy", s.removeUserPolicy)
        mutation.FieldFunc("transferUserVol", s.transferUserVol)
}

func (m *UserService) transferUserVol(ctx context.Context, args proto.UserTransferVolParam) (*proto.UserInfo, error) {
        uid, perm, err := permissions(ctx, ADMIN)
        if err != nil {
                return nil, err
        }

        vol, err := m.cluster.getVol(args.Volume)
        if err != nil {
                return nil, err
        }

        if perm == USER && vol.Owner != uid {
                return nil, fmt.Errorf("not have permission for vol:[%s]", args.Volume)
        }

        if !args.Force && vol.Owner != args.UserSrc {
                return nil, fmt.Errorf("force param need validate user name for vol:[%s]", args.Volume)
        }

        userInfo, err := m.user.transferVol(&args)
        if err != nil {
                return nil, err
        }
        owner := vol.Owner
        vol.Owner = userInfo.UserID
        if err = m.cluster.syncUpdateVol(vol); err != nil {
                vol.Owner = owner
                return nil, err
        }
        return userInfo, nil
}

func (s *UserService) updateUserPolicy(ctx context.Context, args proto.UserPermUpdateParam) (*proto.UserInfo, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if args.Volume == "" {
                        return nil, fmt.Errorf("user:[%s] need set userID", uid)
                }
                if v, e := s.cluster.getVol(args.Volume); e != nil {
                        return nil, e
                } else {
                        if v.Owner != uid {
                                return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.UserID)
                        }
                }
        }
        if _, err := s.cluster.getVol(args.Volume); err != nil {
                return nil, err
        }
        userInfo, err := s.user.updatePolicy(&args)
        if err != nil {
                return nil, err
        }

        return userInfo, nil
}

func (s *UserService) removeUserPolicy(ctx context.Context, args proto.UserPermRemoveParam) (*proto.UserInfo, error) {
        if _, err := s.cluster.getVol(args.Volume); err != nil {
                return nil, err
        }
        userInfo, err := s.user.removePolicy(&args)
        if err != nil {
                return nil, err
        }
        return userInfo, nil
}

func (s *UserService) createUser(ctx context.Context, args proto.UserCreateParam) (*proto.UserInfo, error) {
        uid, _, err := permissions(ctx, ADMIN)
        if err != nil {
                return nil, err
        }

        if !ownerRegexp.MatchString(args.ID) {
                return nil, fmt.Errorf("user id:[%s] is invalid", args.ID)
        }
        if args.Type == proto.UserTypeRoot {
                return nil, fmt.Errorf("user type:[%s] can not to root", args.Type)
        }

        log.LogInfof("create user:[%s] by admin:[%s]", args.ID, uid)
        return s.user.createKey(&args)
}

func (s *UserService) updateUser(ctx context.Context, args proto.UserUpdateParam) (*proto.UserInfo, error) {
        uid, _, err := permissions(ctx, ADMIN)
        if err != nil {
                return nil, err
        }

        old, err := s.user.getUserInfo(args.UserID)
        if err != nil {
                return nil, err
        }

        if old.UserType != args.Type && args.Type == proto.UserTypeRoot {
                return nil, fmt.Errorf("user type:[%s] can not to root", args.Type)
        }

        log.LogInfof("update user:[%s] by admin:[%s]", args.UserID, uid)
        return s.user.updateKey(&args)
}

func (s *UserService) deleteUser(ctx context.Context, args struct {
        UserID string
}) (*proto.GeneralResp, error) {
        uid, _, err := permissions(ctx, ADMIN)
        if err != nil {
                return nil, err
        }

        // TODO : make sure can delete self? can delete other admin ??
        log.LogInfof("delete user:[%s] by admin:[%s]", args.UserID, uid)
        if err := s.user.deleteKey(args.UserID); err != nil {
                return nil, err
        }

        return proto.Success("del user ok"), nil
}

func (s *UserService) getUserInfo(ctx context.Context, args struct {
        UserID string
}) (*proto.UserInfo, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if uid != args.UserID {
                        return nil, fmt.Errorf("you:[%s] not have permission visit this userID:[%s]", uid, args.UserID)
                }
        }

        return s.user.getUserInfo(args.UserID)
}

func (s *UserService) listUserInfo(ctx context.Context, args struct{}) ([]*proto.UserInfo, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        var list []*proto.UserInfo
        s.user.userStore.Range(func(_, ui interface{}) bool {
                list = append(list, ui.(*proto.UserInfo))
                return true
        })
        return list, nil
}

type UserUseSpace struct {
        Name  string
        Size  uint64
        Ratio float32
}

func (s *UserService) topNUser(ctx context.Context, args struct {
        N int32
}) ([]*UserUseSpace, error) {
        if _, _, err := permissions(ctx, ADMIN); err != nil {
                return nil, err
        }
        list := make([]*UserUseSpace, 0)

        var err error
        s.user.userStore.Range(func(_, ui interface{}) bool {
                u := ui.(*proto.UserInfo)

                us := &UserUseSpace{
                        Name:  u.UserID,
                        Size:  0,
                        Ratio: 0,
                }

                for _, volName := range u.Policy.OwnVols {
                        v, e := s.cluster.getVol(volName)
                        if e != nil {
                                err = e
                                return false
                        }
                        us.Size += v.totalUsedSpace()
                }

                list = append(list, us)
                return true
        })

        if err != nil {
                return nil, err
        }

        sort.Slice(list, func(i int, j int) bool {
                return list[i].Size > list[j].Size
        })

        if len(list) > 10 {
                list = list[:10]
        }

        var sum uint64
        for _, u := range list {
                sum += u.Size
        }

        for _, u := range list {
                if sum == 0 {
                        u.Ratio = float32(1) / float32(len(list))
                } else {
                        u.Ratio = float32(u.Size) / float32(sum)
                }
        }

        return list, nil
}

func (s *UserService) validatePassword(ctx context.Context, args struct {
        UserID   string
        Password string
}) (*proto.UserInfo, error) {
        ui, err := s.user.getUserInfo(args.UserID)
        if err != nil {
                return nil, err
        }

        ak, err := s.user.getAKUser(ui.AccessKey)
        if err != nil {
                return nil, err
        }
        hashedPassword := sha256.Sum256([]byte(args.Password))
        hashedPasswordStr := hex.EncodeToString(hashedPassword[:])

        hashedPassword_ := sha256.Sum256([]byte(ak.Password))
        hashedPasswordStr_ := hex.EncodeToString(hashedPassword_[:])

        if hashedPasswordStr != hashedPasswordStr_ {
                log.LogWarnf("user:[%s] login pass word has err", args.UserID)
                return nil, fmt.Errorf("user or password has err")
        }
        return ui, nil
}

type permissionMode int

const (
        ADMIN permissionMode = permissionMode(1)
        USER  permissionMode = permissionMode(2)
)

func permissions(ctx context.Context, mode permissionMode) (userID string, perm permissionMode, err error) {
        userInfo := ctx.Value(proto.UserInfoKey).(*proto.UserInfo)

        userID = userInfo.UserID

        perm = USER
        if userInfo.UserType == proto.UserTypeRoot || userInfo.UserType == proto.UserTypeAdmin {
                perm = ADMIN
        }

        if ADMIN&mode == ADMIN {
                if perm == ADMIN {
                        return
                }
        }

        if USER&mode == USER {
                if perm == USER {
                        return
                }
        }

        err = fmt.Errorf("user:[%s] permissions has err:[%d] your:[%d]", userInfo.UserID, mode, perm)
        return
}

package master

import (
        "context"
        "fmt"
        "math"
        "sort"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "github.com/samsarahq/thunder/graphql"
        "github.com/samsarahq/thunder/graphql/schemabuilder"
)

type VolumeService struct {
        user    *User
        cluster *Cluster
}

func (s *VolumeService) Schema() *graphql.Schema {
        schema := schemabuilder.NewSchema()

        s.registerObject(schema)
        s.registerQuery(schema)
        s.registerMutation(schema)

        return schema.MustBuild()
}

func (s *VolumeService) registerObject(schema *schemabuilder.Schema) {
        object := schema.Object("Vol", Vol{})

        object.FieldFunc("dpReplicaNum", func(ctx context.Context, v *Vol) (int32, error) {
                if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
                        return 0, err
                }
                return int32(v.dpReplicaNum), nil
        })

        object.FieldFunc("occupied", func(ctx context.Context, v *Vol) (int64, error) {
                if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
                        return 0, err
                }
                var used int64
                for _, p := range v.cloneDataPartitionMap() {
                        used += int64(p.used)
                }
                return used, nil
        })

        object.FieldFunc("toSimpleVolView", func(ctx context.Context, vol *Vol) (*proto.SimpleVolView, error) {
                if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
                        return nil, err
                }
                return &proto.SimpleVolView{
                        ID:                 vol.ID,
                        Name:               vol.Name,
                        Owner:              vol.Owner,
                        ZoneName:           vol.zoneName,
                        DpReplicaNum:       vol.dpReplicaNum,
                        MpReplicaNum:       vol.mpReplicaNum,
                        Status:             vol.Status,
                        Capacity:           vol.Capacity,
                        FollowerRead:       vol.FollowerRead,
                        NeedToLowerReplica: vol.NeedToLowerReplica,
                        Authenticate:       vol.authenticate,
                        CrossZone:          vol.crossZone,
                        RwDpCnt:            vol.dataPartitions.readableAndWritableCnt,
                        MpCnt:              len(vol.MetaPartitions),
                        DpCnt:              len(vol.dataPartitions.partitionMap),
                        CreateTime:         time.Unix(vol.createTime, 0).Format(proto.TimeFormat),
                        Description:        vol.description,
                }, nil
        })

        object.FieldFunc("createTime", func(ctx context.Context, v *Vol) (int64, error) {
                if _, _, err := permissions(ctx, USER|ADMIN); err != nil {
                        return 0, err
                }
                return v.createTime, nil
        })
}

func (s *VolumeService) registerQuery(schema *schemabuilder.Schema) {
        query := schema.Query()

        query.FieldFunc("getVolume", s.getVolume)
        query.FieldFunc("listVolume", s.listVolume)
        query.FieldFunc("volPermission", s.volPermission)
}

func (s *VolumeService) registerMutation(schema *schemabuilder.Schema) {
        mutation := schema.Mutation()

        mutation.FieldFunc("createVolume", s.createVolume)
        // mutation.FieldFunc("deleteVolume", s.markDeleteVol)
        mutation.FieldFunc("updateVolume", s.updateVolume)
}

type UserPermission struct {
        UserID string
        Access []string
        Edit   bool
}

func (s *VolumeService) volPermission(ctx context.Context, args struct {
        VolName string
        UserID  *string
},
) ([]*UserPermission, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if args.UserID == nil {
                        return nil, fmt.Errorf("user:[%s] need set userID", uid)
                }
                if v, e := s.cluster.getVol(*args.UserID); e != nil {
                        return nil, e
                } else {
                        if v.Owner != uid {
                                return nil, fmt.Errorf("user:[%s] is not volume:[%d] onwer", uid, args.UserID)
                        }
                }
        }

        vol, err := s.cluster.getVol(args.VolName)
        if err != nil {
                return nil, err
        }

        var volUser *proto.VolUser
        if value, exist := s.user.volUser.Load(args.VolName); exist {
                volUser = value.(*proto.VolUser)
        } else {
                return nil, fmt.Errorf("not found vol user in cluster")
        }

        userPList := make([]*UserPermission, 0, len(volUser.UserIDs))

        userMap := make(map[string]bool)

        for _, u := range volUser.UserIDs {
                v, e := s.user.getUserInfo(u)
                if e != nil {
                        log.LogWarnf("get user info by vol has err:[%s]", e.Error())
                        continue
                }
                if arr, exist := v.Policy.AuthorizedVols[args.VolName]; exist {
                        if userMap[u] {
                                continue
                        }
                        userMap[u] = true
                        userPList = append(userPList, &UserPermission{
                                UserID: u,
                                Access: arr,
                                Edit:   uid == vol.Owner,
                        })
                } else {
                        log.LogWarnf("get vol:[%s] author:[%s] by user policy has err ", args.VolName, u)
                }
        }

        sort.Slice(userPList, func(i, j int) bool {
                return userPList[i].Edit
        })

        return userPList, nil
}

func (s *VolumeService) createVolume(ctx context.Context, args struct {
        Name, Owner, ZoneName, Description                          string
        Capacity, DataPartitionSize, MpCount, DpCount, DpReplicaNum uint64
        FollowerRead, Authenticate, CrossZone, DefaultPriority      bool
        iopsRLimit, iopsWLimit, flowRlimit, flowWlimit              uint64
},
) (*Vol, error) {
        uid, per, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if !(args.DpReplicaNum == 2 || args.DpReplicaNum == 3) {
                return nil, fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", args.DpReplicaNum)
        }

        if per == USER && args.Owner != uid {
                return nil, fmt.Errorf("[%s] not has permission to create volume for [%s]", uid, args.Owner)
        }

        if args.DpReplicaNum > math.MaxUint8 {
                return nil, fmt.Errorf("invalid arg dpReplicaNum: %v", args.DpReplicaNum)
        }

        if args.DpCount > maxInitDataPartitionCnt {
                return nil, fmt.Errorf("invalid arg dpCount[%v], exceeds maximum limit[%d]", args.DpCount, maxInitDataPartitionCnt)
        }

        req := &createVolReq{
                name:             args.Name,
                owner:            args.Owner,
                dpSize:           int(args.DataPartitionSize),
                mpCount:          int(args.MpCount),
                dpCount:          int(args.DpCount),
                dpReplicaNum:     uint8(args.DpReplicaNum),
                capacity:         int(args.Capacity),
                followerRead:     args.FollowerRead,
                authenticate:     args.Authenticate,
                crossZone:        args.CrossZone,
                normalZonesFirst: args.DefaultPriority,
                zoneName:         args.ZoneName,
                description:      args.Description,
        }
        vol, err := s.cluster.createVol(req)
        if err != nil {
                return nil, err
        }

        userInfo, err := s.user.getUserInfo(args.Owner)
        if err != nil {
                if err != proto.ErrUserNotExists {
                        return nil, err
                }

                param := proto.UserCreateParam{
                        ID:       args.Owner,
                        Password: DefaultUserPassword,
                        Type:     proto.UserTypeNormal,
                }
                if userInfo, err = s.user.createKey(&param); err != nil {
                        return nil, err
                }
        }

        if _, err = s.user.addOwnVol(userInfo.UserID, args.Name); err != nil {
                return nil, err
        }

        return vol, nil
}

func (s *VolumeService) markDeleteVol(ctx context.Context, args struct {
        Name, AuthKey string
},
) (*proto.GeneralResp, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if v, e := s.cluster.getVol(args.Name); e != nil {
                        return nil, e
                } else {
                        if v.Owner != uid {
                                return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
                        }
                }
        }

        if err = s.user.deleteVolPolicy(args.Name); err != nil {
                return nil, err
        }

        if err = s.cluster.markDeleteVol(args.Name, args.AuthKey, false); err != nil {
                return nil, err
        }

        log.LogWarnf("delete vol[%s] successfully,from[%s]", args.Name, uid)

        return proto.Success("success"), nil
}

func (s *VolumeService) updateVolume(ctx context.Context, args struct {
        Name, AuthKey              string
        ZoneName, Description      *string
        Capacity, ReplicaNum       *uint64
        FollowerRead, Authenticate *bool
},
) (*Vol, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if v, e := s.cluster.getVol(args.Name); e != nil {
                        return nil, e
                } else {
                        if v.Owner != uid {
                                return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
                        }
                }
        }

        if args.ReplicaNum != nil && !(*args.ReplicaNum == 2 || *args.ReplicaNum == 3) {
                return nil, fmt.Errorf("replicaNum can only be 2 and 3,received replicaNum is[%v]", args.ReplicaNum)
        }

        vol, err := s.cluster.getVol(args.Name)
        if err != nil {
                return nil, err
        }

        newArgs := getVolVarargs(vol)

        if args.FollowerRead != nil {
                newArgs.followerRead = *args.FollowerRead
        }

        if args.Authenticate != nil {
                newArgs.authenticate = *args.Authenticate
        }

        if args.ZoneName != nil {
                newArgs.zoneName = *args.ZoneName
        }

        if args.Capacity != nil {
                newArgs.capacity = *args.Capacity
        }

        if args.Description != nil {
                newArgs.description = *args.Description
        }

        if err = s.cluster.updateVol(args.Name, args.AuthKey, newArgs); err != nil {
                return nil, err
        }

        log.LogInfof("update vol[%v] successfully\n", args.Name)

        vol, err = s.cluster.getVol(args.Name)
        if err != nil {
                return nil, err
        }

        return vol, nil
}

func (s *VolumeService) listVolume(ctx context.Context, args struct {
        UserID  *string
        Keyword *string
},
) ([]*Vol, error) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                args.UserID = &uid
        }

        var list []*Vol
        for _, vol := range s.cluster.vols {
                if args.UserID != nil && vol.Owner != *args.UserID {
                        continue
                }

                if args.Keyword != nil && *args.Keyword != "" && strings.Contains(vol.Name, *args.Keyword) {
                        continue
                }

                if vol.Status == proto.VolStatusMarkDelete {
                        continue
                }

                list = append(list, vol)
        }
        return list, nil
}

func (s *VolumeService) getVolume(ctx context.Context, args struct {
        Name string
},
) (*Vol, error,
) {
        uid, perm, err := permissions(ctx, ADMIN|USER)
        if err != nil {
                return nil, err
        }

        if perm == USER {
                if v, e := s.cluster.getVol(args.Name); e != nil {
                        return nil, e
                } else {
                        if v.Owner != uid {
                                return nil, fmt.Errorf("user:[%s] is not volume:[%s] onwer", uid, args.Name)
                        }
                }
        }

        vol, err := s.cluster.getVol(args.Name)
        if err != nil {
                return nil, err
        }
        return vol, nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "context"
        "encoding/json"
        "fmt"
        "html"
        "net/http"
        "net/http/httputil"
        "strings"
        "time"

        "github.com/gorilla/mux"
        "github.com/samsarahq/thunder/graphql"
        "github.com/samsarahq/thunder/graphql/introspection"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

func (m *Server) startHTTPService(modulename string, cfg *config.Config) {
        router := mux.NewRouter().SkipClean(true)
        m.registerAPIRoutes(router)
        m.registerAPIMiddleware(router)
        if m.cluster.authenticate {
                m.registerAuthenticationMiddleware(router)
        }
        exporter.InitWithRouter(modulename, cfg, router, m.port)
        addr := fmt.Sprintf(":%s", m.port)
        if m.bindIp {
                addr = fmt.Sprintf("%s:%s", m.ip, m.port)
        }

        server := &http.Server{
                Addr:         addr,
                Handler:      router,
                ReadTimeout:  5 * time.Minute,
                WriteTimeout: 5 * time.Minute,
        }

        serveAPI := func() {
                if err := server.ListenAndServe(); err != nil {
                        log.LogErrorf("serveAPI: serve http server failed: err(%v)", err)
                        return
                }
        }
        go serveAPI()
        m.apiServer = server
        return
}

func (m *Server) isClientPartitionsReq(r *http.Request) bool {
        return r.URL.Path == proto.ClientDataPartitions
}

func (m *Server) isFollowerRead(r *http.Request) (followerRead bool) {
        followerRead = false

        if r.URL.Path == proto.ClientDataPartitions && !m.partition.IsRaftLeader() {
                if volName, err := parseAndExtractName(r); err == nil {
                        log.LogInfof("action[interceptor] followerRead vol[%v]", volName)
                        if followerRead = m.cluster.followerReadManager.IsVolViewReady(volName); followerRead {
                                log.LogInfof("action[interceptor] vol [%v] followerRead [%v], GetName[%v] IsRaftLeader[%v]",
                                        volName, followerRead, r.URL.Path, m.partition.IsRaftLeader())
                                return
                        }
                }
        } else if r.URL.Path == proto.AdminChangeMasterLeader ||
                r.URL.Path == proto.AdminOpFollowerPartitionsRead ||
                r.URL.Path == proto.AdminPutDataPartitions ||
                r.URL.Path == "/metrics" {
                followerRead = true
        }
        return
}

func (m *Server) registerAPIMiddleware(route *mux.Router) {
        var interceptor mux.MiddlewareFunc = func(next http.Handler) http.Handler {
                return http.HandlerFunc(
                        func(w http.ResponseWriter, r *http.Request) {
                                log.LogDebugf("action[interceptor] request, method[%v] path[%v] query[%v]", r.Method, r.URL.Path, r.URL.Query())

                                if m.partition.IsRaftLeader() {
                                        if err := m.cluster.apiLimiter.Wait(r.URL.Path); err != nil {
                                                log.LogWarnf("action[interceptor] too many requests, path[%v]", r.URL.Path)
                                                errMsg := fmt.Sprintf("too many requests for api: %s", html.EscapeString(r.URL.Path))
                                                http.Error(w, errMsg, http.StatusTooManyRequests)
                                                return
                                        }
                                } else {
                                        if m.cluster.apiLimiter.IsFollowerLimiter(r.URL.Path) {
                                                if err := m.cluster.apiLimiter.Wait(r.URL.Path); err != nil {
                                                        log.LogWarnf("action[interceptor] too many requests, path[%v]", r.URL.Path)
                                                        errMsg := fmt.Sprintf("too many requests for api: %s", html.EscapeString(r.URL.Path))
                                                        http.Error(w, errMsg, http.StatusTooManyRequests)
                                                        return
                                                }
                                        }
                                }

                                log.LogInfof("action[interceptor] request, remote[%v] method[%v] path[%v] query[%v]",
                                        r.RemoteAddr, r.Method, r.URL.Path, r.URL.Query())
                                if mux.CurrentRoute(r).GetName() == proto.AdminGetIP {
                                        next.ServeHTTP(w, r)
                                        return
                                }

                                isFollowerRead := m.isFollowerRead(r)
                                if m.partition.IsRaftLeader() || isFollowerRead {
                                        if m.metaReady || isFollowerRead {
                                                log.LogDebugf("action[interceptor] request, method[%v] path[%v] query[%v]", r.Method, r.URL.Path, r.URL.Query())
                                                next.ServeHTTP(w, r)
                                                return
                                        }
                                        log.LogWarnf("action[interceptor] leader meta has not ready")
                                        http.Error(w, m.leaderInfo.addr, http.StatusBadRequest)
                                        return
                                } else if m.leaderInfo.addr != "" {
                                        if m.isClientPartitionsReq(r) {
                                                log.LogErrorf("action[interceptor] request, method[%v] path[%v] query[%v] status [%v]", r.Method, r.URL.Path, r.URL.Query(), isFollowerRead)
                                                http.Error(w, m.leaderInfo.addr, http.StatusBadRequest)
                                                return
                                        }
                                        m.proxy(w, r)
                                } else {
                                        log.LogErrorf("action[interceptor] no leader,request[%v]", r.URL)
                                        http.Error(w, "no leader", http.StatusBadRequest)
                                        return
                                }
                        })
        }
        route.Use(interceptor)
}

// AuthenticationUri2MsgTypeMap define the mapping from authentication uri to message type
var AuthenticationUri2MsgTypeMap = map[string]proto.MsgType{
        // Master API cluster management
        proto.AdminClusterFreeze: proto.MsgMasterClusterFreezeReq,
        proto.AddRaftNode:        proto.MsgMasterAddRaftNodeReq,
        proto.RemoveRaftNode:     proto.MsgMasterRemoveRaftNodeReq,
        proto.AdminSetNodeInfo:   proto.MsgMasterSetNodeInfoReq,
        proto.AdminSetNodeRdOnly: proto.MsgMasterSetNodeRdOnlyReq,

        // Master API volume management
        proto.AdminCreateVol: proto.MsgMasterCreateVolReq,
        proto.AdminDeleteVol: proto.MsgMasterDeleteVolReq,
        proto.AdminUpdateVol: proto.MsgMasterUpdateVolReq,
        proto.AdminVolShrink: proto.MsgMasterVolShrinkReq,
        proto.AdminVolExpand: proto.MsgMasterVolExpandReq,

        // Master API meta partition management
        proto.AdminLoadMetaPartition:         proto.MsgMasterLoadMetaPartitionReq,
        proto.AdminDecommissionMetaPartition: proto.MsgMasterDecommissionMetaPartitionReq,
        proto.AdminChangeMetaPartitionLeader: proto.MsgMasterChangeMetaPartitionLeaderReq,
        proto.AdminCreateMetaPartition:       proto.MsgMasterCreateMetaPartitionReq,
        proto.AdminAddMetaReplica:            proto.MsgMasterAddMetaReplicaReq,
        proto.AdminDeleteMetaReplica:         proto.MsgMasterDeleteMetaReplicaReq,
        proto.QosUpdate:                      proto.MsgMasterQosUpdateReq,
        proto.QosUpdateZoneLimit:             proto.MsgMasterQosUpdateZoneLimitReq,
        proto.QosUpdateMasterLimit:           proto.MsgMasterQosUpdateMasterLimitReq,
        proto.QosUpdateClientParam:           proto.MsgMasterQosUpdateClientParamReq,

        // Master API data partition management
        proto.AdminCreateDataPartition:       proto.MsgMasterCreateDataPartitionReq,
        proto.AdminDataPartitionChangeLeader: proto.MsgMasterDataPartitionChangeLeaderReq,
        proto.AdminLoadDataPartition:         proto.MsgMasterLoadDataPartitionReq,
        proto.AdminDecommissionDataPartition: proto.MsgMasterDecommissionDataPartitionReq,
        proto.AdminAddDataReplica:            proto.MsgMasterAddDataReplicaReq,
        proto.AdminDeleteDataReplica:         proto.MsgMasterDeleteDataReplicaReq,
        proto.AdminSetDpRdOnly:               proto.MsgMasterSetDpRdOnlyReq,

        // Master API meta node management
        proto.AddMetaNode:               proto.MsgMasterAddMetaNodeReq,
        proto.DecommissionMetaNode:      proto.MsgMasterDecommissionMetaNodeReq,
        proto.MigrateMetaNode:           proto.MsgMasterMigrateMetaNodeReq,
        proto.AdminSetMetaNodeThreshold: proto.MsgMasterSetMetaNodeThresholdReq,
        proto.AdminUpdateMetaNode:       proto.MsgMasterUpdateMetaNodeReq,

        // Master API data node management
        proto.AddDataNode:                   proto.MsgMasterAddDataNodeReq,
        proto.DecommissionDataNode:          proto.MsgMasterDecommissionDataNodeReq,
        proto.MigrateDataNode:               proto.MsgMasterMigrateDataNodeReq,
        proto.CancelDecommissionDataNode:    proto.MsgMasterCancelDecommissionDataNodeReq,
        proto.DecommissionDisk:              proto.MsgMasterDecommissionDiskReq,
        proto.AdminUpdateNodeSetCapcity:     proto.MsgMasterUpdateNodeSetCapcityReq,
        proto.AdminUpdateNodeSetId:          proto.MsgMasterUpdateNodeSetIdReq,
        proto.AdminUpdateDomainDataUseRatio: proto.MsgMasterUpdateDomainDataUseRatioReq,
        proto.AdminUpdateZoneExcludeRatio:   proto.MsgMasterUpdateZoneExcludeRatioReq,
        proto.RecommissionDisk:              proto.MsgMasterRecommissionDiskReq,

        // Master API user management
        proto.UserCreate:          proto.MsgMasterUserCreateReq,
        proto.UserDelete:          proto.MsgMasterUserDeleteReq,
        proto.UserUpdate:          proto.MsgMasterUserUpdateReq,
        proto.UserUpdatePolicy:    proto.MsgMasterUserUpdatePolicyReq,
        proto.UserRemovePolicy:    proto.MsgMasterUserRemovePolicyReq,
        proto.UserDeleteVolPolicy: proto.MsgMasterUserDeleteVolPolicyReq,
        proto.UserTransferVol:     proto.MsgMasterUserTransferVolReq,

        // Master API zone management
        proto.UpdateZone: proto.MsgMasterUpdateZoneReq,
}

func (m *Server) registerAuthenticationMiddleware(router *mux.Router) {
        authenticationInterceptor := func(next http.Handler) http.Handler {
                return http.HandlerFunc(
                        func(w http.ResponseWriter, r *http.Request) {
                                split := strings.Split(r.RequestURI, "?")
                                uriPath := split[0]
                                msgType, match := AuthenticationUri2MsgTypeMap[uriPath]
                                if match {
                                        if err := m.cluster.parseAndCheckClientIDKey(r, msgType); err != nil {
                                                log.LogInfof("action[AuthenticationInterceptor] parseAndCheckClientKey failed, RequestURI[%v], err[%v]",
                                                        r.RequestURI, err)
                                                sendErrReply(w, r, &proto.HTTPReply{Code: proto.ErrCodeInvalidClientIDKey, Msg: err.Error()})
                                                return
                                        }
                                }
                                next.ServeHTTP(w, r)
                        })
        }
        router.Use(authenticationInterceptor)
}

func (m *Server) registerAPIRoutes(router *mux.Router) {
        // graphql api for cluster
        cs := &ClusterService{user: m.user, cluster: m.cluster, conf: m.config, leaderInfo: m.leaderInfo}
        m.registerHandler(router, proto.AdminClusterAPI, cs.Schema())

        us := &UserService{user: m.user, cluster: m.cluster}
        m.registerHandler(router, proto.AdminUserAPI, us.Schema())

        // vs := &VolumeService{user: m.user, cluster: m.cluster}
        // m.registerHandler(router, proto.AdminVolumeAPI, vs.Schema())

        // cluster management APIs
        router.NewRoute().Name(proto.AdminGetMasterApiList).
                Methods(http.MethodGet).
                Path(proto.AdminGetMasterApiList).
                HandlerFunc(m.getApiList)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetApiQpsLimit).
                HandlerFunc(m.setApiQpsLimit)
        router.NewRoute().Name(proto.AdminGetApiQpsLimit).
                Methods(http.MethodGet).
                Path(proto.AdminGetApiQpsLimit).
                HandlerFunc(m.getApiQpsLimit)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminRemoveApiQpsLimit).
                HandlerFunc(m.rmApiQpsLimit)
        router.NewRoute().Name(proto.AdminGetIP).
                Methods(http.MethodGet).
                Path(proto.AdminGetIP).
                HandlerFunc(m.getIPAddr)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetCluster).
                HandlerFunc(m.getCluster)
        router.NewRoute().Name(proto.AdminACL).
                Methods(http.MethodGet).
                Path(proto.AdminACL).
                HandlerFunc(m.aclOperate)
        router.NewRoute().Name(proto.AdminUid).
                Methods(http.MethodGet).
                Path(proto.AdminUid).
                HandlerFunc(m.UidOperate)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetClusterInfo).
                HandlerFunc(m.setClusterInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetMonitorPushAddr).
                HandlerFunc(m.getMonitorPushAddr)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminClusterFreeze).
                HandlerFunc(m.setupAutoAllocation)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminVolForbidden).
                HandlerFunc(m.forbidVolume)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminVolEnableAuditLog).
                HandlerFunc(m.setEnableAuditLogForVolume)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminClusterForbidMpDecommission).
                HandlerFunc(m.setupForbidMetaPartitionDecommission)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AddRaftNode).
                HandlerFunc(m.addRaftNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.RemoveRaftNode).
                HandlerFunc(m.removeRaftNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.RaftStatus).
                HandlerFunc(m.getRaftStatus)
        router.NewRoute().Methods(http.MethodGet).Path(proto.AdminClusterStat).HandlerFunc(m.clusterStat)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetCheckDataReplicasEnable).
                HandlerFunc(m.setCheckDataReplicasEnable)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetConfig).
                HandlerFunc(m.setConfigHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetConfig).
                HandlerFunc(m.getConfigHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateDecommissionLimit).
                HandlerFunc(m.updateDecommissionLimit)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminQueryDecommissionLimit).
                HandlerFunc(m.queryDecommissionLimit)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminQueryDecommissionToken).
                HandlerFunc(m.queryDecommissionToken)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetFileStats).
                HandlerFunc(m.setFileStats)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetFileStats).
                HandlerFunc(m.getFileStats)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetClusterUuidEnable).
                HandlerFunc(m.setClusterUuidEnable)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetClusterUuid).
                HandlerFunc(m.getClusterUuid)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGenerateClusterUuid).
                HandlerFunc(m.generateClusterUuid)

        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetClusterValue).
                HandlerFunc(m.GetClusterValue)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateDecommissionDiskFactor).
                HandlerFunc(m.updateDecommissionDiskFactor)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminQueryDecommissionDiskLimit).
                HandlerFunc(m.queryDecommissionDiskLimit)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminEnableAutoDecommissionDisk).
                HandlerFunc(m.enableAutoDecommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminQueryAutoDecommissionDisk).
                HandlerFunc(m.queryAutoDecommissionDisk)

        // volume management APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminCreateVol).
                HandlerFunc(m.createVol)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetVol).
                HandlerFunc(m.getVolSimpleInfo)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDeleteVol).
                HandlerFunc(m.markDeleteVol)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateVol).
                HandlerFunc(m.updateVol)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminVolShrink).
                HandlerFunc(m.volShrink)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminVolExpand).
                HandlerFunc(m.volExpand)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.ClientVol).
                HandlerFunc(m.getVol)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.ClientVolStat).
                HandlerFunc(m.getVolStatInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetTopologyView).
                HandlerFunc(m.getTopology)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminListVols).
                HandlerFunc(m.listVols)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminChangeMasterLeader).
                HandlerFunc(m.changeMasterLeader)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminOpFollowerPartitionsRead).
                HandlerFunc(m.OpFollowerPartitionsRead)

        // multi version snapshot APIs
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminCreateVersion).
                HandlerFunc(m.CreateVersion)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminDelVersion).
                HandlerFunc(m.DelVersion)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetVersionInfo).
                HandlerFunc(m.GetVersionInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetAllVersionInfo).
                HandlerFunc(m.GetAllVersionInfo)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetVolVer).
                HandlerFunc(m.getVolVer)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetVerStrategy).
                HandlerFunc(m.SetVerStrategy)

        // S3 lifecycle configuration APIS
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.SetBucketLifecycle).
                HandlerFunc(m.SetBucketLifecycle)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.GetBucketLifecycle).
                HandlerFunc(m.GetBucketLifecycle)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.DeleteBucketLifecycle).
                HandlerFunc(m.DelBucketLifecycle)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AddLcNode).
                HandlerFunc(m.addLcNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminLcNode).
                HandlerFunc(m.lcnodeInfo)

        // node task response APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.GetDataNodeTaskResponse).
                HandlerFunc(m.handleDataNodeTaskResponse)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.GetMetaNodeTaskResponse).
                HandlerFunc(m.handleMetaNodeTaskResponse)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.GetLcNodeTaskResponse).
                HandlerFunc(m.handleLcNodeTaskResponse)

        // meta partition management APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminLoadMetaPartition).
                HandlerFunc(m.loadMetaPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDecommissionMetaPartition).
                HandlerFunc(m.decommissionMetaPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminChangeMetaPartitionLeader).
                HandlerFunc(m.changeMetaPartitionLeader)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminBalanceMetaPartitionLeader).
                HandlerFunc(m.balanceMetaPartitionLeader)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.ClientMetaPartitions).
                HandlerFunc(m.getMetaPartitions)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.ClientMetaPartition).
                HandlerFunc(m.getMetaPartition)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosUpload).
                HandlerFunc(m.qosUpload)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosGetStatus).
                HandlerFunc(m.getQosStatus)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosGetClientsLimitInfo).
                HandlerFunc(m.getClientQosInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosUpdate).
                HandlerFunc(m.QosUpdate)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosUpdateZoneLimit).
                HandlerFunc(m.QosUpdateZoneLimit)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosGetZoneLimitInfo).
                HandlerFunc(m.QosGetZoneLimit)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosUpdateMasterLimit).
                HandlerFunc(m.getQosUpdateMasterLimit)
        // router.NewRoute().Methods(http.MethodGet).
        //        Path(proto.QosUpdateMagnify).
        //        HandlerFunc(m.QosUpdateMagnify)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QosUpdateClientParam).
                HandlerFunc(m.QosUpdateClientParam)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminCreateMetaPartition).
                HandlerFunc(m.createMetaPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminAddMetaReplica).
                HandlerFunc(m.addMetaReplica)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDeleteMetaReplica).
                HandlerFunc(m.deleteMetaReplica)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDiagnoseMetaPartition).
                HandlerFunc(m.diagnoseMetaPartition)

        // data partition management APIs
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminGetDataPartition).
                HandlerFunc(m.getDataPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminCreateDataPartition).
                HandlerFunc(m.createDataPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminCreatePreLoadDataPartition).
                HandlerFunc(m.createPreLoadDataPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDataPartitionChangeLeader).
                HandlerFunc(m.changeDataPartitionLeader)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminLoadDataPartition).
                HandlerFunc(m.loadDataPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDecommissionDataPartition).
                HandlerFunc(m.decommissionDataPartition)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDiagnoseDataPartition).
                HandlerFunc(m.diagnoseDataPartition)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.ClientDataPartitions).
                HandlerFunc(m.getDataPartitions)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminResetDataPartitionDecommissionStatus).
                HandlerFunc(m.resetDataPartitionDecommissionStatus)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.AdminQueryDataPartitionDecommissionStatus).
                HandlerFunc(m.queryDataPartitionDecommissionStatus)

        // meta node management APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AddMetaNode).
                HandlerFunc(m.addMetaNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.DecommissionMetaNode).
                HandlerFunc(m.decommissionMetaNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.MigrateMetaNode).
                HandlerFunc(m.migrateMetaNodeHandler)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetMetaNode).
                HandlerFunc(m.getMetaNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetMetaNodeThreshold).
                HandlerFunc(m.setMetaNodeThreshold)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminAddDataReplica).
                HandlerFunc(m.addDataReplica)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminDeleteDataReplica).
                HandlerFunc(m.deleteDataReplica)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateMetaNode).
                HandlerFunc(m.updateMetaNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateDataNode).
                HandlerFunc(m.updateDataNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetInvalidNodes).
                HandlerFunc(m.checkInvalidIDNodes)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminPutDataPartitions).
                HandlerFunc(m.putDataPartitions)

        // data node management APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AddDataNode).
                HandlerFunc(m.addDataNode)

        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.DecommissionDataNode).
                HandlerFunc(m.decommissionDataNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryDataNodeDecoProgress).
                HandlerFunc(m.queryDataNodeDecoProgress)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.MigrateDataNode).
                HandlerFunc(m.migrateDataNodeHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.CancelDecommissionDataNode).
                HandlerFunc(m.cancelDecommissionDataNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryDataNodeDecoFailedDps).
                HandlerFunc(m.queryDataNodeDecoFailedDps)

        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetDataNode).
                HandlerFunc(m.getDataNode)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.DecommissionDisk).
                HandlerFunc(m.decommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.RecommissionDisk).
                HandlerFunc(m.recommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.RestoreStoppedAutoDecommissionDisk).
                HandlerFunc(m.restoreStoppedAutoDecommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryDiskDecoProgress).
                HandlerFunc(m.queryDiskDecoProgress)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.MarkDecoDiskFixed).
                HandlerFunc(m.markDecoDiskFixed)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.CancelDecommissionDisk).
                HandlerFunc(m.cancelDecommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryDecommissionDiskDecoFailedDps).
                HandlerFunc(m.queryDecommissionDiskDecoFailedDps)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryBadDisks).
                HandlerFunc(m.queryBadDisks)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryAllDecommissionDisk).
                HandlerFunc(m.queryAllDecommissionDisk)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QueryDisableDisk).
                HandlerFunc(m.queryDisableDisk)

        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetNodeInfo).
                HandlerFunc(m.setNodeInfoHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetNodeInfo).
                HandlerFunc(m.getNodeInfoHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetIsDomainOn).
                HandlerFunc(m.getIsDomainOn)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetAllNodeSetGrpInfo).
                HandlerFunc(m.getAllNodeSetGrpInfoHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetNodeSetGrpInfo).
                HandlerFunc(m.getNodeSetGrpInfoHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateNodeSetCapcity).
                HandlerFunc(m.updateNodeSetCapacityHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateNodeSetId).
                HandlerFunc(m.updateNodeSetIdHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateNodeSetNodeSelector).
                HandlerFunc(m.updateNodeSetNodeSelector)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateDomainDataUseRatio).
                HandlerFunc(m.updateDataUseRatioHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminUpdateZoneExcludeRatio).
                HandlerFunc(m.updateZoneExcludeRatioHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetNodeRdOnly).
                HandlerFunc(m.setNodeRdOnlyHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetDpRdOnly).
                HandlerFunc(m.setDpRdOnlyHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminSetDpDiscard).
                HandlerFunc(m.setDpDiscardHandler)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.AdminGetDiscardDp).
                HandlerFunc(m.getDiscardDpHandler)

        // user management APIs
        router.NewRoute().Methods(http.MethodPost).
                Path(proto.UserCreate).
                HandlerFunc(m.createUser)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserDelete).
                HandlerFunc(m.deleteUser)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserUpdate).
                HandlerFunc(m.updateUser)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserUpdatePolicy).
                HandlerFunc(m.updateUserPolicy)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserRemovePolicy).
                HandlerFunc(m.removeUserPolicy)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserDeleteVolPolicy).
                HandlerFunc(m.deleteUserVolPolicy)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.UserGetAKInfo).
                HandlerFunc(m.getUserAKInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.UserGetInfo).
                HandlerFunc(m.getUserInfo)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.UserList).
                HandlerFunc(m.getAllUsers)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UserTransferVol).
                HandlerFunc(m.transferUserVol)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.UsersOfVol).
                HandlerFunc(m.getUsersOfVol)

        // zone management APIs
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UpdateZone).
                HandlerFunc(m.updateZone)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetAllZones).
                HandlerFunc(m.listZone)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetAllNodeSets).
                HandlerFunc(m.listNodeSets)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.GetNodeSet).
                HandlerFunc(m.getNodeSet)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.UpdateNodeSet).
                HandlerFunc(m.updateNodeSet)

        // Quota
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QuotaCreate).
                HandlerFunc(m.CreateQuota)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QuotaUpdate).
                HandlerFunc(m.UpdateQuota)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.QuotaDelete).
                HandlerFunc(m.DeleteQuota)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QuotaList).
                HandlerFunc(m.ListQuota)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QuotaGet).
                HandlerFunc(m.GetQuota)
        router.NewRoute().Methods(http.MethodGet).
                Path(proto.QuotaListAll).
                HandlerFunc(m.ListQuotaAll)

        // S3 API QoS Manager
        router.NewRoute().Methods(http.MethodPut, http.MethodPost).
                Path(proto.S3QoSSet).
                HandlerFunc(m.S3QosSet)
        router.NewRoute().Methods(http.MethodGet, http.MethodPost).
                Path(proto.S3QoSGet).
                HandlerFunc(m.S3QosGet)
        router.NewRoute().Methods(http.MethodDelete, http.MethodPost).
                Path(proto.S3QoSDelete).
                HandlerFunc(m.S3QosDelete)
}

func (m *Server) registerHandler(router *mux.Router, model string, schema *graphql.Schema) {
        introspection.AddIntrospectionToSchema(schema)

        gHandler := graphql.HTTPHandler(schema)
        router.NewRoute().Name(model).Methods(http.MethodGet, http.MethodPost).Path(model).HandlerFunc(func(writer http.ResponseWriter, request *http.Request) {
                userID := request.Header.Get(proto.UserKey)
                if userID == "" {
                        ErrResponse(writer, fmt.Errorf("not found [%s] in header", proto.UserKey))
                        return
                }

                if ui, err := m.user.getUserInfo(userID); err != nil {
                        ErrResponse(writer, fmt.Errorf("user:[%s] not found ", userID))
                        return
                } else {
                        request = request.WithContext(context.WithValue(request.Context(), proto.UserInfoKey, ui))
                }

                gHandler.ServeHTTP(writer, request)
        })
}

func ErrResponse(w http.ResponseWriter, err error) {
        response := struct {
                Errors []string `json:"errors"`
        }{
                Errors: []string{err.Error()},
        }

        responseJSON, err := json.Marshal(response)
        if err != nil {
                http.Error(w, err.Error(), http.StatusInternalServerError)
                return
        }
        if w.Header().Get("Content-Type") == "" {
                w.Header().Set("Content-Type", "application/json")
        }
        if _, e := w.Write(responseJSON); e != nil {
                log.LogErrorf("send response has err:[%s]", e)
        }
}

func (m *Server) newReverseProxy() *httputil.ReverseProxy {
        return &httputil.ReverseProxy{Director: func(request *http.Request) {
                request.URL.Scheme = "http"
                request.URL.Host = m.leaderInfo.addr
        }}
}

func (m *Server) proxy(w http.ResponseWriter, r *http.Request) {
        m.reverseProxy.ServeHTTP(w, r)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "strconv"
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/raftstore/raftstore_db"
        "github.com/cubefs/cubefs/util/log"
)

// IDAllocator generates and allocates ids
type IDAllocator struct {
        dataPartitionID uint64
        metaPartitionID uint64
        commonID        uint64
        clientID        uint64
        clientIDLimit   uint64
        quotaID         uint32
        store           *raftstore_db.RocksDBStore
        partition       raftstore.Partition
        dpIDLock        sync.RWMutex
        mpIDLock        sync.RWMutex
        mnIDLock        sync.RWMutex
        qaIDLock        sync.RWMutex
}

const clientIDBatchCount = 1000

func newIDAllocator(store *raftstore_db.RocksDBStore, partition raftstore.Partition) (alloc *IDAllocator) {
        alloc = new(IDAllocator)
        alloc.store = store
        alloc.partition = partition
        return
}

func (alloc *IDAllocator) restore() {
        alloc.restoreMaxDataPartitionID()
        alloc.restoreMaxMetaPartitionID()
        alloc.restoreMaxCommonID()
        alloc.restoreMaxQuotaID()
        alloc.restoreClientID()
}

func (alloc *IDAllocator) restoreMaxDataPartitionID() {
        value, err := alloc.store.Get(maxDataPartitionIDKey)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxDataPartitionID,err:%v ", err.Error()))
        }
        bytes := value.([]byte)
        if len(bytes) == 0 {
                alloc.dataPartitionID = 0
                return
        }
        maxDataPartitionID, err := strconv.ParseUint(string(bytes), 10, 64)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxDataPartitionID,err:%v ", err.Error()))
        }
        alloc.dataPartitionID = maxDataPartitionID
        log.LogInfof("action[restoreMaxDataPartitionID] maxDpID[%v]", alloc.dataPartitionID)
}

func (alloc *IDAllocator) restoreMaxMetaPartitionID() {
        value, err := alloc.store.Get(maxMetaPartitionIDKey)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxPartitionID,err:%v ", err.Error()))
        }
        bytes := value.([]byte)
        if len(bytes) == 0 {
                alloc.metaPartitionID = 0
                return
        }
        maxPartitionID, err := strconv.ParseUint(string(bytes), 10, 64)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxPartitionID,err:%v ", err.Error()))
        }
        alloc.metaPartitionID = maxPartitionID
        log.LogInfof("action[restoreMaxMetaPartitionID] maxMpID[%v]", alloc.metaPartitionID)
}

// The data node, meta node, and node set share the same ID allocator.
func (alloc *IDAllocator) restoreMaxCommonID() {
        value, err := alloc.store.Get(maxCommonIDKey)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxCommonID,err:%v ", err.Error()))
        }
        bytes := value.([]byte)
        if len(bytes) == 0 {
                alloc.commonID = 0
                return
        }
        maxMetaNodeID, err := strconv.ParseUint(string(bytes), 10, 64)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxCommonID,err:%v ", err.Error()))
        }
        alloc.commonID = maxMetaNodeID
        log.LogInfof("action[restoreMaxCommonID] maxCommonID[%v]", alloc.commonID)
}

func (alloc *IDAllocator) restoreMaxQuotaID() {
        value, err := alloc.store.Get(maxQuotaIDKey)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxQuotaID,err:%v ", err.Error()))
        }
        bytes := value.([]byte)
        if len(bytes) == 0 {
                alloc.quotaID = 0
                return
        }
        maxQuotaID, err := strconv.ParseUint(string(bytes), 10, 64)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxQuotaID,err:%v ", err.Error()))
        }

        if maxQuotaID > 0 && maxQuotaID <= math.MaxInt32 {
                alloc.quotaID = uint32(maxQuotaID)
        } else {
                alloc.quotaID = math.MaxInt32
        }

        log.LogInfof("action[restoreMaxCommonID] maxQuotaID[%v]", alloc.quotaID)
}

func (alloc *IDAllocator) setDataPartitionID(id uint64) {
        atomic.StoreUint64(&alloc.dataPartitionID, id)
}

func (alloc *IDAllocator) setMetaPartitionID(id uint64) {
        atomic.StoreUint64(&alloc.metaPartitionID, id)
}

func (alloc *IDAllocator) setCommonID(id uint64) {
        atomic.StoreUint64(&alloc.commonID, id)
}

func (alloc *IDAllocator) restoreClientID() {
        alloc.mpIDLock.Lock()
        defer alloc.mpIDLock.Unlock()
        value, err := alloc.store.Get(maxClientIDKey)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore maxClientID,err:%v ", err.Error()))
        }
        bytes := value.([]byte)
        if len(bytes) != 0 {
                alloc.clientID, err = strconv.ParseUint(string(bytes), 10, 64)
                if err != nil {
                        panic(fmt.Sprintf("Failed to restore maxClientID,err:%v ", err.Error()))
                }
        }
        alloc.clientIDLimit = alloc.clientID
        alloc.clientID += clientIDBatchCount
}

func (alloc *IDAllocator) setQuotaID(id uint32) {
        atomic.StoreUint32(&alloc.quotaID, id)
}

func (alloc *IDAllocator) allocateDataPartitionID() (partitionID uint64, err error) {
        alloc.dpIDLock.Lock()
        defer alloc.dpIDLock.Unlock()
        var cmd []byte
        metadata := new(RaftCmd)
        partitionID = atomic.LoadUint64(&alloc.dataPartitionID) + 1
        metadata.Op = opSyncAllocDataPartitionID
        metadata.K = maxDataPartitionIDKey
        value := strconv.FormatUint(uint64(partitionID), 10)
        metadata.V = []byte(value)
        cmd, err = metadata.Marshal()
        if err != nil {
                goto errHandler
        }
        if _, err = alloc.partition.Submit(cmd); err != nil {
                goto errHandler
        }
        alloc.setDataPartitionID(partitionID)
        return
errHandler:
        log.LogErrorf("action[allocateDataPartitionID] err:%v", err.Error())
        return
}

func (alloc *IDAllocator) allocateMetaPartitionID() (partitionID uint64, err error) {
        alloc.mpIDLock.Lock()
        defer alloc.mpIDLock.Unlock()
        var cmd []byte
        metadata := new(RaftCmd)
        metadata.Op = opSyncAllocMetaPartitionID
        metadata.K = maxMetaPartitionIDKey
        partitionID = atomic.LoadUint64(&alloc.metaPartitionID) + 1
        value := strconv.FormatUint(uint64(partitionID), 10)
        metadata.V = []byte(value)
        cmd, err = metadata.Marshal()
        if err != nil {
                goto errHandler
        }
        if _, err = alloc.partition.Submit(cmd); err != nil {
                goto errHandler
        }
        alloc.setMetaPartitionID(partitionID)
        return
errHandler:
        log.LogErrorf("action[allocateMetaPartitionID] err:%v", err.Error())
        return
}

func (alloc *IDAllocator) allocateClientID() (clientID uint64, err error) {
        alloc.mpIDLock.Lock()
        defer alloc.mpIDLock.Unlock()
        clientID = alloc.clientID + 1
        if alloc.clientIDLimit < clientID {
                var cmd []byte
                metadata := new(RaftCmd)
                metadata.Op = opSyncAllocClientID
                metadata.K = maxClientIDKey
                // sync clientID - 1
                value := strconv.FormatUint(uint64(alloc.clientID), 10)
                metadata.V = []byte(value)
                cmd, err = metadata.Marshal()
                if err != nil {
                        goto errHandler
                }
                if _, err = alloc.partition.Submit(cmd); err != nil {
                        goto errHandler
                }
                alloc.clientIDLimit = alloc.clientID + clientIDBatchCount
        }
        alloc.clientID = clientID
        return
errHandler:
        log.LogErrorf("action[allocateClientID] err:%v", err.Error())
        return
}

func (alloc *IDAllocator) allocateCommonID() (id uint64, err error) {
        alloc.mnIDLock.Lock()
        defer alloc.mnIDLock.Unlock()
        var cmd []byte
        metadata := new(RaftCmd)
        metadata.Op = opSyncAllocCommonID
        metadata.K = maxCommonIDKey
        id = atomic.LoadUint64(&alloc.commonID) + 1
        value := strconv.FormatUint(uint64(id), 10)
        metadata.V = []byte(value)
        cmd, err = metadata.Marshal()
        if err != nil {
                goto errHandler
        }
        if _, err = alloc.partition.Submit(cmd); err != nil {
                goto errHandler
        }
        alloc.setCommonID(id)
        return
errHandler:
        log.LogErrorf("action[allocateCommonID] err:%v", err.Error())
        return
}

func (alloc *IDAllocator) allocateQuotaID() (id uint32, err error) {
        alloc.qaIDLock.Lock()
        defer alloc.qaIDLock.Unlock()
        var cmd []byte
        metadata := new(RaftCmd)
        metadata.Op = opSyncAllocQuotaID
        metadata.K = maxQuotaIDKey
        id = atomic.LoadUint32(&alloc.quotaID) + 1
        value := strconv.FormatUint(uint64(id), 10)
        metadata.V = []byte(value)
        cmd, err = metadata.Marshal()
        if err != nil {
                goto errHandler
        }
        if _, err = alloc.partition.Submit(cmd); err != nil {
                goto errHandler
        }
        alloc.setQuotaID(id)
        return
errHandler:
        log.LogErrorf("action[allocateQuotaID] err:%v", err.Error())
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "math"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

type lifecycleManager struct {
        sync.RWMutex
        cluster          *Cluster
        lcConfigurations map[string]*proto.LcConfiguration
        lcNodeStatus     *lcNodeStatus
        lcRuleTaskStatus *lcRuleTaskStatus
        idleLcNodeCh     chan struct{}
        exitCh           chan struct{}
}

func newLifecycleManager() *lifecycleManager {
        log.LogInfof("action[newLifecycleManager] construct")
        lcMgr := &lifecycleManager{
                lcConfigurations: make(map[string]*proto.LcConfiguration),
                lcNodeStatus:     newLcNodeStatus(),
                lcRuleTaskStatus: newLcRuleTaskStatus(),
                idleLcNodeCh:     make(chan struct{}),
                exitCh:           make(chan struct{}),
        }
        return lcMgr
}

func (lcMgr *lifecycleManager) startLcScan() {
        // stop if already scanning
        if lcMgr.scanning() {
                log.LogWarnf("rescheduleScanRoutine: scanning is not completed, lcRuleTaskStatus(%v)", lcMgr.lcRuleTaskStatus)
                return
        }

        tasks := lcMgr.genEnabledRuleTasks()
        if len(tasks) <= 0 {
                log.LogDebugf("startLcScan: no enabled lifecycle rule task to schedule!")
                return
        } else {
                log.LogDebugf("startLcScan: %v lifecycle rule tasks to schedule!", len(tasks))
        }

        // start scan init
        lcMgr.lcRuleTaskStatus = newLcRuleTaskStatus()
        for _, r := range tasks {
                lcMgr.lcRuleTaskStatus.ToBeScanned[r.Id] = r
        }

        go lcMgr.process()
}

// generate tasks for every bucket
func (lcMgr *lifecycleManager) genEnabledRuleTasks() []*proto.RuleTask {
        lcMgr.RLock()
        defer lcMgr.RUnlock()
        tasks := make([]*proto.RuleTask, 0)
        for _, v := range lcMgr.lcConfigurations {
                ts := v.GenEnabledRuleTasks()
                if len(ts) > 0 {
                        tasks = append(tasks, ts...)
                }
        }
        return tasks
}

func (lcMgr *lifecycleManager) scanning() bool {
        log.LogInfof("decide scanning, lcNodeStatus: %v, lcRuleTaskStatus: %v", lcMgr.lcNodeStatus, lcMgr.lcRuleTaskStatus)
        if len(lcMgr.lcRuleTaskStatus.ToBeScanned) > 0 {
                return true
        }

        for _, v := range lcMgr.lcRuleTaskStatus.Results {
                if v.Done != true && time.Now().Before(v.UpdateTime.Add(time.Minute*10)) {
                        return true
                }
        }

        for _, c := range lcMgr.lcNodeStatus.WorkingCount {
                if c > 0 {
                        return true
                }
        }

        log.LogInfof("decide scanning, scanning stop!")
        return false
}

func (lcMgr *lifecycleManager) process() {
        log.LogInfof("lifecycleManager process start, rule num(%v)", len(lcMgr.lcRuleTaskStatus.ToBeScanned))
        now := time.Now()
        lcMgr.lcRuleTaskStatus.StartTime = &now
        for lcMgr.scanning() {
                log.LogDebugf("wait idleLcNodeCh... ToBeScanned num(%v)", len(lcMgr.lcRuleTaskStatus.ToBeScanned))
                select {
                case <-lcMgr.exitCh:
                        log.LogInfo("exitCh notified, lifecycleManager process exit")
                        return
                case <-lcMgr.idleLcNodeCh:
                        log.LogDebug("idleLcNodeCh notified")

                        // ToBeScanned -> Scanning
                        task := lcMgr.lcRuleTaskStatus.GetOneTask()
                        if task == nil {
                                log.LogDebugf("lcRuleTaskStatus.GetOneTask, no task")
                                continue
                        }

                        nodeAddr := lcMgr.lcNodeStatus.GetIdleNode()
                        if nodeAddr == "" {
                                log.LogWarn("no idle lcnode, redo task")
                                lcMgr.lcRuleTaskStatus.RedoTask(task)
                                continue
                        }

                        val, ok := lcMgr.cluster.lcNodes.Load(nodeAddr)
                        if !ok {
                                log.LogErrorf("lcNodes.Load, nodeAddr(%v) is not available, redo task", nodeAddr)
                                lcMgr.lcNodeStatus.RemoveNode(nodeAddr)
                                lcMgr.lcRuleTaskStatus.RedoTask(task)
                                continue
                        }

                        node := val.(*LcNode)
                        adminTask := node.createLcScanTask(lcMgr.cluster.masterAddr(), task)
                        lcMgr.cluster.addLcNodeTasks([]*proto.AdminTask{adminTask})
                        log.LogDebugf("add lifecycle scan task(%v) to lcnode(%v)", *task, nodeAddr)
                }
        }
        end := time.Now()
        lcMgr.lcRuleTaskStatus.EndTime = &end
        log.LogInfof("lifecycleManager process finish, lcRuleTaskStatus results(%v)", lcMgr.lcRuleTaskStatus.Results)
}

func (lcMgr *lifecycleManager) notifyIdleLcNode() {
        select {
        case lcMgr.idleLcNodeCh <- struct{}{}:
                log.LogDebug("action[handleLcNodeHeartbeatResp], lifecycleManager scan routine notified!")
        default:
                log.LogDebug("action[handleLcNodeHeartbeatResp], lifecycleManager skipping notify!")
        }
}

func (lcMgr *lifecycleManager) SetS3BucketLifecycle(lcConf *proto.LcConfiguration) error {
        lcMgr.Lock()
        defer lcMgr.Unlock()

        lcMgr.lcConfigurations[lcConf.VolName] = lcConf

        return nil
}

func (lcMgr *lifecycleManager) GetS3BucketLifecycle(VolName string) (lcConf *proto.LcConfiguration) {
        lcMgr.RLock()
        defer lcMgr.RUnlock()

        var ok bool
        lcConf, ok = lcMgr.lcConfigurations[VolName]
        if !ok {
                return nil
        }

        return lcConf
}

func (lcMgr *lifecycleManager) DelS3BucketLifecycle(VolName string) {
        lcMgr.Lock()
        defer lcMgr.Unlock()

        delete(lcMgr.lcConfigurations, VolName)
}

//-----------------------------------------------

type OpLcNode interface {
        GetIdleNode() (nodeAddr string)
        RemoveNode(nodeAddr string)
        UpdateNode(nodeAddr string, count int)
}

// update status by heartbeat
type lcNodeStatus struct {
        sync.RWMutex
        WorkingCount map[string]int //ip:count, number of tasks being processed on this node
}

func newLcNodeStatus() *lcNodeStatus {
        return &lcNodeStatus{
                WorkingCount: make(map[string]int),
        }
}

func (ns *lcNodeStatus) GetIdleNode() (nodeAddr string) {
        ns.Lock()
        defer ns.Unlock()
        if len(ns.WorkingCount) == 0 {
                return
        }

        min := math.MaxInt
        for n, c := range ns.WorkingCount {
                if c < min {
                        nodeAddr = n
                        min = c
                }
                if c == 0 {
                        break
                }
        }
        ns.WorkingCount[nodeAddr]++
        return
}

func (ns *lcNodeStatus) RemoveNode(nodeAddr string) {
        ns.Lock()
        defer ns.Unlock()
        delete(ns.WorkingCount, nodeAddr)
        return
}

func (ns *lcNodeStatus) UpdateNode(nodeAddr string, count int) {
        ns.Lock()
        defer ns.Unlock()
        ns.WorkingCount[nodeAddr] = count
        return
}

// -----------------------------------------------
type lcRuleTaskStatus struct {
        sync.RWMutex
        ToBeScanned map[string]*proto.RuleTask
        Results     map[string]*proto.LcNodeRuleTaskResponse
        StartTime   *time.Time
        EndTime     *time.Time
}

func newLcRuleTaskStatus() *lcRuleTaskStatus {
        return &lcRuleTaskStatus{
                ToBeScanned: make(map[string]*proto.RuleTask),
                Results:     make(map[string]*proto.LcNodeRuleTaskResponse),
        }
}

func (rs *lcRuleTaskStatus) GetOneTask() (task *proto.RuleTask) {
        rs.Lock()
        defer rs.Unlock()
        if len(rs.ToBeScanned) == 0 {
                return
        }

        for _, t := range rs.ToBeScanned {
                task = t
                break
        }

        delete(rs.ToBeScanned, task.Id)
        return
}

func (rs *lcRuleTaskStatus) RedoTask(task *proto.RuleTask) {
        rs.Lock()
        defer rs.Unlock()
        if task == nil {
                return
        }

        rs.ToBeScanned[task.Id] = task
}

func (rs *lcRuleTaskStatus) AddResult(resp *proto.LcNodeRuleTaskResponse) {
        rs.Lock()
        defer rs.Unlock()
        rs.Results[resp.ID] = resp
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

type LcNode struct {
        ID          uint64
        Addr        string
        ReportTime  time.Time
        IsActive    bool
        TaskManager *AdminTaskManager
        sync.RWMutex
}

func newLcNode(addr, clusterID string) (lcNode *LcNode) {
        lcNode = new(LcNode)
        lcNode.Addr = addr
        lcNode.IsActive = true
        lcNode.ReportTime = time.Now()
        lcNode.TaskManager = newAdminTaskManager(lcNode.Addr, clusterID)
        return
}

func (lcNode *LcNode) clean() {
        lcNode.TaskManager.exitCh <- struct{}{}
}

func (lcNode *LcNode) checkLiveness() {
        lcNode.Lock()
        defer lcNode.Unlock()
        log.LogInfof("action[checkLiveness] lcnode[%v, %v, %v] report time[%v], since report time[%v], need gap[%v]",
                lcNode.ID, lcNode.Addr, lcNode.IsActive, lcNode.ReportTime, time.Since(lcNode.ReportTime), time.Second*time.Duration(defaultNodeTimeOutSec))
        if time.Since(lcNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
                lcNode.IsActive = false
        }

        return
}

func (lcNode *LcNode) createHeartbeatTask(masterAddr string) (task *proto.AdminTask) {
        request := &proto.HeartBeatRequest{
                CurrTime:   time.Now().Unix(),
                MasterAddr: masterAddr,
        }
        task = proto.NewAdminTask(proto.OpLcNodeHeartbeat, lcNode.Addr, request)
        return
}

func (lcNode *LcNode) createLcScanTask(masterAddr string, ruleTask *proto.RuleTask) (task *proto.AdminTask) {
        request := &proto.LcNodeRuleTaskRequest{
                MasterAddr: masterAddr,
                LcNodeAddr: lcNode.Addr,
                Task:       ruleTask,
        }
        task = proto.NewAdminTaskEx(proto.OpLcNodeScan, lcNode.Addr, request, ruleTask.Id)
        return
}

func (lcNode *LcNode) createSnapshotVerDelTask(masterAddr string, sTask *proto.SnapshotVerDelTask) (task *proto.AdminTask) {
        request := &proto.SnapshotVerDelTaskRequest{
                MasterAddr: masterAddr,
                LcNodeAddr: lcNode.Addr,
                Task:       sTask,
        }
        task = proto.NewAdminTaskEx(proto.OpLcNodeSnapshotVerDel, lcNode.Addr, request, request.Task.Id)
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

func (c *Cluster) handleLcNodeTaskResponse(nodeAddr string, task *proto.AdminTask) {
        if task == nil {
                log.LogInfof("lc action[handleLcNodeTaskResponse] receive addr[%v] task response, but task is nil", nodeAddr)
                return
        }
        log.LogInfof("lc action[handleLcNodeTaskResponse] receive addr[%v] task: %v", nodeAddr, task.ToString())
        var (
                err    error
                lcNode *LcNode
        )

        if lcNode, err = c.lcNode(nodeAddr); err != nil {
                goto errHandler
        }
        lcNode.TaskManager.DelTask(task)
        if err = unmarshalTaskResponse(task); err != nil {
                goto errHandler
        }

        switch task.OpCode {
        case proto.OpLcNodeHeartbeat:
                response := task.Response.(*proto.LcNodeHeartbeatResponse)
                err = c.handleLcNodeHeartbeatResp(task.OperatorAddr, response)
        case proto.OpLcNodeScan:
                response := task.Response.(*proto.LcNodeRuleTaskResponse)
                err = c.handleLcNodeLcScanResp(task.OperatorAddr, response)
        case proto.OpLcNodeSnapshotVerDel:
                response := task.Response.(*proto.SnapshotVerDelTaskResponse)
                err = c.handleLcNodeSnapshotScanResp(task.OperatorAddr, response)
        default:
                err = fmt.Errorf(fmt.Sprintf("lc unknown operate code %v", task.OpCode))
                goto errHandler
        }

        if err != nil {
                goto errHandler
        }
        return

errHandler:
        log.LogWarnf("lc handleLcNodeTaskResponse failed, task: %v, err: %v", task.ToString(), err)
        return
}

func (c *Cluster) handleLcNodeHeartbeatResp(nodeAddr string, resp *proto.LcNodeHeartbeatResponse) (err error) {
        var lcNode *LcNode

        log.LogDebugf("action[handleLcNodeHeartbeatResp] clusterID[%v] receive lcNode[%v] heartbeat", c.Name, nodeAddr)
        if resp.Status != proto.TaskSucceeds {
                Warn(c.Name, fmt.Sprintf("action[handleLcNodeHeartbeatResp] clusterID[%v] lcNode[%v] heartbeat task failed, err[%v]",
                        c.Name, nodeAddr, resp.Result))
                return
        }

        if lcNode, err = c.lcNode(nodeAddr); err != nil {
                log.LogErrorf("action[handleLcNodeHeartbeatResp], lcNode[%v], heartbeat error: %v", nodeAddr, err.Error())
                return
        }
        lcNode.Lock()
        lcNode.IsActive = true
        lcNode.ReportTime = time.Now()
        lcNode.Unlock()

        // update lcNodeStatus
        log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v], LcScanningTasks[%v], SnapshotScanningTasks[%v]", nodeAddr, len(resp.LcScanningTasks), len(resp.SnapshotScanningTasks))
        c.lcMgr.lcNodeStatus.UpdateNode(nodeAddr, len(resp.LcScanningTasks))
        c.snapshotMgr.lcNodeStatus.UpdateNode(nodeAddr, len(resp.SnapshotScanningTasks))

        // handle LcScanningTasks
        for _, taskRsp := range resp.LcScanningTasks {
                c.lcMgr.lcRuleTaskStatus.Lock()

                // avoid updating TaskResults incorrectly when received handleLcNodeLcScanResp first and then handleLcNodeHeartbeatResp
                if c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID] != nil && c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID].Done {
                        log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v] task[%v] already done", nodeAddr, taskRsp.ID)
                } else {
                        t := time.Now()
                        taskRsp.UpdateTime = &t
                        c.lcMgr.lcRuleTaskStatus.Results[taskRsp.ID] = taskRsp
                }

                c.lcMgr.lcRuleTaskStatus.Unlock()
                log.LogDebugf("action[handleLcNodeHeartbeatResp], lcNode[%v] taskRsp: %v", nodeAddr, taskRsp)
        }
        if len(resp.LcScanningTasks) < resp.LcTaskCountLimit {
                log.LogInfof("action[handleLcNodeHeartbeatResp], notify idle lcNode[%v], now LcScanningTasks[%v]", nodeAddr, len(resp.LcScanningTasks))
                c.lcMgr.notifyIdleLcNode()
        }

        // handle SnapshotScanningTasks
        for _, taskRsp := range resp.SnapshotScanningTasks {
                c.snapshotMgr.lcSnapshotTaskStatus.Lock()

                // avoid updating TaskResults incorrectly when received handleLcNodeLcScanResp first and then handleLcNodeHeartbeatResp
                if c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID] != nil && c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID].Done {
                        log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v] snapshot task[%v] already done", nodeAddr, taskRsp.ID)
                } else {
                        t := time.Now()
                        taskRsp.UpdateTime = &t
                        c.snapshotMgr.lcSnapshotTaskStatus.TaskResults[taskRsp.ID] = taskRsp
                }

                c.snapshotMgr.lcSnapshotTaskStatus.Unlock()
                log.LogDebugf("action[handleLcNodeHeartbeatResp], lcNode[%v] snapshot taskRsp: %v", nodeAddr, taskRsp)
        }
        if len(resp.SnapshotScanningTasks) < resp.LcTaskCountLimit {
                n := resp.LcTaskCountLimit - len(resp.SnapshotScanningTasks)
                log.LogInfof("action[handleLcNodeHeartbeatResp], notify idle lcNode[%v], now SnapshotScanningTasks[%v], notify times[%v]", nodeAddr, len(resp.SnapshotScanningTasks), n)
                for i := 0; i < n; i++ {
                        c.snapshotMgr.notifyIdleLcNode()
                }
        }

        log.LogInfof("action[handleLcNodeHeartbeatResp], lcNode[%v], heartbeat success", nodeAddr)
        return
}

func (c *Cluster) handleLcNodeLcScanResp(nodeAddr string, resp *proto.LcNodeRuleTaskResponse) (err error) {
        log.LogDebugf("action[handleLcNodeLcScanResp] lcNode[%v] task[%v] Enter", nodeAddr, resp.ID)
        defer func() {
                log.LogDebugf("action[handleLcNodeLcScanResp] lcNode[%v] task[%v] Exit", nodeAddr, resp.ID)
        }()

        switch resp.Status {
        case proto.TaskFailed:
                log.LogWarnf("action[handleLcNodeLcScanResp] scanning failed, resp(%v), no redo", resp)
                return
        case proto.TaskSucceeds:
                c.lcMgr.lcRuleTaskStatus.AddResult(resp)
                log.LogInfof("action[handleLcNodeLcScanResp] scanning completed, resp(%v)", resp)
                return
        default:
                log.LogInfof("action[handleLcNodeLcScanResp] scanning received, resp(%v)", resp)
        }

        return
}

func (c *Cluster) handleLcNodeSnapshotScanResp(nodeAddr string, resp *proto.SnapshotVerDelTaskResponse) (err error) {
        log.LogDebugf("action[handleLcNodeSnapshotScanResp] lcNode[%v] task[%v] Enter", nodeAddr, resp.ID)
        defer func() {
                log.LogDebugf("action[handleLcNodeSnapshotScanResp] lcNode[%v] task[%v] Exit", nodeAddr, resp.ID)
        }()

        switch resp.Status {
        case proto.TaskFailed:
                c.snapshotMgr.lcSnapshotTaskStatus.RedoTask(resp.SnapshotVerDelTask)
                log.LogErrorf("action[handleLcNodeSnapshotScanResp] scanning failed, resp(%v), redo", resp)
                return
        case proto.TaskSucceeds:
                // 1.mark done for VersionMgr
                var vol *Vol
                vol, err = c.getVol(resp.VolName)
                if err != nil {
                        log.LogErrorf("action[handleLcNodeSnapshotScanResp] snapshot task(%v) scanning completed by %v, results(%v), volume(%v) is not found",
                                resp.ID, nodeAddr, resp, resp.VolName)
                } else {
                        _ = vol.VersionMgr.DelVer(resp.VerSeq)
                }

                // 2. mark done for snapshotMgr
                c.snapshotMgr.lcSnapshotTaskStatus.AddResult(resp)
                log.LogInfof("action[handleLcNodeSnapshotScanResp] scanning completed, resp(%v)", resp)
                return
        default:
                log.LogInfof("action[handleLcNodeSnapshotScanResp] scanning received, resp(%v)", resp)
        }

        return
}

package master

import (
        "encoding/json"
        "fmt"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

type UidSpaceManager struct {
        volName        string
        mpSpaceMetrics map[uint64][]*proto.UidReportSpaceInfo
        uidInfo        map[uint32]*proto.UidSpaceInfo
        c              *Cluster
        vol            *Vol
        sync.RWMutex
}

type UidSpaceFsm struct {
        UidSpaceArr []*proto.UidSpaceInfo
}

func (vol *Vol) initUidSpaceManager(c *Cluster) {
        vol.uidSpaceManager = &UidSpaceManager{
                c:              c,
                vol:            vol,
                volName:        vol.Name,
                mpSpaceMetrics: make(map[uint64][]*proto.UidReportSpaceInfo),
                uidInfo:        make(map[uint32]*proto.UidSpaceInfo),
        }
}

func (uMgr *UidSpaceManager) addUid(uid uint32, size uint64) bool {
        uMgr.Lock()
        uMgr.uidInfo[uid] = &proto.UidSpaceInfo{
                LimitSize: size,
                VolName:   uMgr.volName,
                Uid:       uid,
                Enabled:   true,
        }
        uMgr.persist()
        uMgr.Unlock()

        uMgr.listAll()
        return true
}

func (uMgr *UidSpaceManager) removeUid(uid uint32) bool {
        uMgr.Lock()
        defer uMgr.Unlock()

        if _, ok := uMgr.uidInfo[uid]; !ok {
                log.LogErrorf("UidSpaceManager.vol %v del %v failed", uMgr.volName, uid)
                return true
        }
        uMgr.uidInfo[uid].Enabled = false
        uMgr.uidInfo[uid].Limited = false
        uMgr.persist()
        log.LogDebugf("UidSpaceManager.vol %v del %v success", uMgr.volName, uid)
        return true
}

func (uMgr *UidSpaceManager) checkUid(uid uint32) (ok bool, uidInfo *proto.UidSpaceInfo) {
        uMgr.RLock()
        defer uMgr.RUnlock()
        uidInfo, ok = uMgr.uidInfo[uid]
        return
}

func (uMgr *UidSpaceManager) listAll() (rsp []*proto.UidSpaceInfo) {
        uMgr.RLock()
        defer uMgr.RUnlock()

        log.LogDebugf("UidSpaceManager. listAll vol %v, info %v", uMgr.volName, len(uMgr.uidInfo))
        for _, t := range uMgr.uidInfo {
                log.LogDebugf("UidSpaceManager. listAll vol %v, uid %v, info %v", t.VolName, t.Uid, t)
                rsp = append(rsp, t)
        }
        return
}

func (uMgr *UidSpaceManager) persist() (err error) {
        log.LogDebugf("vol %v UidSpaceManager persist", uMgr.volName)
        var uidFsm UidSpaceFsm
        for _, t := range uMgr.uidInfo {
                uidFsm.UidSpaceArr = append(uidFsm.UidSpaceArr, t)
        }

        var val []byte
        if val, err = json.Marshal(uidFsm); err != nil {
                log.LogErrorf("UidSpaceManager vol %v uid persist error %v", uMgr.vol.Name, err)
                return
        }
        if err = uMgr.c.syncUidSpaceList(uMgr.vol, val); err != nil {
                log.LogErrorf("UidSpaceManager vol %v uid persist syncUidList error %v", uMgr.vol.Name, err)
                return
        }
        return
}

func (uMgr *UidSpaceManager) load(c *Cluster, val []byte) (err error) {
        log.LogDebugf("vol %v UidSpaceManager load", uMgr.volName)
        uMgr.c = c
        uidFsm := &UidSpaceFsm{}
        if err = json.Unmarshal(val, uidFsm); err != nil {
                log.LogErrorf("UidSpaceManager vol %v Unmarshal error %v", uMgr.volName, err)
                return
        }
        for _, info := range uidFsm.UidSpaceArr {
                uMgr.uidInfo[info.Uid] = info
                log.LogDebugf("vol %v uid %v load usedSize %v limit %v enabled %v", uMgr.volName, info.Uid, info.UsedSize, info.LimitSize, info.Limited)
        }
        return
}

func (uMgr *UidSpaceManager) getSpaceOp() (rsp []*proto.UidSpaceInfo) {
        uMgr.RLock()
        defer uMgr.RUnlock()
        for _, info := range uMgr.uidInfo {
                rsp = append(rsp, info)
                log.LogDebugf("getSpaceOp. vol %v uid %v enabled %v", info.VolName, info.Uid, info.Limited)
        }
        return
}

func (uMgr *UidSpaceManager) volUidUpdate(report *proto.MetaPartitionReport) {
        if !report.IsLeader {
                return
        }
        uMgr.Lock()
        defer uMgr.Unlock()
        id := report.PartitionID
        uMgr.mpSpaceMetrics[id] = report.UidInfo
        log.LogDebugf("vol %v volUidUpdate.mpID %v set uid %v. uid list size %v", uMgr.volName, id, report.UidInfo, len(uMgr.uidInfo))

        for _, info := range uMgr.uidInfo {
                info.UsedSize = 0
        }

        uidInfo := make(map[uint32]*proto.UidSpaceInfo)
        for mpId, info := range uMgr.mpSpaceMetrics {
                log.LogDebugf("vol %v volUidUpdate. reCalc mpId %v info %v", uMgr.volName, mpId, len(info))
                for _, space := range info {
                        if _, ok := uMgr.uidInfo[space.Uid]; !ok {
                                log.LogDebugf("vol %v volUidUpdate.uid %v not found", uMgr.volName, space.Uid)
                                uMgr.uidInfo[space.Uid] = &proto.UidSpaceInfo{
                                        VolName: uMgr.volName,
                                        Uid:     space.Uid,
                                        CTime:   time.Now().Unix(),
                                }
                        }
                        if _, ok := uidInfo[space.Uid]; !ok {
                                uidInfo[space.Uid] = &(*uMgr.uidInfo[space.Uid])
                        }

                        log.LogDebugf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
                        uidInfo[space.Uid].UsedSize += space.Size
                        if !uidInfo[space.Uid].Enabled {
                                uidInfo[space.Uid].Limited = false
                                continue
                        }
                        if uidInfo[space.Uid].UsedSize > uMgr.uidInfo[space.Uid].LimitSize {
                                uidInfo[space.Uid].Limited = true
                                log.LogWarnf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
                        } else {
                                uidInfo[space.Uid].Limited = false
                                log.LogWarnf("volUidUpdate.vol %v uid %v from mpId %v useSize %v add %v", uMgr.vol, space.Uid, mpId, uidInfo[space.Uid].UsedSize, space.Size)
                        }
                }
        }

        log.LogDebugf("vol %v volUidUpdate.mpID %v set uid %v. uid list size %v", uMgr.volName, id, report.UidInfo, len(uMgr.uidInfo))
        for _, info := range uidInfo {
                if _, ok := uMgr.uidInfo[info.Uid]; !ok {
                        log.LogErrorf("volUidUpdate.uid %v not found", info.Uid)
                        continue
                }
                uMgr.uidInfo[info.Uid] = info
        }
        for _, info := range uMgr.uidInfo {
                if info.UsedSize == 0 {
                        info.Limited = false
                }
        }
        log.LogDebugf("volUidUpdate.mpID %v set uid %v. uid list size %v", id, report.UidInfo, len(uMgr.uidInfo))
}

type ServerFactorLimit struct {
        Name           string
        Type           uint32
        Total          uint64
        Buffer         uint64 // flowbuffer add with preallocate buffer equal with flowtotal
        CliUsed        uint64
        CliNeed        uint64
        Allocated      uint64
        NeedAfterAlloc uint64
        magnify        uint32 // for client allocation need magnify
        LimitRate      float32
        LastMagnify    uint64
        requestCh      chan interface{}
        done           chan interface{}
        qosManager     *QosCtrlManager
}

type ClientReportOutput struct {
        ID        uint64
        FactorMap map[uint32]*proto.ClientLimitInfo
        Host      string
        Status    uint8
}

type LimitOutput struct {
        ID            uint64
        Enable        bool
        ReqPeriod     uint32
        HitTriggerCnt uint8
        FactorMap     map[uint32]*proto.ClientLimitInfo
}

type ClientInfoOutput struct {
        Cli    *ClientReportOutput
        Assign *LimitOutput
        Time   time.Time
        ID     uint64
        Host   string
}

type ClientInfoMgr struct {
        Cli    *proto.ClientReportLimitInfo
        Assign *proto.LimitRsp2Client
        Time   time.Time
        ID     uint64
        Host   string
}

type qosRequestArgs struct {
        clientID       uint64
        factorType     uint32
        clientReq      *proto.ClientLimitInfo
        lastClientInfo *proto.ClientLimitInfo
        assignInfo     *proto.ClientLimitInfo
        rsp2Client     *proto.ClientLimitInfo
        wg             *sync.WaitGroup
}

type QosCtrlManager struct {
        cliInfoMgrMap        map[uint64]*ClientInfoMgr     // clientid->client_reportinfo&&assign_limitinfo
        serverFactorLimitMap map[uint32]*ServerFactorLimit // vol qos data for iops w/r and flow w/r
        defaultClientCnt     uint32
        qosEnable            bool
        ClientReqPeriod      uint32
        ClientHitTriggerCnt  uint32
        vol                  *Vol
        sync.RWMutex
}

func (qosManager *QosCtrlManager) volUpdateMagnify(magnifyArgs *qosArgs) {
        defer qosManager.Unlock()
        qosManager.Lock()

        log.LogWarnf("action[volUpdateMagnify] vol %v try set magnify iopsRVal[%v],iopsWVal[%v],flowRVal[%v],flowWVal[%v]",
                qosManager.vol.Name, magnifyArgs.iopsRVal, magnifyArgs.iopsWVal, magnifyArgs.flowRVal, magnifyArgs.flowWVal)

        arrMagnify := [4]uint64{magnifyArgs.iopsRVal, magnifyArgs.iopsWVal, magnifyArgs.flowRVal, magnifyArgs.flowWVal}
        for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
                magnify := qosManager.serverFactorLimitMap[i].magnify
                if uint64(magnify) != arrMagnify[i-1] && arrMagnify[i-1] > 0 {
                        qosManager.serverFactorLimitMap[i].magnify = uint32(arrMagnify[i-1])
                        log.LogWarnf("action[volUpdateMagnify] vol %v  after update type [%v] magnify [%v] to [%v]",
                                qosManager.vol.Name, proto.QosTypeString(i), magnify, arrMagnify[i-1])
                }
        }
}

func (qosManager *QosCtrlManager) volUpdateLimit(limitArgs *qosArgs) {
        defer qosManager.Unlock()
        qosManager.Lock()

        log.LogWarnf("action[volUpdateLimit] vol %v try set limit iopsrlimit[%v],iopswlimit[%v],flowrlimit[%v],flowwlimit[%v]",
                qosManager.vol.Name, limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal)

        //if limitArgs.iopsWVal != 0 {
        //        qosManager.serverFactorLimitMap[proto.IopsWriteType].Total = limitArgs.iopsWVal
        //        qosManager.serverFactorLimitMap[proto.IopsWriteType].LastMagnify = 0
        //}
        //if limitArgs.iopsRVal != 0 {
        //        qosManager.serverFactorLimitMap[proto.IopsReadType].Total = limitArgs.iopsRVal
        //        qosManager.serverFactorLimitMap[proto.IopsWriteType].LastMagnify = 0
        //}
        if limitArgs.flowWVal != 0 {
                qosManager.serverFactorLimitMap[proto.FlowWriteType].Total = limitArgs.flowWVal
                qosManager.serverFactorLimitMap[proto.FlowWriteType].LastMagnify = 0
                qosManager.serverFactorLimitMap[proto.FlowWriteType].Buffer = limitArgs.flowWVal
        }
        if limitArgs.flowRVal != 0 {
                qosManager.serverFactorLimitMap[proto.FlowReadType].Total = limitArgs.flowRVal
                qosManager.serverFactorLimitMap[proto.FlowReadType].LastMagnify = 0
                qosManager.serverFactorLimitMap[proto.FlowReadType].Buffer = limitArgs.flowRVal
        }

        for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
                limitf := qosManager.serverFactorLimitMap[i]
                log.LogWarnf("action[volUpdateLimit] vol [%v] after set type [%v] [%v,%v,%v,%v]",
                        qosManager.vol.Name, proto.QosTypeString(i), limitf.Allocated, limitf.NeedAfterAlloc, limitf.Total, limitf.Buffer)
        }
}

func (qosManager *QosCtrlManager) getQosMagnify(factorTYpe uint32) uint32 {
        return qosManager.serverFactorLimitMap[factorTYpe].magnify
}

func (qosManager *QosCtrlManager) getQosLimit(factorTYpe uint32) uint64 {
        return qosManager.serverFactorLimitMap[factorTYpe].Total
}

func (qosManager *QosCtrlManager) initClientQosInfo(clientID uint64, host string) (limitRsp2Client *proto.LimitRsp2Client, err error) {
        log.QosWriteDebugf("action[initClientQosInfo] vol %v clientID %v Host %v", qosManager.vol.Name, clientID, host)
        clientInitInfo := proto.NewClientReportLimitInfo()
        cliCnt := qosManager.defaultClientCnt
        if cliCnt <= proto.QosDefaultClientCnt {
                cliCnt = proto.QosDefaultClientCnt
        }
        if len(qosManager.cliInfoMgrMap) > int(cliCnt) {
                cliCnt = uint32(len(qosManager.cliInfoMgrMap))
        }

        limitRsp2Client = proto.NewLimitRsp2Client()
        limitRsp2Client.ID = clientID
        limitRsp2Client.Enable = qosManager.qosEnable

        factorType := proto.IopsReadType

        defer qosManager.Unlock()
        qosManager.Lock()

        for factorType <= proto.FlowWriteType {
                var initLimit uint64
                serverLimit := qosManager.serverFactorLimitMap[factorType]

                if qosManager.qosEnable {
                        initLimit = serverLimit.Total / uint64(cliCnt)

                        if serverLimit.Buffer > initLimit {
                                serverLimit.Buffer -= initLimit
                                serverLimit.Allocated += initLimit
                        } else {
                                initLimit = serverLimit.Buffer
                                serverLimit.Allocated += initLimit
                                serverLimit.Buffer = 0
                        }
                        if factorType == proto.FlowWriteType || factorType == proto.FlowReadType {
                                if initLimit > 1*util.GB/8 {
                                        initLimit = 1 * util.GB / 8
                                }
                        } else {
                                if initLimit > 200 {
                                        initLimit = 200
                                }
                        }
                }

                clientInitInfo.FactorMap[factorType] = &proto.ClientLimitInfo{
                        UsedLimit:  initLimit,
                        UsedBuffer: 0,
                        Used:       0,
                        Need:       0,
                }

                limitRsp2Client.Magnify[factorType] = serverLimit.magnify
                limitRsp2Client.FactorMap[factorType] = clientInitInfo.FactorMap[factorType]

                log.QosWriteDebugf("action[initClientQosInfo] vol [%v] clientID [%v] factorType [%v] init client info and set limitRsp2Client [%v]"+
                        "server total[%v] used [%v] buffer [%v]",
                        qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
                        initLimit, serverLimit.Total, serverLimit.Allocated, serverLimit.Buffer)
                factorType++
        }

        qosManager.cliInfoMgrMap[clientID] = &ClientInfoMgr{
                Cli:    clientInitInfo,
                Assign: limitRsp2Client,
                Time:   time.Now(),
                ID:     clientID,
                Host:   host,
        }
        log.QosWriteDebugf("action[initClientQosInfo] vol [%v] clientID [%v] Assign [%v]", qosManager.vol.Name, clientID, limitRsp2Client)
        return
}

func (serverLimit *ServerFactorLimit) String() string {
        return fmt.Sprintf("serverLimit {total:[%v],alloc:(allocated:[%v],need:[%v],buffer:[%v]),limit:(limitrate:[%v], magnify:[%v]),client sum {used:[%v], need:[%v]}}",
                serverLimit.Total, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Buffer,
                serverLimit.LimitRate, serverLimit.LastMagnify,
                serverLimit.CliUsed, serverLimit.CliNeed)
}

func (serverLimit *ServerFactorLimit) getDstLimit(factorType uint32, used, need uint64) (dstLimit uint64) {
        if factorType == proto.FlowWriteType || factorType == proto.FlowReadType {
                if need > used {
                        need = used
                }
                if (need + used) < 10*util.MB/8 {
                        dstLimit = uint64(float64(need+used) * 2)
                } else if (need + used) < 50*util.MB/8 {
                        dstLimit = uint64(float64(need+used) * 1.5)
                } else if (need + used) < 100*util.MB/8 {
                        dstLimit = uint64(float64(need+used) * 1.2)
                } else if (need + used) < 1*util.GB/8 {
                        dstLimit = uint64(float64(need+used) * 1.1)
                } else {
                        dstLimit = uint64(float64(need+used) + 1*util.GB/8)
                }
        } else {
                if (need + used) < 100 {
                        dstLimit = uint64(float64(need+used) * 2)
                } else if (need + used) < 500 {
                        dstLimit = uint64(float64(need+used) * 1.5)
                } else if (need + used) < 1000 {
                        dstLimit = uint64(float64(need+used) * 1.2)
                } else if (need + used) < 5000 {
                        dstLimit = uint64(float64(need+used) * 1.2)
                } else {
                        dstLimit = uint64(float64(need+used) + 1000)
                }
        }
        return
}

func (serverLimit *ServerFactorLimit) dispatch() {
        for {
                select {
                case request := <-serverLimit.requestCh:
                        serverLimit.updateLimitFactor(request)
                case <-serverLimit.done:
                        log.LogErrorf("done ServerFactorLimit type (%v)", serverLimit.Type)
                        return
                }
        }
}

// handle client request and rsp with much more if buffer is enough according rules of allocate
func (serverLimit *ServerFactorLimit) updateLimitFactor(req interface{}) {
        request := req.(*qosRequestArgs)
        clientID := request.clientID
        factorType := request.factorType
        clientReq := request.clientReq
        assignInfo := request.assignInfo
        rsp2Client := request.rsp2Client
        lastClientInfo := request.lastClientInfo

        log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v],client report [%v,%v,%v,%v] last client report [%v,%v,%v,%v] periodically cal Assign [%v,%v]",
                serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
                clientReq.Used, clientReq.Need, clientReq.UsedLimit, clientReq.UsedBuffer,
                lastClientInfo.Used, lastClientInfo.Need, lastClientInfo.UsedLimit, lastClientInfo.UsedBuffer,
                assignInfo.UsedLimit, assignInfo.UsedBuffer)

        rsp2Client.UsedLimit = assignInfo.UsedLimit
        rsp2Client.UsedBuffer = assignInfo.UsedBuffer

        // flow limit and buffer not enough,client need more
        if (clientReq.Need + clientReq.Used) > (assignInfo.UsedLimit + assignInfo.UsedBuffer) {
                log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v], need [%v] used [%v], used limit [%v]",
                        serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType), clientReq.Need, clientReq.Used, clientReq.UsedLimit)

                dstLimit := serverLimit.getDstLimit(factorType, clientReq.Used, clientReq.Need)

                // Assign already  allocated the buffer for client
                if dstLimit > assignInfo.UsedLimit+assignInfo.UsedBuffer {
                        additionBuffer := dstLimit - assignInfo.UsedLimit - assignInfo.UsedBuffer
                        // if buffer is available then balance must not effect, try use buffer as possible as can
                        if serverLimit.Buffer > 0 {
                                log.QosWriteDebugf("action[updateLimitFactor] vol [%v] clientID [%v] type [%v] client need more buffer [%v] serverlimit buffer [%v] used [%v]",
                                        serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType),
                                        additionBuffer, serverLimit.Buffer, serverLimit.Allocated)

                                // calc dst buffer for client to expand
                                // ignore the case of s.used be zero.  used should large then 0 because dstLimit isn't zero and be part of s.used
                                var dstUsedBuffer uint64
                                if serverLimit.Allocated != 0 {
                                        dstUsedBuffer = uint64(float64(dstLimit) * (float64(serverLimit.Buffer) / float64(serverLimit.Allocated)) * 0.5)
                                        if dstUsedBuffer > dstLimit {
                                                dstUsedBuffer = dstLimit
                                        }
                                } else {
                                        dstUsedBuffer = dstLimit
                                }

                                if assignInfo.UsedBuffer < dstUsedBuffer {
                                        additionBuffer = dstUsedBuffer - assignInfo.UsedBuffer
                                        if additionBuffer > serverLimit.Buffer {
                                                rsp2Client.UsedBuffer += serverLimit.Buffer
                                                assignInfo.UsedBuffer = rsp2Client.UsedBuffer
                                                serverLimit.Allocated += serverLimit.Buffer
                                                serverLimit.Buffer = 0
                                        } else {
                                                rsp2Client.UsedBuffer = dstUsedBuffer
                                                assignInfo.UsedBuffer = dstUsedBuffer
                                                serverLimit.Buffer -= additionBuffer
                                                serverLimit.Allocated += additionBuffer
                                        }
                                }
                        }
                }
        }
        log.QosWriteDebugf("action[updateLimitFactor] vol [%v] [clientID [%v] type [%v] rsp2Client.UsedLimit [%v], UsedBuffer [%v]",
                serverLimit.qosManager.vol.Name, clientID, proto.QosTypeString(factorType), rsp2Client.UsedLimit, rsp2Client.UsedBuffer)
        request.wg.Done()
}

func (qosManager *QosCtrlManager) init(cluster *Cluster, host string) (limit *proto.LimitRsp2Client, err error) {
        log.QosWriteDebugf("action[qosManage.init] vol [%v] Host %v", qosManager.vol.Name, host)
        var id uint64
        if id, err = cluster.idAlloc.allocateClientID(); err == nil {
                return qosManager.initClientQosInfo(id, host)
        }
        return
}

func (qosManager *QosCtrlManager) HandleClientQosReq(reqClientInfo *proto.ClientReportLimitInfo, clientID uint64) (limitRsp *proto.LimitRsp2Client, err error) {
        log.QosWriteDebugf("action[HandleClientQosReq] vol [%v] reqClientInfo from [%v], enable [%v]",
                qosManager.vol.Name, clientID, qosManager.qosEnable)

        qosManager.RLock()
        clientInfo, lastExist := qosManager.cliInfoMgrMap[clientID]
        if !lastExist || reqClientInfo == nil {
                qosManager.RUnlock()
                log.LogWarnf("action[HandleClientQosReq] vol [%v] id [%v] addr [%v] not exist", qosManager.vol.Name, clientID, reqClientInfo.Host)
                return qosManager.initClientQosInfo(clientID, reqClientInfo.Host)
        }
        qosManager.RUnlock()

        limitRsp = proto.NewLimitRsp2Client()
        limitRsp.Enable = qosManager.qosEnable
        limitRsp.ID = reqClientInfo.ID
        limitRsp.ReqPeriod = qosManager.ClientReqPeriod
        limitRsp.HitTriggerCnt = uint8(qosManager.ClientHitTriggerCnt)

        if !qosManager.qosEnable {
                clientInfo.Cli = reqClientInfo
                limitRsp.FactorMap = reqClientInfo.FactorMap
                clientInfo.Assign = limitRsp
                clientInfo.Time = time.Now()
                for i := proto.IopsReadType; i <= proto.FlowWriteType; i++ {
                        reqClientInfo.FactorMap[i].UsedLimit = reqClientInfo.FactorMap[i].Used
                        reqClientInfo.FactorMap[i].UsedBuffer = reqClientInfo.FactorMap[i].Need

                        log.QosWriteDebugf("action[HandleClientQosReq] vol [%v] [%v,%v,%v,%v]", qosManager.vol.Name,
                                reqClientInfo.FactorMap[i].Used,
                                reqClientInfo.FactorMap[i].Need,
                                reqClientInfo.FactorMap[i].UsedLimit,
                                reqClientInfo.FactorMap[i].UsedBuffer)
                }
                return
        }
        index := 0
        wg := &sync.WaitGroup{}
        wg.Add(len(reqClientInfo.FactorMap))
        for factorType, clientFactor := range reqClientInfo.FactorMap {
                limitRsp.FactorMap[factorType] = &proto.ClientLimitInfo{}
                serverLimit := qosManager.serverFactorLimitMap[factorType]
                limitRsp.Magnify[factorType] = serverLimit.magnify

                request := &qosRequestArgs{
                        clientID:       clientID,
                        factorType:     factorType,
                        clientReq:      clientFactor,
                        lastClientInfo: clientInfo.Cli.FactorMap[factorType],
                        assignInfo:     clientInfo.Assign.FactorMap[factorType],
                        rsp2Client:     limitRsp.FactorMap[factorType],
                        wg:             wg,
                }
                serverLimit.requestCh <- request
                index++
        }
        wg.Wait()

        clientInfo.Cli = reqClientInfo
        clientInfo.Assign = limitRsp
        clientInfo.Time = time.Now()

        return
}

func (qosManager *QosCtrlManager) updateServerLimitByClientsInfo(factorType uint32) {
        var (
                cliSum                      proto.ClientLimitInfo
                nextStageNeed, nextStageUse uint64
        )
        qosManager.RLock()
        serverLimit := qosManager.serverFactorLimitMap[factorType]

        log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] last limitInfo(%v)",
                qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)

        // get sum of data from all clients reports
        for host, cliInfo := range qosManager.cliInfoMgrMap {
                cliFactor := cliInfo.Cli.FactorMap[factorType]
                cliSum.Used += cliFactor.Used
                cliSum.Need += cliFactor.Need
                cliSum.UsedLimit += cliFactor.UsedLimit
                cliSum.UsedBuffer += cliFactor.UsedBuffer
                log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] Host [%v] type [%v] used [%v] need [%v] limit [%v] buffer [%v]",
                        qosManager.vol.Name, host, proto.QosTypeString(factorType),
                        cliFactor.Used, cliFactor.Need, cliFactor.UsedLimit, cliFactor.UsedBuffer)
        }

        serverLimit.CliUsed = cliSum.Used
        serverLimit.CliNeed = cliSum.Need
        qosManager.RUnlock()

        if !qosManager.qosEnable {
                return
        }

        serverLimit.Buffer = 0
        nextStageUse = cliSum.Used
        nextStageNeed = cliSum.Need
        if serverLimit.Total >= nextStageUse {
                serverLimit.Buffer = serverLimit.Total - nextStageUse
                log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageUse [%v]",
                        qosManager.vol.Name, serverLimit.Buffer, nextStageUse)
                if nextStageNeed > serverLimit.Buffer {
                        nextStageNeed -= serverLimit.Buffer
                        nextStageUse += serverLimit.Buffer
                        serverLimit.Buffer = 0
                        log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageNeed [%v] too much",
                                qosManager.vol.Name, serverLimit.Buffer, nextStageNeed)
                } else {
                        serverLimit.Buffer -= nextStageNeed
                        log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] reset server buffer [%v] all clients nextStageNeed [%v]",
                                qosManager.vol.Name, serverLimit.Buffer, nextStageNeed)
                        nextStageUse += nextStageNeed
                        nextStageNeed = 0
                }
        } else { // usage large than limitation
                log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol[%v] type [%v] clients needs [%v] plus overuse [%v],get nextStageNeed [%v]",
                        qosManager.vol.Name, proto.QosTypeString(factorType), nextStageNeed, nextStageUse-serverLimit.Total,
                        nextStageNeed+nextStageUse-serverLimit.Total)
                nextStageNeed += nextStageUse - serverLimit.Total
                nextStageUse = serverLimit.Total
        }

        serverLimit.Allocated = nextStageUse
        serverLimit.NeedAfterAlloc = nextStageNeed

        // get the limitRate,additionFlowNeed should be zero if total used can increase
        serverLimit.LimitRate = 0
        if serverLimit.NeedAfterAlloc > 0 {
                serverLimit.LimitRate = float32(float64(serverLimit.NeedAfterAlloc) / float64(serverLimit.Allocated+serverLimit.NeedAfterAlloc))

                log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] alloc not enough need limitRatio serverLimit:(%v)",
                        qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)

                lastMagnify := serverLimit.LastMagnify
                lastLimitRatio := serverLimit.LimitRate
                // master assigned limit and buffer not be used as expected,we need adjust the gap
                if serverLimit.CliUsed < serverLimit.Total {
                        if serverLimit.LimitRate > -10.0 && serverLimit.LastMagnify < serverLimit.Total*10 {
                                serverLimit.LastMagnify += uint64(float64(serverLimit.Total-serverLimit.CliUsed) * 0.1)
                        }
                } else {
                        if serverLimit.LastMagnify > 0 {
                                var magnify uint64
                                if serverLimit.LastMagnify > (serverLimit.CliUsed - serverLimit.Total) {
                                        magnify = serverLimit.CliUsed - serverLimit.Total
                                } else {
                                        magnify = serverLimit.LastMagnify
                                }
                                serverLimit.LastMagnify -= uint64(float32(magnify) * 0.1)
                        }
                }
                serverLimit.LimitRate = serverLimit.LimitRate * float32(1-float64(serverLimit.LastMagnify)/float64(serverLimit.Allocated+serverLimit.NeedAfterAlloc))
                log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] limitRatio [%v] updated to limitRatio [%v] by magnify [%v] lastMagnify [%v]",
                        qosManager.vol.Name, proto.QosTypeString(factorType),
                        lastLimitRatio, serverLimit.LimitRate, serverLimit.LastMagnify, lastMagnify)
        } else {
                serverLimit.LastMagnify = 0
        }
        log.QosWriteDebugf("action[updateServerLimitByClientsInfo] vol [%v] type [%v] after adjust limitRatio serverLimit:(%v)",
                qosManager.vol.Name, proto.QosTypeString(factorType), serverLimit)
        return
}

func (qosManager *QosCtrlManager) assignClientsNewQos(factorType uint32) {
        qosManager.RLock()
        if !qosManager.qosEnable {
                return
        }
        serverLimit := qosManager.serverFactorLimitMap[factorType]
        var bufferAllocated uint64

        // recalculate client Assign limit and buffer
        for _, cliInfoMgr := range qosManager.cliInfoMgrMap {
                cliInfo := cliInfoMgr.Cli.FactorMap[factorType]
                assignInfo := cliInfoMgr.Assign.FactorMap[factorType]

                if cliInfo.Used+cliInfoMgr.Cli.FactorMap[factorType].Need == 0 {
                        assignInfo.UsedLimit = 0
                        assignInfo.UsedBuffer = 0
                } else {
                        assignInfo.UsedLimit = uint64(float64(cliInfo.Used+cliInfo.Need) * float64(1-serverLimit.LimitRate))
                        if serverLimit.Allocated != 0 {
                                assignInfo.UsedBuffer = uint64(float64(serverLimit.Buffer) * (float64(assignInfo.UsedLimit) / float64(serverLimit.Allocated)) * 0.5)
                        }

                        // buffer left may be quit large and we should not use up and doesn't mean if buffer large than used limit line
                        if assignInfo.UsedBuffer > assignInfo.UsedLimit {
                                assignInfo.UsedBuffer = assignInfo.UsedLimit
                        }
                }

                bufferAllocated += assignInfo.UsedBuffer
        }

        qosManager.RUnlock()

        if serverLimit.Buffer > bufferAllocated {
                serverLimit.Buffer -= bufferAllocated
        } else {
                serverLimit.Buffer = 0
                log.LogWarnf("action[assignClientsNewQos] vol [%v] type [%v] clients buffer [%v] and server buffer used up trigger flow limit overall",
                        qosManager.vol.Name, proto.QosTypeString(factorType), bufferAllocated)
        }

        log.QosWriteDebugf("action[assignClientsNewQos] vol [%v]  type [%v] serverLimit buffer:[%v] used:[%v] need:[%v] total:[%v]",
                qosManager.vol.Name, proto.QosTypeString(factorType),
                serverLimit.Buffer, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Total)
}

func (vol *Vol) checkQos() {
        vol.qosManager.Lock()
        // check expire client and delete from map
        tTime := time.Now()
        for id, cli := range vol.qosManager.cliInfoMgrMap {
                if cli.Time.Add(20 * time.Second).Before(tTime) {
                        log.LogWarnf("action[checkQos] vol [%v] Id [%v] addr [%v] be delete in case of long time no request",
                                vol.Name, id, cli.Host)
                        delete(vol.qosManager.cliInfoMgrMap, id)
                }
        }

        vol.qosManager.Unlock()

        // periodically updateServerLimitByClientsInfo and get assigned limit info for all clients
        // with last report info from client and qos control info
        for factorType := proto.IopsReadType; factorType <= proto.FlowWriteType; factorType++ {
                // calc all clients and get real used and need value , used value should less then total
                vol.qosManager.updateServerLimitByClientsInfo(factorType)
                // update client assign info by result above
                if !vol.qosManager.qosEnable {
                        continue
                }

                vol.qosManager.assignClientsNewQos(factorType)

                serverLimit := vol.qosManager.serverFactorLimitMap[factorType]
                log.QosWriteDebugf("action[UpdateAllQosInfo] vol name [%v] type [%v] after updateServerLimitByClientsInfo get limitRate:[%v] "+
                        "server total [%v] beAllocated [%v] NeedAfterAlloc [%v] buffer [%v]",
                        vol.Name, proto.QosTypeString(factorType), serverLimit.LimitRate,
                        serverLimit.Total, serverLimit.Allocated, serverLimit.NeedAfterAlloc, serverLimit.Buffer)
        }
}

func (vol *Vol) getQosStatus(cluster *Cluster) interface{} {
        type qosStatus struct {
                ServerFactorLimitMap map[uint32]*ServerFactorLimit // vol qos data for iops w/r and flow w/r
                QosEnable            bool
                ClientReqPeriod      uint32
                ClientHitTriggerCnt  uint32
                ClusterMaxUploadCnt  uint32
                ClientALiveCnt       int
        }
        vol.qosManager.RLock()
        defer vol.qosManager.RUnlock()

        return &qosStatus{
                ServerFactorLimitMap: map[uint32]*ServerFactorLimit{
                        proto.FlowReadType:  vol.qosManager.serverFactorLimitMap[proto.FlowReadType],
                        proto.FlowWriteType: vol.qosManager.serverFactorLimitMap[proto.FlowWriteType],
                },
                QosEnable:           vol.qosManager.qosEnable,
                ClientReqPeriod:     vol.qosManager.ClientReqPeriod,
                ClientHitTriggerCnt: vol.qosManager.ClientHitTriggerCnt,
                ClusterMaxUploadCnt: uint32(cluster.QosAcceptLimit.Limit()),
                ClientALiveCnt:      len(vol.qosManager.cliInfoMgrMap),
        }
}

func (vol *Vol) getClientLimitInfo(id uint64, ip string) (interface{}, error) {
        log.QosWriteDebugf("action[getClientLimitInfo] vol [%v] id [%v] ip [%v]", vol.Name, id, ip)
        vol.qosManager.RLock()
        defer vol.qosManager.RUnlock()

        assignFuc := func(info *ClientInfoMgr) (rspInfo *ClientInfoOutput) {
                rspInfo = &ClientInfoOutput{
                        Cli: &ClientReportOutput{
                                ID:        info.Cli.ID,
                                Status:    info.Cli.Status,
                                FactorMap: make(map[uint32]*proto.ClientLimitInfo, 0),
                        },
                        Assign: &LimitOutput{
                                ID:            info.Assign.ID,
                                Enable:        info.Assign.Enable,
                                ReqPeriod:     info.Assign.ReqPeriod,
                                HitTriggerCnt: info.Assign.HitTriggerCnt,
                                FactorMap:     make(map[uint32]*proto.ClientLimitInfo, 0),
                        },
                        Time: info.Time,
                        Host: info.Host,
                        ID:   info.ID,
                }

                rspInfo.Cli.FactorMap[proto.FlowReadType] = info.Cli.FactorMap[proto.FlowReadType]
                rspInfo.Cli.FactorMap[proto.FlowWriteType] = info.Cli.FactorMap[proto.FlowWriteType]

                rspInfo.Assign.FactorMap[proto.FlowReadType] = info.Assign.FactorMap[proto.FlowReadType]
                rspInfo.Assign.FactorMap[proto.FlowWriteType] = info.Assign.FactorMap[proto.FlowWriteType]

                return
        }

        if id > 0 {
                if info, ok := vol.qosManager.cliInfoMgrMap[id]; ok {
                        if len(ip) > 0 && info.Host != ip {
                                return nil, fmt.Errorf("ip info [%v] not equal with request [%v]", info.Host, ip)
                        }
                        return assignFuc(info), nil
                }
        } else {
                var resp []*ClientInfoOutput
                for _, info := range vol.qosManager.cliInfoMgrMap {
                        // http connection port  from client will change time by time,so ignore port here
                        rspInfo := assignFuc(info)
                        if len(ip) != 0 {
                                if info.Host == ip {
                                        resp = append(resp, rspInfo)
                                }
                        } else {
                                resp = append(resp, rspInfo)
                        }
                }
                if len(resp) > 0 {
                        return resp, nil
                }
        }
        return nil, fmt.Errorf("not found")
}

func (vol *Vol) volQosEnable(c *Cluster, enable bool) error {
        log.LogWarnf("action[qosEnable] vol %v, set qos enable [%v], qosmgr[%v]", vol.Name, enable, vol.qosManager)
        vol.qosManager.qosEnable = enable
        vol.qosManager.Lock()
        defer vol.qosManager.Unlock()

        if !enable {
                for _, limit := range vol.qosManager.cliInfoMgrMap {
                        for factorType := proto.IopsReadType; factorType <= proto.FlowWriteType; factorType++ {
                                limit.Assign.FactorMap[factorType] = &proto.ClientLimitInfo{}
                        }
                }
        }
        return c.syncUpdateVol(vol)
}

func (vol *Vol) updateClientParam(c *Cluster, period, triggerCnt uint32) error {
        vol.qosManager.ClientHitTriggerCnt = triggerCnt
        vol.qosManager.ClientReqPeriod = period
        return c.syncUpdateVol(vol)
}

func (vol *Vol) volQosUpdateLimit(c *Cluster, limitArgs *qosArgs) error {
        vol.qosManager.volUpdateLimit(limitArgs)
        return c.syncUpdateVol(vol)
}

type AclManager struct {
        aclIps map[string]*proto.AclIpInfo
        c      *Cluster
        vol    *Vol
        sync.RWMutex
}

type AclFsm struct {
        AclIpArr []*proto.AclIpInfo
}

func (acl *AclManager) init(c *Cluster, vol *Vol) {
        acl.c = c
        acl.vol = vol
        acl.aclIps = make(map[string]*proto.AclIpInfo)
}

func (acl *AclManager) aclOperate(op uint64, ip string) interface{} {
        acl.Lock()
        defer acl.Unlock()

        switch op {
        case util.AclAddIP:
                return acl.addIp(ip)
        case util.AclDelIP:
                return acl.removeIp(ip)
        case util.AclCheckIP:
                return acl.checkIp(ip)
        case util.AclListIP:
                return acl.listAll()
        default:
                err := fmt.Errorf("aclOperate op %v not found", op)
                return err
        }
}

func (acl *AclManager) listAll() (val []*proto.AclIpInfo) {
        log.LogDebugf("vol %v listAll", acl.vol.Name)
        for ip, info := range acl.aclIps {
                log.LogDebugf("vol %v listAll ip %v", ip, acl.vol.Name)
                val = append(val, info)
        }

        return
}

func (acl *AclManager) checkIp(ip string) (val []*proto.AclIpInfo) {
        log.LogDebugf("vol %v checkIp %v", ip, acl.vol.Name)
        if info, ok := acl.aclIps[ip]; ok {
                log.LogDebugf("vol %v checkIp ip %v", ip, acl.vol.Name)
                val = append(val, info)
        }
        return
}

func (acl *AclManager) addIp(ip string) (err error) {
        log.LogDebugf("vol %v acl addIp %v", acl.vol.Name, ip)
        if _, ok := acl.aclIps[ip]; ok {
                return
        }
        acl.aclIps[ip] = &proto.AclIpInfo{
                Ip:    ip,
                CTime: time.Now().Unix(),
        }

        return acl.persist()
}

func (acl *AclManager) removeIp(ip string) (err error) {
        log.LogDebugf("vol %v acl removeIp %v", acl.vol.Name, ip)
        delete(acl.aclIps, ip)
        return acl.persist()
}

func (acl *AclManager) persist() (err error) {
        log.LogDebugf("vol %v acl persist", acl.vol.Name)
        var aclFsm AclFsm
        for _, t := range acl.aclIps {
                aclFsm.AclIpArr = append(aclFsm.AclIpArr, t)
        }

        var val []byte
        if val, err = json.Marshal(aclFsm); err != nil {
                log.LogErrorf("vol %v acl persist error %v", acl.vol.Name, err)
                return
        }
        if err = acl.c.syncAclList(acl.vol, val); err != nil {
                log.LogErrorf("vol %v acl persist syncAclList error %v", acl.vol.Name, err)
                return
        }
        return
}

func (acl *AclManager) load(c *Cluster, val []byte) (err error) {
        log.LogDebugf("vol %v acl load meta", acl.vol.Name)
        acl.c = c
        aclFsm := &AclFsm{}
        if err = json.Unmarshal(val, aclFsm); err != nil {
                log.LogErrorf("vol %v acl load %v", acl.vol.Name, err)
                return
        }
        for _, info := range aclFsm.AclIpArr {
                acl.aclIps[info.Ip] = info
                log.LogDebugf("vol %v acl load %v", acl.vol.Name, info.Ip)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        syslog "log"
        "strings"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        cfsProto "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

// LeaderInfo represents the leader's information
type LeaderInfo struct {
        addr string //host:port
}

func (m *Server) handleLeaderChange(leader uint64) {
        if leader == 0 {
                log.LogWarnf("action[handleLeaderChange] but no leader")
                if WarnMetrics != nil {
                        WarnMetrics.reset()
                }
                return
        }

        oldLeaderAddr := m.leaderInfo.addr
        m.leaderInfo.addr = AddrDatabase[leader]
        log.LogWarnf("action[handleLeaderChange]  [%v] ", m.leaderInfo.addr)
        m.reverseProxy = m.newReverseProxy()

        if m.id == leader {
                Warn(m.clusterName, fmt.Sprintf("clusterID[%v] leader is changed to %v",
                        m.clusterName, m.leaderInfo.addr))
                if oldLeaderAddr != m.leaderInfo.addr {
                        m.cluster.checkPersistClusterValue()

                        m.loadMetadata()
                        m.cluster.metaReady = true
                        m.metaReady = true
                }
                m.cluster.checkDataNodeHeartbeat()
                m.cluster.checkMetaNodeHeartbeat()
                m.cluster.checkLcNodeHeartbeat()
                m.cluster.followerReadManager.reSet()

        } else {
                Warn(m.clusterName, fmt.Sprintf("clusterID[%v] leader is changed to %v",
                        m.clusterName, m.leaderInfo.addr))
                m.clearMetadata()
                m.metaReady = false
                m.cluster.metaReady = false
                m.cluster.masterClient.AddNode(m.leaderInfo.addr)
                m.cluster.masterClient.SetLeader(m.leaderInfo.addr)
                if WarnMetrics != nil {
                        WarnMetrics.reset()
                }
        }
}

func (m *Server) handlePeerChange(confChange *proto.ConfChange) (err error) {
        var msg string
        addr := string(confChange.Context)
        switch confChange.Type {
        case proto.ConfAddNode:
                var arr []string
                if arr = strings.Split(addr, colonSplit); len(arr) < 2 {
                        msg = fmt.Sprintf("action[handlePeerChange] clusterID[%v] nodeAddr[%v] is invalid", m.clusterName, addr)
                        break
                }
                m.raftStore.AddNodeWithPort(confChange.Peer.ID, arr[0], int(m.config.heartbeatPort), int(m.config.replicaPort))
                AddrDatabase[confChange.Peer.ID] = string(confChange.Context)
                msg = fmt.Sprintf("clusterID[%v] peerID:%v,nodeAddr[%v] has been add", m.clusterName, confChange.Peer.ID, addr)
        case proto.ConfRemoveNode:
                m.raftStore.DeleteNode(confChange.Peer.ID)
                msg = fmt.Sprintf("clusterID[%v] peerID:%v,nodeAddr[%v] has been removed", m.clusterName, confChange.Peer.ID, addr)
        default:
                // do nothing
        }
        Warn(m.clusterName, msg)
        return
}

func (m *Server) handleApplySnapshot() {
        m.fsm.restore()
        m.restoreIDAlloc()
        return
}

func (m *Server) handleRaftUserCmd(opt uint32, key string, cmdMap map[string][]byte) (err error) {
        log.LogInfof("action[handleRaftUserCmd] opt %v, key %v, map len %v", opt, key, len(cmdMap))
        switch opt {
        case opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo:
                if m.cluster != nil && !m.partition.IsRaftLeader() {
                        m.cluster.apiLimiter.updateLimiterInfoFromLeader(cmdMap[key])
                }
        default:
                log.LogErrorf("action[handleRaftUserCmd] opt %v not supported,key %v, map len %v", opt, key, len(cmdMap))
        }
        return nil
}

func (m *Server) restoreIDAlloc() {
        m.cluster.idAlloc.restore()
}

// Load stored metadata into the memory
func (m *Server) loadMetadata() {
        log.LogInfo("action[loadMetadata] begin")
        syslog.Println("action[loadMetadata] begin")
        m.clearMetadata()
        m.restoreIDAlloc()
        m.cluster.fsm.restore()
        var err error
        if err = m.cluster.loadClusterValue(); err != nil {
                panic(err)
        }
        var loadDomain bool
        if m.cluster.FaultDomain { // try load exclude
                if loadDomain, err = m.cluster.loadZoneDomain(); err != nil {
                        log.LogInfof("action[putZoneDomain] err[%v]", err)
                        panic(err)
                }
                if err = m.cluster.loadNodeSetGrps(); err != nil {
                        panic(err)
                }
                if loadDomain {
                        // if load success the domain already init before this startup,
                        // start grp manager ,load nodeset can trigger build ns grps
                        m.cluster.domainManager.start()
                }
        }

        if err = m.cluster.loadNodeSets(); err != nil {
                panic(err)
        }

        if m.cluster.FaultDomain {
                log.LogInfof("action[FaultDomain] set")
                if !loadDomain { // first restart after domain item be added
                        if err = m.cluster.putZoneDomain(true); err != nil {
                                log.LogInfof("action[putZoneDomain] err[%v]", err)
                                panic(err)
                        }
                        m.cluster.domainManager.start()
                }
        }

        if err = m.cluster.loadDataNodes(); err != nil {
                panic(err)
        }

        if err = m.cluster.loadMetaNodes(); err != nil {
                panic(err)
        }

        if err = m.cluster.loadZoneValue(); err != nil {
                panic(err)
        }

        if err = m.cluster.loadVols(); err != nil {
                panic(err)
        }

        if err = m.cluster.loadMetaPartitions(); err != nil {
                panic(err)
        }
        if err = m.cluster.loadDataPartitions(); err != nil {
                panic(err)
        }
        if err = m.cluster.loadDecommissionDiskList(); err != nil {
                panic(err)
        }
        if err = m.cluster.startDecommissionListTraverse(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadMetadata] end")

        log.LogInfo("action[loadUserInfo] begin")
        if err = m.user.loadUserStore(); err != nil {
                panic(err)
        }
        if err = m.user.loadAKStore(); err != nil {
                panic(err)
        }
        if err = m.user.loadVolUsers(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadUserInfo] end")

        log.LogInfo("action[refreshUser] begin")
        if err = m.refreshUser(); err != nil {
                panic(err)
        }
        log.LogInfo("action[refreshUser] end")

        log.LogInfo("action[loadApiLimiterInfo] begin")
        if err = m.cluster.loadApiLimiterInfo(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadApiLimiterInfo] end")

        log.LogInfo("action[loadQuota] begin")
        if err = m.cluster.loadQuota(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadQuota] end")

        log.LogInfo("action[loadLcConfs] begin")
        if err = m.cluster.loadLcConfs(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadLcConfs] end")

        log.LogInfo("action[loadLcNodes] begin")
        if err = m.cluster.loadLcNodes(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadLcNodes] end")
        syslog.Println("action[loadMetadata] end")

        log.LogInfo("action[loadS3QoSInfo] begin")
        if err = m.cluster.loadS3ApiQosInfo(); err != nil {
                panic(err)
        }
        log.LogInfo("action[loadS3QoSInfo] end")
}

func (m *Server) clearMetadata() {
        m.cluster.clearTopology()
        m.cluster.clearDataNodes()
        m.cluster.clearMetaNodes()
        m.cluster.clearLcNodes()
        m.cluster.clearVols()

        if m.user != nil {
                // leader change event may be before m.user initialization
                m.user.clearUserStore()
                m.user.clearAKStore()
                m.user.clearVolUsers()
        }

        m.cluster.t = newTopology()
        // m.cluster.apiLimiter.Clear()
}

func (m *Server) refreshUser() (err error) {
        /* todo create user automatically
        var userInfo *cfsProto.UserInfo
        for volName, vol := range m.cluster.allVols() {
                if _, err = m.user.getUserInfo(vol.Owner); err == cfsProto.ErrUserNotExists {
                        if len(vol.OSSAccessKey) > 0 && len(vol.OSSSecretKey) > 0 {
                                var param = cfsProto.UserCreateParam{
                                        ID:        vol.Owner,
                                        Password:  DefaultUserPassword,
                                        AccessKey: vol.OSSAccessKey,
                                        SecretKey: vol.OSSSecretKey,
                                        Type:      cfsProto.UserTypeNormal,
                                }
                                userInfo, err = m.user.createKey(&param)
                                if err != nil && err != cfsProto.ErrDuplicateUserID && err != cfsProto.ErrDuplicateAccessKey {
                                        return err
                                }
                        } else {
                                var param = cfsProto.UserCreateParam{
                                        ID:       vol.Owner,
                                        Password: DefaultUserPassword,
                                        Type:     cfsProto.UserTypeNormal,
                                }
                                userInfo, err = m.user.createKey(&param)
                                if err != nil && err != cfsProto.ErrDuplicateUserID {
                                        return err
                                }
                        }
                        if err == nil && userInfo != nil {
                                if _, err = m.user.addOwnVol(userInfo.UserID, volName); err != nil {
                                        return err
                                }
                        }
                }
        }*/
        if _, err = m.user.getUserInfo(RootUserID); err != nil {
                param := cfsProto.UserCreateParam{
                        ID:       RootUserID,
                        Password: DefaultRootPasswd,
                        Type:     cfsProto.UserTypeRoot,
                }
                if _, err = m.user.createKey(&param); err != nil {
                        return err
                }
        }
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "strconv"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

type MasterQuotaManager struct {
        MpQuotaInfoMap map[uint64][]*proto.QuotaReportInfo
        IdQuotaInfoMap map[uint32]*proto.QuotaInfo
        vol            *Vol
        c              *Cluster

        sync.RWMutex
}

func (mqMgr *MasterQuotaManager) createQuota(req *proto.SetMasterQuotaReuqest) (quotaId uint32, err error) {
        mqMgr.Lock()
        defer mqMgr.Unlock()

        if len(mqMgr.IdQuotaInfoMap) >= gConfig.MaxQuotaNumPerVol {
                err = errors.NewErrorf("the number of quota has reached the upper limit %v", len(mqMgr.IdQuotaInfoMap))
                return
        }
        for _, quotaInfo := range mqMgr.IdQuotaInfoMap {
                for _, pathInfo := range req.PathInfos {
                        for _, quotaPathInfo := range quotaInfo.PathInfos {
                                if pathInfo.RootInode == quotaPathInfo.RootInode {
                                        err = errors.NewErrorf("path [%v] is the same as quotaId [%v]",
                                                pathInfo.FullPath, quotaInfo.QuotaId)
                                        return
                                }
                                if pathInfo.FullPath == quotaPathInfo.FullPath {
                                        err = errors.NewErrorf("path [%v] is the same as quotaId [%v]",
                                                pathInfo.FullPath, quotaInfo.QuotaId)
                                        return
                                }

                                if proto.IsAncestor(pathInfo.FullPath, quotaPathInfo.FullPath) {
                                        err = errors.NewErrorf("Nested directories found: %s and %s", pathInfo.FullPath, quotaPathInfo.FullPath)
                                        return
                                }

                                if proto.IsAncestor(quotaPathInfo.FullPath, pathInfo.FullPath) {
                                        err = errors.NewErrorf("Nested directories found: %s and %s", pathInfo.FullPath, quotaPathInfo.FullPath)
                                        return
                                }
                        }
                }
        }

        if quotaId, err = mqMgr.c.idAlloc.allocateQuotaID(); err != nil {
                return
        }

        quotaInfo := &proto.QuotaInfo{
                VolName:   req.VolName,
                QuotaId:   quotaId,
                CTime:     time.Now().Unix(),
                PathInfos: make([]proto.QuotaPathInfo, 0, 0),
                MaxFiles:  req.MaxFiles,
                MaxBytes:  req.MaxBytes,
        }

        for _, pathInfo := range req.PathInfos {
                quotaInfo.PathInfos = append(quotaInfo.PathInfos, pathInfo)
        }

        var value []byte
        if value, err = json.Marshal(quotaInfo); err != nil {
                log.LogErrorf("create quota [%v] marsha1 fail [%v].", quotaInfo, err)
                return
        }

        metadata := new(RaftCmd)
        metadata.Op = opSyncSetQuota
        metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaId), 10)
        metadata.V = value

        if err = mqMgr.c.submit(metadata); err != nil {
                log.LogErrorf("create quota [%v] submit fail [%v].", quotaInfo, err)
                return
        }

        // for _, pathInfo := range req.PathInfos {
        //         var inodes = make([]uint64, 0)
        //         inodes = append(inodes, pathInfo.RootInode)
        //         request := &proto.BatchSetMetaserverQuotaReuqest{
        //                 PartitionId: pathInfo.PartitionId,
        //                 Inodes:      inodes,
        //                 QuotaId:     quotaId,
        //         }

        //         if err = mqMgr.setQuotaToMetaNode(request); err != nil {
        //                 log.LogErrorf("create quota [%v] to metanode fail [%v].", quotaInfo, err)
        //                 return
        //         }
        // }
        mqMgr.IdQuotaInfoMap[quotaId] = quotaInfo

        log.LogInfof("create quota [%v] success.", quotaInfo)
        return
}

func (mqMgr *MasterQuotaManager) updateQuota(req *proto.UpdateMasterQuotaReuqest) (err error) {
        mqMgr.Lock()
        defer mqMgr.Unlock()
        quotaInfo, isFind := mqMgr.IdQuotaInfoMap[req.QuotaId]
        if !isFind {
                log.LogErrorf("vol [%v] quota quotaId [%v] is not exist.", mqMgr.vol.Name, req.QuotaId)
                err = errors.New("quota is not exist.")
                return
        }

        quotaInfo.MaxFiles = req.MaxFiles
        quotaInfo.MaxBytes = req.MaxBytes

        var value []byte
        if value, err = json.Marshal(quotaInfo); err != nil {
                log.LogErrorf("update quota [%v] marsha1 fail [%v].", quotaInfo, err)
                return
        }

        metadata := new(RaftCmd)
        metadata.Op = opSyncSetQuota
        metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaInfo.QuotaId), 10)
        metadata.V = value

        if err = mqMgr.c.submit(metadata); err != nil {
                log.LogErrorf("update quota [%v] submit fail [%v].", quotaInfo, err)
                return
        }

        log.LogInfof("update quota [%v] success.", *quotaInfo)
        return
}

func (mqMgr *MasterQuotaManager) listQuota() (resp *proto.ListMasterQuotaResponse) {
        mqMgr.RLock()
        defer mqMgr.RUnlock()
        resp = &proto.ListMasterQuotaResponse{}
        resp.Quotas = make([]*proto.QuotaInfo, 0)
        for _, info := range mqMgr.IdQuotaInfoMap {
                resp.Quotas = append(resp.Quotas, info)
        }
        return
}

func (mqMgr *MasterQuotaManager) getQuota(quotaId uint32) (quotaInfo *proto.QuotaInfo, err error) {
        mqMgr.RLock()
        defer mqMgr.RUnlock()
        quotaInfo, isFind := mqMgr.IdQuotaInfoMap[quotaId]
        if !isFind {
                err = errors.New("quota is not exist.")
                return nil, err
        }

        return quotaInfo, nil
}

func (mqMgr *MasterQuotaManager) deleteQuota(quotaId uint32) (err error) {
        mqMgr.Lock()
        defer mqMgr.Unlock()

        quotaInfo, isFind := mqMgr.IdQuotaInfoMap[quotaId]
        if !isFind {
                log.LogErrorf("vol [%v] quota quotaId [%v] is not exist.", mqMgr.vol.Name, quotaId)
                err = errors.New("quota is not exist.")
                return
        }

        var value []byte
        if value, err = json.Marshal(quotaInfo); err != nil {
                log.LogErrorf("delete quota [%v] marsha1 fail [%v].", quotaInfo, err)
                return
        }
        metadata := new(RaftCmd)
        metadata.Op = opSyncDeleteQuota
        metadata.K = quotaPrefix + strconv.FormatUint(mqMgr.vol.ID, 10) + keySeparator + strconv.FormatUint(uint64(quotaInfo.QuotaId), 10)
        metadata.V = value

        if err = mqMgr.c.submit(metadata); err != nil {
                log.LogErrorf("delete quota [%v] submit fail [%v].", quotaInfo, err)
                return
        }

        delete(mqMgr.IdQuotaInfoMap, quotaInfo.QuotaId)
        log.LogInfof("deleteQuota: idmap len [%v]", len(mqMgr.IdQuotaInfoMap))
        return
}

func (mqMgr *MasterQuotaManager) quotaUpdate(report *proto.MetaPartitionReport) {
        var (
                quotaInfo = &proto.QuotaInfo{}
                id        uint32
        )

        mqMgr.Lock()
        defer mqMgr.Unlock()

        mpId := report.PartitionID

        if !report.IsLeader {
                return
        }

        mqMgr.MpQuotaInfoMap[mpId] = report.QuotaReportInfos

        for _, quotaInfo = range mqMgr.IdQuotaInfoMap {
                quotaInfo.UsedInfo.UsedFiles = 0
                quotaInfo.UsedInfo.UsedBytes = 0
        }
        deleteQuotaIds := make(map[uint32]bool, 0)
        for mpId, reportInfos := range mqMgr.MpQuotaInfoMap {
                for _, info := range reportInfos {
                        if _, isFind := mqMgr.IdQuotaInfoMap[info.QuotaId]; !isFind {
                                deleteQuotaIds[info.QuotaId] = true
                                continue
                        }
                        log.LogDebugf("[quotaUpdate] mpId [%v] quotaId [%v] reportinfo [%v]", mpId, info.QuotaId, info.UsedInfo)
                        quotaInfo = mqMgr.IdQuotaInfoMap[info.QuotaId]
                        quotaInfo.UsedInfo.Add(&info.UsedInfo)
                }
        }
        if len(deleteQuotaIds) != 0 {
                log.LogWarnf("[quotaUpdate] quotaIds [%v] is delete", deleteQuotaIds)
        }
        for id, quotaInfo = range mqMgr.IdQuotaInfoMap {
                if quotaInfo.IsOverQuotaFiles() {
                        quotaInfo.LimitedInfo.LimitedFiles = true
                } else {
                        quotaInfo.LimitedInfo.LimitedFiles = false
                }
                if quotaInfo.IsOverQuotaBytes() {
                        quotaInfo.LimitedInfo.LimitedBytes = true
                } else {
                        quotaInfo.LimitedInfo.LimitedBytes = false
                }
                log.LogDebugf("[quotaUpdate] quotaId [%v] quotaInfo [%v]", id, quotaInfo)
        }
        return
}

func (mqMgr *MasterQuotaManager) getQuotaHbInfos() (infos []*proto.QuotaHeartBeatInfo) {
        mqMgr.RLock()
        defer mqMgr.RUnlock()
        for quotaId, quotaInfo := range mqMgr.IdQuotaInfoMap {
                info := &proto.QuotaHeartBeatInfo{}
                info.VolName = mqMgr.vol.Name
                info.QuotaId = quotaId
                info.LimitedInfo.LimitedFiles = quotaInfo.LimitedInfo.LimitedFiles
                info.LimitedInfo.LimitedBytes = quotaInfo.LimitedInfo.LimitedBytes
                info.Enable = mqMgr.vol.enableQuota
                infos = append(infos, info)
                log.LogDebugf("getQuotaHbInfos info %v", info)
        }

        return
}

func (mqMgr *MasterQuotaManager) HasQuota() bool {
        mqMgr.RLock()
        defer mqMgr.RUnlock()

        if len(mqMgr.IdQuotaInfoMap) == 0 {
                return false
        }
        return true
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/atomicutil"
        "github.com/cubefs/cubefs/util/log"
)

// MetaNode defines the structure of a meta node
type MetaNode struct {
        ID                        uint64
        Addr                      string
        DomainAddr                string
        IsActive                  bool
        Sender                    *AdminTaskManager `graphql:"-"`
        ZoneName                  string            `json:"Zone"`
        MaxMemAvailWeight         uint64            `json:"MaxMemAvailWeight"`
        Total                     uint64            `json:"TotalWeight"`
        Used                      uint64            `json:"UsedWeight"`
        Ratio                     float64
        SelectCount               uint64
        Threshold                 float32
        ReportTime                time.Time
        metaPartitionInfos        []*proto.MetaPartitionReport
        MetaPartitionCount        int
        NodeSetID                 uint64
        sync.RWMutex              `graphql:"-"`
        ToBeOffline               bool
        PersistenceMetaPartitions []uint64
        RdOnly                    bool
        MigrateLock               sync.RWMutex
        CpuUtil                   atomicutil.Float64 `json:"-"`
}

func newMetaNode(addr, zoneName, clusterID string) (node *MetaNode) {
        node = &MetaNode{
                Addr:     addr,
                ZoneName: zoneName,
                Sender:   newAdminTaskManager(addr, clusterID),
        }
        node.CpuUtil.Store(0)
        return
}

func (metaNode *MetaNode) clean() {
        metaNode.Sender.exitCh <- struct{}{}
}

func (metaNode *MetaNode) GetID() uint64 {
        metaNode.RLock()
        defer metaNode.RUnlock()
        return metaNode.ID
}

func (metaNode *MetaNode) GetAddr() string {
        metaNode.RLock()
        defer metaNode.RUnlock()
        return metaNode.Addr
}

// SelectNodeForWrite implements the Node interface
func (metaNode *MetaNode) SelectNodeForWrite() {
        metaNode.Lock()
        defer metaNode.Unlock()
        metaNode.SelectCount++
}

func (metaNode *MetaNode) isWritable() (ok bool) {
        metaNode.RLock()
        defer metaNode.RUnlock()
        if metaNode.IsActive && metaNode.MaxMemAvailWeight > gConfig.metaNodeReservedMem &&
                !metaNode.reachesThreshold() && metaNode.MetaPartitionCount < defaultMaxMetaPartitionCountOnEachNode &&
                !metaNode.RdOnly {
                ok = true
        }
        return
}

func (metaNode *MetaNode) setNodeActive() {
        metaNode.Lock()
        defer metaNode.Unlock()
        metaNode.ReportTime = time.Now()
        metaNode.IsActive = true
}

func (metaNode *MetaNode) updateMetric(resp *proto.MetaNodeHeartbeatResponse, threshold float32) {
        metaNode.Lock()
        defer metaNode.Unlock()

        metaNode.DomainAddr = util.ParseIpAddrToDomainAddr(metaNode.Addr)
        metaNode.metaPartitionInfos = resp.MetaPartitionReports
        metaNode.MetaPartitionCount = len(metaNode.metaPartitionInfos)
        metaNode.Total = resp.Total
        metaNode.Used = resp.MemUsed
        if resp.Total == 0 {
                metaNode.Ratio = 0
        } else {
                metaNode.Ratio = float64(resp.MemUsed) / float64(resp.Total)
        }
        left := int64(resp.Total - resp.MemUsed)
        if left < 0 {
                metaNode.MaxMemAvailWeight = 0
        } else {
                metaNode.MaxMemAvailWeight = uint64(left)
        }
        metaNode.ZoneName = resp.ZoneName
        metaNode.Threshold = threshold
}

func (metaNode *MetaNode) reachesThreshold() bool {
        if metaNode.Threshold <= 0 {
                metaNode.Threshold = defaultMetaPartitionMemUsageThreshold
        }
        return float32(float64(metaNode.Used)/float64(metaNode.Total)) > metaNode.Threshold
}

func (metaNode *MetaNode) createHeartbeatTask(masterAddr string, fileStatsEnable bool) (task *proto.AdminTask) {
        request := &proto.HeartBeatRequest{
                CurrTime:   time.Now().Unix(),
                MasterAddr: masterAddr,
        }
        request.FileStatsEnable = fileStatsEnable
        task = proto.NewAdminTask(proto.OpMetaNodeHeartbeat, metaNode.Addr, request)
        return
}

func (metaNode *MetaNode) createVersionTask(volume string, version uint64, op uint8, addr string, verList []*proto.VolVersionInfo) (task *proto.AdminTask) {
        request := &proto.MultiVersionOpRequest{
                VolumeID:   volume,
                VerSeq:     version,
                Op:         op,
                Addr:       addr,
                VolVerList: verList,
        }
        task = proto.NewAdminTask(proto.OpVersionOperation, metaNode.Addr, request)
        return
}

func (metaNode *MetaNode) checkHeartbeat() {
        metaNode.Lock()
        defer metaNode.Unlock()
        if time.Since(metaNode.ReportTime) > time.Second*time.Duration(defaultNodeTimeOutSec) {
                metaNode.IsActive = false
        }
}

// LeaderMetaNode define the leader metaPartitions in meta node
type LeaderMetaNode struct {
        addr           string
        metaPartitions []*MetaPartition
}

type sortLeaderMetaNode struct {
        nodes        []*LeaderMetaNode
        leaderCountM map[string]int
        average      int
        mu           sync.RWMutex
}

func (s *sortLeaderMetaNode) Less(i, j int) bool {
        return len(s.nodes[i].metaPartitions) > len(s.nodes[j].metaPartitions)
}

func (s *sortLeaderMetaNode) Swap(i, j int) {
        s.nodes[i], s.nodes[j] = s.nodes[j], s.nodes[i]
}

func (s *sortLeaderMetaNode) Len() int {
        return len(s.nodes)
}

func (s *sortLeaderMetaNode) getLeaderCount(addr string) int {
        s.mu.RLock()
        defer s.mu.RUnlock()
        return s.leaderCountM[addr]
}

func (s *sortLeaderMetaNode) changeLeader(l *LeaderMetaNode) {
        for _, mp := range l.metaPartitions {
                if count := s.getLeaderCount(l.addr); count <= s.average {
                        log.LogInfof("now leader count is[%d], average is[%d]", count, s.average)
                        break
                }

                // mp's leader not in this metaNode, skip it
                oldLeader, err := mp.getMetaReplicaLeader()
                if err != nil {
                        log.LogErrorf("mp[%v] no leader, can not change leader err[%v]", mp, err)
                        continue
                }

                // get the leader metaPartition count meta node which smaller than (old leader count - 1) addr as new leader
                addr := oldLeader.Addr
                s.mu.RLock()
                for i := 0; i < len(mp.Replicas); i++ {
                        if s.leaderCountM[mp.Replicas[i].Addr] < s.leaderCountM[oldLeader.Addr]-1 {
                                addr = mp.Replicas[i].Addr
                        }
                }
                s.mu.RUnlock()

                if addr == oldLeader.Addr {
                        log.LogDebugf("newAddr:%s,oldAddr:%s is same", addr, oldLeader.Addr)
                        continue
                }

                // one mp change leader failed not influence others
                if err = mp.tryToChangeLeaderByHost(addr); err != nil {
                        log.LogErrorf("mp[%v] change to addr[%v] err[%v]", mp, addr, err)
                        continue
                }
                s.mu.Lock()
                s.leaderCountM[addr]++
                s.leaderCountM[oldLeader.Addr]--
                s.mu.Unlock()
                log.LogDebugf("mp[%v] oldLeader[%v,nowCount:%d] change to newLeader[%v,nowCount:%d] success", mp.PartitionID, oldLeader.Addr, s.leaderCountM[oldLeader.Addr], addr, s.leaderCountM[addr])
        }
}

func (s *sortLeaderMetaNode) balanceLeader() {
        for _, node := range s.nodes {
                log.LogDebugf("node[%v] leader count is:%d,average:%d", node.addr, len(node.metaPartitions), s.average)
                s.changeLeader(node)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// MetaReplica defines the replica of a meta partition
type MetaReplica struct {
        Addr        string
        start       uint64 // lower bound of the inode id
        end         uint64 // upper bound of the inode id
        dataSize    uint64
        nodeID      uint64
        MaxInodeID  uint64
        InodeCount  uint64
        DentryCount uint64
        TxCnt       uint64
        TxRbInoCnt  uint64
        TxRbDenCnt  uint64
        FreeListLen uint64
        ReportTime  int64
        Status      int8 // unavailable, readOnly, readWrite
        IsLeader    bool
        metaNode    *MetaNode
}

// MetaPartition defines the structure of a meta partition
type MetaPartition struct {
        PartitionID      uint64
        Start            uint64
        End              uint64
        MaxInodeID       uint64
        InodeCount       uint64
        DentryCount      uint64
        FreeListLen      uint64
        TxCnt            uint64
        TxRbInoCnt       uint64
        TxRbDenCnt       uint64
        Replicas         []*MetaReplica
        LeaderReportTime int64
        ReplicaNum       uint8
        Status           int8
        IsRecover        bool
        volID            uint64
        volName          string
        Hosts            []string
        Peers            []proto.Peer
        OfflinePeerID    uint64
        MissNodes        map[string]int64
        LoadResponse     []*proto.MetaPartitionLoadResponse
        offlineMutex     sync.RWMutex
        uidInfo          []*proto.UidReportSpaceInfo
        EqualCheckPass   bool
        VerSeq           uint64
        heartBeatDone    bool

        sync.RWMutex
}

func newMetaReplica(start, end uint64, metaNode *MetaNode) (mr *MetaReplica) {
        mr = &MetaReplica{start: start, end: end, nodeID: metaNode.ID, Addr: metaNode.Addr}
        mr.metaNode = metaNode
        mr.ReportTime = time.Now().Unix()
        return
}

func newMetaPartition(partitionID, start, end uint64, replicaNum uint8, volName string, volID uint64, verSeq uint64) (mp *MetaPartition) {
        mp = &MetaPartition{PartitionID: partitionID, Start: start, End: end, volName: volName, volID: volID}
        mp.ReplicaNum = replicaNum
        mp.Replicas = make([]*MetaReplica, 0)
        mp.LeaderReportTime = time.Now().Unix()
        mp.Status = proto.Unavailable
        mp.MissNodes = make(map[string]int64, 0)
        mp.Peers = make([]proto.Peer, 0)
        mp.Hosts = make([]string, 0)
        mp.VerSeq = verSeq
        mp.LoadResponse = make([]*proto.MetaPartitionLoadResponse, 0)
        mp.EqualCheckPass = true
        return
}

func (mp *MetaPartition) setPeers(peers []proto.Peer) {
        mp.Peers = peers
}

func (mp *MetaPartition) setHosts(hosts []string) {
        mp.Hosts = hosts
}

func (mp *MetaPartition) hostsToString() (hosts string) {
        return strings.Join(mp.Hosts, underlineSeparator)
}

func (mp *MetaPartition) addReplica(mr *MetaReplica) {
        for _, m := range mp.Replicas {
                if m.Addr == mr.Addr {
                        return
                }
        }
        mp.Replicas = append(mp.Replicas, mr)
        return
}

func (mp *MetaPartition) removeReplica(mr *MetaReplica) {
        var newReplicas []*MetaReplica
        for _, m := range mp.Replicas {
                if m.Addr == mr.Addr {
                        continue
                }
                newReplicas = append(newReplicas, m)
        }
        mp.Replicas = newReplicas
        return
}

func (mp *MetaPartition) removeReplicaByAddr(addr string) {
        var newReplicas []*MetaReplica
        for _, m := range mp.Replicas {
                if m.Addr == addr {
                        continue
                }
                newReplicas = append(newReplicas, m)
        }
        mp.Replicas = newReplicas
        return
}

func (mp *MetaPartition) updateInodeIDRangeForAllReplicas() {
        for _, mr := range mp.Replicas {
                mr.end = mp.End
        }
}

// canSplit caller must be add lock
func (mp *MetaPartition) canSplit(end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (err error) {
        if end < mp.Start {
                err = fmt.Errorf("end[%v] less than mp.start[%v]", end, mp.Start)
                return
        }
        // overflow
        if end > (defaultMaxMetaPartitionInodeID - metaPartitionInodeIdStep) {
                msg := fmt.Sprintf("action[updateInodeIDRange] vol[%v] partitionID[%v] nextStart[%v] "+
                        "to prevent overflow ,not update end", mp.volName, mp.PartitionID, end)
                log.LogWarn(msg)
                err = fmt.Errorf(msg)
                return
        }

        if end <= mp.MaxInodeID {
                err = fmt.Errorf("next meta partition start must be larger than %v", mp.MaxInodeID)
                return
        }

        if ignoreNoLeader {
                return
        }

        if _, err = mp.getMetaReplicaLeader(); err != nil {
                log.LogWarnf("action[updateInodeIDRange] vol[%v] id[%v] no leader", mp.volName, mp.PartitionID)
                return
        }

        return
}

func (mp *MetaPartition) addUpdateMetaReplicaTask(c *Cluster) (err error) {
        tasks := make([]*proto.AdminTask, 0)
        t := mp.createTaskToUpdateMetaReplica(c.Name, mp.PartitionID, mp.End)
        // if no leader,don't update end
        if t == nil {
                err = proto.ErrNoLeader
                return
        }
        tasks = append(tasks, t)
        c.addMetaNodeTasks(tasks)
        log.LogWarnf("action[addUpdateMetaReplicaTask] partitionID[%v] end[%v] success", mp.PartitionID, mp.End)
        return
}

func (mp *MetaPartition) dataSize() uint64 {
        maxSize := uint64(0)
        for _, mr := range mp.Replicas {
                if maxSize < mr.dataSize {
                        maxSize = mr.dataSize
                }
        }

        return maxSize
}

func (mp *MetaPartition) checkEnd(c *Cluster, maxPartitionID uint64) {
        if mp.PartitionID < maxPartitionID {
                return
        }
        vol, err := c.getVol(mp.volName)
        if err != nil {
                log.LogWarnf("action[checkEnd] vol[%v] not exist", mp.volName)
                return
        }

        vol.createMpMutex.Lock()
        defer vol.createMpMutex.Unlock()

        curMaxPartitionID := vol.maxPartitionID()
        if mp.PartitionID != curMaxPartitionID {
                log.LogWarnf("action[checkEnd] partition[%v] not max partition[%v]", mp.PartitionID, curMaxPartitionID)
                return
        }

        mp.Lock()
        defer mp.Unlock()
        if _, err = mp.getMetaReplicaLeader(); err != nil {
                log.LogWarnf("action[checkEnd] partition[%v] no leader", mp.PartitionID)
                return
        }
        if mp.End != defaultMaxMetaPartitionInodeID {
                oldEnd := mp.End
                mp.End = defaultMaxMetaPartitionInodeID
                if err := c.syncUpdateMetaPartition(mp); err != nil {
                        mp.End = oldEnd
                        log.LogErrorf("action[checkEnd] partitionID[%v] err[%v]", mp.PartitionID, err)
                        return
                }
                if err = mp.addUpdateMetaReplicaTask(c); err != nil {
                        mp.End = oldEnd
                }
        }
        log.LogDebugf("action[checkEnd] partitionID[%v] end[%v]", mp.PartitionID, mp.End)
}

func (mp *MetaPartition) getMetaReplica(addr string) (mr *MetaReplica, err error) {
        for _, mr = range mp.Replicas {
                if mr.Addr == addr {
                        return
                }
        }
        return nil, metaReplicaNotFound(addr)
}

func (mp *MetaPartition) removeMissingReplica(addr string) {
        if _, ok := mp.MissNodes[addr]; ok {
                delete(mp.MissNodes, addr)
        }
}

func (mp *MetaPartition) isLeaderExist() bool {
        mp.RLock()
        defer mp.RUnlock()
        for _, mr := range mp.Replicas {
                if mr.IsLeader {
                        return true
                }
        }
        return false
}

func (mp *MetaPartition) checkLeader(clusterID string) {
        mp.Lock()
        defer mp.Unlock()
        for _, mr := range mp.Replicas {
                if !mr.isActive() {
                        mr.IsLeader = false
                }
        }

        var report bool
        if _, err := mp.getMetaReplicaLeader(); err != nil {
                report = true
        }
        if WarnMetrics != nil {
                WarnMetrics.WarnMpNoLeader(clusterID, mp.PartitionID, report)
        }
        return
}

func (mp *MetaPartition) checkStatus(clusterID string, writeLog bool, replicaNum int, maxPartitionID uint64, metaPartitionInodeIdStep uint64, forbiddenVol bool) (doSplit bool) {
        mp.Lock()
        defer mp.Unlock()

        mp.checkReplicas()
        liveReplicas := mp.getLiveReplicas()

        if len(liveReplicas) <= replicaNum/2 {
                mp.Status = proto.Unavailable
        } else {
                mr, err := mp.getMetaReplicaLeader()
                if err != nil {
                        mp.Status = proto.Unavailable
                        log.LogErrorf("[checkStatus] mp %v getMetaReplicaLeader err:%v", mp.PartitionID, err)
                }

                mp.Status = mr.Status
                for _, replica := range liveReplicas {
                        if replica.Status == proto.ReadOnly {
                                mp.Status = proto.ReadOnly
                        }

                        if mr.metaNode == nil {
                                continue
                        }

                        if !mr.metaNode.reachesThreshold() && mp.InodeCount < metaPartitionInodeIdStep {
                                continue
                        }

                        if mp.PartitionID == maxPartitionID {
                                log.LogInfof("split[checkStatus] need split,id:%v,status:%v,replicaNum:%v,InodeCount:%v", mp.PartitionID, mp.Status, mp.ReplicaNum, mp.InodeCount)
                                doSplit = true
                        } else {
                                if mr.metaNode.reachesThreshold() || mp.End-mp.MaxInodeID > 2*metaPartitionInodeIdStep {
                                        log.LogInfof("split[checkStatus],change state,id:%v,status:%v,replicaNum:%v,replicas:%v,persistenceHosts:%v, inodeCount:%v, MaxInodeID:%v, start:%v, end:%v",
                                                mp.PartitionID, mp.Status, mp.ReplicaNum, len(liveReplicas), mp.Hosts, mp.InodeCount, mp.MaxInodeID, mp.Start, mp.End)
                                        mp.Status = proto.ReadOnly
                                }
                        }
                }
        }

        if mp.PartitionID >= maxPartitionID && mp.Status == proto.ReadOnly && !forbiddenVol {
                mp.Status = proto.ReadWrite
        }

        if writeLog && len(liveReplicas) != int(mp.ReplicaNum) {
                msg := fmt.Sprintf("action[checkMPStatus],id:%v,status:%v,replicaNum:%v,replicas:%v,persistenceHosts:%v",
                        mp.PartitionID, mp.Status, mp.ReplicaNum, len(liveReplicas), mp.Hosts)
                log.LogInfo(msg)
                Warn(clusterID, msg)
        }

        return
}

func (mp *MetaPartition) getMetaReplicaLeader() (mr *MetaReplica, err error) {
        for _, mr = range mp.Replicas {
                if mr.IsLeader {
                        return
                }
        }
        err = proto.ErrNoLeader
        return
}

func (mp *MetaPartition) checkReplicaNum(c *Cluster, volName string, replicaNum uint8) {
        mp.RLock()
        defer mp.RUnlock()
        if mp.ReplicaNum != replicaNum {
                msg := fmt.Sprintf("FIX MetaPartition replicaNum clusterID[%v] vol[%v] replica num[%v],current num[%v]",
                        c.Name, volName, replicaNum, mp.ReplicaNum)
                Warn(c.Name, msg)
        }
}

func (mp *MetaPartition) removeIllegalReplica() (excessAddr string, t *proto.AdminTask, err error) {
        mp.RLock()
        defer mp.RUnlock()
        for _, mr := range mp.Replicas {
                if !contains(mp.Hosts, mr.Addr) {
                        t = mr.createTaskToDeleteReplica(mp.PartitionID)
                        err = proto.ErrIllegalMetaReplica
                        break
                }
        }
        return
}

func (mp *MetaPartition) missingReplicaAddrs() (lackAddrs []string) {
        mp.RLock()
        defer mp.RUnlock()
        var liveReplicas []string
        for _, mr := range mp.Replicas {
                liveReplicas = append(liveReplicas, mr.Addr)
        }
        for _, host := range mp.Hosts {
                if !contains(liveReplicas, host) {
                        lackAddrs = append(lackAddrs, host)
                        break
                }
        }
        return
}

func (mp *MetaPartition) updateMetaPartition(mgr *proto.MetaPartitionReport, metaNode *MetaNode) {
        if !contains(mp.Hosts, metaNode.Addr) {
                return
        }
        mp.Lock()
        defer mp.Unlock()
        mr, err := mp.getMetaReplica(metaNode.Addr)
        if err != nil {
                mr = newMetaReplica(mp.Start, mp.End, metaNode)
                mp.addReplica(mr)
        }
        mr.updateMetric(mgr)
        if mr.IsLeader {
                mp.LeaderReportTime = time.Now().Unix()
        }
        mp.setMaxInodeID()
        mp.setInodeCount()
        mp.setDentryCount()
        mp.setFreeListLen()
        mp.SetTxCnt()
        mp.removeMissingReplica(metaNode.Addr)
        mp.setUidInfo(mgr)
        mp.setHeartBeatDone()
}

func (mp *MetaPartition) canBeOffline(nodeAddr string, replicaNum int) (err error) {
        liveReplicas := mp.getLiveReplicas()
        if len(liveReplicas) < int(mp.ReplicaNum/2+1) {
                err = proto.ErrNoEnoughReplica
                return
        }
        liveAddrs := mp.getLiveReplicasAddr(liveReplicas)
        if len(liveReplicas) == (replicaNum/2+1) && contains(liveAddrs, nodeAddr) {
                err = fmt.Errorf("live replicas num will be less than majority after offline nodeAddr: %v", nodeAddr)
                return
        }
        return
}

// Check if there is a replica missing or not, exclude addr
func (mp *MetaPartition) hasMissingOneReplica(addr string, replicaNum int) (err error) {
        inReplicas := false
        for _, rep := range mp.Replicas {
                if rep.Addr == addr {
                        inReplicas = true
                        break
                }
        }

        hostNum := len(mp.Replicas)
        if hostNum <= replicaNum-1 && inReplicas {
                log.LogError(fmt.Sprintf("action[%v],partitionID:%v,err:%v",
                        "hasMissingOneReplica", mp.PartitionID, proto.ErrHasOneMissingReplica))
                err = proto.ErrHasOneMissingReplica
        }
        return
}

func (mp *MetaPartition) getLiveReplicasAddr(liveReplicas []*MetaReplica) (addrs []string) {
        addrs = make([]string, 0)
        for _, mr := range liveReplicas {
                addrs = append(addrs, mr.Addr)
        }
        return
}

func (mp *MetaPartition) getLiveReplicas() (liveReplicas []*MetaReplica) {
        liveReplicas = make([]*MetaReplica, 0)
        for _, mr := range mp.Replicas {
                if mr.isActive() {
                        liveReplicas = append(liveReplicas, mr)
                }
        }
        return
}

func (mp *MetaPartition) checkReplicas() {
        for _, mr := range mp.Replicas {
                if !mr.isActive() {
                        mr.Status = proto.Unavailable
                }
        }
        return
}

func (mp *MetaPartition) persistToRocksDB(action, volName string, newHosts []string, newPeers []proto.Peer, c *Cluster) (err error) {
        oldHosts := make([]string, len(mp.Hosts))
        copy(oldHosts, mp.Hosts)
        oldPeers := make([]proto.Peer, len(mp.Peers))
        copy(oldPeers, mp.Peers)
        mp.Hosts = newHosts
        mp.Peers = newPeers
        if err = c.syncUpdateMetaPartition(mp); err != nil {
                mp.Hosts = oldHosts
                mp.Peers = oldPeers
                log.LogWarnf("action[%v_persist] failed,vol[%v] partitionID:%v  old hosts:%v new hosts:%v oldPeers:%v  newPeers:%v",
                        action, volName, mp.PartitionID, mp.Hosts, newHosts, mp.Peers, newPeers)
                return
        }
        log.LogWarnf("action[%v_persist] success,vol[%v] partitionID:%v  old hosts:%v  new hosts:%v oldPeers:%v  newPeers:%v ",
                action, volName, mp.PartitionID, oldHosts, mp.Hosts, oldPeers, mp.Peers)
        return
}

func (mp *MetaPartition) getActiveAddrs() (liveAddrs []string) {
        liveAddrs = make([]string, 0)
        for _, mr := range mp.Replicas {
                if mr.isActive() {
                        liveAddrs = append(liveAddrs, mr.Addr)
                }
        }
        return liveAddrs
}

func (mp *MetaPartition) isMissingReplica(addr string) bool {
        return !contains(mp.getActiveAddrs(), addr)
}

func (mp *MetaPartition) shouldReportMissingReplica(addr string, interval int64) (isWarn bool) {
        lastWarningTime, ok := mp.MissNodes[addr]
        if !ok {
                isWarn = true
                mp.MissNodes[addr] = time.Now().Unix()
        } else if (time.Now().Unix() - lastWarningTime) > interval {
                isWarn = true
                mp.MissNodes[addr] = time.Now().Unix()
        }
        return isWarn
        // return false
}

func (mp *MetaPartition) reportMissingReplicas(clusterID, leaderAddr string, seconds, interval int64) {
        mp.Lock()
        defer mp.Unlock()
        for _, replica := range mp.Replicas {
                // reduce the alarm frequency
                if contains(mp.Hosts, replica.Addr) && replica.isMissing() {
                        if mp.shouldReportMissingReplica(replica.Addr, interval) {
                                metaNode := replica.metaNode
                                var lastReportTime time.Time
                                isActive := true
                                if metaNode != nil {
                                        lastReportTime = metaNode.ReportTime
                                        isActive = metaNode.IsActive
                                }
                                msg := fmt.Sprintf("action[reportMissingReplicas], clusterID[%v] volName[%v] partition:%v  on node:%v  "+
                                        "miss time > :%v  vlocLastRepostTime:%v   dnodeLastReportTime:%v  nodeisActive:%v",
                                        clusterID, mp.volName, mp.PartitionID, replica.Addr, seconds, replica.ReportTime, lastReportTime, isActive)
                                Warn(clusterID, msg)
                                // msg = fmt.Sprintf("decommissionMetaPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, mp.PartitionID, replica.Addr)
                                // Warn(clusterID, msg)
                                if WarnMetrics != nil {
                                        WarnMetrics.WarnMissingMp(clusterID, replica.Addr, mp.PartitionID, true)
                                }
                        }
                } else {
                        if WarnMetrics != nil {
                                WarnMetrics.WarnMissingMp(clusterID, replica.Addr, mp.PartitionID, false)
                        }
                }
        }
        if WarnMetrics != nil {
                WarnMetrics.CleanObsoleteMpMissing(clusterID, mp)
        }
        for _, addr := range mp.Hosts {
                if mp.isMissingReplica(addr) && mp.shouldReportMissingReplica(addr, interval) {
                        msg := fmt.Sprintf("action[reportMissingReplicas],clusterID[%v] volName[%v] partition:%v  on node:%v  "+
                                "miss time  > %v ",
                                clusterID, mp.volName, mp.PartitionID, addr, defaultMetaPartitionTimeOutSec)
                        Warn(clusterID, msg)
                        msg = fmt.Sprintf("decommissionMetaPartitionURL is http://%v/dataPartition/decommission?id=%v&addr=%v", leaderAddr, mp.PartitionID, addr)
                        Warn(clusterID, msg)
                }
        }
}

func (mp *MetaPartition) replicaCreationTasks(clusterID, volName string) (tasks []*proto.AdminTask) {
        var msg string
        tasks = make([]*proto.AdminTask, 0)
        if addr, _, err := mp.removeIllegalReplica(); err != nil {
                msg = fmt.Sprintf("action[%v],clusterID[%v] metaPartition:%v  excess replication"+
                        " on :%v  err:%v  persistenceHosts:%v",
                        deleteIllegalReplicaErr, clusterID, mp.PartitionID, addr, err.Error(), mp.Hosts)
                log.LogWarn(msg)
        }
        if addrs := mp.missingReplicaAddrs(); addrs != nil {
                msg = fmt.Sprintf("action[missingReplicaAddrs],clusterID[%v] metaPartition:%v  lack replication"+
                        " on :%v Hosts:%v",
                        clusterID, mp.PartitionID, addrs, mp.Hosts)
                Warn(clusterID, msg)
        }

        return
}

func (mp *MetaPartition) buildNewMetaPartitionTasks(specifyAddrs []string, peers []proto.Peer, volName string) (tasks []*proto.AdminTask) {
        tasks = make([]*proto.AdminTask, 0)
        hosts := make([]string, 0)

        req := &proto.CreateMetaPartitionRequest{
                Start:       mp.Start,
                End:         mp.End,
                PartitionID: mp.PartitionID,
                Members:     peers,
                VolName:     volName,
                VerSeq:      mp.VerSeq,
        }
        if specifyAddrs == nil {
                hosts = mp.Hosts
        } else {
                hosts = specifyAddrs
        }

        for _, addr := range hosts {
                t := proto.NewAdminTask(proto.OpCreateMetaPartition, addr, req)
                resetMetaPartitionTaskID(t, mp.PartitionID)
                tasks = append(tasks, t)
        }
        return
}

func (mp *MetaPartition) tryToChangeLeader(c *Cluster, metaNode *MetaNode) (err error) {
        task, err := mp.createTaskToTryToChangeLeader(metaNode.Addr)
        if err != nil {
                return
        }
        if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
                return
        }
        return
}

func (mp *MetaPartition) tryToChangeLeaderByHost(host string) (err error) {
        var metaNode *MetaNode
        for _, r := range mp.Replicas {
                if host == r.Addr {
                        metaNode = r.metaNode
                        break
                }
        }
        if metaNode == nil {
                return fmt.Errorf("host not found[%v]", host)
        }
        task, err := mp.createTaskToTryToChangeLeader(host)
        if err != nil {
                return
        }
        if _, err = metaNode.Sender.syncSendAdminTask(task); err != nil {
                return
        }
        return
}

func (mp *MetaPartition) createTaskToTryToChangeLeader(addr string) (task *proto.AdminTask, err error) {
        task = proto.NewAdminTask(proto.OpMetaPartitionTryToLeader, addr, nil)
        resetMetaPartitionTaskID(task, mp.PartitionID)
        return
}

func (mp *MetaPartition) createTaskToCreateReplica(host string) (t *proto.AdminTask, err error) {
        req := &proto.CreateMetaPartitionRequest{
                Start:       mp.Start,
                End:         mp.End,
                PartitionID: mp.PartitionID,
                Members:     mp.Peers,
                VolName:     mp.volName,
                VerSeq:      mp.VerSeq,
        }
        t = proto.NewAdminTask(proto.OpCreateMetaPartition, host, req)
        resetMetaPartitionTaskID(t, mp.PartitionID)
        return
}

func (mp *MetaPartition) createTaskToAddRaftMember(addPeer proto.Peer, leaderAddr string) (t *proto.AdminTask, err error) {
        req := &proto.AddMetaPartitionRaftMemberRequest{PartitionId: mp.PartitionID, AddPeer: addPeer}
        t = proto.NewAdminTask(proto.OpAddMetaPartitionRaftMember, leaderAddr, req)
        resetMetaPartitionTaskID(t, mp.PartitionID)
        return
}

func (mp *MetaPartition) createTaskToRemoveRaftMember(removePeer proto.Peer) (t *proto.AdminTask, err error) {
        mr, err := mp.getMetaReplicaLeader()
        if err != nil {
                return nil, errors.NewError(err)
        }
        req := &proto.RemoveMetaPartitionRaftMemberRequest{PartitionId: mp.PartitionID, RemovePeer: removePeer}
        t = proto.NewAdminTask(proto.OpRemoveMetaPartitionRaftMember, mr.Addr, req)
        resetMetaPartitionTaskID(t, mp.PartitionID)
        return
}

func (mp *MetaPartition) createTaskToDecommissionReplica(volName string, removePeer proto.Peer, addPeer proto.Peer) (t *proto.AdminTask, err error) {
        mr, err := mp.getMetaReplicaLeader()
        if err != nil {
                return nil, errors.NewError(err)
        }
        req := &proto.MetaPartitionDecommissionRequest{PartitionID: mp.PartitionID, VolName: volName, RemovePeer: removePeer, AddPeer: addPeer}
        t = proto.NewAdminTask(proto.OpDecommissionMetaPartition, mr.Addr, req)
        resetMetaPartitionTaskID(t, mp.PartitionID)
        return
}

func resetMetaPartitionTaskID(t *proto.AdminTask, partitionID uint64) {
        t.ID = fmt.Sprintf("%v_pid[%v]", t.ID, partitionID)
        t.PartitionID = partitionID
}

func (mp *MetaPartition) createTaskToUpdateMetaReplica(clusterID string, partitionID uint64, end uint64) (t *proto.AdminTask) {
        mr, err := mp.getMetaReplicaLeader()
        if err != nil {
                msg := fmt.Sprintf("action[createTaskToUpdateMetaReplica] clusterID[%v] meta partition %v no leader",
                        clusterID, mp.PartitionID)
                Warn(clusterID, msg)
                return
        }
        req := &proto.UpdateMetaPartitionRequest{PartitionID: partitionID, End: end, VolName: mp.volName}
        t = proto.NewAdminTask(proto.OpUpdateMetaPartition, mr.Addr, req)
        resetMetaPartitionTaskID(t, mp.PartitionID)
        return
}

func (mr *MetaReplica) createTaskToDeleteReplica(partitionID uint64) (t *proto.AdminTask) {
        req := &proto.DeleteMetaPartitionRequest{PartitionID: partitionID}
        t = proto.NewAdminTask(proto.OpDeleteMetaPartition, mr.Addr, req)
        resetMetaPartitionTaskID(t, partitionID)
        return
}

func (mr *MetaReplica) createTaskToLoadMetaPartition(partitionID uint64) (t *proto.AdminTask) {
        req := &proto.MetaPartitionLoadRequest{PartitionID: partitionID}
        t = proto.NewAdminTask(proto.OpLoadMetaPartition, mr.Addr, req)
        resetMetaPartitionTaskID(t, partitionID)
        return
}

func (mr *MetaReplica) isMissing() (miss bool) {
        return time.Now().Unix()-mr.ReportTime > defaultMetaPartitionTimeOutSec
}

func (mr *MetaReplica) isActive() (active bool) {
        return mr.metaNode.IsActive && mr.Status != proto.Unavailable &&
                time.Now().Unix()-mr.ReportTime < defaultMetaPartitionTimeOutSec
}

func (mr *MetaReplica) setLastReportTime() {
        mr.ReportTime = time.Now().Unix()
}

func (mr *MetaReplica) updateMetric(mgr *proto.MetaPartitionReport) {
        mr.Status = (int8)(mgr.Status)
        mr.IsLeader = mgr.IsLeader
        mr.MaxInodeID = mgr.MaxInodeID
        mr.InodeCount = mgr.InodeCnt
        mr.DentryCount = mgr.DentryCnt
        mr.TxCnt = mgr.TxCnt
        mr.TxRbInoCnt = mgr.TxRbInoCnt
        mr.TxRbDenCnt = mgr.TxRbDenCnt
        mr.FreeListLen = mgr.FreeListLen
        mr.dataSize = mgr.Size
        mr.setLastReportTime()

        if mr.metaNode.RdOnly && mr.Status == proto.ReadWrite {
                mr.Status = proto.ReadOnly
        }
}

func (mp *MetaPartition) afterCreation(nodeAddr string, c *Cluster) (err error) {
        metaNode, err := c.metaNode(nodeAddr)
        if err != nil {
                return err
        }
        mr := newMetaReplica(mp.Start, mp.End, metaNode)
        mr.Status = proto.ReadWrite
        mr.ReportTime = time.Now().Unix()
        mp.addReplica(mr)
        mp.removeMissingReplica(mr.Addr)
        return
}

func (mp *MetaPartition) addOrReplaceLoadResponse(response *proto.MetaPartitionLoadResponse) {
        mp.Lock()
        defer mp.Unlock()
        loadResponse := make([]*proto.MetaPartitionLoadResponse, 0)
        for _, lr := range mp.LoadResponse {
                if lr.Addr == response.Addr {
                        continue
                }
                loadResponse = append(loadResponse, lr)
        }
        loadResponse = append(loadResponse, response)
        mp.LoadResponse = loadResponse
}

func (mp *MetaPartition) getMinusOfMaxInodeID() (minus float64) {
        mp.RLock()
        defer mp.RUnlock()
        var sentry float64
        for index, replica := range mp.Replicas {
                if index == 0 {
                        sentry = float64(replica.MaxInodeID)
                        continue
                }
                diff := math.Abs(float64(replica.MaxInodeID) - sentry)
                if diff > minus {
                        minus = diff
                }
        }
        return
}

func (mp *MetaPartition) activeMaxInodeSimilar() bool {
        mp.RLock()
        defer mp.RUnlock()

        minus := float64(0)
        var sentry float64
        replicas := mp.getLiveReplicas()
        for index, replica := range replicas {
                if index == 0 {
                        sentry = float64(replica.MaxInodeID)
                        continue
                }
                diff := math.Abs(float64(replica.MaxInodeID) - sentry)
                if diff > minus {
                        minus = diff
                }
        }

        return minus < defaultMinusOfMaxInodeID
}

func (mp *MetaPartition) setUidInfo(mgr *proto.MetaPartitionReport) {
        if !mgr.IsLeader {
                return
        }

        mp.uidInfo = mgr.UidInfo
}

func (mp *MetaPartition) setMaxInodeID() {
        var maxUsed uint64
        for _, r := range mp.Replicas {
                if r.MaxInodeID > maxUsed {
                        maxUsed = r.MaxInodeID
                }
        }
        mp.MaxInodeID = maxUsed
}

// Caller should call mp.lock and mp.unlock when use it.
func (mp *MetaPartition) setHeartBeatDone() {
        if len(mp.Replicas) == int(mp.ReplicaNum) {
                mp.heartBeatDone = true
        }
}

func (mp *MetaPartition) setInodeCount() {
        var inodeCount uint64
        for _, r := range mp.Replicas {
                if r.InodeCount > inodeCount {
                        inodeCount = r.InodeCount
                }
        }
        mp.InodeCount = inodeCount
}

func (mp *MetaPartition) setDentryCount() {
        var dentryCount uint64
        for _, r := range mp.Replicas {
                if r.DentryCount > dentryCount {
                        dentryCount = r.DentryCount
                }
        }
        mp.DentryCount = dentryCount
}

func (mp *MetaPartition) setFreeListLen() {
        var freeListLen uint64
        for _, r := range mp.Replicas {
                if r.FreeListLen > freeListLen {
                        freeListLen = r.FreeListLen
                }
        }
        mp.FreeListLen = freeListLen
}

func (mp *MetaPartition) SetTxCnt() {
        var txCnt, rbInoCnt, rbDenCnt uint64
        for _, r := range mp.Replicas {
                if r.TxCnt > txCnt {
                        txCnt = r.TxCnt
                }
                if r.TxRbInoCnt > rbInoCnt {
                        rbInoCnt = r.TxRbInoCnt
                }
                if r.TxRbDenCnt > rbDenCnt {
                        rbDenCnt = r.TxRbDenCnt
                }
        }
        mp.TxCnt, mp.TxRbInoCnt, mp.TxRbDenCnt = txCnt, rbInoCnt, rbDenCnt
}

func (mp *MetaPartition) getAllNodeSets() (nodeSets []uint64) {
        mp.RLock()
        defer mp.RUnlock()
        nodeSets = make([]uint64, 0)
        for _, mr := range mp.Replicas {
                if mr.metaNode == nil {
                        continue
                }
                if !containsID(nodeSets, mr.metaNode.NodeSetID) {
                        nodeSets = append(nodeSets, mr.metaNode.NodeSetID)
                }
        }
        return
}

func (mp *MetaPartition) getLiveZones(offlineAddr string) (zones []string) {
        mp.RLock()
        defer mp.RUnlock()
        for _, mr := range mp.Replicas {
                if mr.metaNode == nil {
                        continue
                }
                if mr.Addr == offlineAddr {
                        continue
                }
                zones = append(zones, mr.metaNode.ZoneName)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

func (c *Cluster) scheduleToLoadMetaPartitions() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                if c.vols != nil {
                                        c.checkLoadMetaPartitions()
                                }
                        }
                        time.Sleep(2 * time.Second * defaultIntervalToCheckDataPartition)
                }
        }()
}

func (c *Cluster) checkLoadMetaPartitions() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkDiskRecoveryProgress occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkDiskRecoveryProgress occurred panic")
                }
        }()
        vols := c.allVols()
        for _, vol := range vols {
                mps := vol.cloneMetaPartitionMap()
                for _, mp := range mps {
                        c.doLoadMetaPartition(mp)
                }
        }
}

func (mp *MetaPartition) checkSnapshot(c *Cluster) {
        if len(mp.LoadResponse) == 0 {
                return
        }
        if !mp.doCompare() {
                return
        }
        if !mp.isSameApplyID() {
                return
        }
        ckInode := mp.checkInodeCount(c)
        ckDentry := mp.checkDentryCount(c)
        if ckInode && ckDentry {
                mp.EqualCheckPass = true
        } else {
                mp.EqualCheckPass = false
        }
}

func (mp *MetaPartition) doCompare() bool {
        for _, lr := range mp.LoadResponse {
                if !lr.DoCompare {
                        return false
                }
        }
        return true
}

func (mp *MetaPartition) isSameApplyID() bool {
        rst := true
        applyID := mp.LoadResponse[0].ApplyID
        for _, loadResponse := range mp.LoadResponse {
                if applyID != loadResponse.ApplyID {
                        rst = false
                }
        }
        return rst
}

func (mp *MetaPartition) checkInodeCount(c *Cluster) (isEqual bool) {
        isEqual = true
        maxInode := mp.LoadResponse[0].MaxInode
        maxInodeCount := mp.LoadResponse[0].InodeCount
        inodeEqual := true
        maxInodeEqual := true
        if mp.IsRecover {
                return
        }
        for _, loadResponse := range mp.LoadResponse {
                diff := math.Abs(float64(loadResponse.MaxInode) - float64(maxInode))
                if diff > defaultRangeOfCountDifferencesAllowed {
                        isEqual = false
                        inodeEqual = false
                        break
                }
                diff = math.Abs(float64(loadResponse.InodeCount) - float64(maxInodeCount))
                if diff > defaultRangeOfCountDifferencesAllowed {
                        isEqual = false
                        maxInodeEqual = false
                        break
                }
        }
        if !isEqual {
                msg := fmt.Sprintf("inode count is not equal,vol[%v],mpID[%v],", mp.volName, mp.PartitionID)
                for _, lr := range mp.LoadResponse {
                        lrMsg := fmt.Sprintf(msg+lr.Addr, "applyId[%d],committedId[%d],maxInode[%d],InodeCnt[%d]", lr.ApplyID, lr.CommittedID, lr.MaxInode, lr.InodeCount)
                        Warn(c.Name, lrMsg)
                }
                if !maxInodeEqual {
                        c.inodeCountNotEqualMP.Store(mp.PartitionID, mp)
                }
                if !inodeEqual {
                        c.maxInodeNotEqualMP.Store(mp.PartitionID, mp)
                }

        } else {
                if _, ok := c.inodeCountNotEqualMP.Load(mp.PartitionID); ok {
                        c.inodeCountNotEqualMP.Delete(mp.PartitionID)
                }
                if _, ok := c.maxInodeNotEqualMP.Load(mp.PartitionID); ok {
                        c.maxInodeNotEqualMP.Delete(mp.PartitionID)
                }
        }
        return
}

func (mp *MetaPartition) checkDentryCount(c *Cluster) (isEqual bool) {
        isEqual = true
        if mp.IsRecover {
                return
        }
        dentryCount := mp.LoadResponse[0].DentryCount
        for _, loadResponse := range mp.LoadResponse {
                diff := math.Abs(float64(loadResponse.DentryCount) - float64(dentryCount))
                if diff > defaultRangeOfCountDifferencesAllowed {
                        isEqual = false
                }
        }

        if !isEqual {
                msg := fmt.Sprintf("dentry count is not equal,vol[%v],mpID[%v],", mp.volName, mp.PartitionID)
                for _, lr := range mp.LoadResponse {
                        lrMsg := fmt.Sprintf(msg+lr.Addr, "applyId[%d],committedId[%d],dentryCount[%d]", lr.ApplyID, lr.CommittedID, lr.DentryCount)
                        Warn(c.Name, lrMsg)
                }
                c.dentryCountNotEqualMP.Store(mp.PartitionID, mp)
        } else {
                if _, ok := c.dentryCountNotEqualMP.Load(mp.PartitionID); ok {
                        c.dentryCountNotEqualMP.Delete(mp.PartitionID)
                }
        }
        return
}

func (c *Cluster) scheduleToCheckMetaPartitionRecoveryProgress() {
        go func() {
                for {
                        if c.partition != nil && c.partition.IsRaftLeader() {
                                if c.vols != nil {
                                        c.checkMetaPartitionRecoveryProgress()
                                }
                        }
                        time.Sleep(time.Second * defaultIntervalToCheckDataPartition)
                }
        }()
}

func (c *Cluster) checkMetaPartitionRecoveryProgress() {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkMetaPartitionRecoveryProgress occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkMetaPartitionRecoveryProgress occurred panic")
                }
        }()

        c.badPartitionMutex.Lock()
        defer c.badPartitionMutex.Unlock()

        c.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
                badMetaPartitionIds := value.([]uint64)
                newBadMpIds := make([]uint64, 0)
                for _, partitionID := range badMetaPartitionIds {
                        partition, err := c.getMetaPartitionByID(partitionID)
                        if err != nil {
                                Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v], partitionID[%v] is not exist", c.Name, partitionID))
                                continue
                        }

                        vol, err := c.getVol(partition.volName)
                        if err != nil {
                                Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],vol[%v] partitionID[%v]is not exist",
                                        c.Name, partition.volName, partitionID))
                                continue
                        }

                        if len(partition.Replicas) == 0 || len(partition.Replicas) < int(vol.mpReplicaNum) {
                                newBadMpIds = append(newBadMpIds, partitionID)
                                continue
                        }

                        if partition.getMinusOfMaxInodeID() < defaultMinusOfMaxInodeID {
                                partition.IsRecover = false
                                partition.RLock()
                                c.syncUpdateMetaPartition(partition)
                                partition.RUnlock()
                                Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],vol[%v] partitionID[%v] has recovered success",
                                        c.Name, partition.volName, partitionID))
                        } else {
                                newBadMpIds = append(newBadMpIds, partitionID)
                        }
                }

                if len(newBadMpIds) == 0 {
                        Warn(c.Name, fmt.Sprintf("checkMetaPartitionRecoveryProgress clusterID[%v],node[%v] has recovered success", c.Name, key))
                        c.BadMetaPartitionIds.Delete(key)
                } else {
                        c.BadMetaPartitionIds.Store(key, newBadMpIds)
                        log.LogInfof("checkMetaPartitionRecoveryProgress BadMetaPartitionIds there is still (%d) mp in recover, addr (%s)", len(newBadMpIds), key)
                }

                return true
        })
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "io"
        "strconv"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        raftstore "github.com/cubefs/cubefs/raftstore/raftstore_db"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

const (
        applied = "applied"
)

type raftLeaderChangeHandler func(leader uint64)

type raftPeerChangeHandler func(confChange *proto.ConfChange) (err error)

type raftUserCmdApplyHandler func(opt uint32, key string, cmdMap map[string][]byte) (err error)

type raftApplySnapshotHandler func()

// MetadataFsm represents the finite state machine of a metadata partition
type MetadataFsm struct {
        store               *raftstore.RocksDBStore
        rs                  *raft.RaftServer
        applied             uint64
        retainLogs          uint64
        leaderChangeHandler raftLeaderChangeHandler
        peerChangeHandler   raftPeerChangeHandler
        snapshotHandler     raftApplySnapshotHandler
        UserAppCmdHandler   raftUserCmdApplyHandler
        onSnapshot          bool
}

func newMetadataFsm(store *raftstore.RocksDBStore, retainsLog uint64, rs *raft.RaftServer) (fsm *MetadataFsm) {
        fsm = new(MetadataFsm)
        fsm.store = store
        fsm.rs = rs
        fsm.retainLogs = retainsLog
        return
}

// Corresponding to the LeaderChange interface in Raft library.
func (mf *MetadataFsm) registerLeaderChangeHandler(handler raftLeaderChangeHandler) {
        mf.leaderChangeHandler = handler
}

// Corresponding to the PeerChange interface in Raft library.
func (mf *MetadataFsm) registerPeerChangeHandler(handler raftPeerChangeHandler) {
        mf.peerChangeHandler = handler
}

// Corresponding to the ApplySnapshot interface in Raft library.
func (mf *MetadataFsm) registerApplySnapshotHandler(handler raftApplySnapshotHandler) {
        mf.snapshotHandler = handler
}

// Corresponding to the ApplyRaftCmd interface in Raft library.
func (mf *MetadataFsm) registerRaftUserCmdApplyHandler(handler raftUserCmdApplyHandler) {
        mf.UserAppCmdHandler = handler
}

func (mf *MetadataFsm) restore() {
        mf.restoreApplied()
}

func (mf *MetadataFsm) restoreApplied() {
        value, err := mf.store.Get(applied)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore applied err:%v", err.Error()))
        }
        byteValues := value.([]byte)
        if len(byteValues) == 0 {
                mf.applied = 0
                return
        }
        applied, err := strconv.ParseUint(string(byteValues), 10, 64)
        if err != nil {
                panic(fmt.Sprintf("Failed to restore applied,err:%v ", err.Error()))
        }
        mf.applied = applied
}

// Apply implements the interface of raft.StateMachine
func (mf *MetadataFsm) Apply(command []byte, index uint64) (resp interface{}, err error) {
        log.LogDebugf("[Apply] apply index(%v)", index)
        cmd := new(RaftCmd)
        if err = cmd.Unmarshal(command); err != nil {
                log.LogErrorf("action[fsmApply],unmarshal data:%v, err:%v", command, err.Error())
                panic(err)
        }

        cmdMap := make(map[string][]byte)
        deleteSet := make(map[string]util.Null)
        if cmd.Op != opSyncBatchPut {
                cmdMap[cmd.K] = cmd.V
                cmdMap[applied] = []byte(strconv.FormatUint(uint64(index), 10))
        } else {
                nestedCmdMap := make(map[string]*RaftCmd)
                if err = json.Unmarshal(cmd.V, &nestedCmdMap); err != nil {
                        log.LogErrorf("action[fsmApply],unmarshal nested cmd data:%v, err:%v", command, err.Error())
                        panic(err)
                }
                for cmdK, cmd := range nestedCmdMap {
                        switch cmd.Op {
                        case opSyncDeleteDataNode, opSyncDeleteMetaNode, opSyncDeleteVol, opSyncDeleteDataPartition, opSyncDeleteMetaPartition,
                                opSyncDeleteUserInfo, opSyncDeleteAKUser, opSyncDeleteVolUser, opSyncDeleteQuota, opSyncDeleteLcNode, opSyncDeleteLcConf, opSyncS3QosDelete:
                                deleteSet[cmdK] = util.Null{}
                        // NOTE: opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo need special handle?
                        default:
                                cmdMap[cmdK] = cmd.V
                        }
                }
                cmdMap[applied] = []byte(strconv.FormatUint(uint64(index), 10))
        }

        switch cmd.Op {
        case opSyncDeleteDataNode, opSyncDeleteMetaNode, opSyncDeleteVol, opSyncDeleteDataPartition, opSyncDeleteMetaPartition,
                opSyncDeleteUserInfo, opSyncDeleteAKUser, opSyncDeleteVolUser, opSyncDeleteQuota, opSyncDeleteLcNode, opSyncDeleteLcConf, opSyncS3QosDelete:
                if err = mf.delKeyAndPutIndex(cmd.K, cmdMap); err != nil {
                        panic(err)
                }
        case opSyncPutFollowerApiLimiterInfo, opSyncPutApiLimiterInfo:
                mf.UserAppCmdHandler(cmd.Op, cmd.K, cmdMap)
                //if err = mf.delKeyAndPutIndex(cmd.K, cmdMap); err != nil {
                //        panic(err)
                //}
                if err = mf.store.BatchPut(cmdMap, true); err != nil {
                        panic(err)
                }
        default:
                // sync put data
                if err = mf.store.BatchDeleteAndPut(deleteSet, cmdMap, true); err != nil {
                        panic(err)
                }
        }

        mf.applied = index

        if mf.applied > 0 && (mf.applied%mf.retainLogs) == 0 {
                log.LogWarnf("action[Apply],truncate raft log,retainLogs[%v],index[%v]", mf.retainLogs, mf.applied)
                mf.rs.Truncate(GroupID, mf.applied)
        }
        return
}

// ApplyMemberChange implements the interface of raft.StateMachine
func (mf *MetadataFsm) ApplyMemberChange(confChange *proto.ConfChange, index uint64) (interface{}, error) {
        var err error
        if mf.peerChangeHandler != nil {
                err = mf.peerChangeHandler(confChange)
        }
        return nil, err
}

// Snapshot implements the interface of raft.StateMachine
func (mf *MetadataFsm) Snapshot() (proto.Snapshot, error) {
        snapshot := mf.store.RocksDBSnapshot()
        iterator := mf.store.Iterator(snapshot)
        iterator.SeekToFirst()
        return &MetadataSnapshot{
                applied:  mf.applied,
                snapshot: snapshot,
                fsm:      mf,
                iterator: iterator,
        }, nil
}

// ApplySnapshot implements the interface of raft.StateMachine
func (mf *MetadataFsm) ApplySnapshot(peers []proto.Peer, iterator proto.SnapIterator) (err error) {
        log.LogWarnf("action[ApplySnapshot] reset rocksdb before applying snapshot")
        mf.onSnapshot = true

        defer func() {
                mf.onSnapshot = false
        }()

        if log.EnableDebug() {
                func() {
                        snap := mf.store.RocksDBSnapshot()
                        defer mf.store.ReleaseSnapshot(snap)
                        iter := mf.store.Iterator(snap)
                        defer iter.Close()
                        cnt := 0
                        for iter.SeekToFirst(); iter.Valid(); iter.Next() {
                                cnt++
                        }
                        log.LogDebugf("[ApplySnapshot] scan %v keys before clear", cnt)
                }()
        }

        mf.store.Clear()

        if log.EnableDebug() {
                func() {
                        snap := mf.store.RocksDBSnapshot()
                        defer mf.store.ReleaseSnapshot(snap)
                        iter := mf.store.Iterator(snap)
                        defer iter.Close()
                        cnt := 0
                        for iter.SeekToFirst(); iter.Valid(); iter.Next() {
                                cnt++
                        }
                        log.LogDebugf("[ApplySnapshot] scan %v keys after clear", cnt)
                }()
        }

        log.LogWarnf(fmt.Sprintf("action[ApplySnapshot] begin,applied[%v]", mf.applied))
        var data []byte
        var appliedIndex []byte
        for err == nil {
                bgTime := stat.BeginStat()
                if data, err = iterator.Next(); err != nil {
                        break
                }
                stat.EndStat("ApplySnapshot-Next", err, bgTime, 1)
                cmd := &RaftCmd{}
                if err = json.Unmarshal(data, cmd); err != nil {
                        goto errHandler
                }
                bgTime = stat.BeginStat()
                if cmd.K != applied {
                        if _, err = mf.store.Put(cmd.K, cmd.V, false); err != nil {
                                goto errHandler
                        }
                } else {
                        appliedIndex = cmd.V
                }
                stat.EndStat("ApplySnapshot-Put", err, bgTime, 1)
        }
        if err != nil && err != io.EOF {
                goto errHandler
        }

        if err = mf.store.Flush(); err != nil {
                log.LogError(fmt.Sprintf("action[ApplySnapshot] Flush failed,err:%v", err.Error()))
                goto errHandler
        }

        // NOTE: we write applied index at last
        log.LogDebugf("[ApplySnapshot] find applied index(%v)", appliedIndex)
        if appliedIndex != nil {
                if _, err = mf.store.Put(applied, appliedIndex, true); err != nil {
                        goto errHandler
                }
        } else {
                log.LogErrorf("[ApplySnapshot] not found applied index in snapshot")
        }

        mf.snapshotHandler()
        log.LogWarnf(fmt.Sprintf("action[ApplySnapshot] success,applied[%v]", mf.applied))
        return nil
errHandler:
        log.LogError(fmt.Sprintf("action[ApplySnapshot] failed,err:%v", err.Error()))
        return err
}

// HandleFatalEvent implements the interface of raft.StateMachine
func (mf *MetadataFsm) HandleFatalEvent(err *raft.FatalError) {
        panic(err.Err)
}

// HandleLeaderChange implements the interface of raft.StateMachine
func (mf *MetadataFsm) HandleLeaderChange(leader uint64) {
        if mf.leaderChangeHandler != nil {
                go mf.leaderChangeHandler(leader)
        }
}

func (mf *MetadataFsm) delKeyAndPutIndex(key string, cmdMap map[string][]byte) (err error) {
        return mf.store.DeleteKeyAndPutIndex(key, cmdMap, true)
}

// Stop stops the RaftServer
func (mf *MetadataFsm) Stop() {
        if mf.rs != nil {
                mf.rs.Stop()
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "strconv"
        "strings"
        "sync/atomic"
        "time"

        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        bsProto "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

/* We defines several "values" such as clusterValue, metaPartitionValue, dataPartitionValue, volValue, dataNodeValue,
   nodeSetValue, and metaNodeValue here. Those are the value objects that will be marshaled as a byte array to
   transferred over the network. */

type clusterValue struct {
        Name                        string
        CreateTime                  int64
        Threshold                   float32
        LoadFactor                  float32
        DisableAutoAllocate         bool
        ForbidMpDecommission        bool
        DataNodeDeleteLimitRate     uint64
        MetaNodeDeleteBatchCount    uint64
        MetaNodeDeleteWorkerSleepMs uint64
        DataNodeAutoRepairLimitRate uint64
        MaxDpCntLimit               uint64
        FaultDomain                 bool
        DiskQosEnable               bool
        QosLimitUpload              uint64
        DirChildrenNumLimit         uint32
        DecommissionLimit           uint64
        CheckDataReplicasEnable     bool
        FileStatsEnable             bool
        ClusterUuid                 string
        ClusterUuidEnable           bool
        MetaPartitionInodeIdStep    uint64
        MaxConcurrentLcNodes        uint64
        DpMaxRepairErrCnt           uint64
        DpRepairTimeOut             uint64
        EnableAutoDecommissionDisk  bool
        DecommissionDiskFactor      float64
}

func newClusterValue(c *Cluster) (cv *clusterValue) {
        cv = &clusterValue{
                Name:                        c.Name,
                CreateTime:                  c.CreateTime,
                LoadFactor:                  c.cfg.ClusterLoadFactor,
                Threshold:                   c.cfg.MetaNodeThreshold,
                DataNodeDeleteLimitRate:     c.cfg.DataNodeDeleteLimitRate,
                MetaNodeDeleteBatchCount:    c.cfg.MetaNodeDeleteBatchCount,
                MetaNodeDeleteWorkerSleepMs: c.cfg.MetaNodeDeleteWorkerSleepMs,
                DataNodeAutoRepairLimitRate: c.cfg.DataNodeAutoRepairLimitRate,
                DisableAutoAllocate:         c.DisableAutoAllocate,
                ForbidMpDecommission:        c.ForbidMpDecommission,
                MaxDpCntLimit:               c.cfg.MaxDpCntLimit,
                FaultDomain:                 c.FaultDomain,
                DiskQosEnable:               c.diskQosEnable,
                QosLimitUpload:              uint64(c.QosAcceptLimit.Limit()),
                DirChildrenNumLimit:         c.cfg.DirChildrenNumLimit,
                DecommissionLimit:           c.DecommissionLimit,
                CheckDataReplicasEnable:     c.checkDataReplicasEnable,
                FileStatsEnable:             c.fileStatsEnable,
                ClusterUuid:                 c.clusterUuid,
                ClusterUuidEnable:           c.clusterUuidEnable,
                MetaPartitionInodeIdStep:    c.cfg.MetaPartitionInodeIdStep,
                MaxConcurrentLcNodes:        c.cfg.MaxConcurrentLcNodes,
                DpMaxRepairErrCnt:           c.cfg.DpMaxRepairErrCnt,
                DpRepairTimeOut:             c.cfg.DpRepairTimeOut,
                EnableAutoDecommissionDisk:  c.EnableAutoDecommissionDisk,
                DecommissionDiskFactor:      c.DecommissionDiskFactor,
        }
        return cv
}

type metaPartitionValue struct {
        PartitionID   uint64
        Start         uint64
        End           uint64
        VolID         uint64
        ReplicaNum    uint8
        Status        int8
        VolName       string
        Hosts         string
        OfflinePeerID uint64
        Peers         []bsProto.Peer
        IsRecover     bool
}

func newMetaPartitionValue(mp *MetaPartition) (mpv *metaPartitionValue) {
        mpv = &metaPartitionValue{
                PartitionID:   mp.PartitionID,
                Start:         mp.Start,
                End:           mp.End,
                VolID:         mp.volID,
                ReplicaNum:    mp.ReplicaNum,
                Status:        mp.Status,
                VolName:       mp.volName,
                Hosts:         mp.hostsToString(),
                Peers:         mp.Peers,
                OfflinePeerID: mp.OfflinePeerID,
                IsRecover:     mp.IsRecover,
        }
        return
}

type dataPartitionValue struct {
        PartitionID                    uint64
        ReplicaNum                     uint8
        Hosts                          string
        Peers                          []bsProto.Peer
        Status                         int8
        VolID                          uint64
        VolName                        string
        OfflinePeerID                  uint64
        Replicas                       []*replicaValue
        IsRecover                      bool
        PartitionType                  int
        PartitionTTL                   int64
        RdOnly                         bool
        IsDiscard                      bool
        DecommissionRetry              int
        DecommissionStatus             uint32
        DecommissionSrcAddr            string
        DecommissionDstAddr            string
        DecommissionRaftForce          bool
        DecommissionSrcDiskPath        string
        DecommissionTerm               uint64
        SpecialReplicaDecommissionStep uint32
        DecommissionDstAddrSpecify     bool
        DecommissionNeedRollback       bool
        RecoverStartTime               int64
        RecoverLastConsumeTime         float64
        Forbidden                      bool
        DecommissionWaitTimes          int
}

func (dpv *dataPartitionValue) Restore(c *Cluster) (dp *DataPartition) {
        for i := 0; i < len(dpv.Peers); i++ {
                dn, ok := c.dataNodes.Load(dpv.Peers[i].Addr)
                if ok && dn.(*DataNode).ID != dpv.Peers[i].ID {
                        dpv.Peers[i].ID = dn.(*DataNode).ID
                }
        }
        dp = newDataPartition(dpv.PartitionID, dpv.ReplicaNum, dpv.VolName, dpv.VolID, dpv.PartitionType, dpv.PartitionTTL)
        dp.Hosts = strings.Split(dpv.Hosts, underlineSeparator)
        dp.Peers = dpv.Peers
        dp.OfflinePeerID = dpv.OfflinePeerID
        dp.isRecover = dpv.IsRecover
        dp.RdOnly = dpv.RdOnly
        dp.IsDiscard = dpv.IsDiscard
        dp.DecommissionRaftForce = dpv.DecommissionRaftForce
        dp.DecommissionDstAddr = dpv.DecommissionDstAddr
        dp.DecommissionSrcAddr = dpv.DecommissionSrcAddr
        dp.DecommissionRetry = dpv.DecommissionRetry
        dp.DecommissionStatus = dpv.DecommissionStatus
        dp.DecommissionSrcDiskPath = dpv.DecommissionSrcDiskPath
        dp.DecommissionTerm = dpv.DecommissionTerm
        dp.SpecialReplicaDecommissionStep = dpv.SpecialReplicaDecommissionStep
        dp.DecommissionDstAddrSpecify = dpv.DecommissionDstAddrSpecify
        dp.DecommissionNeedRollback = dpv.DecommissionNeedRollback
        dp.RecoverStartTime = time.Unix(dpv.RecoverStartTime, 0)
        dp.RecoverLastConsumeTime = time.Duration(dpv.RecoverLastConsumeTime) * time.Second
        dp.DecommissionWaitTimes = dpv.DecommissionWaitTimes
        for _, rv := range dpv.Replicas {
                if !contains(dp.Hosts, rv.Addr) {
                        continue
                }
                dp.afterCreation(rv.Addr, rv.DiskPath, c)
        }
        return dp
}

type replicaValue struct {
        Addr     string
        DiskPath string
}

func newDataPartitionValue(dp *DataPartition) (dpv *dataPartitionValue) {
        dpv = &dataPartitionValue{
                PartitionID:                    dp.PartitionID,
                ReplicaNum:                     dp.ReplicaNum,
                Hosts:                          dp.hostsToString(),
                Peers:                          dp.Peers,
                Status:                         dp.Status,
                VolID:                          dp.VolID,
                VolName:                        dp.VolName,
                OfflinePeerID:                  dp.OfflinePeerID,
                Replicas:                       make([]*replicaValue, 0),
                IsRecover:                      dp.isRecover,
                PartitionType:                  dp.PartitionType,
                PartitionTTL:                   dp.PartitionTTL,
                RdOnly:                         dp.RdOnly,
                IsDiscard:                      dp.IsDiscard,
                DecommissionRetry:              dp.DecommissionRetry,
                DecommissionStatus:             dp.DecommissionStatus,
                DecommissionSrcAddr:            dp.DecommissionSrcAddr,
                DecommissionDstAddr:            dp.DecommissionDstAddr,
                DecommissionRaftForce:          dp.DecommissionRaftForce,
                DecommissionSrcDiskPath:        dp.DecommissionSrcDiskPath,
                DecommissionTerm:               dp.DecommissionTerm,
                SpecialReplicaDecommissionStep: dp.SpecialReplicaDecommissionStep,
                DecommissionDstAddrSpecify:     dp.DecommissionDstAddrSpecify,
                DecommissionNeedRollback:       dp.DecommissionNeedRollback,
                RecoverStartTime:               dp.RecoverStartTime.Unix(),
                RecoverLastConsumeTime:         dp.RecoverLastConsumeTime.Seconds(),
                DecommissionWaitTimes:          dp.DecommissionWaitTimes,
        }
        for _, replica := range dp.Replicas {
                rv := &replicaValue{Addr: replica.Addr, DiskPath: replica.DiskPath}
                dpv.Replicas = append(dpv.Replicas, rv)
        }
        return
}

type volValue struct {
        ID                    uint64
        Name                  string
        ReplicaNum            uint8
        DpReplicaNum          uint8
        Status                uint8
        DataPartitionSize     uint64
        Capacity              uint64
        Owner                 string
        FollowerRead          bool
        Authenticate          bool
        DpReadOnlyWhenVolFull bool

        CrossZone       bool
        DomainOn        bool
        ZoneName        string
        OSSAccessKey    string
        OSSSecretKey    string
        CreateTime      int64
        DeleteLockTime  int64
        Description     string
        DpSelectorName  string
        DpSelectorParm  string
        DefaultPriority bool
        DomainId        uint64
        VolType         int

        EbsBlkSize       int
        CacheCapacity    uint64
        CacheAction      int
        CacheThreshold   int
        CacheTTL         int
        CacheHighWater   int
        CacheLowWater    int
        CacheLRUInterval int
        CacheRule        string

        EnablePosixAcl bool
        EnableQuota    bool

        EnableTransaction       bsProto.TxOpMask
        TxTimeout               int64
        TxConflictRetryNum      int64
        TxConflictRetryInterval int64
        TxOpLimit               int

        VolQosEnable                                           bool
        DiskQosEnable                                          bool
        IopsRLimit, IopsWLimit, FlowRlimit, FlowWlimit         uint64
        IopsRMagnify, IopsWMagnify, FlowRMagnify, FlowWMagnify uint32
        ClientReqPeriod, ClientHitTriggerCnt                   uint32
        Forbidden                                              bool
        EnableAuditLog                                         bool
}

func (v *volValue) Bytes() (raw []byte, err error) {
        raw, err = json.Marshal(v)
        return
}

func newVolValue(vol *Vol) (vv *volValue) {
        vv = &volValue{
                ID:                      vol.ID,
                Name:                    vol.Name,
                ReplicaNum:              vol.mpReplicaNum,
                DpReplicaNum:            vol.dpReplicaNum,
                Status:                  vol.Status,
                DataPartitionSize:       vol.dataPartitionSize,
                Capacity:                vol.Capacity,
                Owner:                   vol.Owner,
                FollowerRead:            vol.FollowerRead,
                Authenticate:            vol.authenticate,
                CrossZone:               vol.crossZone,
                DomainOn:                vol.domainOn,
                ZoneName:                vol.zoneName,
                OSSAccessKey:            vol.OSSAccessKey,
                OSSSecretKey:            vol.OSSSecretKey,
                CreateTime:              vol.createTime,
                DeleteLockTime:          vol.DeleteLockTime,
                Description:             vol.description,
                DpSelectorName:          vol.dpSelectorName,
                DpSelectorParm:          vol.dpSelectorParm,
                DefaultPriority:         vol.defaultPriority,
                EnablePosixAcl:          vol.enablePosixAcl,
                EnableQuota:             vol.enableQuota,
                EnableTransaction:       vol.enableTransaction,
                TxTimeout:               vol.txTimeout,
                TxConflictRetryNum:      vol.txConflictRetryNum,
                TxConflictRetryInterval: vol.txConflictRetryInterval,
                TxOpLimit:               vol.txOpLimit,

                VolType:             vol.VolType,
                EbsBlkSize:          vol.EbsBlkSize,
                CacheCapacity:       vol.CacheCapacity,
                CacheAction:         vol.CacheAction,
                CacheThreshold:      vol.CacheThreshold,
                CacheTTL:            vol.CacheTTL,
                CacheHighWater:      vol.CacheHighWater,
                CacheLowWater:       vol.CacheLowWater,
                CacheLRUInterval:    vol.CacheLRUInterval,
                CacheRule:           vol.CacheRule,
                VolQosEnable:        vol.qosManager.qosEnable,
                IopsRLimit:          vol.qosManager.getQosLimit(bsProto.IopsReadType),
                IopsWLimit:          vol.qosManager.getQosLimit(bsProto.IopsWriteType),
                FlowRlimit:          vol.qosManager.getQosLimit(bsProto.FlowReadType),
                FlowWlimit:          vol.qosManager.getQosLimit(bsProto.FlowWriteType),
                IopsRMagnify:        vol.qosManager.getQosMagnify(bsProto.IopsReadType),
                IopsWMagnify:        vol.qosManager.getQosMagnify(bsProto.IopsWriteType),
                FlowRMagnify:        vol.qosManager.getQosMagnify(bsProto.FlowReadType),
                FlowWMagnify:        vol.qosManager.getQosMagnify(bsProto.FlowWriteType),
                ClientReqPeriod:     vol.qosManager.ClientReqPeriod,
                ClientHitTriggerCnt: vol.qosManager.ClientHitTriggerCnt,

                DpReadOnlyWhenVolFull: vol.DpReadOnlyWhenVolFull,
                Forbidden:             vol.Forbidden,
                EnableAuditLog:        vol.EnableAuditLog,
        }

        return
}

func newVolValueFromBytes(raw []byte) (*volValue, error) {
        vv := &volValue{}
        if err := json.Unmarshal(raw, vv); err != nil {
                return nil, err
        }
        return vv, nil
}

type dataNodeValue struct {
        ID                       uint64
        NodeSetID                uint64
        Addr                     string
        ZoneName                 string
        RdOnly                   bool
        DecommissionedDisks      []string
        DecommissionStatus       uint32
        DecommissionDstAddr      string
        DecommissionRaftForce    bool
        DecommissionLimit        int
        DecommissionRetry        uint8
        DecommissionCompleteTime int64
        ToBeOffline              bool
        DecommissionDiskList     []string
        DecommissionDpTotal      int
}

func newDataNodeValue(dataNode *DataNode) *dataNodeValue {
        return &dataNodeValue{
                ID:                       dataNode.ID,
                NodeSetID:                dataNode.NodeSetID,
                Addr:                     dataNode.Addr,
                ZoneName:                 dataNode.ZoneName,
                RdOnly:                   dataNode.RdOnly,
                DecommissionedDisks:      dataNode.getDecommissionedDisks(),
                DecommissionStatus:       atomic.LoadUint32(&dataNode.DecommissionStatus),
                DecommissionDstAddr:      dataNode.DecommissionDstAddr,
                DecommissionRaftForce:    dataNode.DecommissionRaftForce,
                DecommissionLimit:        dataNode.DecommissionLimit,
                DecommissionRetry:        dataNode.DecommissionRetry,
                DecommissionCompleteTime: dataNode.DecommissionCompleteTime,
                ToBeOffline:              dataNode.ToBeOffline,
                DecommissionDiskList:     dataNode.DecommissionDiskList,
                DecommissionDpTotal:      dataNode.DecommissionDpTotal,
        }
}

type metaNodeValue struct {
        ID        uint64
        NodeSetID uint64
        Addr      string
        ZoneName  string
        RdOnly    bool
}

func newMetaNodeValue(metaNode *MetaNode) *metaNodeValue {
        return &metaNodeValue{
                ID:        metaNode.ID,
                NodeSetID: metaNode.NodeSetID,
                Addr:      metaNode.Addr,
                ZoneName:  metaNode.ZoneName,
                RdOnly:    metaNode.RdOnly,
        }
}

type nodeSetValue struct {
        ID               uint64
        Capacity         int
        ZoneName         string
        DataNodeSelector string
        MetaNodeSelector string
}

type domainNodeSetGrpValue struct {
        DomainId    uint64
        ID          uint64
        NodeSetsIds []uint64
        Status      uint8
}

type zoneDomainValue struct {
        ExcludeZoneMap       map[string]int
        NeedFaultDomain      bool
        DataRatio            float64
        domainNodeSetGrpVec  []*DomainNodeSetGrpManager
        DomainZoneName2IdMap map[string]uint64 // zoneName:domainId
        ExcludeZoneUseRatio  float64
}

func newZoneDomainValue() (ev *zoneDomainValue) {
        ev = &zoneDomainValue{
                ExcludeZoneMap: make(map[string]int),
        }
        return
}

func newNodeSetValue(nset *nodeSet) (nsv *nodeSetValue) {
        nsv = &nodeSetValue{
                ID:               nset.ID,
                Capacity:         nset.Capacity,
                ZoneName:         nset.zoneName,
                DataNodeSelector: nset.GetDataNodeSelector(),
                MetaNodeSelector: nset.GetMetaNodeSelector(),
        }
        return
}

func newNodeSetGrpValue(nset *nodeSetGroup) (nsv *domainNodeSetGrpValue) {
        nsv = &domainNodeSetGrpValue{
                DomainId:    nset.domainId,
                ID:          nset.ID,
                NodeSetsIds: nset.nodeSetsIds,
                Status:      nset.status,
        }
        return
}

// RaftCmd defines the Raft commands.
type RaftCmd struct {
        Op uint32 `json:"op"`
        K  string `json:"k"`
        V  []byte `json:"v"`
}

// Marshal converts the RaftCmd to a byte array.
func (m *RaftCmd) Marshal() ([]byte, error) {
        return json.Marshal(m)
}

// Unmarshal converts the byte array to a RaftCmd.
func (m *RaftCmd) Unmarshal(data []byte) (err error) {
        return json.Unmarshal(data, m)
}

func (m *RaftCmd) setOpType() {
        keyArr := strings.Split(m.K, keySeparator)
        if len(keyArr) < 2 {
                log.LogWarnf("action[setOpType] invalid length[%v]", keyArr)
                return
        }
        switch keyArr[1] {
        case metaNodeAcronym:
                m.Op = opSyncAddMetaNode
        case dataNodeAcronym:
                m.Op = opSyncAddDataNode
        case dataPartitionAcronym:
                m.Op = opSyncAddDataPartition
        case metaPartitionAcronym:
                m.Op = opSyncAddMetaPartition
        case volAcronym:
                m.Op = opSyncAddVol
        case clusterAcronym:
                m.Op = opSyncPutCluster
        case nodeSetAcronym:
                m.Op = opSyncAddNodeSet
        case maxDataPartitionIDKey:
                m.Op = opSyncAllocDataPartitionID
        case maxMetaPartitionIDKey:
                m.Op = opSyncAllocMetaPartitionID
        case maxCommonIDKey:
                m.Op = opSyncAllocCommonID
        case userAcronym:
                m.Op = opSyncAddUserInfo
        case akAcronym:
                m.Op = opSyncAddAKUser
        case volUserAcronym:
                m.Op = opSyncAddVolUser
        case lcNodeAcronym:
                m.Op = opSyncAddLcNode
        case lcConfigurationAcronym:
                m.Op = opSyncAddLcConf
        default:
                log.LogWarnf("action[setOpType] unknown opCode[%v]", keyArr[1])
        }
}

// key=#c#name
func (c *Cluster) syncPutCluster() (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opSyncPutCluster
        metadata.K = clusterPrefix + c.Name
        cv := newClusterValue(c)
        log.LogInfof("action[syncPutCluster] cluster value:[%+v]", cv)
        metadata.V, err = json.Marshal(cv)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) syncPutApiLimiterInfo(followerLimiter bool) (err error) {
        metadata := new(RaftCmd)
        if followerLimiter {
                metadata.Op = opSyncPutFollowerApiLimiterInfo
        } else {
                metadata.Op = opSyncPutApiLimiterInfo
        }

        metadata.K = apiLimiterPrefix + c.Name
        c.apiLimiter.m.RLock()
        metadata.V, err = json.Marshal(c.apiLimiter.limiterInfos)
        c.apiLimiter.m.RUnlock()
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) loadApiLimiterInfo() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(apiLimiterPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadApiLimiterInfo],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                // cv := &clusterValue{}
                limiterInfos := make(map[string]*ApiLimitInfo)
                if err = json.Unmarshal(value, &limiterInfos); err != nil {
                        log.LogErrorf("action[loadApiLimiterInfo], unmarshal err:%v", err.Error())
                        return err
                }
                for _, v := range limiterInfos {
                        v.InitLimiter()
                }

                c.apiLimiter.m.Lock()
                c.apiLimiter.limiterInfos = limiterInfos
                c.apiLimiter.m.Unlock()
                // c.apiLimiter.Replace(limiterInfos)
                log.LogInfof("action[loadApiLimiterInfo], limiter info[%v]", value)
        }
        return
}

// key=#s#id
func (c *Cluster) syncAddNodeSet(nset *nodeSet) (err error) {
        return c.putNodeSetInfo(opSyncAddNodeSet, nset)
}

func (c *Cluster) syncUpdateNodeSet(nset *nodeSet) (err error) {
        return c.putNodeSetInfo(opSyncUpdateNodeSet, nset)
}

func (c *Cluster) putNodeSetInfo(opType uint32, nset *nodeSet) (err error) {
        log.LogInfof("action[putNodeSetInfo], type:[%v], gridId:[%v], name:[%v]", opType, nset.ID, nset.zoneName)
        metadata := new(RaftCmd)
        metadata.Op = opType
        metadata.K = nodeSetPrefix + strconv.FormatUint(nset.ID, 10)
        nsv := newNodeSetValue(nset)
        metadata.V, err = json.Marshal(nsv)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) putNodeSetGrpInfo(opType uint32, nsg *nodeSetGroup) (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opType
        metadata.K = nodeSetGrpPrefix + strconv.FormatUint(nsg.ID, 10)
        log.LogInfof("action[putNodeSetGrpInfo] nsg id[%v] status[%v] ids[%v]", nsg.ID, nsg.status, nsg.nodeSetsIds)
        nsv := newNodeSetGrpValue(nsg)
        log.LogInfof("action[putNodeSetGrpInfo] nsv id[%v] status[%v] ids[%v]", nsv.ID, nsv.Status, nsv.NodeSetsIds)
        metadata.V, err = json.Marshal(nsv)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

// key=#dp#volID#partitionID,value=json.Marshal(dataPartitionValue)
func (c *Cluster) syncAddDataPartition(dp *DataPartition) (err error) {
        return c.putDataPartitionInfo(opSyncAddDataPartition, dp)
}

func (c *Cluster) syncUpdateDataPartition(dp *DataPartition) (err error) {
        return c.putDataPartitionInfo(opSyncUpdateDataPartition, dp)
}

func (c *Cluster) syncDeleteDataPartition(dp *DataPartition) (err error) {
        return c.putDataPartitionInfo(opSyncDeleteDataPartition, dp)
}

func (c *Cluster) buildDataPartitionRaftCmd(opType uint32, dp *DataPartition) (metadata *RaftCmd, err error) {
        metadata = new(RaftCmd)
        metadata.Op = opType
        metadata.K = dataPartitionPrefix + strconv.FormatUint(dp.VolID, 10) + keySeparator + strconv.FormatUint(dp.PartitionID, 10)
        dpv := newDataPartitionValue(dp)
        metadata.V, err = json.Marshal(dpv)
        if err != nil {
                return
        }
        return
}

func (c *Cluster) putDataPartitionInfo(opType uint32, dp *DataPartition) (err error) {
        metadata, err := c.buildDataPartitionRaftCmd(opType, dp)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) submit(metadata *RaftCmd) (err error) {
        cmd, err := metadata.Marshal()
        if err != nil {
                return errors.New(err.Error())
        }
        if _, err = c.partition.Submit(cmd); err != nil {
                msg := fmt.Sprintf("action[metadata_submit] err:%v", err.Error())
                return errors.New(msg)
        }
        return
}

// key=#vol#volID,value=json.Marshal(vv)
func (c *Cluster) syncAddVol(vol *Vol) (err error) {
        return c.syncPutVolInfo(opSyncAddVol, vol)
}

func (c *Cluster) syncUpdateVol(vol *Vol) (err error) {
        return c.syncPutVolInfo(opSyncUpdateVol, vol)
}

func (c *Cluster) syncDeleteVol(vol *Vol) (err error) {
        return c.syncPutVolInfo(opSyncDeleteVol, vol)
}

func (c *Cluster) sycnPutZoneInfo(zone *Zone) error {
        var err error
        metadata := new(RaftCmd)
        metadata.Op = opSyncUpdateZone
        metadata.K = zonePrefix + zone.name
        vv := zone.getFsmValue()
        if vv.Name == "" {
                vv.Name = DefaultZoneName
        }
        log.LogInfof("action[sycnPutZoneInfo] zone name %v", vv.Name)
        if metadata.V, err = json.Marshal(vv); err != nil {
                return errors.New(err.Error())
        }
        return c.submit(metadata)
}

func (c *Cluster) buildVolInfoRaftCmd(opType uint32, vol *Vol) (metadata *RaftCmd, err error) {
        metadata = new(RaftCmd)
        metadata.Op = opType
        metadata.K = volPrefix + strconv.FormatUint(vol.ID, 10)
        vv := newVolValue(vol)
        if metadata.V, err = json.Marshal(vv); err != nil {
                return nil, errors.New(err.Error())
        }
        return
}

func (c *Cluster) syncPutVolInfo(opType uint32, vol *Vol) (err error) {
        metadata, err := c.buildVolInfoRaftCmd(opType, vol)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) syncAclList(vol *Vol, val []byte) (err error) {
        log.LogDebugf("syncAclList vol %v vallen %v", vol.Name, len(val))
        metadata := new(RaftCmd)
        metadata.Op = opSyncAcl
        metadata.K = AclPrefix + strconv.FormatUint(vol.ID, 10)
        metadata.V = val

        return c.submit(metadata)
}

func (c *Cluster) syncMultiVersion(vol *Vol, val []byte) (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opSyncMulitVersion
        metadata.K = MultiVerPrefix + strconv.FormatUint(vol.ID, 10)
        metadata.V = val
        if c == nil {
                log.LogErrorf("syncMultiVersion c is nil")
                return fmt.Errorf("vol %v but cluster is nil", vol.Name)
        }
        return c.submit(metadata)
}

func (c *Cluster) loadAclList(vol *Vol) (err error) {
        key := AclPrefix + strconv.FormatUint(vol.ID, 10)
        result, err := c.fsm.store.SeekForPrefix([]byte(key))
        if err != nil {
                log.LogErrorf("action[loadAclList] err %v", err)
                return
        }

        log.LogDebugf("loadAclList vol %v rocksdb value count %v", vol.Name, len(result))

        vol.aclMgr.init(c, vol)
        for _, value := range result {
                return vol.aclMgr.load(c, value)
        }
        return
}

func (c *Cluster) syncUidSpaceList(vol *Vol, val []byte) (err error) {
        log.LogDebugf("syncUidSpaceList vol %v vallen %v", vol.Name, len(val))
        metadata := new(RaftCmd)
        metadata.Op = opSyncUid
        metadata.K = UidPrefix + strconv.FormatUint(vol.ID, 10)
        metadata.V = val

        return c.submit(metadata)
}

func (c *Cluster) loadUidSpaceList(vol *Vol) (err error) {
        key := UidPrefix + strconv.FormatUint(vol.ID, 10)
        result, err := c.fsm.store.SeekForPrefix([]byte(key))
        if err != nil {
                log.LogErrorf("action[loadUidSpaceList] err %v", err)
                return
        }

        log.LogDebugf("loadUidSpaceList vol %v rocksdb value count %v", vol.Name, len(result))

        vol.initUidSpaceManager(c)
        for _, value := range result {
                return vol.uidSpaceManager.load(c, value)
        }
        return
}

func (c *Cluster) loadMultiVersion(vol *Vol) (err error) {
        key := MultiVerPrefix + strconv.FormatUint(vol.ID, 10)
        result, err := c.fsm.store.SeekForPrefix([]byte(key))
        if err != nil {
                log.LogErrorf("action[loadMultiVersion] err %v", err)
                return
        }
        if len(result) == 0 {
                log.LogWarnf("action[loadMultiVersion] MultiVersion zero and do init")
                return vol.VersionMgr.init(c)
        }
        vol.VersionMgr.c = c
        log.LogWarnf("action[loadMultiVersion] vol %v loadMultiVersion set cluster %v vol.VersionMgr %v", vol.Name, c, vol.VersionMgr)
        for _, value := range result {
                if err = vol.VersionMgr.loadMultiVersion(c, value); err != nil {
                        log.LogErrorf("action[loadMultiVersion] vol %v err %v", vol.Name, err)
                        return
                }
                log.LogWarnf("action[loadMultiVersion] vol %v MultiVersion zero and do init, verlist %v", vol.Name, vol.VersionMgr)
        }
        return
}

// key=#mp#volID#metaPartitionID,value=json.Marshal(metaPartitionValue)
func (c *Cluster) syncAddMetaPartition(mp *MetaPartition) (err error) {
        return c.putMetaPartitionInfo(opSyncAddMetaPartition, mp)
}

func (c *Cluster) syncUpdateMetaPartition(mp *MetaPartition) (err error) {
        return c.putMetaPartitionInfo(opSyncUpdateMetaPartition, mp)
}

func (c *Cluster) syncDeleteMetaPartition(mp *MetaPartition) (err error) {
        return c.putMetaPartitionInfo(opSyncDeleteMetaPartition, mp)
}

func (c *Cluster) putMetaPartitionInfo(opType uint32, mp *MetaPartition) (err error) {
        metadata, err := c.buildMetaPartitionRaftCmd(opType, mp)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) buildMetaPartitionRaftCmd(opType uint32, mp *MetaPartition) (metadata *RaftCmd, err error) {
        metadata = new(RaftCmd)
        metadata.Op = opType
        partitionID := strconv.FormatUint(mp.PartitionID, 10)
        metadata.K = metaPartitionPrefix + strconv.FormatUint(mp.volID, 10) + keySeparator + partitionID
        mpv := newMetaPartitionValue(mp)
        if metadata.V, err = json.Marshal(mpv); err != nil {
                return metadata, errors.New(err.Error())
        }
        return
}

func (c *Cluster) syncBatchCommitCmd(cmdMap map[string]*RaftCmd) (err error) {
        value, err := json.Marshal(cmdMap)
        if err != nil {
                return
        }
        cmd := &RaftCmd{
                Op: opSyncBatchPut,
                K:  "batch_put",
                V:  value,
        }
        return c.submit(cmd)
}

// key=#mn#id#addr,value = nil
func (c *Cluster) syncAddMetaNode(metaNode *MetaNode) (err error) {
        return c.syncPutMetaNode(opSyncAddMetaNode, metaNode)
}

func (c *Cluster) syncDeleteMetaNode(metaNode *MetaNode) (err error) {
        return c.syncPutMetaNode(opSyncDeleteMetaNode, metaNode)
}

func (c *Cluster) syncUpdateMetaNode(metaNode *MetaNode) (err error) {
        return c.syncPutMetaNode(opSyncUpdateMetaNode, metaNode)
}

func (c *Cluster) buildPutMetaNodeCmd(opType uint32, metaNode *MetaNode) (metadata *RaftCmd, err error) {
        metadata = new(RaftCmd)
        metadata.Op = opType
        metadata.K = metaNodePrefix + strconv.FormatUint(metaNode.ID, 10) + keySeparator + metaNode.Addr
        mnv := newMetaNodeValue(metaNode)
        metadata.V, err = json.Marshal(mnv)
        return
}

func (c *Cluster) buildAddMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutMetaNodeCmd(opSyncAddMetaNode, metaNode)
        return
}

func (c *Cluster) buildDeleteMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutMetaNodeCmd(opSyncDeleteMetaNode, metaNode)
        return
}

func (c *Cluster) buildUpdateMetaNodeCmd(metaNode *MetaNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutMetaNodeCmd(opSyncUpdateMetaNode, metaNode)
        return
}

func (c *Cluster) syncPutMetaNode(opType uint32, metaNode *MetaNode) (err error) {
        metadata, err := c.buildPutMetaNodeCmd(opType, metaNode)
        if err != nil {
                return errors.New(err.Error())
        }
        return c.submit(metadata)
}

// key=#dn#id#Addr,value = json.Marshal(dnv)
func (c *Cluster) syncAddDataNode(dataNode *DataNode) (err error) {
        return c.syncPutDataNode(opSyncAddDataNode, dataNode)
}

func (c *Cluster) syncDeleteDataNode(dataNode *DataNode) (err error) {
        return c.syncPutDataNode(opSyncDeleteDataNode, dataNode)
}

func (c *Cluster) syncUpdateDataNode(dataNode *DataNode) (err error) {
        return c.syncPutDataNode(opSyncUpdateDataNode, dataNode)
}

func (c *Cluster) buildAddDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutDataNodeCmd(opSyncAddDataNode, dataNode)
        return
}

func (c *Cluster) buildDeleteDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutDataNodeCmd(opSyncDeleteDataNode, dataNode)
        return
}

func (c *Cluster) buildUpdateDataNodeCmd(dataNode *DataNode) (metadata *RaftCmd, err error) {
        metadata, err = c.buildPutDataNodeCmd(opSyncUpdateDataNode, dataNode)
        return
}

func (c *Cluster) buildPutDataNodeCmd(opType uint32, dataNode *DataNode) (metadata *RaftCmd, err error) {
        metadata = new(RaftCmd)
        metadata.Op = opType
        metadata.K = dataNodePrefix + strconv.FormatUint(dataNode.ID, 10) + keySeparator + dataNode.Addr
        dnv := newDataNodeValue(dataNode)
        metadata.V, err = json.Marshal(dnv)
        if err != nil {
                return
        }
        return
}

func (c *Cluster) syncPutDataNode(opType uint32, dataNode *DataNode) (err error) {
        metadata, err := c.buildPutDataNodeCmd(opType, dataNode)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) addRaftNode(nodeID uint64, addr string) (err error) {
        log.LogInfof("action[addRaftNode] nodeID: %v, addr: %v:", nodeID, addr)

        peer := proto.Peer{ID: nodeID}
        _, err = c.partition.ChangeMember(proto.ConfAddNode, peer, []byte(addr))
        if err != nil {
                return errors.New("action[addRaftNode] error: " + err.Error())
        }
        return nil
}

func (c *Cluster) removeRaftNode(nodeID uint64, addr string) (err error) {
        log.LogInfof("action[removeRaftNode] nodeID: %v, addr: %v:", nodeID, addr)

        peer := proto.Peer{ID: nodeID}
        _, err = c.partition.ChangeMember(proto.ConfRemoveNode, peer, []byte(addr))
        if err != nil {
                return errors.New("action[removeRaftNode] error: " + err.Error())
        }
        return nil
}

func (c *Cluster) updateDirChildrenNumLimit(val uint32) {
        if val < bsProto.MinDirChildrenNumLimit {
                val = bsProto.DefaultDirChildrenNumLimit
        }
        atomic.StoreUint32(&c.cfg.DirChildrenNumLimit, val)
}

func (c *Cluster) updateMetaNodeDeleteBatchCount(val uint64) {
        atomic.StoreUint64(&c.cfg.MetaNodeDeleteBatchCount, val)
}

func (c *Cluster) updateMetaNodeDeleteWorkerSleepMs(val uint64) {
        atomic.StoreUint64(&c.cfg.MetaNodeDeleteWorkerSleepMs, val)
}

func (c *Cluster) updateDataPartitionMaxRepairErrCnt(val uint64) {
        atomic.StoreUint64(&c.cfg.DpMaxRepairErrCnt, val)
}

func (c *Cluster) updateDataPartitionRepairTimeOut(val uint64) {
        atomic.StoreUint64(&c.cfg.DpRepairTimeOut, val)
}

func (c *Cluster) updateDataNodeAutoRepairLimit(val uint64) {
        atomic.StoreUint64(&c.cfg.DataNodeAutoRepairLimitRate, val)
}

func (c *Cluster) updateDataNodeDeleteLimitRate(val uint64) {
        atomic.StoreUint64(&c.cfg.DataNodeDeleteLimitRate, val)
}

func (c *Cluster) updateMaxDpCntLimit(val uint64) {
        atomic.StoreUint64(&c.cfg.MaxDpCntLimit, val)
}

func (c *Cluster) updateInodeIdStep(val uint64) {
        atomic.StoreUint64(&c.cfg.MetaPartitionInodeIdStep, val)
}

func (c *Cluster) loadZoneValue() (err error) {
        var ok bool
        result, err := c.fsm.store.SeekForPrefix([]byte(zonePrefix))
        if err != nil {
                err = fmt.Errorf("action[loadZoneValue],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                cv := &zoneValue{}
                if err = json.Unmarshal(value, cv); err != nil {
                        log.LogErrorf("action[loadZoneValue], unmarshal err:%v", err.Error())
                        continue
                }
                var zoneInfo interface{}
                if zoneInfo, ok = c.t.zoneMap.Load(cv.Name); !ok {
                        log.LogErrorf("action[loadZoneValue], zonename [%v] not found", cv.Name)
                        continue
                }
                zone := zoneInfo.(*Zone)
                zone.QosFlowRLimit = cv.QosFlowRLimit
                zone.QosIopsWLimit = cv.QosIopsWLimit
                zone.QosFlowWLimit = cv.QosFlowWLimit
                zone.QosIopsRLimit = cv.QosIopsRLimit
                if zone.GetDataNodesetSelector() != cv.DataNodesetSelector {
                        zone.dataNodesetSelector = NewNodesetSelector(cv.DataNodesetSelector, DataNodeType)
                }
                if zone.GetMetaNodesetSelector() != cv.MetaNodesetSelector {
                        zone.metaNodesetSelector = NewNodesetSelector(cv.MetaNodesetSelector, MetaNodeType)
                }
                log.LogInfof("action[loadZoneValue] load zonename[%v] with limit [%v,%v,%v,%v]",
                        zone.name, cv.QosFlowRLimit, cv.QosIopsWLimit, cv.QosFlowWLimit, cv.QosIopsRLimit)
                zone.loadDataNodeQosLimit()
        }

        return
}

func (c *Cluster) updateMaxConcurrentLcNodes(val uint64) {
        atomic.StoreUint64(&c.cfg.MaxConcurrentLcNodes, val)
}

// persist cluster value if not persisted; set create time for cluster being created.
func (c *Cluster) checkPersistClusterValue() {
        result, err := c.fsm.store.SeekForPrefix([]byte(clusterPrefix))
        if err != nil {
                err = fmt.Errorf("action[checkPersistClusterValue] seek cluster value err: %v", err.Error())
                panic(err)
        }
        if len(result) != 0 {
                log.LogInfo("action[checkPersistClusterValue] already has cluster value record, need to do nothing")
                return
        }
        /* when cluster value not persisted, it could be:
           - cluster created by old version master which may not persist cluster value, not need set create time;
           - cluster being created, need to set create time;
         check whether persisted node set info to determine which scenario it is. */
        result, err = c.fsm.store.SeekForPrefix([]byte(nodeSetPrefix))
        if err != nil {
                err = fmt.Errorf("action[checkPersistClusterValue] seek node set err: %v", err.Error())
                panic(err)
        }
        oldVal := c.CreateTime
        var scenarioMsg string
        if len(result) != 0 {
                scenarioMsg = "cluster already created"
        } else {
                scenarioMsg = "cluster being created"
                c.CreateTime = time.Now().Unix()
        }
        log.LogInfo("action[checkPersistClusterValue] to add cluster value record for " + scenarioMsg)
        if err = c.syncPutCluster(); err != nil {
                c.CreateTime = oldVal
                log.LogErrorf("action[checkPersistClusterValue] put err[%v]", err.Error())
                panic(err)
        }
        log.LogInfo("action[checkPersistClusterValue] add cluster value record")
        return
}

func (c *Cluster) loadClusterValue() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(clusterPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadClusterValue],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                cv := &clusterValue{}
                if err = json.Unmarshal(value, cv); err != nil {
                        log.LogErrorf("action[loadClusterValue], unmarshal err:%v", err.Error())
                        return err
                }

                if cv.Name != c.Name {
                        log.LogErrorf("action[loadClusterValue] loaded cluster value: %+v", cv)
                        continue
                }

                log.LogDebugf("action[loadClusterValue] loaded cluster value: %+v", cv)
                c.CreateTime = cv.CreateTime

                if cv.MaxConcurrentLcNodes == 0 {
                        cv.MaxConcurrentLcNodes = defaultMaxConcurrentLcNodes
                }

                c.cfg.MetaNodeThreshold = cv.Threshold
                // c.cfg.DirChildrenNumLimit = cv.DirChildrenNumLimit
                c.cfg.ClusterLoadFactor = cv.LoadFactor
                c.DisableAutoAllocate = cv.DisableAutoAllocate
                c.ForbidMpDecommission = cv.ForbidMpDecommission
                c.diskQosEnable = cv.DiskQosEnable
                c.cfg.QosMasterAcceptLimit = cv.QosLimitUpload
                c.DecommissionLimit = cv.DecommissionLimit // dont update nodesets limit for nodesets are not loaded
                c.fileStatsEnable = cv.FileStatsEnable
                c.clusterUuid = cv.ClusterUuid
                c.clusterUuidEnable = cv.ClusterUuidEnable
                c.DecommissionLimit = cv.DecommissionLimit
                c.EnableAutoDecommissionDisk = cv.EnableAutoDecommissionDisk
                c.DecommissionDiskFactor = cv.DecommissionDiskFactor
                if c.cfg.QosMasterAcceptLimit < QosMasterAcceptCnt {
                        c.cfg.QosMasterAcceptLimit = QosMasterAcceptCnt
                }
                c.QosAcceptLimit.SetLimit(rate.Limit(c.cfg.QosMasterAcceptLimit))
                log.LogInfof("action[loadClusterValue] qos limit %v", c.cfg.QosMasterAcceptLimit)

                c.updateDirChildrenNumLimit(cv.DirChildrenNumLimit)
                c.updateMetaNodeDeleteBatchCount(cv.MetaNodeDeleteBatchCount)
                c.updateMetaNodeDeleteWorkerSleepMs(cv.MetaNodeDeleteWorkerSleepMs)
                c.updateDataNodeDeleteLimitRate(cv.DataNodeDeleteLimitRate)
                c.updateDataNodeAutoRepairLimit(cv.DataNodeAutoRepairLimitRate)
                c.updateDataPartitionMaxRepairErrCnt(cv.DpMaxRepairErrCnt)
                c.updateDataPartitionRepairTimeOut(cv.DpRepairTimeOut)
                c.updateMaxDpCntLimit(cv.MaxDpCntLimit)
                if cv.MetaPartitionInodeIdStep == 0 {
                        cv.MetaPartitionInodeIdStep = defaultMetaPartitionInodeIDStep
                }
                c.updateInodeIdStep(cv.MetaPartitionInodeIdStep)

                c.updateMaxConcurrentLcNodes(cv.MaxConcurrentLcNodes)
                log.LogInfof("action[loadClusterValue], metaNodeThreshold[%v]", cv.Threshold)

                c.checkDataReplicasEnable = cv.CheckDataReplicasEnable
        }
        return
}

func (c *Cluster) loadNodeSets() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(nodeSetPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadNodeSets],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                nsv := &nodeSetValue{}
                if err = json.Unmarshal(value, nsv); err != nil {
                        log.LogErrorf("action[loadNodeSets], unmarshal err:%v", err.Error())
                        return err
                }
                if nsv.ZoneName == "" {
                        nsv.ZoneName = DefaultZoneName
                }
                cap := nsv.Capacity
                if cap < 3 {
                        cap = c.cfg.nodeSetCapacity
                }

                ns := newNodeSet(c, nsv.ID, cap, nsv.ZoneName)
                ns.UpdateMaxParallel(int32(c.DecommissionLimit))
                ns.UpdateDecommissionDiskFactor(c.DecommissionDiskFactor)
                if nsv.DataNodeSelector != "" && ns.GetDataNodeSelector() != nsv.DataNodeSelector {
                        ns.SetDataNodeSelector(nsv.DataNodeSelector)
                }
                if nsv.MetaNodeSelector != "" && ns.GetMetaNodeSelector() != nsv.MetaNodeSelector {
                        ns.SetMetaNodeSelector(nsv.MetaNodeSelector)
                }
                zone, err := c.t.getZone(nsv.ZoneName)
                if err != nil {
                        log.LogErrorf("action[loadNodeSets], getZone err:%v", err)
                        zone = newZone(nsv.ZoneName)
                        c.t.putZoneIfAbsent(zone)
                }

                zone.putNodeSet(ns)
                log.LogInfof("action[addNodeSetGrp] nodeSet[%v]", ns.ID)
                if err = c.addNodeSetGrp(ns, true); err != nil {
                        log.LogErrorf("action[createNodeSet] nodeSet[%v] err[%v]", ns.ID, err)
                        return err
                }
                log.LogInfof("action[loadNodeSets], nsId[%v],zone[%v]", ns.ID, zone.name)
        }
        return nil
}

// put exclude zone only be used one time when master update and restart
func (c *Cluster) putZoneDomain(init bool) (err error) {
        log.LogInfof("action[putZoneDomain]")
        metadata := new(RaftCmd)
        metadata.Op = opSyncExclueDomain
        metadata.K = DomainPrefix

        c.domainManager.RLock()
        defer c.domainManager.RUnlock()

        if init {
                for i := 0; i < len(c.t.zones); i++ {
                        c.domainManager.excludeZoneListDomain[c.t.zones[i].name] = 0
                        c.t.domainExcludeZones = append(c.t.domainExcludeZones, c.t.zones[i].name)
                }
                if len(c.t.zones) == 0 {
                        c.needFaultDomain = true
                }
        }
        domainValue := newZoneDomainValue()
        domainValue.ExcludeZoneMap = c.domainManager.excludeZoneListDomain
        domainValue.NeedFaultDomain = c.needFaultDomain
        domainValue.domainNodeSetGrpVec = c.domainManager.domainNodeSetGrpVec
        domainValue.DomainZoneName2IdMap = c.domainManager.ZoneName2DomainIdMap
        if c.domainManager.dataRatioLimit > 0 {
                log.LogInfof("action[putZoneDomain] ratio %v", c.domainManager.dataRatioLimit)
                domainValue.DataRatio = c.domainManager.dataRatioLimit
        } else {
                domainValue.DataRatio = defaultDomainUsageThreshold
        }
        if c.domainManager.excludeZoneUseRatio > 0 && c.domainManager.excludeZoneUseRatio <= 1 {
                domainValue.ExcludeZoneUseRatio = c.domainManager.excludeZoneUseRatio
        } else {
                domainValue.ExcludeZoneUseRatio = defaultDomainUsageThreshold
        }

        metadata.V, err = json.Marshal(domainValue)
        if err != nil {
                return
        }
        return c.submit(metadata)
}

func (c *Cluster) loadZoneDomain() (ok bool, err error) {
        log.LogInfof("action[loadZoneDomain]")
        result, err := c.fsm.store.SeekForPrefix([]byte(DomainPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadZoneDomain],err:%v", err.Error())
                log.LogInfof("action[loadZoneDomain] err[%v]", err)
                return false, err
        }
        if len(result) == 0 {
                err = fmt.Errorf("action[loadZoneDomain],err:not found")
                log.LogInfof("action[loadZoneDomain] err[%v]", err)
                return false, nil
        }
        for _, value := range result {
                nsv := &zoneDomainValue{}
                if err = json.Unmarshal(value, nsv); err != nil {
                        log.LogErrorf("action[loadNodeSets], unmarshal err:%v", err.Error())
                        return true, err
                }
                log.LogInfof("action[loadZoneDomain] get value!exclue map[%v],need domain[%v] ratio [%v]", nsv.ExcludeZoneMap, nsv.NeedFaultDomain, nsv.DataRatio)
                c.domainManager.excludeZoneListDomain = nsv.ExcludeZoneMap
                for zoneName := range nsv.ExcludeZoneMap {
                        c.t.domainExcludeZones = append(c.t.domainExcludeZones, zoneName)
                }

                c.needFaultDomain = nsv.NeedFaultDomain
                c.domainManager.dataRatioLimit = nsv.DataRatio
                c.domainManager.ZoneName2DomainIdMap = nsv.DomainZoneName2IdMap
                c.domainManager.excludeZoneUseRatio = nsv.ExcludeZoneUseRatio

                for zoneName, domainId := range c.domainManager.ZoneName2DomainIdMap {
                        log.LogInfof("action[loadZoneDomain] zoneName %v domainid %v", zoneName, domainId)
                        if domainIndex, ok := c.domainManager.domainId2IndexMap[domainId]; !ok {
                                log.LogInfof("action[loadZoneDomain] zoneName %v domainid %v build new domainnodesetgrp manager", zoneName, domainId)
                                domainGrp := newDomainNodeSetGrpManager()
                                domainGrp.domainId = domainId
                                c.domainManager.domainNodeSetGrpVec = append(c.domainManager.domainNodeSetGrpVec, domainGrp)
                                domainIndex = len(c.domainManager.domainNodeSetGrpVec) - 1
                                c.domainManager.domainId2IndexMap[domainId] = domainIndex
                        }
                }

                break
        }
        log.LogInfof("action[loadZoneDomain] success!")
        return true, nil
}

func (c *Cluster) loadNodeSetGrps() (err error) {
        log.LogInfof("action[loadNodeSetGrps]")
        result, err := c.fsm.store.SeekForPrefix([]byte(nodeSetGrpPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadNodeSets],err:%v", err.Error())
                log.LogInfof("action[loadNodeSetGrps] seek failed, nsgId[%v]", err)
                return err
        }
        if len(result) > 0 {
                log.LogInfof("action[loadNodeSetGrps] get result len[%v]", len(result))
                c.domainManager.start()
        }
        log.LogInfof("action[loadNodeSetGrps] get result len[%v] before decode", len(result))
        for _, value := range result {
                domainInfoLoad := &domainNodeSetGrpValue{}
                if err = json.Unmarshal(value, domainInfoLoad); err != nil {
                        log.LogFatalf("action[loadNodeSets], unmarshal err:%v", err.Error())
                        return err
                }
                log.LogInfof("action[loadNodeSetGrps] get result domainid [%v] domainInfoLoad id[%v],status[%v],ids[%v]",
                        domainInfoLoad.DomainId, domainInfoLoad.ID, domainInfoLoad.Status, domainInfoLoad.NodeSetsIds)
                nsg := newNodeSetGrp(c)
                nsg.nodeSetsIds = domainInfoLoad.NodeSetsIds
                nsg.ID = domainInfoLoad.ID
                nsg.status = domainInfoLoad.Status
                nsg.domainId = domainInfoLoad.DomainId
                domainId := domainInfoLoad.DomainId

                var domainIndex int
                var ok bool
                var domainGrp *DomainNodeSetGrpManager
                if domainIndex, ok = c.domainManager.domainId2IndexMap[domainId]; !ok {
                        domainGrp = newDomainNodeSetGrpManager()
                        domainGrp.domainId = domainId
                        c.domainManager.domainNodeSetGrpVec = append(c.domainManager.domainNodeSetGrpVec, domainGrp)
                        domainIndex = len(c.domainManager.domainNodeSetGrpVec) - 1
                        c.domainManager.domainId2IndexMap[domainId] = domainIndex
                }
                domainGrp = c.domainManager.domainNodeSetGrpVec[domainIndex]
                domainGrp.nodeSetGrpMap = append(domainGrp.nodeSetGrpMap, nsg)
                var j int
                for j = 0; j < len(domainInfoLoad.NodeSetsIds); j++ {
                        domainGrp.nsId2NsGrpMap[domainInfoLoad.NodeSetsIds[j]] = len(domainGrp.nodeSetGrpMap) - 1
                        log.LogInfof("action[loadNodeSetGrps] get result index[%v] nodesetid[%v] nodesetgrp index [%v]",
                                domainInfoLoad.ID, domainInfoLoad.NodeSetsIds[j], domainInfoLoad.Status)
                }
                log.LogInfof("action[loadNodeSetGrps], nsgId[%v],status[%v]", nsg.ID, nsg.status)
        }
        return
}

func (c *Cluster) loadDataNodes() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(dataNodePrefix))
        if err != nil {
                err = fmt.Errorf("action[loadDataNodes],err:%v", err.Error())
                return err
        }

        for _, value := range result {
                dnv := &dataNodeValue{}
                if err = json.Unmarshal(value, dnv); err != nil {
                        err = fmt.Errorf("action[loadDataNodes],value:%v,unmarshal err:%v", string(value), err)
                        return
                }
                if dnv.ZoneName == "" {
                        dnv.ZoneName = DefaultZoneName
                }
                dataNode := newDataNode(dnv.Addr, dnv.ZoneName, c.Name)
                dataNode.DpCntLimit = newDpCountLimiter(&c.cfg.MaxDpCntLimit)
                dataNode.ID = dnv.ID
                dataNode.NodeSetID = dnv.NodeSetID
                dataNode.RdOnly = dnv.RdOnly
                for _, disk := range dnv.DecommissionedDisks {
                        dataNode.addDecommissionedDisk(disk)
                }
                dataNode.DecommissionStatus = dnv.DecommissionStatus
                dataNode.DecommissionDstAddr = dnv.DecommissionDstAddr
                dataNode.DecommissionRaftForce = dnv.DecommissionRaftForce
                dataNode.DecommissionLimit = dnv.DecommissionLimit
                dataNode.DecommissionRetry = dnv.DecommissionRetry
                dataNode.DecommissionCompleteTime = dnv.DecommissionCompleteTime
                dataNode.ToBeOffline = dnv.ToBeOffline
                dataNode.DecommissionDiskList = dnv.DecommissionDiskList
                dataNode.DecommissionDpTotal = dnv.DecommissionDpTotal
                olddn, ok := c.dataNodes.Load(dataNode.Addr)
                if ok {
                        if olddn.(*DataNode).ID <= dataNode.ID {
                                log.LogDebugf("action[loadDataNodes]: skip addr %v old %v current %v", dataNode.Addr, olddn.(*DataNode).ID, dataNode.ID)
                                continue
                        }
                }
                c.dataNodes.Store(dataNode.Addr, dataNode)
                log.LogInfof("action[loadDataNodes],dataNode[%v],dataNodeID[%v],zone[%v],ns[%v]", dataNode.Addr, dataNode.ID, dnv.ZoneName, dnv.NodeSetID)
        }
        return
}

func (c *Cluster) loadMetaNodes() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(metaNodePrefix))
        if err != nil {
                err = fmt.Errorf("action[loadMetaNodes],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                mnv := &metaNodeValue{}
                if err = json.Unmarshal(value, mnv); err != nil {
                        err = fmt.Errorf("action[loadMetaNodes],unmarshal err:%v", err.Error())
                        return err
                }
                if mnv.ZoneName == "" {
                        mnv.ZoneName = DefaultZoneName
                }
                metaNode := newMetaNode(mnv.Addr, mnv.ZoneName, c.Name)
                metaNode.ID = mnv.ID
                metaNode.NodeSetID = mnv.NodeSetID
                metaNode.RdOnly = mnv.RdOnly

                oldmn, ok := c.metaNodes.Load(metaNode.Addr)
                if ok {
                        if oldmn.(*MetaNode).ID <= metaNode.ID {
                                continue
                        }
                }
                c.metaNodes.Store(metaNode.Addr, metaNode)
                log.LogInfof("action[loadMetaNodes],metaNode[%v], metaNodeID[%v],zone[%v],ns[%v]", metaNode.Addr, metaNode.ID, mnv.ZoneName, mnv.NodeSetID)
        }
        return
}

func (c *Cluster) loadVolsViews() (err error, volViews []*volValue) {
        result, err := c.fsm.store.SeekForPrefix([]byte(volPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadVols],err:%v", err.Error())
                return
        }
        for _, value := range result {
                var vv *volValue
                if vv, err = newVolValueFromBytes(value); err != nil {
                        err = fmt.Errorf("action[loadVols],value:%v,unmarshal err:%v", string(value), err)
                        return
                }

                volViews = append(volViews, vv)
                log.LogInfof("action[loadVols],vol[%v]", vv.Name)
        }
        return
}

func (c *Cluster) loadVols() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(volPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadVols],err:%v", err.Error())
                return err
        }
        for _, value := range result {
                var vv *volValue
                if vv, err = newVolValueFromBytes(value); err != nil {
                        err = fmt.Errorf("action[loadVols],value:%v,unmarshal err:%v", string(value), err)
                        return err
                }
                vol := newVolFromVolValue(vv)
                vol.Status = vv.Status

                if err = c.loadAclList(vol); err != nil {
                        log.LogInfof("action[loadVols],vol[%v] load acl manager error %v", vol.Name, err)
                        continue
                }

                if err = c.loadUidSpaceList(vol); err != nil {
                        log.LogInfof("action[loadVols],vol[%v] load uid manager error %v", vol.Name, err)
                        continue
                }

                if err = c.loadMultiVersion(vol); err != nil {
                        log.LogInfof("action[loadVols],vol[%v] load ver manager error %v c %v", vol.Name, err, c)
                        continue
                }

                c.putVol(vol)
                log.LogInfof("action[loadVols],vol[%v]", vol.Name)
        }
        return
}

func (c *Cluster) loadMetaPartitions() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(metaPartitionPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadMetaPartitions],err:%v", err.Error())
                return err
        }

        for _, value := range result {
                mpv := &metaPartitionValue{}
                if err = json.Unmarshal(value, mpv); err != nil {
                        err = fmt.Errorf("action[loadMetaPartitions],value:%v,unmarshal err:%v", string(value), err)
                        return err
                }
                vol, err1 := c.getVol(mpv.VolName)
                if err1 != nil {
                        log.LogErrorf("action[loadMetaPartitions] err:%v", err1.Error())
                        continue
                }
                if vol.ID != mpv.VolID {
                        Warn(c.Name, fmt.Sprintf("action[loadMetaPartitions] has duplicate vol[%v],vol.gridId[%v],mpv.VolID[%v]", mpv.VolName, vol.ID, mpv.VolID))
                        continue
                }
                for i := 0; i < len(mpv.Peers); i++ {
                        mn, ok := c.metaNodes.Load(mpv.Peers[i].Addr)
                        if ok && mn.(*MetaNode).ID != mpv.Peers[i].ID {
                                mpv.Peers[i].ID = mn.(*MetaNode).ID
                        }
                }
                mp := newMetaPartition(mpv.PartitionID, mpv.Start, mpv.End, vol.mpReplicaNum, vol.Name, mpv.VolID, 0)
                mp.setHosts(strings.Split(mpv.Hosts, underlineSeparator))
                mp.setPeers(mpv.Peers)
                mp.OfflinePeerID = mpv.OfflinePeerID
                mp.IsRecover = mpv.IsRecover
                vol.addMetaPartition(mp)
                c.addBadMetaParitionIdMap(mp)
                log.LogInfof("action[loadMetaPartitions],vol[%v],mp[%v]", vol.Name, mp.PartitionID)
        }
        return
}

func (c *Cluster) addBadMetaParitionIdMap(mp *MetaPartition) {
        if !mp.IsRecover {
                return
        }

        c.putBadMetaPartitions(mp.Hosts[0], mp.PartitionID)
}

func (c *Cluster) loadDataPartitions() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(dataPartitionPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadDataPartitions],err:%v", err.Error())
                return err
        }
        for _, value := range result {

                dpv := &dataPartitionValue{}
                if err = json.Unmarshal(value, dpv); err != nil {
                        err = fmt.Errorf("action[loadDataPartitions],value:%v,unmarshal err:%v", string(value), err)
                        return err
                }
                vol, err1 := c.getVol(dpv.VolName)
                if err1 != nil {
                        log.LogErrorf("action[loadDataPartitions] err:%v %v", dpv.VolName, err1.Error())
                        continue
                }
                if vol.ID != dpv.VolID {
                        Warn(c.Name, fmt.Sprintf("action[loadDataPartitions] has duplicate vol[%v],vol.gridId[%v],mpv.VolID[%v]", dpv.VolName, vol.ID, dpv.VolID))
                        continue
                }

                dp := dpv.Restore(c)
                vol.dataPartitions.put(dp)
                c.addBadDataPartitionIdMap(dp)
                // add to nodeset decommission list
                go dp.addToDecommissionList(c)
                log.LogInfof("action[loadDataPartitions],vol[%v],dp[%v] ", vol.Name, dp.PartitionID)
        }
        return
}

func (c *Cluster) loadQuota() (err error) {
        c.volMutex.RLock()
        defer c.volMutex.RUnlock()
        for name, vol := range c.vols {
                if err = vol.loadQuotaManager(c); err != nil {
                        log.LogErrorf("loadQuota loadQuotaManager vol [%v] fail err [%v]", name, err.Error())
                        return err
                }
        }
        return
}

// load s3api qos info to memory cache
func (c *Cluster) loadS3ApiQosInfo() (err error) {
        keyPrefix := S3QoSPrefix
        result, err := c.fsm.store.SeekForPrefix([]byte(keyPrefix))
        if err != nil {
                err = fmt.Errorf("loadS3ApiQosInfo get failed, err [%v]", err)
                return err
        }

        for key, value := range result {
                s3qosQuota, err := strconv.ParseUint(string(value), 10, 64)
                if err != nil {
                        return err
                }
                log.LogDebugf("loadS3ApiQosInfo key[%v] value[%v]", key, s3qosQuota)
                c.S3ApiQosQuota.Store(key, s3qosQuota)
        }
        return
}

func (c *Cluster) addBadDataPartitionIdMap(dp *DataPartition) {
        if !dp.IsDecommissionRunning() {
                return
        }
        c.putBadDataPartitionIDsByDiskPath(dp.DecommissionSrcDiskPath, dp.DecommissionSrcAddr, dp.PartitionID)
}

func (c *Cluster) syncAddDecommissionDisk(disk *DecommissionDisk) (err error) {
        return c.syncPutDecommissionDiskInfo(opSyncAddDecommissionDisk, disk)
}

func (c *Cluster) syncDeleteDecommissionDisk(disk *DecommissionDisk) (err error) {
        return c.syncPutDecommissionDiskInfo(opSyncDeleteDecommissionDisk, disk)
}

func (c *Cluster) syncUpdateDecommissionDisk(disk *DecommissionDisk) (err error) {
        return c.syncPutDecommissionDiskInfo(opSyncUpdateDecommissionDisk, disk)
}

func (c *Cluster) syncPutDecommissionDiskInfo(opType uint32, disk *DecommissionDisk) (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opType
        metadata.K = DecommissionDiskPrefix + disk.SrcAddr + keySeparator + disk.DiskPath
        ddv := newDecommissionDiskValue(disk)
        metadata.V, err = json.Marshal(ddv)
        if err != nil {
                return errors.New(err.Error())
        }
        return c.submit(metadata)
}

type decommissionDiskValue struct {
        SrcAddr                  string
        DstAddr                  string
        DiskPath                 string
        DecommissionStatus       uint32
        DecommissionRaftForce    bool
        DecommissionRetry        uint8
        DecommissionDpTotal      int
        DecommissionTerm         uint64
        Type                     uint32
        DecommissionCompleteTime int64
        DecommissionLimit        int
}

func newDecommissionDiskValue(disk *DecommissionDisk) *decommissionDiskValue {
        return &decommissionDiskValue{
                SrcAddr:                  disk.SrcAddr,
                DstAddr:                  disk.DstAddr,
                DiskPath:                 disk.DiskPath,
                DecommissionRetry:        disk.DecommissionRetry,
                DecommissionStatus:       atomic.LoadUint32(&disk.DecommissionStatus),
                DecommissionRaftForce:    disk.DecommissionRaftForce,
                DecommissionDpTotal:      disk.DecommissionDpTotal,
                DecommissionTerm:         disk.DecommissionTerm,
                Type:                     disk.Type,
                DecommissionCompleteTime: disk.DecommissionCompleteTime,
                DecommissionLimit:        disk.DecommissionDpCount,
        }
}

func (ddv *decommissionDiskValue) Restore() *DecommissionDisk {
        return &DecommissionDisk{
                SrcAddr:                  ddv.SrcAddr,
                DstAddr:                  ddv.DstAddr,
                DiskPath:                 ddv.DiskPath,
                DecommissionRetry:        ddv.DecommissionRetry,
                DecommissionStatus:       ddv.DecommissionStatus,
                DecommissionRaftForce:    ddv.DecommissionRaftForce,
                DecommissionDpTotal:      ddv.DecommissionDpTotal,
                DecommissionTerm:         ddv.DecommissionTerm,
                Type:                     ddv.Type,
                DecommissionCompleteTime: ddv.DecommissionCompleteTime,
                DecommissionDpCount:      ddv.DecommissionLimit,
        }
}

func (c *Cluster) loadDecommissionDiskList() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(DecommissionDiskPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadDataPartitions],err:%v", err.Error())
                return err
        }
        for _, value := range result {

                ddv := &decommissionDiskValue{}
                if err = json.Unmarshal(value, ddv); err != nil {
                        err = fmt.Errorf("action[loadDecommissionDiskList],value:%v,unmarshal err:%v", string(value), err)
                        return err
                }

                dd := ddv.Restore()
                c.DecommissionDisks.Store(dd.GenerateKey(), dd)
                log.LogInfof("action[loadDecommissionDiskList],decommissionDisk[%v] type %v dst[%v] status[%v] raftForce[%v]"+
                        "dpTotal[%v] term[%v]",
                        dd.GenerateKey(), dd.Type, dd.DstAddr, dd.GetDecommissionStatus(), dd.DecommissionRaftForce,
                        dd.DecommissionDpTotal, dd.DecommissionTerm)
                c.addDecommissionDiskToNodeset(dd)
        }
        return
}

func (c *Cluster) startDecommissionListTraverse() (err error) {
        zones := c.t.getAllZones()
        log.LogDebugf("startDecommissionListTraverse zones len %v", len(zones))
        for _, zone := range zones {
                log.LogDebugf("startDecommissionListTraverse zone %v ", zone.name)
                err = zone.startDecommissionListTraverse(c)
                if err != nil {
                        return
                }
        }
        return
}

func (c *Cluster) syncAddLcNode(ln *LcNode) (err error) {
        return c.syncPutLcNodeInfo(opSyncAddLcNode, ln)
}

func (c *Cluster) syncDeleteLcNode(ln *LcNode) (err error) {
        return c.syncPutLcNodeInfo(opSyncDeleteLcNode, ln)
}

func (c *Cluster) syncUpdateLcNode(ln *LcNode) (err error) {
        return c.syncPutLcNodeInfo(opSyncUpdateLcNode, ln)
}

func (c *Cluster) syncPutLcNodeInfo(opType uint32, ln *LcNode) (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opType
        metadata.K = lcNodePrefix + ln.Addr
        lnv := newLcNodeValue(ln)
        metadata.V, err = json.Marshal(lnv)
        if err != nil {
                return errors.New(err.Error())
        }
        return c.submit(metadata)
}

type lcNodeValue struct {
        ID   uint64
        Addr string
}

func newLcNodeValue(lcNode *LcNode) *lcNodeValue {
        return &lcNodeValue{
                ID:   lcNode.ID,
                Addr: lcNode.Addr,
        }
}

func (c *Cluster) loadLcNodes() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(lcNodePrefix))
        if err != nil {
                err = fmt.Errorf("action[loadLcNodes],err:%v", err.Error())
                return err
        }
        log.LogInfof("action[loadLcNodes], result count %v", len(result))
        for _, value := range result {
                lnv := &lcNodeValue{}
                if err = json.Unmarshal(value, lnv); err != nil {
                        err = fmt.Errorf("action[loadLcNodes],value:%v,unmarshal err:%v", string(value), err)
                        return
                }
                log.LogInfof("action[loadLcNodes], load lcNode[%v], lcNodeID[%v]", lnv.Addr, lnv.ID)
                lcNode := newLcNode(lnv.Addr, c.Name)
                lcNode.ID = lnv.ID
                c.lcNodes.Store(lcNode.Addr, lcNode)
                log.LogInfof("action[loadLcNodes], store lcNode[%v], lcNodeID[%v]", lcNode.Addr, lcNode.ID)
        }
        return
}

func (c *Cluster) syncAddLcConf(lcConf *bsProto.LcConfiguration) (err error) {
        return c.syncPutLcConfInfo(opSyncAddLcConf, lcConf)
}

func (c *Cluster) syncDeleteLcConf(lcConf *bsProto.LcConfiguration) (err error) {
        return c.syncPutLcConfInfo(opSyncDeleteLcConf, lcConf)
}

func (c *Cluster) syncUpdateLcConf(lcConf *bsProto.LcConfiguration) (err error) {
        return c.syncPutLcConfInfo(opSyncUpdateLcConf, lcConf)
}

func (c *Cluster) syncPutLcConfInfo(opType uint32, lcConf *bsProto.LcConfiguration) (err error) {
        metadata := new(RaftCmd)
        metadata.Op = opType
        metadata.K = lcConfPrefix + lcConf.VolName
        metadata.V, err = json.Marshal(lcConf)
        if err != nil {
                return errors.New(err.Error())
        }
        return c.submit(metadata)
}

func (c *Cluster) loadLcConfs() (err error) {
        result, err := c.fsm.store.SeekForPrefix([]byte(lcConfPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadLcConfs],err:%v", err.Error())
                return err
        }

        for _, value := range result {
                lcConf := &bsProto.LcConfiguration{}
                if err = json.Unmarshal(value, lcConf); err != nil {
                        err = fmt.Errorf("action[loadLcConfs],value:%v,unmarshal err:%v", string(value), err)
                        return
                }
                _ = c.lcMgr.SetS3BucketLifecycle(lcConf)
                log.LogInfof("action[loadLcConfs],vol[%v]", lcConf.VolName)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "io"

        "github.com/tecbot/gorocksdb"
)

// MetadataSnapshot represents the snapshot of a meta partition
type MetadataSnapshot struct {
        fsm      *MetadataFsm
        applied  uint64
        snapshot *gorocksdb.Snapshot
        iterator *gorocksdb.Iterator
}

// ApplyIndex implements the Snapshot interface
func (ms *MetadataSnapshot) ApplyIndex() uint64 {
        return ms.applied
}

// Close implements the Snapshot interface
func (ms *MetadataSnapshot) Close() {
        ms.fsm.store.ReleaseSnapshot(ms.snapshot)
}

// Next implements the Snapshot interface
func (ms *MetadataSnapshot) Next() (data []byte, err error) {
        md := new(RaftCmd)
        if ms.iterator.Valid() {
                key := ms.iterator.Key()
                md.K = string(key.Data())
                md.setOpType()
                value := ms.iterator.Value()
                if value != nil {
                        md.V = value.Data()
                }
                if data, err = md.Marshal(); err != nil {
                        err = fmt.Errorf("action[Next],marshal kv:%v,err:%v", md, err.Error())
                        return nil, err
                }
                ms.iterator.Next()
                return data, nil
        }
        return nil, io.EOF
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package mocktest

import (
        "bytes"
        "encoding/json"
        "fmt"
        "net"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
)

const (
        defaultUsedSize = 20 * util.GB
)

type MockDataServer struct {
        nodeID                          uint64
        TcpAddr                         string
        Zone                            string
        ClusterID                       string
        Total                           uint64
        Used                            uint64
        Available                       uint64
        CreatedPartitionWeights         uint64 // dataPartitionCnt*dataPartitionSize
        RemainWeightsForCreatePartition uint64 // all-useddataPartitionsWieghts
        CreatedPartitionCnt             uint64
        MaxWeightsForCreatePartition    uint64
        partitions                      []*MockDataPartition
        zoneName                        string
        mc                              *master.MasterClient
        sync.RWMutex
}

func NewMockDataServer(addr string, zoneName string) *MockDataServer {
        mds := &MockDataServer{
                TcpAddr:    addr,
                zoneName:   zoneName,
                partitions: make([]*MockDataPartition, 0),
                mc:         master.NewMasterClient([]string{hostAddr}, false),
        }

        return mds
}

func (mds *MockDataServer) Start() {
        mds.register()
        go mds.start()
}

func (mds *MockDataServer) register() {
        var err error
        var nodeID uint64
        var retry int
        for retry < 3 {
                nodeID, err = mds.mc.NodeAPI().AddDataNode(mds.TcpAddr, mds.zoneName)
                if err == nil {
                        break
                }
                time.Sleep(500 * time.Millisecond)
                retry++
        }
        if err != nil {
                panic(err)
        }
        mds.nodeID = nodeID
}

func (mds *MockDataServer) start() {
        listener, err := net.Listen("tcp", mds.TcpAddr)
        if err != nil {
                panic(err)
        }
        for {
                conn, err := listener.Accept()
                if err != nil {
                        panic(err)
                }
                go mds.serveConn(conn)
        }
}

func (mds *MockDataServer) serveConn(rc net.Conn) {
        conn, ok := rc.(*net.TCPConn)
        if !ok {
                rc.Close()
                return
        }
        conn.SetKeepAlive(true)
        conn.SetNoDelay(true)

        proto.InitBufferPool(int64(32768))

        req := proto.NewPacket()
        err := req.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime)
        if err != nil {
                return
        }
        adminTask := &proto.AdminTask{}
        decode := json.NewDecoder(bytes.NewBuffer(req.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                responseAckErrToMaster(conn, req, err)
                return
        }
        switch req.Opcode {
        case proto.OpCreateDataPartition:
                err = mds.handleCreateDataPartition(conn, req, adminTask)
                Printf("data node [%v] create data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpDeleteDataPartition:
                err = mds.handleDeleteDataPartition(conn, req)
                Printf("data node [%v] delete data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpDataNodeHeartbeat:
                err = mds.handleHeartbeats(conn, req, adminTask)
                Printf("data node [%v] report heartbeat to master,err:%v\n", mds.TcpAddr, err)
        case proto.OpLoadDataPartition:
                err = mds.handleLoadDataPartition(conn, req, adminTask)
                Printf("data node [%v] load data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpDecommissionDataPartition:
                err = mds.handleDecommissionDataPartition(conn, req, adminTask)
                Printf("data node [%v] decommission data partition,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpAddDataPartitionRaftMember:
                err = mds.handleAddDataPartitionRaftMember(conn, req, adminTask)
                Printf("data node [%v] add data partition raft member,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpRemoveDataPartitionRaftMember:
                err = mds.handleRemoveDataPartitionRaftMember(conn, req, adminTask)
                Printf("data node [%v] remove data partition raft member,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        case proto.OpDataPartitionTryToLeader:
                err = mds.handleTryToLeader(conn, req, adminTask)
                Printf("data node [%v] try to leader,id[%v],err:%v\n", mds.TcpAddr, adminTask.ID, err)
        default:
                fmt.Printf("unknown code [%v]\n", req.Opcode)
        }
}

func (mds *MockDataServer) handleAddDataPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        return
}

func (mds *MockDataServer) handleRemoveDataPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        return
}

func (mds *MockDataServer) handleTryToLeader(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        return
}

func (mds *MockDataServer) CheckVolPartition(name string, cond func(*MockDataPartition) bool) bool {
        mds.RLock()
        defer mds.RUnlock()
        for _, dp := range mds.partitions {
                if dp.VolName == name && !cond(dp) {
                        return false
                }
        }
        return true
}

func (mds *MockDataServer) handleDecommissionDataPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        defer func() {
                if err != nil {
                        responseAckErrToMaster(conn, p, err)
                } else {
                        p.PacketOkWithBody([]byte("/cfs"))
                        p.WriteToConn(conn)
                }
        }()
        // Marshal request body.
        requestJson, err := json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        // Unmarshal request to entity
        req := &proto.DataPartitionDecommissionRequest{}
        if err = json.Unmarshal(requestJson, req); err != nil {
                return
        }
        partitions := make([]*MockDataPartition, 0)
        mds.RLock()
        defer mds.RUnlock()
        for index, dp := range mds.partitions {
                if dp.PartitionID == req.PartitionId {
                        partitions = append(partitions, mds.partitions[:index]...)
                        partitions = append(partitions, mds.partitions[index+1:]...)
                }
        }
        if len(partitions) != 0 {
                mds.partitions = partitions
        }
        return
}

func (mds *MockDataServer) handleCreateDataPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        defer func() {
                if err != nil {
                        responseAckErrToMaster(conn, p, err)
                } else {
                        responseAckOKToMaster(conn, p, nil)
                }
        }()
        // Marshal request body.
        requestJson, err := json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        // Unmarshal request to entity
        req := &proto.CreateDataPartitionRequest{}
        if err = json.Unmarshal(requestJson, req); err != nil {
                return
        }
        // Create new  partition.
        partition := &MockDataPartition{
                PartitionID: req.PartitionId,
                VolName:     req.VolumeId,
                total:       req.PartitionSize,
                used:        defaultUsedSize,
        }
        partition.SetForbidden(false)
        mds.Lock()
        defer mds.Unlock()
        mds.partitions = append(mds.partitions, partition)
        return
}

func (mds *MockDataServer) checkVolumeForbidden(volNames []string, dp *MockDataPartition) {
        for _, volName := range volNames {
                if volName == dp.VolName {
                        dp.SetForbidden(true)
                        return
                }
        }
        dp.SetForbidden(false)
}

// Handle OpHeartbeat packet.
func (mds *MockDataServer) handleHeartbeats(conn net.Conn, pkg *proto.Packet, task *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, pkg, nil)
        response := &proto.DataNodeHeartbeatResponse{}
        req := &proto.HeartBeatRequest{}
        reqData, err := json.Marshal(task.Request)
        if err != nil {
                response.Status = proto.TaskFailed
                response.Result = err.Error()
                goto end
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                response.Status = proto.TaskFailed
                response.Result = err.Error()
                goto end
        }
        response.Status = proto.TaskSucceeds
        response.Used = 5 * util.GB
        response.Total = 1024 * util.GB
        response.Available = 1024 * util.GB
        response.CreatedPartitionCnt = 3
        response.TotalPartitionSize = 120 * util.GB
        response.MaxCapacity = 800 * util.GB
        response.RemainingCapacity = 800 * util.GB

        response.ZoneName = mds.zoneName
        response.PartitionReports = make([]*proto.DataPartitionReport, 0)

        mds.RLock()
        for _, partition := range mds.partitions {
                mds.checkVolumeForbidden(req.ForbiddenVols, partition)
                vr := &proto.DataPartitionReport{
                        PartitionID:     partition.PartitionID,
                        PartitionStatus: proto.ReadWrite,
                        Total:           120 * util.GB,
                        Used:            defaultUsedSize,
                        DiskPath:        "/cfs",
                        ExtentCount:     10,
                        NeedCompare:     true,
                        IsLeader:        true, // todo
                        VolName:         partition.VolName,
                }
                response.PartitionReports = append(response.PartitionReports, vr)
        }
        mds.RUnlock()

        task.Response = response
end:
        if err = mds.mc.NodeAPI().ResponseDataNodeTask(task); err != nil {
                return
        }
        return
}

func (mds *MockDataServer) handleDeleteDataPartition(conn net.Conn, pkg *proto.Packet) (err error) {
        err = responseAckOKToMaster(conn, pkg, nil)
        return
}

func (mds *MockDataServer) handleLoadDataPartition(conn net.Conn, pkg *proto.Packet, task *proto.AdminTask) (err error) {
        if err = responseAckOKToMaster(conn, pkg, nil); err != nil {
                return
        }
        // Marshal request body.
        requestJson, err := json.Marshal(task.Request)
        if err != nil {
                return
        }
        // Unmarshal request to entity
        req := &proto.LoadDataPartitionRequest{}
        if err = json.Unmarshal(requestJson, req); err != nil {
                return
        }
        partitionID := uint64(req.PartitionId)
        response := &proto.LoadDataPartitionResponse{}
        response.PartitionId = partitionID
        response.Used = defaultUsedSize
        response.PartitionSnapshot = buildSnapshot()
        response.Status = proto.TaskSucceeds
        var partition *MockDataPartition
        mds.RLock()
        for _, partition = range mds.partitions {
                if partition.PartitionID == partitionID {
                        break
                }
        }
        mds.RUnlock()
        if partition == nil {
                return
        }
        // response.VolName = partition.VolName
        task.Response = response
        if err = mds.mc.NodeAPI().ResponseDataNodeTask(task); err != nil {
                return
        }
        return
}

func buildSnapshot() (files []*proto.File) {
        files = make([]*proto.File, 0)
        f1 := &proto.File{
                Name:     "1",
                Crc:      4045512210,
                Size:     2 * util.MB,
                Modified: 1562507765,
        }
        files = append(files, f1)

        f2 := &proto.File{
                Name:     "2",
                Crc:      4045512210,
                Size:     2 * util.MB,
                Modified: 1562507765,
        }
        files = append(files, f2)

        f3 := &proto.File{
                Name:     "50000010",
                Crc:      4045512210,
                Size:     2 * util.MB,
                Modified: 1562507765,
        }
        files = append(files, f3)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package mocktest

import (
        "bytes"
        "encoding/json"
        "fmt"
        "net"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
)

type MockMetaServer struct {
        NodeID     uint64
        TcpAddr    string
        ZoneName   string
        mc         *master.MasterClient
        partitions map[uint64]*MockMetaPartition // Key: metaRangeId, Val: metaPartition
        sync.RWMutex
}

func NewMockMetaServer(addr string, zoneName string) *MockMetaServer {
        mms := &MockMetaServer{
                TcpAddr: addr, partitions: make(map[uint64]*MockMetaPartition, 0),
                ZoneName: zoneName,
                mc:       master.NewMasterClient([]string{hostAddr}, false),
        }
        return mms
}

func (mms *MockMetaServer) Start() {
        mms.register()
        go mms.start()
}

func (mms *MockMetaServer) register() {
        var err error
        var nodeID uint64
        var retry int
        for retry < 3 {
                nodeID, err = mms.mc.NodeAPI().AddMetaNode(mms.TcpAddr, mms.ZoneName)
                if err == nil {
                        break
                }
                time.Sleep(500 * time.Millisecond)
                retry++
        }
        if err != nil {
                panic(err)
        }
        mms.NodeID = nodeID
}

func (mms *MockMetaServer) start() {
        s := strings.Split(mms.TcpAddr, ColonSeparator)
        listener, err := net.Listen("tcp", ":"+s[1])
        if err != nil {
                panic(err)
        }
        for {
                conn, err := listener.Accept()
                if err != nil {
                        fmt.Printf("accept conn occurred error,err is [%v]", err)
                }
                go mms.serveConn(conn)
        }
}

func (mms *MockMetaServer) serveConn(rc net.Conn) {
        Printf("remote[%v],local[%v]\n", rc.RemoteAddr(), rc.LocalAddr())
        conn, ok := rc.(*net.TCPConn)
        if !ok {
                rc.Close()
                return
        }
        conn.SetKeepAlive(true)
        conn.SetNoDelay(true)
        req := proto.NewPacket()
        err := req.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime)
        if err != nil {
                fmt.Printf("remote [%v] err is [%v]\n", conn.RemoteAddr(), err)
                return
        }
        Printf("remote [%v] req [%v]\n", conn.RemoteAddr(), req.GetOpMsg())
        adminTask := &proto.AdminTask{}
        decode := json.NewDecoder(bytes.NewBuffer(req.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                responseAckErrToMaster(conn, req, err)
                return
        }
        switch req.Opcode {
        case proto.OpCreateMetaPartition:
                err = mms.handleCreateMetaPartition(conn, req, adminTask)
                Printf("meta node [%v] create meta partition,err:%v\n", mms.TcpAddr, err)
        case proto.OpMetaNodeHeartbeat:
                err = mms.handleHeartbeats(conn, req, adminTask)
                Printf("meta node [%v] heartbeat,err:%v\n", mms.TcpAddr, err)
        case proto.OpDeleteMetaPartition:
                err = mms.handleDeleteMetaPartition(conn, req, adminTask)
                Printf("meta node [%v] delete meta partition,err:%v\n", mms.TcpAddr, err)
        case proto.OpUpdateMetaPartition:
                err = mms.handleUpdateMetaPartition(conn, req, adminTask)
                Printf("meta node [%v] update meta partition,err:%v\n", mms.TcpAddr, err)
        case proto.OpLoadMetaPartition:
                err = mms.handleLoadMetaPartition(conn, req, adminTask)
                Printf("meta node [%v] load meta partition,err:%v\n", mms.TcpAddr, err)
        case proto.OpDecommissionMetaPartition:
                err = mms.handleDecommissionMetaPartition(conn, req, adminTask)
                Printf("meta node [%v] offline meta partition,err:%v\n", mms.TcpAddr, err)
        case proto.OpAddMetaPartitionRaftMember:
                err = mms.handleAddMetaPartitionRaftMember(conn, req, adminTask)
                Printf("meta node [%v] add data partition raft member,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
        case proto.OpRemoveMetaPartitionRaftMember:
                err = mms.handleRemoveMetaPartitionRaftMember(conn, req, adminTask)
                Printf("meta node [%v] remove data partition raft member,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
        case proto.OpMetaPartitionTryToLeader:
                err = mms.handleTryToLeader(conn, req, adminTask)
                Printf("meta node [%v] try to leader,id[%v],err:%v\n", mms.TcpAddr, adminTask.ID, err)
        default:
                fmt.Printf("unknown code [%v]\n", req.Opcode)
        }
}

func (mms *MockMetaServer) handleAddMetaPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        return
}

func (mms *MockMetaServer) handleRemoveMetaPartitionRaftMember(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        return
}

func (mms *MockMetaServer) handleTryToLeader(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        mms.Lock()
        mp := mms.partitions[adminTask.PartitionID]
        for i := range mp.Replicas {
                if mp.Replicas[i].IsLeader {
                        mp.Replicas[i].IsLeader = false
                }
                if mp.Replicas[i].Addr == adminTask.OperatorAddr {
                        mp.Replicas[i].IsLeader = true
                }
        }

        mms.Unlock()

        return
}

func (mms *MockMetaServer) CheckVolPartition(name string, cond func(*MockMetaPartition) bool) bool {
        mms.RLock()
        defer mms.RUnlock()
        for _, mp := range mms.partitions {
                if mp.VolName == name && !cond(mp) {
                        return false
                }
        }
        return true
}

func (mms *MockMetaServer) handleCreateMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        defer func() {
                if err != nil {
                        responseAckErrToMaster(conn, p, err)
                } else {
                        responseAckOKToMaster(conn, p, nil)
                }
        }()
        // Marshal request body.
        requestJson, err := json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        // Unmarshal request to entity
        req := &proto.CreateMetaPartitionRequest{}
        if err = json.Unmarshal(requestJson, req); err != nil {
                return
        }
        // Create new  metaPartition.
        replicas := make([]*MockMetaReplica, 0)
        for i, member := range req.Members {
                re := &MockMetaReplica{Addr: member.Addr, IsLeader: false}
                // only set only leader,choose the first member as leader mp
                if i == 0 {
                        re.IsLeader = true
                }
                replicas = append(replicas, re)
        }

        partition := &MockMetaPartition{
                PartitionID: req.PartitionID,
                VolName:     req.VolName,
                Start:       req.Start,
                End:         req.End,
                Cursor:      req.Start,
                Members:     req.Members,
                Replicas:    replicas,
        }
        partition.SetEnableAuditLog(true)
        partition.SetForbidden(false)
        mms.Lock()
        mms.partitions[req.PartitionID] = partition
        mms.Unlock()
        return
}

func (mms *MockMetaServer) checkForbiddenVolume(volNames []string, mp *MockMetaPartition) {
        for _, volName := range volNames {
                if mp.VolName == volName {
                        mp.SetForbidden(true)
                        return
                }
        }
        mp.SetForbidden(false)
}

func (mms *MockMetaServer) checkAuditLogVolume(volNames []string, mp *MockMetaPartition) {
        for _, volName := range volNames {
                if mp.VolName == volName {
                        mp.SetEnableAuditLog(false)
                        return
                }
        }
        mp.SetEnableAuditLog(true)
}

// Handle OpHeartbeat packet.
func (mms *MockMetaServer) handleHeartbeats(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        // For ack to master
        responseAckOKToMaster(conn, p, nil)
        var (
                req     = &proto.HeartBeatRequest{}
                resp    = &proto.MetaNodeHeartbeatResponse{}
                reqData []byte
        )
        reqData, err = json.Marshal(adminTask.Request)
        if err != nil {
                resp.Status = proto.TaskFailed
                resp.Result = err.Error()
                goto end
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                resp.Status = proto.TaskFailed
                resp.Result = err.Error()
                goto end
        }
        resp.Total = 10 * util.GB
        resp.MemUsed = 1 * util.GB
        // every partition used
        mms.RLock()
        for id, partition := range mms.partitions {
                mms.checkForbiddenVolume(req.ForbiddenVols, partition)
                mms.checkAuditLogVolume(req.DisableAuditVols, partition)
                mpr := &proto.MetaPartitionReport{
                        PartitionID: id,
                        Start:       partition.Start,
                        End:         partition.End,
                        Status:      proto.ReadWrite,
                        MaxInodeID:  partition.Start,
                        VolName:     partition.VolName,
                        IsLeader:    partition.isLeaderMetaNode(mms.TcpAddr),
                }
                mpr.Status = proto.ReadWrite
                resp.MetaPartitionReports = append(resp.MetaPartitionReports, mpr)
        }
        mms.RUnlock()
        resp.ZoneName = mms.ZoneName
        resp.Status = proto.TaskSucceeds
end:
        return mms.postResponseToMaster(adminTask, resp)
}

func (mms *MockMetaServer) postResponseToMaster(adminTask *proto.AdminTask, resp interface{}) (err error) {
        adminTask.Request = nil
        adminTask.Response = resp
        if err = mms.mc.NodeAPI().ResponseMetaNodeTask(adminTask); err != nil {
                return
        }
        return
}

func (mms *MockMetaServer) handleDeleteMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        req := &proto.DeleteMetaPartitionRequest{}
        reqData, err := json.Marshal(adminTask.Request)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, nil)
                responseAckErrToMaster(conn, p, err)
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, nil)
                responseAckErrToMaster(conn, p, err)
                return
        }
        resp := &proto.DeleteMetaPartitionResponse{
                PartitionID: req.PartitionID,
                Status:      proto.TaskSucceeds,
        }
        return mms.postResponseToMaster(adminTask, resp)
}

func (mms *MockMetaServer) handleUpdateMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        req := &proto.UpdateMetaPartitionRequest{}
        reqData, err := json.Marshal(adminTask.Request)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, nil)
                responseAckErrToMaster(conn, p, err)
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, nil)
                responseAckErrToMaster(conn, p, err)
                return
        }
        resp := &proto.UpdateMetaPartitionResponse{
                VolName:     req.VolName,
                PartitionID: req.PartitionID,
                End:         req.End,
        }
        mms.Lock()
        partition := mms.partitions[req.PartitionID]
        partition.End = req.End
        mms.Unlock()
        return mms.postResponseToMaster(adminTask, resp)
}

func (mms *MockMetaServer) handleLoadMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        var data []byte
        defer func() {
                if err != nil {
                        responseAckErrToMaster(conn, p, err)
                } else {
                        responseAckOKToMaster(conn, p, data)
                }
        }()
        req := &proto.MetaPartitionLoadRequest{}
        reqData, err := json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                return
        }
        resp := &proto.MetaPartitionLoadResponse{
                PartitionID: req.PartitionID,
                DoCompare:   true,
                ApplyID:     100,
                MaxInode:    123456,
                DentryCount: 123456,
        }
        data, err = json.Marshal(resp)
        if err != nil {
                return
        }
        return
}

func (mms *MockMetaServer) handleDecommissionMetaPartition(conn net.Conn, p *proto.Packet, adminTask *proto.AdminTask) (err error) {
        responseAckOKToMaster(conn, p, nil)
        req := &proto.MetaPartitionDecommissionRequest{}
        reqData, err := json.Marshal(adminTask.Request)
        if err != nil {
                return
        }
        if err = json.Unmarshal(reqData, req); err != nil {
                return
        }
        resp := &proto.MetaPartitionDecommissionResponse{
                PartitionID: req.PartitionID,
                VolName:     req.VolName,
                Status:      proto.TaskSucceeds,
        }
        return mms.postResponseToMaster(adminTask, resp)
}

package mocktest

import (
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
)

type MockDataPartition struct {
        PartitionID      uint64
        PersistenceHosts []string
        total            int
        used             uint64
        VolName          string
        Forbidden        int32
}

func (md *MockDataPartition) IsForbidden() bool {
        return atomic.LoadInt32(&md.Forbidden) != 0
}

func (md *MockDataPartition) SetForbidden(status bool) {
        val := 0
        if status {
                val = 1
        }
        atomic.StoreInt32(&md.Forbidden, int32(val))
}

type MockMetaPartition struct {
        PartitionID    uint64
        Start          uint64
        End            uint64
        Status         int8
        Cursor         uint64
        VolName        string
        Members        []proto.Peer
        Replicas       []*MockMetaReplica
        Forbidden      int32
        EnableAuditLog int32
}

// MockMetaReplica defines the replica of a meta partition
type MockMetaReplica struct {
        Addr        string
        start       uint64 // lower bound of the inode id
        end         uint64 // upper bound of the inode id
        dataSize    uint64
        nodeID      uint64
        MaxInodeID  uint64
        InodeCount  uint64
        DentryCount uint64
        ReportTime  int64
        Status      int8 // unavailable, readOnly, readWrite
        IsLeader    bool
}

func (mm *MockMetaPartition) isLeaderMetaNode(addr string) bool {
        for _, mr := range mm.Replicas {
                if mr.Addr == addr {
                        return mr.IsLeader
                }
        }

        return false
}

func (mm *MockMetaPartition) IsEnableAuditLog() bool {
        return atomic.LoadInt32(&mm.EnableAuditLog) != 0
}

func (mm *MockMetaPartition) SetEnableAuditLog(status bool) {
        val := 0
        if status {
                val = 1
        }
        atomic.StoreInt32(&mm.EnableAuditLog, int32(val))
}

func (mm *MockMetaPartition) IsForbidden() bool {
        return atomic.LoadInt32(&mm.Forbidden) != 0
}

func (mm *MockMetaPartition) SetForbidden(status bool) {
        val := 0
        if status {
                val = 1
        }
        atomic.StoreInt32(&mm.Forbidden, int32(val))
}

package mocktest

import (
        "bytes"
        "fmt"
        "io"
        "net"
        "net/http"
        "os"
        "testing"
        "time"

        "github.com/cubefs/cubefs/proto"
)

const (
        ColonSeparator = ":"
        hostAddr       = "127.0.0.1:8080"
)

var (
        LogOn   = os.Getenv("DOCKER_TESTING_LOG_OFF") == ""
        Print   = fmt.Print
        Printf  = fmt.Printf
        Println = fmt.Println
)

func init() {
        if !LogOn {
                SetOutput(io.Discard)
        }
}

// SetOutput reset fmt output writer.
func SetOutput(w io.Writer) {
        Print = func(a ...interface{}) (int, error) { return fmt.Fprint(w, a...) }
        Printf = func(format string, a ...interface{}) (int, error) { return fmt.Fprintf(w, format, a...) }
        Println = func(a ...interface{}) (int, error) { return fmt.Fprintln(w, a...) }
}

func Log(tb testing.TB, a ...interface{}) {
        if LogOn {
                tb.Log(a...)
        }
}

func responseAckOKToMaster(conn net.Conn, p *proto.Packet, data []byte) error {
        if len(data) != 0 {
                p.PacketOkWithBody(data)
        } else {
                p.PacketOkReply()
        }
        return p.WriteToConn(conn)
}

func responseAckErrToMaster(conn net.Conn, p *proto.Packet, err error) error {
        status := proto.OpErr
        buf := []byte(err.Error())
        p.PacketErrorWithBody(status, buf)
        p.ResultCode = proto.TaskFailed
        return p.WriteToConn(conn)
}

func PostToMaster(method, url string, reqData []byte) (resp *http.Response, err error) {
        client := &http.Client{}
        reader := bytes.NewReader(reqData)
        client.Timeout = time.Second * 3
        var req *http.Request
        if req, err = http.NewRequest(method, url, reader); err != nil {
                return
        }
        req.Header.Set("Content-Type", "application/json")
        req.Header.Set("Connection", "close")
        resp, err = client.Do(req)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "strconv"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

// metrics
const (
        StatPeriod                 = time.Minute * time.Duration(1)
        MetricDataNodesUsedGB      = "dataNodes_used_GB"
        MetricDataNodesTotalGB     = "dataNodes_total_GB"
        MetricDataNodesIncreasedGB = "dataNodes_increased_GB"
        MetricMetaNodesUsedGB      = "metaNodes_used_GB"
        MetricMetaNodesTotalGB     = "metaNodes_total_GB"
        MetricMetaNodesIncreasedGB = "metaNodes_increased_GB"
        MetricDataNodesCount       = "dataNodes_count"
        MetricMetaNodesCount       = "metaNodes_count"
        MetricNodeStat             = "node_stat"
        MetricVolCount             = "vol_count"
        MetricVolTotalGB           = "vol_total_GB"
        MetricVolUsedGB            = "vol_used_GB"
        MetricVolUsageGB           = "vol_usage_ratio"
        MetricVolMetaCount         = "vol_meta_count"
        MetricBadMpCount           = "bad_mp_count"
        MetricBadDpCount           = "bad_dp_count"
        MetricDiskError            = "disk_error"
        MetricDataNodesInactive    = "dataNodes_inactive"
        MetricInactiveDataNodeInfo = "inactive_dataNodes_info"
        MetricMetaNodesInactive    = "metaNodes_inactive"
        MetricDataNodesNotWritable = "dataNodes_not_writable"
        MetricMetaNodesNotWritable = "metaNodes_not_writable"
        MetricInactiveMataNodeInfo = "inactive_mataNodes_info"
        MetricMetaInconsistent     = "mp_inconsistent"
        MetricMasterNoLeader       = "master_no_leader"
        MetricMasterNoCache        = "master_no_cache"
        MetricMasterSnapshot       = "master_snapshot"

        MetricMissingDp                = "missing_dp"
        MetricDpNoLeader               = "dp_no_leader"
        MetricMissingMp                = "missing_mp"
        MetricMpNoLeader               = "mp_no_leader"
        MetricDataPartitionCount       = "dataPartition_count"
        MetricReplicaMissingDPCount    = "replica_missing_dp_count"
        MetricDpMissingLeaderCount     = "dp_missing_Leader_count"
        MetricMpMissingLeaderCount     = "mp_missing_Leader_count"
        MetricDataNodesetInactiveCount = "data_nodeset_inactive_count"
        MetricMetaNodesetInactiveCount = "meta_nodeset_inactive_count"

        MetricNodesetMetaTotalGB    = "nodeset_meta_total_GB"
        MetricNodesetMetaUsedGB     = "nodeset_meta_used_GB"
        MetricNodesetMetaUsageRadio = "nodeset_meta_usage_ratio"
        MetricNodesetDataTotalGB    = "nodeset_data_total_GB"
        MetricNodesetDataUsedGB     = "nodeset_data_used_GB"
        MetricNodesetDataUsageRadio = "nodeset_data_usage_ratio"
        MetricNodesetMpReplicaCount = "nodeset_mp_replica_count"
        MetricNodesetDpReplicaCount = "nodeset_dp_replica_count"

        MetricLcNodesConcurrentCount     = "lcNodes_concurrent"
        Metrics3LcTotalScanned           = "s3Lc_Total_Scanned"
        Metrics3LcTotalFileScanned       = "s3Lc_Total_File_Scanned"
        Metrics3LcTotalDirScanned        = "s3Lc_Total_DirS_canned"
        Metrics3LcTotalExpired           = "s3Lc_Total_Expired"
        Metrics3LcAbortedMultipartUpload = "s3Lc_Aborted_Multipart_Upload"
        MetricLcNodesCount               = "lc_nodes_count"
        MetricLcTotalScanned             = "lc_total_scanned"
        MetricLcTotalFileScanned         = "lc_total_file_scanned"
        MetricLcTotalDirScanned          = "lc_total_dirs_scanned"
        MetricLcTotalExpired             = "lc_total_expired"
)

var WarnMetrics *warningMetrics

type monitorMetrics struct {
        cluster                  *Cluster
        dataNodesCount           *exporter.Gauge
        metaNodesCount           *exporter.Gauge
        volCount                 *exporter.Gauge
        dataNodesTotal           *exporter.Gauge
        dataNodesUsed            *exporter.Gauge
        dataNodeIncreased        *exporter.Gauge
        metaNodesTotal           *exporter.Gauge
        metaNodesUsed            *exporter.Gauge
        metaNodesIncreased       *exporter.Gauge
        volTotalSpace            *exporter.GaugeVec
        volUsedSpace             *exporter.GaugeVec
        volUsage                 *exporter.GaugeVec
        volMetaCount             *exporter.GaugeVec
        badMpCount               *exporter.Gauge
        badDpCount               *exporter.Gauge
        diskError                *exporter.GaugeVec
        dataNodesNotWritable     *exporter.Gauge
        metaNodesNotWritable     *exporter.Gauge
        dataNodesInactive        *exporter.Gauge
        InactiveDataNodeInfo     *exporter.GaugeVec
        metaNodesInactive        *exporter.Gauge
        InactiveMataNodeInfo     *exporter.GaugeVec
        dataPartitionCount       *exporter.Gauge
        ReplicaMissingDPCount    *exporter.Gauge
        DpMissingLeaderCount     *exporter.Gauge
        MpMissingLeaderCount     *exporter.Gauge
        dataNodesetInactiveCount *exporter.GaugeVec
        metaNodesetInactiveCount *exporter.GaugeVec
        metaEqualCheckFail       *exporter.GaugeVec
        masterNoLeader           *exporter.Gauge
        masterNoCache            *exporter.GaugeVec
        masterSnapshot           *exporter.Gauge
        nodesetMetaTotal         *exporter.GaugeVec
        nodesetMetaUsed          *exporter.GaugeVec
        nodesetMetaUsageRatio    *exporter.GaugeVec
        nodesetDataTotal         *exporter.GaugeVec
        nodesetDataUsed          *exporter.GaugeVec
        nodesetDataUsageRatio    *exporter.GaugeVec
        nodesetMpReplicaCount    *exporter.GaugeVec
        nodesetDpReplicaCount    *exporter.GaugeVec
        nodeStat                 *exporter.GaugeVec

        volNames                      map[string]struct{}
        badDisks                      map[string]string
        nodesetInactiveDataNodesCount map[uint64]int64
        nodesetInactiveMetaNodesCount map[uint64]int64
        inconsistentMps               map[string]string
        nodesetIds                    map[uint64]string

        lcNodesCount       *exporter.Gauge
        lcVolNames         map[string]struct{}
        lcTotalScanned     *exporter.GaugeVec
        lcTotalFileScanned *exporter.GaugeVec
        lcTotalDirScanned  *exporter.GaugeVec
        lcTotalExpired     *exporter.GaugeVec
}

func newMonitorMetrics(c *Cluster) *monitorMetrics {
        return &monitorMetrics{
                cluster:                       c,
                volNames:                      make(map[string]struct{}),
                badDisks:                      make(map[string]string),
                nodesetInactiveDataNodesCount: make(map[uint64]int64),
                nodesetInactiveMetaNodesCount: make(map[uint64]int64),
                inconsistentMps:               make(map[string]string),
                lcVolNames:                    make(map[string]struct{}),
        }
}

type voidType struct{}

var voidVal voidType

type addrSet struct {
        addrs        map[string]voidType // empty value of map does not occupy memory
        replicaNum   string
        replicaAlive string
}

type warningMetrics struct {
        cluster               *Cluster
        missingDp             *exporter.GaugeVec
        dpNoLeader            *exporter.GaugeVec
        missingMp             *exporter.GaugeVec
        mpNoLeader            *exporter.GaugeVec
        dpMutex               sync.Mutex
        mpMutex               sync.Mutex
        dpNoLeaderInfo        map[uint64]int64
        mpNoLeaderInfo        map[uint64]int64
        dpMissingReplicaMutex sync.Mutex
        mpMissingReplicaMutex sync.Mutex
        dpMissingReplicaInfo  map[string]addrSet
        mpMissingReplicaInfo  map[string]addrSet
}

func newWarningMetrics(c *Cluster) *warningMetrics {
        return &warningMetrics{
                cluster:              c,
                missingDp:            exporter.NewGaugeVec(MetricMissingDp, "", []string{"clusterName", "partitionID", "addr", "ReplicaAlive", "ReplicaNum"}),
                dpNoLeader:           exporter.NewGaugeVec(MetricDpNoLeader, "", []string{"clusterName", "partitionID"}),
                missingMp:            exporter.NewGaugeVec(MetricMissingMp, "", []string{"clusterName", "partitionID", "addr"}),
                mpNoLeader:           exporter.NewGaugeVec(MetricMpNoLeader, "", []string{"clusterName", "partitionID"}),
                dpNoLeaderInfo:       make(map[uint64]int64),
                mpNoLeaderInfo:       make(map[uint64]int64),
                dpMissingReplicaInfo: make(map[string]addrSet),
                mpMissingReplicaInfo: make(map[string]addrSet),
        }
}

func (m *warningMetrics) reset() {
        log.LogInfo("action[warningMetrics] reset all")
        m.dpMutex.Lock()
        for dp := range m.dpNoLeaderInfo {
                m.dpNoLeader.DeleteLabelValues(m.cluster.Name, strconv.FormatUint(dp, 10))
                delete(m.dpNoLeaderInfo, dp)
        }
        m.dpMutex.Unlock()

        m.mpMutex.Lock()
        for mp := range m.mpNoLeaderInfo {
                m.mpNoLeader.DeleteLabelValues(m.cluster.Name, strconv.FormatUint(mp, 10))
                delete(m.mpNoLeaderInfo, mp)
        }
        m.mpMutex.Unlock()

        m.dpMissingReplicaMutex.Lock()
        for id, dpAddrSet := range m.dpMissingReplicaInfo {
                for addr := range dpAddrSet.addrs {
                        m.missingDp.DeleteLabelValues(m.cluster.Name, id, addr, dpAddrSet.replicaAlive, dpAddrSet.replicaNum)
                }
                delete(m.dpMissingReplicaInfo, id)
        }
        m.dpMissingReplicaMutex.Unlock()

        m.mpMissingReplicaMutex.Lock()
        for id, mpAddrSet := range m.mpMissingReplicaInfo {
                for addr := range mpAddrSet.addrs {
                        m.missingMp.DeleteLabelValues(m.cluster.Name, id, addr)
                }
                delete(m.mpMissingReplicaInfo, id)
        }
        m.mpMissingReplicaMutex.Unlock()
}

// The caller is responsible for lock
func (m *warningMetrics) deleteMissingDp(missingDpAddrSet addrSet, clusterName, dpId, addr string) {
        if len(missingDpAddrSet.addrs) == 0 {
                return
        }

        if _, ok := missingDpAddrSet.addrs[addr]; !ok {
                return
        }

        replicaAlive := m.dpMissingReplicaInfo[dpId].replicaAlive
        replicaNum := m.dpMissingReplicaInfo[dpId].replicaNum

        delete(missingDpAddrSet.addrs, addr)
        if len(missingDpAddrSet.addrs) == 0 {
                delete(m.dpMissingReplicaInfo, dpId)
        }

        m.missingDp.DeleteLabelValues(clusterName, dpId, addr, replicaAlive, replicaNum)
        log.LogDebugf("action[deleteMissingDp] delete: dpId(%v), addr(%v)", dpId, addr)
}

// leader only
func (m *warningMetrics) WarnMissingDp(clusterName, addr string, partitionID uint64, report bool) {
        m.dpMissingReplicaMutex.Lock()
        defer m.dpMissingReplicaMutex.Unlock()
        if clusterName != m.cluster.Name {
                return
        }
        id := strconv.FormatUint(partitionID, 10)
        if !report {
                m.deleteMissingDp(m.dpMissingReplicaInfo[id], clusterName, id, addr)
                return
        }

        // m.missingDp.SetWithLabelValues(1, clusterName, id, addr)
        if _, ok := m.dpMissingReplicaInfo[id]; !ok {
                m.dpMissingReplicaInfo[id] = addrSet{addrs: make(map[string]voidType)}
                // m.dpMissingReplicaInfo[id].addrs = make(addrSet)
        }
        m.dpMissingReplicaInfo[id].addrs[addr] = voidVal
}

// leader only
func (m *warningMetrics) CleanObsoleteDpMissing(clusterName string, dp *DataPartition) {
        m.dpMissingReplicaMutex.Lock()
        defer m.dpMissingReplicaMutex.Unlock()
        if clusterName != m.cluster.Name {
                return
        }
        id := strconv.FormatUint(dp.PartitionID, 10)

        missingRepAddrs, ok := m.dpMissingReplicaInfo[id]
        if !ok {
                return
        }

        for addr := range missingRepAddrs.addrs {
                _, hasReplica := dp.hasReplica(addr)
                hasHost := dp.hasHost(addr)

                if !hasReplica && !hasHost {
                        log.LogDebugf("action[warningMetrics] delete obsolete dp missing record: dpId(%v), addr(%v)", id, addr)
                        m.deleteMissingDp(missingRepAddrs, clusterName, id, addr)
                }
        }
}

// leader only
func (m *warningMetrics) WarnDpNoLeader(clusterName string, partitionID uint64, report bool) {
        if clusterName != m.cluster.Name {
                return
        }

        m.dpMutex.Lock()
        defer m.dpMutex.Unlock()
        t, ok := m.dpNoLeaderInfo[partitionID]
        if !report {
                if ok {
                        delete(m.dpNoLeaderInfo, partitionID)
                        m.dpNoLeader.DeleteLabelValues(clusterName, strconv.FormatUint(partitionID, 10))
                }
                return
        }

        now := time.Now().Unix()
        if !ok {
                m.dpNoLeaderInfo[partitionID] = now
                return
        }
        if now-t > m.cluster.cfg.DpNoLeaderReportIntervalSec {
                m.dpNoLeader.SetWithLabelValues(1, clusterName, strconv.FormatUint(partitionID, 10))
                m.dpNoLeaderInfo[partitionID] = now
        }
}

// The caller is responsible for lock
func (m *warningMetrics) deleteMissingMp(missingMpAddrSet addrSet, clusterName, mpId, addr string) {
        if len(missingMpAddrSet.addrs) == 0 {
                return
        }

        if _, ok := missingMpAddrSet.addrs[addr]; !ok {
                return
        }

        delete(missingMpAddrSet.addrs, addr)
        if len(missingMpAddrSet.addrs) == 0 {
                delete(m.mpMissingReplicaInfo, mpId)
        }

        m.missingMp.DeleteLabelValues(clusterName, mpId, addr)
        log.LogDebugf("action[deleteMissingMp] delete: mpId(%v), addr(%v)", mpId, addr)
}

// leader only
func (m *warningMetrics) WarnMissingMp(clusterName, addr string, partitionID uint64, report bool) {
        m.mpMissingReplicaMutex.Lock()
        defer m.mpMissingReplicaMutex.Unlock()
        if clusterName != m.cluster.Name {
                return
        }

        id := strconv.FormatUint(partitionID, 10)
        if !report {
                m.deleteMissingMp(m.mpMissingReplicaInfo[id], clusterName, id, addr)
                return
        }

        m.missingMp.SetWithLabelValues(1, clusterName, id, addr)
        if _, ok := m.mpMissingReplicaInfo[id]; !ok {
                m.dpMissingReplicaInfo[id] = addrSet{addrs: make(map[string]voidType)}
                // m.mpMissingReplicaInfo[id] = make(addrSet)
        }
        m.mpMissingReplicaInfo[id].addrs[addr] = voidVal
}

// leader only
func (m *warningMetrics) CleanObsoleteMpMissing(clusterName string, mp *MetaPartition) {
        m.mpMissingReplicaMutex.Lock()
        defer m.mpMissingReplicaMutex.Unlock()
        if clusterName != m.cluster.Name {
                return
        }
        id := strconv.FormatUint(mp.PartitionID, 10)

        missingRepAddrs, ok := m.mpMissingReplicaInfo[id]
        if !ok {
                return
        }

        for addr := range missingRepAddrs.addrs {
                if _, err := mp.getMetaReplica(addr); err != nil {
                        log.LogDebugf("action[warningMetrics] delete obsolete Mp missing record: dpId(%v), addr(%v)", id, addr)
                        m.deleteMissingMp(missingRepAddrs, clusterName, id, addr)
                }
        }
}

// leader only
func (m *warningMetrics) WarnMpNoLeader(clusterName string, partitionID uint64, report bool) {
        if clusterName != m.cluster.Name {
                return
        }
        m.mpMutex.Lock()
        defer m.mpMutex.Unlock()
        t, ok := m.mpNoLeaderInfo[partitionID]
        if !report {
                if ok {
                        delete(m.mpNoLeaderInfo, partitionID)
                        m.mpNoLeader.DeleteLabelValues(clusterName, strconv.FormatUint(partitionID, 10))
                }
                return
        }

        now := time.Now().Unix()

        if !ok {
                m.mpNoLeaderInfo[partitionID] = now
                return
        }

        if now-t > m.cluster.cfg.MpNoLeaderReportIntervalSec {
                m.mpNoLeader.SetWithLabelValues(1, clusterName, strconv.FormatUint(partitionID, 10))
                m.mpNoLeaderInfo[partitionID] = now
        }
}

func (mm *monitorMetrics) start() {
        mm.dataNodesTotal = exporter.NewGauge(MetricDataNodesTotalGB)
        mm.dataNodesUsed = exporter.NewGauge(MetricDataNodesUsedGB)
        mm.dataNodeIncreased = exporter.NewGauge(MetricDataNodesIncreasedGB)
        mm.metaNodesTotal = exporter.NewGauge(MetricMetaNodesTotalGB)
        mm.metaNodesUsed = exporter.NewGauge(MetricMetaNodesUsedGB)
        mm.metaNodesIncreased = exporter.NewGauge(MetricMetaNodesIncreasedGB)
        mm.dataNodesCount = exporter.NewGauge(MetricDataNodesCount)
        mm.metaNodesCount = exporter.NewGauge(MetricMetaNodesCount)
        mm.lcNodesCount = exporter.NewGauge(MetricLcNodesCount)
        mm.volCount = exporter.NewGauge(MetricVolCount)
        mm.volTotalSpace = exporter.NewGaugeVec(MetricVolTotalGB, "", []string{"volName"})
        mm.volUsedSpace = exporter.NewGaugeVec(MetricVolUsedGB, "", []string{"volName"})
        mm.volUsage = exporter.NewGaugeVec(MetricVolUsageGB, "", []string{"volName"})
        mm.volMetaCount = exporter.NewGaugeVec(MetricVolMetaCount, "", []string{"volName", "type"})
        mm.badMpCount = exporter.NewGauge(MetricBadMpCount)
        mm.badDpCount = exporter.NewGauge(MetricBadDpCount)
        mm.diskError = exporter.NewGaugeVec(MetricDiskError, "", []string{"addr", "path"})
        mm.nodeStat = exporter.NewGaugeVec(MetricNodeStat, "", []string{"type", "addr", "stat"})
        mm.dataNodesInactive = exporter.NewGauge(MetricDataNodesInactive)
        mm.InactiveDataNodeInfo = exporter.NewGaugeVec(MetricInactiveDataNodeInfo, "", []string{"clusterName", "addr"})
        mm.metaNodesInactive = exporter.NewGauge(MetricMetaNodesInactive)
        mm.dataNodesNotWritable = exporter.NewGauge(MetricDataNodesNotWritable)
        mm.metaNodesNotWritable = exporter.NewGauge(MetricMetaNodesNotWritable)
        mm.InactiveMataNodeInfo = exporter.NewGaugeVec(MetricInactiveMataNodeInfo, "", []string{"clusterName", "addr"})
        mm.dataPartitionCount = exporter.NewGauge(MetricDataPartitionCount)
        mm.ReplicaMissingDPCount = exporter.NewGauge(MetricReplicaMissingDPCount)
        mm.DpMissingLeaderCount = exporter.NewGauge(MetricDpMissingLeaderCount)
        mm.MpMissingLeaderCount = exporter.NewGauge(MetricMpMissingLeaderCount)
        mm.dataNodesetInactiveCount = exporter.NewGaugeVec(MetricDataNodesetInactiveCount, "", []string{"nodeset"})
        mm.metaNodesetInactiveCount = exporter.NewGaugeVec(MetricMetaNodesetInactiveCount, "", []string{"nodeset"})
        mm.metaEqualCheckFail = exporter.NewGaugeVec(MetricMetaInconsistent, "", []string{"volume", "mpId"})

        mm.masterSnapshot = exporter.NewGauge(MetricMasterSnapshot)
        mm.masterNoLeader = exporter.NewGauge(MetricMasterNoLeader)
        mm.masterNoCache = exporter.NewGaugeVec(MetricMasterNoCache, "", []string{"volName"})

        mm.nodesetMetaTotal = exporter.NewGaugeVec(MetricNodesetMetaTotalGB, "", []string{"nodeset"})
        mm.nodesetMetaUsed = exporter.NewGaugeVec(MetricNodesetMetaUsedGB, "", []string{"nodeset"})
        mm.nodesetMetaUsageRatio = exporter.NewGaugeVec(MetricNodesetMetaUsageRadio, "", []string{"nodeset"})
        mm.nodesetDataTotal = exporter.NewGaugeVec(MetricNodesetDataTotalGB, "", []string{"nodeset"})
        mm.nodesetDataUsed = exporter.NewGaugeVec(MetricNodesetDataUsedGB, "", []string{"nodeset"})
        mm.nodesetDataUsageRatio = exporter.NewGaugeVec(MetricNodesetDataUsageRadio, "", []string{"nodeset"})
        mm.nodesetMpReplicaCount = exporter.NewGaugeVec(MetricNodesetMpReplicaCount, "", []string{"nodeset"})
        mm.nodesetDpReplicaCount = exporter.NewGaugeVec(MetricNodesetDpReplicaCount, "", []string{"nodeset"})

        mm.lcNodesCount = exporter.NewGauge(MetricLcNodesCount)
        mm.lcTotalScanned = exporter.NewGaugeVec(MetricLcTotalScanned, "", []string{"volName", "type"})
        mm.lcTotalFileScanned = exporter.NewGaugeVec(MetricLcTotalFileScanned, "", []string{"volName", "type"})
        mm.lcTotalDirScanned = exporter.NewGaugeVec(MetricLcTotalDirScanned, "", []string{"volName", "type"})
        mm.lcTotalExpired = exporter.NewGaugeVec(MetricLcTotalExpired, "", []string{"volName", "type"})
        go mm.statMetrics()
}

func (mm *monitorMetrics) statMetrics() {
        ticker := time.NewTicker(StatPeriod)
        defer func() {
                if err := recover(); err != nil {
                        ticker.Stop()
                        log.LogErrorf("statMetrics panic,msg:%v", err)
                }
        }()

        for {
                select {
                case <-ticker.C:
                        partition := mm.cluster.partition
                        if partition != nil && partition.IsRaftLeader() {
                                mm.resetFollowerMetrics()
                                mm.doStat()
                        } else {
                                mm.resetAllLeaderMetrics()
                                mm.doFollowerStat()
                        }
                }
        }
}

func (mm *monitorMetrics) doFollowerStat() {
        if mm.cluster.leaderInfo.addr == "" {
                mm.masterNoLeader.Set(1)
        } else {
                mm.masterNoLeader.Set(0)
        }
        if mm.cluster.fsm.onSnapshot {
                mm.masterSnapshot.Set(1)
        } else {
                mm.masterSnapshot.Set(0)
        }
        mm.setVolNoCacheMetrics()
}

func (mm *monitorMetrics) doStat() {
        dataNodeCount := mm.cluster.dataNodeCount()
        mm.dataNodesCount.Set(float64(dataNodeCount))
        metaNodeCount := mm.cluster.metaNodeCount()
        mm.metaNodesCount.Set(float64(metaNodeCount))
        lcNodeCount := mm.cluster.lcNodeCount()
        mm.lcNodesCount.Set(float64(lcNodeCount))
        volCount := len(mm.cluster.vols)
        mm.volCount.Set(float64(volCount))
        mm.dataNodesTotal.Set(float64(mm.cluster.dataNodeStatInfo.TotalGB))
        mm.dataNodesUsed.Set(float64(mm.cluster.dataNodeStatInfo.UsedGB))
        mm.dataNodeIncreased.Set(float64(mm.cluster.dataNodeStatInfo.IncreasedGB))
        mm.metaNodesTotal.Set(float64(mm.cluster.metaNodeStatInfo.TotalGB))
        mm.metaNodesUsed.Set(float64(mm.cluster.metaNodeStatInfo.UsedGB))
        mm.metaNodesIncreased.Set(float64(mm.cluster.metaNodeStatInfo.IncreasedGB))
        mm.setVolMetrics()
        mm.setBadPartitionMetrics()
        mm.setDiskErrorMetric()
        mm.setNotWritableDataNodesCount()
        mm.setNotWritableMetaNodesCount()
        mm.setMpInconsistentErrorMetric()
        mm.setMpAndDpMetrics()
        mm.setNodesetMetrics()
        mm.setLcMetrics()
        mm.updateDataNodesStat()
        mm.updateMetaNodesStat()
}

func (mm *monitorMetrics) setMpAndDpMetrics() {
        dpCount := 0
        dpMissingReplicaDpCount := 0
        dpMissingLeaderCount := 0
        mpMissingLeaderCount := 0

        vols := mm.cluster.copyVols()
        for _, vol := range vols {
                if vol.Status == proto.VolStatusMarkDelete {
                        continue
                }
                var dps *DataPartitionMap
                dps = vol.dataPartitions
                dpCount += len(dps.partitions)
                for _, dp := range dps.partitions {
                        if dp.ReplicaNum > uint8(len(dp.liveReplicas(defaultDataPartitionTimeOutSec))) {
                                dpMissingReplicaDpCount++
                        }
                        if proto.IsNormalDp(dp.PartitionType) && dp.getLeaderAddr() == "" {
                                dpMissingLeaderCount++
                        }
                }
                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        if !mp.isLeaderExist() {
                                mpMissingLeaderCount++
                        }
                }
                vol.mpsLock.RUnlock()
        }

        mm.dataPartitionCount.Set(float64(dpCount))
        mm.ReplicaMissingDPCount.Set(float64(dpMissingReplicaDpCount))
        mm.DpMissingLeaderCount.Set(float64(dpMissingLeaderCount))

        mm.MpMissingLeaderCount.Set(float64(mpMissingLeaderCount))
        return
}

func (mm *monitorMetrics) setVolNoCacheMetrics() {
        deleteVolNames := make(map[string]struct{})
        ObsoleteVVolNames := make(map[string]struct{})

        mm.cluster.followerReadManager.rwMutex.RLock()
        for volName, stat := range mm.cluster.followerReadManager.status {
                if mm.cluster.followerReadManager.isVolRecordObsolete(volName) {
                        deleteVolNames[volName] = struct{}{}
                        ObsoleteVVolNames[volName] = struct{}{}
                        log.LogDebugf("setVolNoCacheMetrics: to deleteVolNames volName %v for vol becomes obsolete", volName)
                        continue
                }

                if stat == true {
                        deleteVolNames[volName] = struct{}{}
                        log.LogDebugf("setVolNoCacheMetrics: to deleteVolNames volName %v for status becomes ok", volName)
                        continue
                }
                log.LogWarnf("setVolNoCacheMetrics volName %v", volName)
                mm.masterNoCache.SetWithLabelValues(1, volName)
        }
        mm.cluster.followerReadManager.rwMutex.RUnlock()

        for volName := range deleteVolNames {
                mm.masterNoCache.DeleteLabelValues(volName)
        }

        mm.cluster.followerReadManager.DelObsoleteVolRecord(ObsoleteVVolNames)
}

func (mm *monitorMetrics) setVolMetrics() {
        deleteVolNames := make(map[string]struct{})
        for k, v := range mm.volNames {
                deleteVolNames[k] = v
                delete(mm.volNames, k)
        }

        mm.cluster.volStatInfo.Range(func(key, value interface{}) bool {
                volStatInfo, ok := value.(*volStatInfo)
                if !ok {
                        return true
                }
                volName, ok := key.(string)
                if !ok {
                        return true
                }
                mm.volNames[volName] = struct{}{}
                if _, ok := deleteVolNames[volName]; ok {
                        delete(deleteVolNames, volName)
                }

                mm.volTotalSpace.SetWithLabelValues(float64(volStatInfo.TotalSize)/float64(util.GB), volName)
                mm.volUsedSpace.SetWithLabelValues(float64(volStatInfo.UsedSize)/float64(util.GB), volName)
                usedRatio, e := strconv.ParseFloat(volStatInfo.UsedRatio, 64)
                if e == nil {
                        mm.volUsage.SetWithLabelValues(usedRatio, volName)
                }
                if usedRatio > volWarnUsedRatio {
                        WarnBySpecialKey("vol size used too high", fmt.Sprintf("vol: %v(total: %v, used: %v) has used(%v) to be full", volName, volStatInfo.TotalSize, volStatInfo.UsedRatio, volStatInfo.UsedSize))
                }

                return true
        })

        for volName, vol := range mm.cluster.allVols() {
                inodeCount := uint64(0)
                dentryCount := uint64(0)
                mpCount := uint64(0)
                freeListLen := uint64(0)
                for _, mpv := range vol.getMetaPartitionsView() {
                        inodeCount += mpv.InodeCount
                        dentryCount += mpv.DentryCount
                        mpCount += 1
                        freeListLen += mpv.FreeListLen
                }
                mm.volMetaCount.SetWithLabelValues(float64(inodeCount), volName, "inode")
                mm.volMetaCount.SetWithLabelValues(float64(dentryCount), volName, "dentry")
                mm.volMetaCount.SetWithLabelValues(float64(mpCount), volName, "mp")
                mm.volMetaCount.SetWithLabelValues(float64(vol.getDataPartitionsCount()), volName, "dp")
                mm.volMetaCount.SetWithLabelValues(float64(freeListLen), volName, "freeList")
        }

        for volName := range deleteVolNames {
                mm.deleteVolMetric(volName)
        }
}

func (mm *monitorMetrics) setBadPartitionMetrics() {
        badMpCount := uint64(0)
        mm.cluster.BadMetaPartitionIds.Range(func(key, value interface{}) bool {
                badMpCount += uint64(len(value.([]uint64)))
                return true
        })
        mm.badMpCount.SetWithLabels(float64(badMpCount), map[string]string{"type": "bad_mp"})

        badDpCount := uint64(0)
        mm.cluster.BadDataPartitionIds.Range(func(key, value interface{}) bool {
                badDpCount += uint64(len(value.([]uint64)))
                return true
        })
        mm.badDpCount.SetWithLabels(float64(badDpCount), map[string]string{"type": "bad_dp"})
}

func (mm *monitorMetrics) deleteVolMetric(volName string) {
        mm.volTotalSpace.DeleteLabelValues(volName)
        mm.volUsedSpace.DeleteLabelValues(volName)
        mm.volUsage.DeleteLabelValues(volName)
        mm.volMetaCount.DeleteLabelValues(volName, "inode")
        mm.volMetaCount.DeleteLabelValues(volName, "dentry")
        mm.volMetaCount.DeleteLabelValues(volName, "mp")
        mm.volMetaCount.DeleteLabelValues(volName, "dp")
        mm.volMetaCount.DeleteLabelValues(volName, "freeList")
}

func (mm *monitorMetrics) setMpInconsistentErrorMetric() {
        deleteMps := make(map[string]string)
        for k, v := range mm.inconsistentMps {
                deleteMps[k] = v
                delete(mm.inconsistentMps, k)
        }
        mm.cluster.volMutex.RLock()
        defer mm.cluster.volMutex.RUnlock()

        for _, vol := range mm.cluster.vols {
                if vol.Status == proto.VolStatusMarkDelete {
                        continue
                }
                vol.mpsLock.RLock()
                for _, mp := range vol.MetaPartitions {
                        if mp.IsRecover || mp.EqualCheckPass {
                                continue
                        }
                        idStr := strconv.FormatUint(mp.PartitionID, 10)
                        mm.metaEqualCheckFail.SetWithLabelValues(1, vol.Name, idStr)
                        mm.inconsistentMps[idStr] = vol.Name
                        log.LogWarnf("setMpInconsistentErrorMetric.mp %v SetWithLabelValues id %v vol %v", mp.PartitionID, idStr, vol.Name)
                        delete(deleteMps, idStr)
                }
                vol.mpsLock.RUnlock()
        }

        for k, v := range deleteMps {
                mm.metaEqualCheckFail.DeleteLabelValues(v, k)
        }
}

func (mm *monitorMetrics) setDiskErrorMetric() {
        // key: addr_diskpath, val: addr
        deleteBadDisks := make(map[string]string)
        for k, v := range mm.badDisks {
                deleteBadDisks[k] = v
                delete(mm.badDisks, k)
        }

        mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode, ok := node.(*DataNode)
                if !ok {
                        return true
                }
                for _, badDisk := range dataNode.BadDisks {
                        for _, partition := range dataNode.DataPartitionReports {
                                if partition.DiskPath == badDisk {
                                        key := fmt.Sprintf("%s_%s", dataNode.Addr, badDisk)
                                        mm.diskError.SetWithLabelValues(1, dataNode.Addr, key)
                                        mm.badDisks[key] = dataNode.Addr
                                        delete(deleteBadDisks, key)
                                        break
                                }
                        }
                }

                return true
        })

        for k, v := range deleteBadDisks {
                mm.diskError.DeleteLabelValues(v, k)
        }
}

func (mm *monitorMetrics) updateMetaNodesStat() {
        var inactiveMetaNodesCount int64

        deleteNodesetCount := make(map[uint64]int64)
        for k, v := range mm.nodesetInactiveMetaNodesCount {
                deleteNodesetCount[k] = v
                delete(mm.nodesetInactiveMetaNodesCount, k)
        }

        mm.cluster.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode, ok := node.(*MetaNode)
                if !ok {
                        return true
                }
                if !metaNode.IsActive {
                        inactiveMetaNodesCount++
                        mm.InactiveMataNodeInfo.SetWithLabelValues(1, mm.cluster.Name, metaNode.Addr)
                        mm.nodesetInactiveMetaNodesCount[metaNode.NodeSetID] = mm.nodesetInactiveMetaNodesCount[metaNode.NodeSetID] + 1
                        delete(deleteNodesetCount, metaNode.NodeSetID)
                } else {
                        mm.InactiveMataNodeInfo.DeleteLabelValues(mm.cluster.Name, metaNode.Addr)
                }
                mm.nodeStat.SetWithLabelValues(metaNode.Ratio, MetricRoleMetaNode, metaNode.Addr, "usageRatio")
                mm.nodeStat.SetWithLabelValues(float64(metaNode.Total), MetricRoleMetaNode, metaNode.Addr, "memTotal")
                mm.nodeStat.SetWithLabelValues(float64(metaNode.Used), MetricRoleMetaNode, metaNode.Addr, "memUsed")
                mm.nodeStat.SetWithLabelValues(float64(metaNode.MetaPartitionCount), MetricRoleMetaNode, metaNode.Addr, "mpCount")
                mm.nodeStat.SetWithLabelValues(float64(metaNode.Threshold), MetricRoleMetaNode, metaNode.Addr, "threshold")
                mm.nodeStat.SetBoolWithLabelValues(metaNode.isWritable(), MetricRoleMetaNode, metaNode.Addr, "writable")
                mm.nodeStat.SetBoolWithLabelValues(metaNode.IsActive, MetricRoleMetaNode, metaNode.Addr, "active")

                return true
        })

        mm.metaNodesInactive.Set(float64(inactiveMetaNodesCount))
        for id, count := range mm.nodesetInactiveMetaNodesCount {
                mm.metaNodesetInactiveCount.SetWithLabelValues(float64(count), strconv.FormatUint(id, 10))
        }

        for k := range deleteNodesetCount {
                mm.metaNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
        }
}

func (mm *monitorMetrics) clearInactiveMetaNodesCountMetric() {
        for k := range mm.nodesetInactiveMetaNodesCount {
                mm.metaNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
        }
}

func (mm *monitorMetrics) updateDataNodesStat() {
        var inactiveDataNodesCount uint64
        deleteNodesetCount := make(map[uint64]int64)
        for k, v := range mm.nodesetInactiveDataNodesCount {
                log.LogErrorf("setInactiveDataNodesCountMetric, init deleteNodesetCount")
                deleteNodesetCount[k] = v
                delete(mm.nodesetInactiveDataNodesCount, k)
        }

        mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode, ok := node.(*DataNode)
                if !ok {
                        return true
                }
                if !dataNode.isActive {
                        inactiveDataNodesCount++
                        mm.InactiveDataNodeInfo.SetWithLabelValues(1, mm.cluster.Name, dataNode.Addr)
                        mm.nodesetInactiveDataNodesCount[dataNode.NodeSetID] = mm.nodesetInactiveDataNodesCount[dataNode.NodeSetID] + 1
                        delete(deleteNodesetCount, dataNode.NodeSetID)
                } else {
                        mm.InactiveDataNodeInfo.DeleteLabelValues(mm.cluster.Name, dataNode.Addr)
                }
                mm.nodeStat.SetWithLabelValues(float64(dataNode.DataPartitionCount), MetricRoleDataNode, dataNode.Addr, "dpCount")
                mm.nodeStat.SetWithLabelValues(float64(dataNode.Total), MetricRoleDataNode, dataNode.Addr, "diskTotal")
                mm.nodeStat.SetWithLabelValues(float64(dataNode.Used), MetricRoleDataNode, dataNode.Addr, "diskUsed")
                mm.nodeStat.SetWithLabelValues(float64(dataNode.AvailableSpace), MetricRoleDataNode, dataNode.Addr, "diskAvail")
                mm.nodeStat.SetWithLabelValues(dataNode.UsageRatio, MetricRoleDataNode, dataNode.Addr, "usageRatio")
                mm.nodeStat.SetWithLabelValues(float64(len(dataNode.BadDisks)), MetricRoleDataNode, dataNode.Addr, "badDiskCount")
                mm.nodeStat.SetBoolWithLabelValues(dataNode.isActive, MetricRoleDataNode, dataNode.Addr, "active")
                mm.nodeStat.SetBoolWithLabelValues(dataNode.isWriteAble(), MetricRoleDataNode, dataNode.Addr, "writable")
                return true
        })
        mm.dataNodesInactive.Set(float64(inactiveDataNodesCount))
        for id, count := range mm.nodesetInactiveDataNodesCount {
                mm.dataNodesetInactiveCount.SetWithLabelValues(float64(count), strconv.FormatUint(id, 10))
        }

        for k := range deleteNodesetCount {
                mm.dataNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
        }
}

func (mm *monitorMetrics) clearInactiveDataNodesCountMetric() {
        for k := range mm.nodesetInactiveDataNodesCount {
                mm.dataNodesetInactiveCount.DeleteLabelValues(strconv.FormatUint(k, 10))
        }
}

func (mm *monitorMetrics) setNotWritableMetaNodesCount() {
        var notWritabelMetaNodesCount int64
        mm.cluster.metaNodes.Range(func(addr, node interface{}) bool {
                metaNode, ok := node.(*MetaNode)
                if !ok {
                        return true
                }
                if !metaNode.isWritable() {
                        notWritabelMetaNodesCount++
                }
                return true
        })
        mm.metaNodesNotWritable.Set(float64(notWritabelMetaNodesCount))
}

func (mm *monitorMetrics) setNotWritableDataNodesCount() {
        var notWritabelDataNodesCount int64
        mm.cluster.dataNodes.Range(func(addr, node interface{}) bool {
                dataNode, ok := node.(*DataNode)
                if !ok {
                        return true
                }
                if !dataNode.isWriteAble() {
                        notWritabelDataNodesCount++
                }
                return true
        })
        mm.dataNodesNotWritable.Set(float64(notWritabelDataNodesCount))
}

func (mm *monitorMetrics) clearInconsistentMps() {
        for k := range mm.inconsistentMps {
                mm.dataNodesetInactiveCount.DeleteLabelValues(k)
        }
}

func (mm *monitorMetrics) deleteS3LcVolMetric(volName string) {
        mm.lcTotalScanned.DeleteLabelValues(volName, "total")
        mm.lcTotalFileScanned.DeleteLabelValues(volName, "file")
        mm.lcTotalDirScanned.DeleteLabelValues(volName, "dir")
        mm.lcTotalExpired.DeleteLabelValues(volName, "expired")
}

func (mm *monitorMetrics) setLcMetrics() {
        lcTaskStatus := mm.cluster.lcMgr.lcRuleTaskStatus
        volumeScanStatistics := make(map[string]proto.LcNodeRuleTaskStatistics, 0)
        lcTaskStatus.RLock()
        for _, r := range lcTaskStatus.Results {
                key := r.Volume + "[" + r.RuleId + "]"
                if _, ok := volumeScanStatistics[key]; ok && r.Done {
                        volumeScanStatistics[key] = proto.LcNodeRuleTaskStatistics{}
                } else {
                        volumeScanStatistics[key] = r.LcNodeRuleTaskStatistics
                }
        }
        lcTaskStatus.RUnlock()
        for key, stat := range volumeScanStatistics {
                mm.lcVolNames[key] = struct{}{}
                mm.lcTotalScanned.SetWithLabelValues(float64(stat.TotalInodeScannedNum), key, "total")
                mm.lcTotalFileScanned.SetWithLabelValues(float64(stat.FileScannedNum), key, "file")
                mm.lcTotalDirScanned.SetWithLabelValues(float64(stat.DirScannedNum), key, "dir")
                mm.lcTotalExpired.SetWithLabelValues(float64(stat.ExpiredNum), key, "expired")
        }
}

func (mm *monitorMetrics) clearLcMetrics() {
        for vol := range mm.lcVolNames {
                mm.deleteS3LcVolMetric(vol)
                delete(mm.lcVolNames, vol)
        }
}

func (mm *monitorMetrics) clearVolMetrics() {
        mm.cluster.volStatInfo.Range(func(key, value interface{}) bool {
                if volName, ok := key.(string); ok {
                        mm.deleteVolMetric(volName)
                }
                return true
        })
}

func (mm *monitorMetrics) clearDiskErrMetrics() {
        for k, v := range mm.badDisks {
                mm.diskError.DeleteLabelValues(v, k)
        }
}

func (mm *monitorMetrics) setNodesetMetrics() {
        deleteNodesetIds := make(map[uint64]string)
        for k, v := range mm.nodesetIds {
                deleteNodesetIds[k] = v
        }
        mm.nodesetIds = make(map[uint64]string)

        zones := mm.cluster.t.getAllZones()
        for _, zone := range zones {
                nodeSets := zone.getAllNodeSet()
                for _, nodeset := range nodeSets {
                        var metaTotal, metaUsed, dataTotal, dataUsed uint64
                        var mpReplicasCount, dpReplicasCount int
                        nodeset.metaNodes.Range(func(key, value interface{}) bool {
                                metaNode := value.(*MetaNode)
                                metaTotal += metaNode.Total
                                metaUsed += metaNode.Used
                                mpReplicasCount += metaNode.MetaPartitionCount
                                return true
                        })
                        nodeset.dataNodes.Range(func(ney, value interface{}) bool {
                                dataNode := value.(*DataNode)
                                dataTotal += dataNode.Total
                                dataUsed += dataNode.Used
                                dpReplicasCount += int(dataNode.DataPartitionCount)
                                return true
                        })

                        nodesetId := strconv.FormatUint(nodeset.ID, 10)

                        mm.nodesetIds[nodeset.ID] = nodesetId
                        delete(deleteNodesetIds, nodeset.ID)

                        mm.nodesetMetaTotal.SetWithLabelValues(float64(metaTotal)/util.GB, nodesetId)
                        mm.nodesetMetaUsed.SetWithLabelValues(float64(metaUsed)/util.GB, nodesetId)
                        mm.nodesetDataTotal.SetWithLabelValues(float64(dataTotal)/util.GB, nodesetId)
                        mm.nodesetDataUsed.SetWithLabelValues(float64(dataUsed)/util.GB, nodesetId)

                        if metaTotal == 0 {
                                mm.nodesetMetaUsageRatio.SetWithLabelValues(0, nodesetId)
                        } else {
                                mm.nodesetMetaUsageRatio.SetWithLabelValues(float64(metaUsed)/float64(metaTotal), nodesetId)
                        }
                        if dataTotal == 0 {
                                mm.nodesetDataUsageRatio.SetWithLabelValues(0, nodesetId)
                        } else {
                                mm.nodesetDataUsageRatio.SetWithLabelValues(float64(dataUsed)/float64(dataTotal), nodesetId)
                        }

                        mm.nodesetMpReplicaCount.SetWithLabelValues(float64(mpReplicasCount), nodesetId)
                        mm.nodesetDpReplicaCount.SetWithLabelValues(float64(dpReplicasCount), nodesetId)
                }
        }

        for _, v := range deleteNodesetIds {
                mm.deleteNodesetMetric(v)
        }
}

func (mm *monitorMetrics) deleteNodesetMetric(nodesetId string) {
        mm.nodesetMetaTotal.DeleteLabelValues(nodesetId)
        mm.nodesetMetaUsed.DeleteLabelValues(nodesetId)
        mm.nodesetMetaUsageRatio.DeleteLabelValues(nodesetId)
        mm.nodesetDataTotal.DeleteLabelValues(nodesetId)
        mm.nodesetDataUsed.DeleteLabelValues(nodesetId)
        mm.nodesetDataUsageRatio.DeleteLabelValues(nodesetId)
        mm.nodesetMpReplicaCount.DeleteLabelValues(nodesetId)
        mm.nodesetDpReplicaCount.DeleteLabelValues(nodesetId)
}

func (mm *monitorMetrics) clearNodesetMetrics() {
        zones := mm.cluster.t.getAllZones()
        for _, zone := range zones {
                nodeSets := zone.getAllNodeSet()
                for _, nodeset := range nodeSets {
                        mm.deleteNodesetMetric(strconv.FormatUint(nodeset.ID, 10))
                }
        }
}

func (mm *monitorMetrics) resetFollowerMetrics() {
        mm.masterNoCache.GaugeVec.Reset()
        mm.masterNoLeader.Set(0)
        mm.masterSnapshot.Set(0)
}

func (mm *monitorMetrics) resetAllLeaderMetrics() {
        mm.clearVolMetrics()
        mm.clearDiskErrMetrics()
        mm.clearInactiveMetaNodesCountMetric()
        mm.clearInactiveDataNodesCountMetric()
        mm.clearInconsistentMps()
        mm.clearNodesetMetrics()
        mm.clearLcMetrics()

        mm.dataNodesCount.Set(0)
        mm.metaNodesCount.Set(0)
        mm.lcNodesCount.Set(0)
        mm.volCount.Set(0)
        mm.dataNodesTotal.Set(0)
        mm.dataNodesUsed.Set(0)
        mm.dataNodeIncreased.Set(0)
        mm.metaNodesTotal.Set(0)
        mm.metaNodesUsed.Set(0)
        mm.metaNodesIncreased.Set(0)
        // mm.diskError.Set(0)
        mm.dataNodesInactive.Set(0)
        mm.metaNodesInactive.Set(0)

        mm.dataNodesNotWritable.Set(0)
        mm.metaNodesNotWritable.Set(0)
        mm.dataPartitionCount.Set(0)
        mm.ReplicaMissingDPCount.Set(0)
        mm.MpMissingLeaderCount.Set(0)
        mm.DpMissingLeaderCount.Set(0)
}

package master

import (
        "encoding/json"
        "fmt"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

type Ver2PhaseCommit struct {
        op            uint8
        prepareInfo   *proto.VolVersionInfo
        commitCnt     uint32
        nodeCnt       uint32
        dataNodeArray *sync.Map
        metaNodeArray *sync.Map
}

func (commit *Ver2PhaseCommit) String() string {
        return fmt.Sprintf("prepareCommit:(op[%v] commitCnt[%v],nodeCnt[%v] info[%v])",
                commit.op, commit.commitCnt, commit.nodeCnt, commit.prepareInfo)
}

func (commit *Ver2PhaseCommit) reset(volName string) {
        commit.op = 0
        commit.commitCnt = 0
        commit.nodeCnt = 0
        // datanode and metanode will not allow change member during make snapshot
        commit.dataNodeArray = new(sync.Map)
        commit.metaNodeArray = new(sync.Map)
        log.LogDebugf("action[Ver2PhaseCommit.reset] vol name %v", volName)
}

type VolVersionPersist struct {
        MultiVersionList []*proto.VolVersionInfo
        Strategy         proto.VolumeVerStrategy
        VerSeq           uint64
}

type VolVersionManager struct {
        // ALL snapshots not include deleted one,deleted one should write in error log
        multiVersionList []*proto.VolVersionInfo
        vol              *Vol
        prepareCommit    *Ver2PhaseCommit
        status           uint32
        wait             chan error
        cancel           chan bool
        verSeq           uint64
        enabled          bool
        strategy         proto.VolumeVerStrategy
        checkStrategy    int32
        checkStatus      int32
        c                *Cluster
        enableMiddleOp   bool
        sync.RWMutex
}

func newVersionMgr(vol *Vol) (mgr *VolVersionManager) {
        mgr = &VolVersionManager{
                vol:    vol,
                wait:   make(chan error, 1),
                cancel: make(chan bool, 1),
                prepareCommit: &Ver2PhaseCommit{
                        dataNodeArray: new(sync.Map),
                        metaNodeArray: new(sync.Map),
                },
        }
        return
}

func (verMgr *VolVersionManager) String() string {
        return fmt.Sprintf("mgr:{vol[%v],status[%v] verSeq [%v], prepareinfo [%v], verlist [%v]}",
                verMgr.vol.Name, verMgr.status, verMgr.verSeq, verMgr.prepareCommit, verMgr.multiVersionList)
}

func (verMgr *VolVersionManager) Persist() (err error) {
        persistInfo := &VolVersionPersist{
                MultiVersionList: verMgr.multiVersionList,
                Strategy:         verMgr.strategy,
                VerSeq:           verMgr.verSeq,
        }
        var val []byte
        if val, err = json.Marshal(persistInfo); err != nil {
                return
        }
        if verMgr.c == nil {
                log.LogErrorf("vol %v cluster nil", verMgr.vol.Name)
                return fmt.Errorf("persist vol %v cluster nil", verMgr.vol.Name)
        }
        if err = verMgr.c.syncMultiVersion(verMgr.vol, val); err != nil {
                return
        }
        return
}

func (verMgr *VolVersionManager) loadMultiVersion(c *Cluster, val []byte) (err error) {
        persistInfo := &VolVersionPersist{}
        if err = json.Unmarshal(val, persistInfo); err != nil {
                return
        }
        verMgr.multiVersionList = persistInfo.MultiVersionList
        verMgr.verSeq = persistInfo.VerSeq
        verMgr.strategy = persistInfo.Strategy
        return nil
}

func (verMgr *VolVersionManager) CommitVer() (ver *proto.VolVersionInfo) {
        log.LogDebugf("action[CommitVer] op %v vol %v %v", verMgr.prepareCommit.op, verMgr.vol.Name, verMgr)
        if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
                ver = verMgr.prepareCommit.prepareInfo
                commitVer := &proto.VolVersionInfo{
                        Ver:    ver.Ver,
                        Status: proto.VersionNormal,
                }
                verMgr.multiVersionList = append(verMgr.multiVersionList, commitVer)
                verMgr.verSeq = ver.Ver
                log.LogInfof("action[CommitVer] vol %v verseq %v exit", verMgr.vol.Name, verMgr.verSeq)
                if err := verMgr.Persist(); err != nil {
                        log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
                        return
                }
                log.LogDebugf("action[CommitVer] vol %v ask mgr do commit in next step version %v", verMgr.vol.Name, ver)
                verMgr.wait <- nil
        } else if verMgr.prepareCommit.op == proto.DeleteVersion {
                idx, found := verMgr.getLayInfo(verMgr.prepareCommit.prepareInfo.Ver)
                if !found {
                        log.LogErrorf("action[CommitVer] vol %v not found seq %v in list but commit", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
                        return
                }
                verMgr.multiVersionList[idx].Status = proto.VersionDeleting
                verMgr.multiVersionList[idx].DelTime = time.Now().Unix()
                verMgr.wait <- nil
        } else {
                log.LogErrorf("action[CommitVer] vol %v with seq %v wrong step", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
        }
        return
}

func (verMgr *VolVersionManager) GenerateVer(verSeq uint64, op uint8) (err error) {
        log.LogInfof("action[GenerateVer] vol %v  enter verseq %v", verMgr.vol.Name, verSeq)
        verMgr.Lock()
        defer verMgr.Unlock()
        tm := time.Now()
        verMgr.enabled = true
        if len(verMgr.multiVersionList) > MaxSnapshotCount {
                err = fmt.Errorf("too much version exceed %v in list", MaxSnapshotCount)
                log.LogWarnf("action[GenerateVer] vol %v err %v", verMgr.vol.Name, err)
                return
        }

        verMgr.prepareCommit.reset(verMgr.vol.Name)
        verMgr.prepareCommit.prepareInfo = &proto.VolVersionInfo{
                Ver:    verSeq,
                Status: proto.VersionNormal,
        }

        verMgr.prepareCommit.op = op
        size := len(verMgr.multiVersionList)
        if size > 0 && !tm.After(time.Unix(int64(verMgr.multiVersionList[size-1].Ver)/1e6, 0)) {
                verMgr.prepareCommit.prepareInfo.Ver = uint64(verMgr.multiVersionList[size-1].Ver) + 1
                log.LogDebugf("action[GenerateVer] vol %v  use ver %v", verMgr.vol.Name, verMgr.prepareCommit.prepareInfo.Ver)
        }
        log.LogDebugf("action[GenerateVer] vol %v exit", verMgr.vol.Name)
        return
}

func (verMgr *VolVersionManager) DelVer(verSeq uint64) (err error) {
        verMgr.Lock()
        defer verMgr.Unlock()

        for i, ver := range verMgr.multiVersionList {
                if ver.Ver == verSeq {
                        if ver.Status != proto.VersionDeleting && ver.Status != proto.VersionDeleteAbnormal {
                                err = fmt.Errorf("with seq %v but it's status is %v", verSeq, ver.Status)
                                log.LogErrorf("action[VolVersionManager.DelVer] vol %v err %v", verMgr.vol.Name, err)
                                return
                        }
                        verMgr.multiVersionList = append(verMgr.multiVersionList[:i], verMgr.multiVersionList[i+1:]...)
                        break
                }
        }
        if err = verMgr.Persist(); err != nil {
                log.LogErrorf("[DelVer] vol %v call persist error %v", verMgr.vol.Name, err)
        }
        return
}

func (verMgr *VolVersionManager) SetVerStrategy(strategy proto.VolumeVerStrategy, isForce bool) (err error) {
        verMgr.Lock()
        defer verMgr.Unlock()

        log.LogWarnf("vol %v SetVerStrategy.keepCnt %v need in [1-%v], peroidic %v need in [1-%v], enable %v", verMgr.vol.Name,
                strategy.KeepVerCnt, MaxSnapshotCount, strategy.GetPeriodic(), 24*7, strategy.Enable)

        if strategy.Enable == true {
                if strategy.KeepVerCnt > MaxSnapshotCount || strategy.GetPeriodic() > 24*7 || strategy.KeepVerCnt < 0 || strategy.GetPeriodic() < 0 {
                        return fmt.Errorf("SetVerStrategy.vol %v keepCnt %v need in [1-%v], peroidic %v need in [1-%v] not qualified",
                                verMgr.vol.Name, strategy.KeepVerCnt, MaxSnapshotCount, strategy.GetPeriodic(), 24*7)
                }
                if strategy.KeepVerCnt != 0 {
                        verMgr.strategy.KeepVerCnt = strategy.KeepVerCnt
                }
                if strategy.GetPeriodic() != 0 {
                        verMgr.strategy.Periodic = strategy.Periodic
                }
                if isForce {
                        verMgr.strategy.ForceUpdate = strategy.ForceUpdate
                }
        }

        verMgr.strategy.Enable = strategy.Enable
        verMgr.strategy.UTime = time.Now()

        if err = verMgr.Persist(); err != nil {
                log.LogErrorf("action[SetVerStrategy] vol %v err %v", verMgr.vol.Name, err)
                return
        }
        return
}

func (verMgr *VolVersionManager) checkCreateStrategy(c *Cluster) {
        verMgr.RLock()
        log.LogDebugf("checkSnapshotStrategy enter")
        if len(verMgr.multiVersionList)-1 > verMgr.strategy.KeepVerCnt {
                verMgr.RUnlock()
                return
        }
        verMgr.RUnlock()

        curTime := time.Now()
        if verMgr.strategy.TimeUp(curTime) {
                log.LogDebugf("checkSnapshotStrategy.vol %v try create snapshot", verMgr.vol.Name)
                if _, err := verMgr.createVer2PhaseTask(c, uint64(time.Now().UnixMicro()), proto.CreateVersion, verMgr.strategy.ForceUpdate); err != nil {
                        verMgr.RLock()
                        verEle := verMgr.multiVersionList[len(verMgr.multiVersionList)-1]
                        verMgr.RUnlock()
                        if int64(verEle.Ver)/1e6+int64(verMgr.strategy.GetPeriodicSecond()) < curTime.Unix() {
                                msg := fmt.Sprintf("[checkSnapshotStrategy] last version %v status %v for %v hours than 2times periodic", verEle.Ver, verEle.Status, 2*verMgr.strategy.Periodic)
                                Warn(c.Name, msg)
                        }
                        return
                }
                verMgr.strategy.UTime = time.Now()
                if err := verMgr.Persist(); err != nil {
                        log.LogErrorf("vol %v call persist error %v", verMgr.vol.Name, err)
                }
        }
}

func (verMgr *VolVersionManager) checkDeleteStrategy(c *Cluster) {
        verMgr.RLock()
        log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, len(verMgr.multiVersionList)-1, verMgr.strategy.KeepVerCnt)
        nLen := len(verMgr.multiVersionList)
        log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, len(verMgr.multiVersionList)-1, verMgr.strategy.KeepVerCnt)
        if nLen-1 > verMgr.strategy.KeepVerCnt {
                log.LogDebugf("checkSnapshotStrategy.vol %v try delete snapshot nLen %v, keep cnt %v", verMgr.vol.Name, nLen-1, verMgr.strategy.KeepVerCnt)
                if verMgr.multiVersionList[0].Status != proto.VersionNormal {
                        log.LogDebugf("checkSnapshotStrategy.vol %v oldest ver %v status %v",
                                verMgr.vol.Name, verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status)
                        if verMgr.multiVersionList[0].DelTime+int64(verMgr.strategy.GetPeriodicSecond()) < time.Now().Unix() {
                                msg := fmt.Sprintf("[checkSnapshotStrategy] version %v in deleting status for %v hours than configure periodic [%v] hours",
                                        verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status, verMgr.strategy.GetPeriodic())
                                Warn(c.Name, msg)
                        }
                        verMgr.RUnlock()
                        return
                }
                verMgr.RUnlock()
                if _, err := verMgr.createVer2PhaseTask(c, verMgr.multiVersionList[0].Ver, proto.DeleteVersion, verMgr.strategy.ForceUpdate); err != nil {
                        return
                }
                return
        }
        verMgr.RUnlock()
}

func (verMgr *VolVersionManager) UpdateVerStatus(verSeq uint64, status uint8) (err error) {
        verMgr.Lock()
        defer verMgr.Unlock()

        for _, ver := range verMgr.multiVersionList {
                if ver.Ver == verSeq {
                        ver.Status = status
                }
                if ver.Ver > verSeq {
                        return fmt.Errorf("not found")
                }
        }
        return
}

const (
        TypeNoReply      = 0
        TypeReply        = 1
        MaxSnapshotCount = 30
)

func (verMgr *VolVersionManager) handleTaskRsp(resp *proto.MultiVersionOpResponse, partitionType uint32) {
        verMgr.RLock()
        defer verMgr.RUnlock()
        log.LogInfof("action[handleTaskRsp] vol %v node %v partitionType %v,op %v, inner op %v", verMgr.vol.Name,
                resp.Addr, partitionType, resp.Op, verMgr.prepareCommit.op)

        if resp.Op != verMgr.prepareCommit.op {
                log.LogWarnf("action[handleTaskRsp] vol %v op %v, inner op %v", verMgr.vol.Name, resp.Op, verMgr.prepareCommit.op)
                return
        }

        if resp.Op != proto.DeleteVersion && resp.VerSeq != verMgr.prepareCommit.prepareInfo.Ver {
                log.LogErrorf("action[handleTaskRsp] vol %v op %v, inner verseq %v commit verseq %v", verMgr.vol.Name,
                        resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver)
                return
        }
        var needCommit bool
        dFunc := func(pType uint32, array *sync.Map) {
                if val, ok := array.Load(resp.Addr); ok {
                        if rType, rok := val.(int); rok && rType == TypeNoReply {
                                log.LogInfof("action[handleTaskRsp] vol %v node %v partitionType %v,op %v, inner op %v", verMgr.vol.Name,
                                        resp.Addr, partitionType, resp.Op, verMgr.prepareCommit.op)
                                array.Store(resp.Addr, TypeReply)

                                if resp.Status != proto.TaskSucceeds || resp.Result != "" {
                                        log.LogErrorf("action[handleTaskRsp] vol %v type %v node %v rsp sucess. op %v, verseq %v,commit cnt %v, rsp status %v mgr status %v result %v",
                                                verMgr.vol.Name, pType, resp.Addr, resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt), resp.Status, verMgr.status, resp.Result)

                                        if verMgr.prepareCommit.prepareInfo.Status == proto.VersionWorking {
                                                verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                                                verMgr.wait <- fmt.Errorf("pType %v node %v error %v", pType, resp.Addr, resp.Status)
                                                log.LogErrorf("action[handleTaskRsp] vol %v type %v commit cnt %v, rsp status %v mgr status %v result %v", verMgr.vol.Name,
                                                        pType, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt), resp.Status, verMgr.status, resp.Result)
                                                return
                                        }
                                        return
                                }
                                if verMgr.prepareCommit.nodeCnt == atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1) {
                                        needCommit = true
                                }
                                log.LogDebugf("action[handleTaskRsp] vol %v type %v node %v rsp sucess. op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
                                        pType, resp.Addr, resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
                        } else {
                                log.LogWarnf("action[handleTaskRsp] vol %v type %v node %v op %v, inner verseq %v commit verseq %v status %v", verMgr.vol.Name,
                                        pType, resp.Addr, resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver, val.(int))
                        }
                } else {
                        log.LogErrorf("action[handleTaskRsp] vol %v type %v node %v not found. op %v, inner verseq %v commit verseq %v", verMgr.vol.Name,
                                pType, resp.Addr, resp.Op, resp.VerSeq, verMgr.prepareCommit.prepareInfo.Ver)
                }
        }

        if partitionType == TypeDataPartition {
                dFunc(partitionType, verMgr.prepareCommit.dataNodeArray)
        } else {
                dFunc(partitionType, verMgr.prepareCommit.metaNodeArray)
        }

        log.LogInfof("action[handleTaskRsp] vol %v commit cnt %v, node cnt %v, operation %v", verMgr.vol.Name,
                atomic.LoadUint32(&verMgr.prepareCommit.commitCnt),
                atomic.LoadUint32(&verMgr.prepareCommit.nodeCnt), verMgr.prepareCommit.op)

        if atomic.LoadUint32(&verMgr.prepareCommit.commitCnt) == verMgr.prepareCommit.nodeCnt && needCommit {
                if verMgr.prepareCommit.op == proto.DeleteVersion {
                        verMgr.CommitVer()
                        // verMgr.prepareCommit.reset()
                        // verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingFinished
                        log.LogWarnf("action[handleTaskRsp] vol %v do Del version finished, verMgr %v", verMgr.vol.Name, verMgr)
                } else if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
                        log.LogInfof("action[handleTaskRsp] vol %v ver update prepare sucess. op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
                                resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
                        verMgr.CommitVer()
                } else if verMgr.prepareCommit.op == proto.CreateVersionCommit {
                        log.LogWarnf("action[handleTaskRsp] vol %v ver already update all node now! op %v, verseq %v,commit cnt %v", verMgr.vol.Name,
                                resp.Op, resp.VerSeq, atomic.LoadUint32(&verMgr.prepareCommit.commitCnt))
                        verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingFinished
                        verMgr.wait <- nil
                }
        }
}

func (verMgr *VolVersionManager) createTaskToDataNode(cluster *Cluster, verSeq uint64, op uint8, force bool) (err error) {
        var dpHost sync.Map

        log.LogWarnf("action[createTaskToDataNode] vol %v verMgr.status %v verSeq %v op %v force %v, prepareCommit.nodeCnt %v",
                verMgr.vol.Name, verMgr.status, verSeq, op, force, verMgr.prepareCommit.nodeCnt)
        for _, dp := range verMgr.vol.dataPartitions.clonePartitions() {
                for _, host := range dp.Hosts {
                        dpHost.Store(host, nil)
                }
                dp.VerSeq = verSeq
        }

        tasks := make([]*proto.AdminTask, 0)
        cluster.dataNodes.Range(func(addr, dataNode interface{}) bool {
                if _, ok := dpHost.Load(addr); !ok {
                        return true
                }
                node := dataNode.(*DataNode)
                node.checkLiveness()
                if !node.isActive {
                        if !force {
                                err = fmt.Errorf("node %v not alive", node.Addr)
                                verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                                return false
                        }
                        atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1)
                        log.LogInfof("action[createTaskToDataNode] volume %v addr %v op %v verseq %v force commit in advance", verMgr.vol.Name, addr.(string), op, verSeq)
                }
                verMgr.prepareCommit.dataNodeArray.Store(node.Addr, TypeNoReply)
                verMgr.prepareCommit.nodeCnt++
                log.LogInfof("action[createTaskToDataNode] volume %v addr %v op %v verseq %v nodeCnt %v",
                        verMgr.vol.Name, addr.(string), op, verSeq, verMgr.prepareCommit.nodeCnt)
                task := node.createVersionTask(verMgr.vol.Name, verSeq, op, addr.(string), verMgr.multiVersionList)
                tasks = append(tasks, task)
                return true
        })

        if verMgr.prepareCommit.prepareInfo.Status != proto.VersionWorking {
                log.LogWarnf("action[verManager.createTask] vol %v status %v not working", verMgr.vol.Name, verMgr.status)
                return
        }
        log.LogInfof("action[verManager.createTask] verSeq %v, datanode task cnt %v", verSeq, len(tasks))
        cluster.addDataNodeTasks(tasks)

        return
}

func (verMgr *VolVersionManager) createTaskToMetaNode(cluster *Cluster, verSeq uint64, op uint8, force bool) (err error) {
        var (
                mpHost sync.Map
                ok     bool
        )

        log.LogInfof("action[verManager.createTaskToMetaNode] vol %v verSeq %v, mp cnt %v, prepareCommit.nodeCnt %v",
                verMgr.vol.Name, verSeq, len(verMgr.vol.MetaPartitions), verMgr.prepareCommit.nodeCnt)

        verMgr.vol.mpsLock.RLock()
        for _, mp := range verMgr.vol.MetaPartitions {
                for _, host := range mp.Hosts {
                        mpHost.Store(host, nil)
                }
                mp.VerSeq = verSeq
        }
        verMgr.vol.mpsLock.RUnlock()

        tasks := make([]*proto.AdminTask, 0)
        cluster.metaNodes.Range(func(addr, metaNode interface{}) bool {
                if _, ok = mpHost.Load(addr); !ok {
                        return true
                }
                node := metaNode.(*MetaNode)
                if !node.IsActive {
                        if !force {
                                err = fmt.Errorf("node %v not alive", node.Addr)
                                verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                                return false
                        }
                        atomic.AddUint32(&verMgr.prepareCommit.commitCnt, 1)
                }
                verMgr.prepareCommit.nodeCnt++
                log.LogInfof("action[createTaskToMetaNode] volume %v addr %v op %v verseq %v nodeCnt %v",
                        verMgr.vol.Name, addr.(string), op, verSeq, verMgr.prepareCommit.nodeCnt)
                verMgr.prepareCommit.metaNodeArray.Store(node.Addr, TypeNoReply)
                task := node.createVersionTask(verMgr.vol.Name, verSeq, op, addr.(string), verMgr.multiVersionList)
                tasks = append(tasks, task)
                return true
        })
        if verMgr.prepareCommit.prepareInfo.Status != proto.VersionWorking {
                return
        }

        log.LogInfof("action[verManager.createTaskToMetaNode] vol %v verSeq %v, metaNodes task cnt %v", verMgr.vol.Name, verSeq, len(tasks))
        cluster.addMetaNodeTasks(tasks)
        return
}

func (verMgr *VolVersionManager) finishWork() {
        log.LogDebugf("action[finishWork] vol %v VolVersionManager finishWork!", verMgr.vol.Name)
        atomic.StoreUint32(&verMgr.status, proto.VersionWorkingFinished)
}

func (verMgr *VolVersionManager) startWork() (err error) {
        var status uint32
        log.LogDebugf("action[VolVersionManager.startWork] vol %v status %v", verMgr.status, verMgr.vol.Name)
        if status = atomic.LoadUint32(&verMgr.status); status == proto.VersionWorking {
                err = fmt.Errorf("have task still working,try it later")
                log.LogWarnf("action[VolVersionManager.startWork] vol %v %v", verMgr.vol.Name, err)
                return
        }
        if !atomic.CompareAndSwapUint32(&verMgr.status, status, proto.VersionWorking) {
                err = fmt.Errorf("have task still working,try it later")
                log.LogWarnf("action[VolVersionManager.startWork] vol %v %v", verMgr.vol.Name, err)
                return
        }
        return
}

func (verMgr *VolVersionManager) getLayInfo(verSeq uint64) (int, bool) {
        for idx, info := range verMgr.multiVersionList {
                if info.Ver == verSeq {
                        return idx, true
                }
        }
        return 0, false
}

func (verMgr *VolVersionManager) createTask(cluster *Cluster, verSeq uint64, op uint8, force bool) (ver *proto.VolVersionInfo, err error) {
        log.LogInfof("action[VolVersionManager.createTask] vol %v verSeq %v op %v force %v ,prepareCommit.nodeCnt %v",
                verMgr.vol.Name, verSeq, op, force, verMgr.prepareCommit.nodeCnt)
        verMgr.RLock()
        defer verMgr.RUnlock()

        if err = verMgr.createTaskToDataNode(cluster, verSeq, op, force); err != nil {
                log.LogInfof("action[VolVersionManager.createTask] vol %v err %v", verMgr.vol.Name, err)
                return
        }

        if err = verMgr.createTaskToMetaNode(cluster, verSeq, op, force); err != nil {
                log.LogInfof("action[VolVersionManager.createTask] vol %v err %v", verMgr.vol.Name, err)
                return
        }

        log.LogInfof("action[VolVersionManager.createTask] exit")
        return
}

func (verMgr *VolVersionManager) initVer2PhaseTask(verSeq uint64, op uint8) (verRsp *proto.VolVersionInfo, err error, opRes uint8) {
        verMgr.prepareCommit.reset(verMgr.vol.Name)
        log.LogWarnf("action[VolVersionManager.initVer2PhaseTask] vol %v verMgr.status %v op %v verSeq %v", verMgr.vol.Name, verMgr.status, op, verSeq)
        if op == proto.CreateVersion {
                if err = verMgr.GenerateVer(verSeq, op); err != nil {
                        log.LogInfof("action[VolVersionManager.initVer2PhaseTask] exit")
                        return
                }
                op = proto.CreateVersionPrepare
                log.LogInfof("action[VolVersionManager.initVer2PhaseTask] CreateVersionPrepare")
        } else if op == proto.DeleteVersion {
                var (
                        idx   int
                        found bool
                )

                if verMgr.enableMiddleOp {
                        if ver, status := verMgr.getOldestVer(); ver != verSeq || status != proto.VersionNormal {
                                err = fmt.Errorf("oldest is %v, status %v", ver, status)
                                return
                        }
                }

                if idx, found = verMgr.getLayInfo(verSeq); !found {
                        verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                        log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v not found", verMgr.vol.Name, op, verSeq)
                        return nil, fmt.Errorf("not found"), op
                }
                if idx == len(verMgr.multiVersionList)-1 {
                        verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                        log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
                        return nil, fmt.Errorf("uncommited version"), op
                }
                if verMgr.multiVersionList[idx].Status == proto.VersionDeleting {
                        log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
                        return nil, fmt.Errorf("version on deleting"), op
                }
                if verMgr.multiVersionList[idx].Status == proto.VersionDeleted {
                        log.LogErrorf("action[VolVersionManager.initVer2PhaseTask] vol %v op %v verSeq %v is uncommitted", verMgr.vol.Name, op, verSeq)
                        return nil, fmt.Errorf("version alreay be deleted"), op
                }

                verMgr.prepareCommit.op = op
                verMgr.prepareCommit.prepareInfo = &proto.VolVersionInfo{
                        Ver:    verSeq,
                        Status: proto.VersionWorking,
                }
        }
        opRes = op
        return
}

func (verMgr *VolVersionManager) createVer2PhaseTask(cluster *Cluster, verSeq uint64, op uint8, force bool) (verRsp *proto.VolVersionInfo, err error) {
        if err = verMgr.startWork(); err != nil {
                return
        }
        if !proto.IsHot(verMgr.vol.VolType) {
                err = fmt.Errorf("vol need be hot one")
                log.LogErrorf("vol %v createVer2PhaseTask. %v", verMgr.vol.Name, err)
                return
        }
        defer func() {
                if err != nil {
                        log.LogWarnf("action[createVer2PhaseTask] vol %v close lock due to err %v", verMgr.vol.Name, err)
                        verMgr.finishWork()
                }
        }()

        if verRsp, err, op = verMgr.initVer2PhaseTask(verSeq, op); err != nil {
                return
        }
        if op == proto.CreateVersion {
                log.LogWarnf("action[createVer2PhaseTask] vol %v update seq %v to %v", verMgr.vol.Name, verSeq, verMgr.prepareCommit.prepareInfo.Ver)
                verSeq = verMgr.prepareCommit.prepareInfo.Ver
        }

        if _, err = verMgr.createTask(cluster, verSeq, op, force); err != nil {
                log.LogInfof("action[createVer2PhaseTask] vol %v CreateVersionPrepare err %v", verMgr.vol.Name, err)
                return
        }
        verMgr.prepareCommit.op = op
        wg := &sync.WaitGroup{}
        wg.Add(1)

        go func() {
                wgFin := false
                wgDone := func() {
                        if !wgFin {
                                wg.Done()
                                wgFin = true
                        }
                }
                log.LogInfof("action[createVer2PhaseTask] verseq %v op %v enter wait schedule", verSeq, verMgr.prepareCommit.op)
                defer func() {
                        log.LogDebugf("action[createVer2PhaseTask] status %v", verMgr.status)
                        log.LogInfof("action[createVer2PhaseTask] verseq %v op %v exit wait schedule", verSeq, verMgr.prepareCommit.op)
                        if err != nil {
                                log.LogInfof("action[createVer2PhaseTask] verseq %v op %v exit schedule with err %v", verSeq, verMgr.prepareCommit.op, err)
                        }
                        wgDone()
                }()
                ticker := time.NewTicker(time.Second)
                cnt := 0
                for {
                        select {
                        case err = <-verMgr.wait:
                                log.LogInfof("action[createVer2PhaseTask] %v go routine verseq %v op %v get err %v", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op, err)
                                if verMgr.prepareCommit.op == proto.DeleteVersion {
                                        if err == nil {
                                                verMgr.prepareCommit.reset(verMgr.vol.Name)
                                                if err = verMgr.Persist(); err != nil {
                                                        log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
                                                        return
                                                }
                                                verMgr.finishWork()
                                                wgDone()
                                        } else {
                                                verMgr.prepareCommit.reset(verMgr.vol.Name)
                                                verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                                                log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
                                        }
                                        return
                                } else if verMgr.prepareCommit.op == proto.CreateVersionPrepare {
                                        if err == nil {
                                                verMgr.verSeq = verSeq
                                                verMgr.prepareCommit.reset(verMgr.vol.Name)
                                                verMgr.prepareCommit.op = proto.CreateVersionCommit
                                                if err = verMgr.Persist(); err != nil {
                                                        log.LogErrorf("action[createVer2PhaseTask] vol %v err %v", verMgr.vol.Name, err)
                                                        return
                                                }
                                                log.LogInfof("action[createVer2PhaseTask] vol %v prepare fin.start commit", verMgr.vol.Name)
                                                if _, err = verMgr.createTask(cluster, verSeq, verMgr.prepareCommit.op, force); err != nil {
                                                        log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
                                                        return
                                                }
                                                if vLen := len(verMgr.multiVersionList); vLen > 1 {
                                                        verRsp = verMgr.multiVersionList[vLen-2]
                                                }
                                                wgDone()
                                        } else {
                                                verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingAbnormal
                                                log.LogInfof("action[createVer2PhaseTask] vol %v prepare error %v", verMgr.vol.Name, err)
                                                return
                                        }
                                } else if verMgr.prepareCommit.op == proto.CreateVersionCommit {
                                        log.LogInfof("action[createVer2PhaseTask] vol %v create ver task commit, create 2phase finished", verMgr.vol.Name)
                                        verMgr.prepareCommit.reset(verMgr.vol.Name)
                                        verMgr.finishWork()
                                        return
                                } else {
                                        log.LogErrorf("action[createVer2PhaseTask] vol %v op %v", verMgr.vol.Name, verMgr.prepareCommit.op)
                                        return
                                }
                        case <-verMgr.cancel:
                                verMgr.prepareCommit.reset(verMgr.vol.Name)
                                log.LogInfof("action[createVer2PhaseTask.cancel] vol %v verseq %v op %v be canceled", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op)
                                return
                        case <-ticker.C:
                                log.LogInfof("action[createVer2PhaseTask.tick] vol %v verseq %v op %v wait", verMgr.vol.Name, verSeq, verMgr.prepareCommit.op)
                                cnt++
                                if cnt > 5 {
                                        verMgr.prepareCommit.prepareInfo.Status = proto.VersionWorkingTimeOut
                                        err = fmt.Errorf("verseq %v op %v be set timeout", verSeq, verMgr.prepareCommit.op)
                                        log.LogInfof("action[createVer2PhaseTask] vol %v close lock due to err %v", verMgr.vol.Name, err)

                                        if verMgr.prepareCommit.op == proto.CreateVersionCommit {
                                                err = nil
                                        }
                                        verMgr.prepareCommit.reset(verMgr.vol.Name)
                                        verMgr.finishWork()
                                        return
                                }
                        }
                }
        }()
        wg.Wait()
        log.LogDebugf("action[createVer2PhaseTask] vol %v prepare phase finished", verMgr.vol.Name)
        return
}

func (verMgr *VolVersionManager) init(cluster *Cluster) error {
        verMgr.c = cluster
        log.LogWarnf("action[VolVersionManager.init] vol %v", verMgr.vol.Name)
        verMgr.multiVersionList = append(verMgr.multiVersionList, &proto.VolVersionInfo{
                Ver:    0,
                Status: 1,
        })
        if cluster.partition.IsRaftLeader() {
                return verMgr.Persist()
        }
        return nil
}

func (verMgr *VolVersionManager) getVersionInfo(verGet uint64) (verInfo *proto.VolVersionInfo, err error) {
        verMgr.RLock()
        defer verMgr.RUnlock()

        if !proto.IsHot(verMgr.vol.VolType) {
                err = fmt.Errorf("vol need be hot one")
                log.LogErrorf("createVer2PhaseTask. %v", err)
                return
        }

        log.LogDebugf("action[getVersionInfo] verGet %v", verGet)
        for _, ver := range verMgr.multiVersionList {
                if ver.Ver == verGet {
                        log.LogDebugf("action[getVersionInfo] ver %v", ver)
                        return ver, nil
                }
                log.LogDebugf("action[getVersionInfo] ver %v", ver)
                if ver.Ver > verGet {
                        log.LogDebugf("action[getVersionInfo] ver %v", ver)
                        break
                }
        }
        msg := fmt.Sprintf("ver [%v] not found", verGet)
        log.LogInfof("action[getVersionInfo] %v", msg)
        return nil, fmt.Errorf("%v", msg)
}

func (verMgr *VolVersionManager) getOldestVer() (ver uint64, status uint8) {
        verMgr.RLock()
        defer verMgr.RUnlock()

        size := len(verMgr.multiVersionList)
        if size <= 1 {
                return 0, proto.VersionDeleteAbnormal
        }
        log.LogInfof("action[getLatestVer] ver len %v verMgr %v", size, verMgr)
        return verMgr.multiVersionList[0].Ver, verMgr.multiVersionList[0].Status
}

func (verMgr *VolVersionManager) getVolDelStatus() (status uint8) {
        verMgr.RLock()
        defer verMgr.RUnlock()

        size := len(verMgr.multiVersionList)
        if size == 0 {
                return 0
        }
        log.LogInfof("action[getLatestVer] ver len %v verMgr %v", size, verMgr)
        return verMgr.multiVersionList[size-1].Status
}

func (verMgr *VolVersionManager) getLatestVer() (ver uint64) {
        verMgr.RLock()
        defer verMgr.RUnlock()

        size := len(verMgr.multiVersionList)
        if size == 0 {
                return 0
        }

        return verMgr.multiVersionList[size-1].Ver
}

func (verMgr *VolVersionManager) getVersionList() *proto.VolVersionInfoList {
        verMgr.RLock()
        defer verMgr.RUnlock()

        return &proto.VolVersionInfoList{
                VerList:  verMgr.multiVersionList,
                Strategy: verMgr.strategy,
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "math"
        "math/rand"
        "sort"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

const RoundRobinNodeSelectorName = "RoundRobin"

const CarryWeightNodeSelectorName = "CarryWeight"

const AvailableSpaceFirstNodeSelectorName = "AvailableSpaceFirst"

const StrawNodeSelectorName = "Straw"

const DefaultNodeSelectorName = CarryWeightNodeSelectorName

func (ns *nodeSet) getNodes(nodeType NodeType) *sync.Map {
        switch nodeType {
        case DataNodeType:
                return ns.dataNodes
        case MetaNodeType:
                return ns.metaNodes
        default:
                panic("unknown node type")
        }
}

type NodeSelector interface {
        GetName() string
        Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error)
}

type weightedNode struct {
        Carry  float64
        Weight float64
        Ptr    Node
        ID     uint64
}

// Node defines an interface that needs to be implemented by weightedNode
type Node interface {
        SelectNodeForWrite()
        GetID() uint64
        GetAddr() string
}

// SortedWeightedNodes defines an array sorted by carry
type SortedWeightedNodes []*weightedNode

func (nodes SortedWeightedNodes) Len() int {
        return len(nodes)
}

func (nodes SortedWeightedNodes) Less(i, j int) bool {
        return nodes[i].Carry > nodes[j].Carry
}

func (nodes SortedWeightedNodes) Swap(i, j int) {
        nodes[i], nodes[j] = nodes[j], nodes[i]
}

func canAllocPartition(node interface{}, nodeType NodeType) bool {
        switch nodeType {
        case DataNodeType:
                dataNode := node.(*DataNode)
                return dataNode.canAlloc() && dataNode.canAllocDp()
        case MetaNodeType:
                metaNode := node.(*MetaNode)
                return metaNode.isWritable()
        default:
                panic("unknown node type")
        }
}

func asNodeWrap(node interface{}, nodeType NodeType) Node {
        switch nodeType {
        case DataNodeType:
                dataNode := node.(*DataNode)
                return dataNode
        case MetaNodeType:
                metaNode := node.(*MetaNode)
                return metaNode
        default:
                panic("unknown node type")
        }
}

type CarryWeightNodeSelector struct {
        nodeType NodeType

        carry map[uint64]float64
}

func (s *CarryWeightNodeSelector) GetName() string {
        return CarryWeightNodeSelectorName
}

func (s *CarryWeightNodeSelector) prepareCarryForDataNodes(nodes *sync.Map, total uint64) {
        nodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if _, ok := s.carry[dataNode.ID]; !ok {
                        // use available space to calculate initial weight
                        s.carry[dataNode.ID] = float64(dataNode.AvailableSpace) / float64(total)
                }
                return true
        })
}

func (s *CarryWeightNodeSelector) prepareCarryForMetaNodes(nodes *sync.Map, total uint64) {
        nodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if _, ok := s.carry[metaNode.ID]; !ok {
                        // use available space to calculate initial weight
                        s.carry[metaNode.ID] = float64(metaNode.Total-metaNode.Used) / float64(total)
                }
                return true
        })
}

func (s *CarryWeightNodeSelector) prepareCarry(nodes *sync.Map, total uint64) {
        switch s.nodeType {
        case DataNodeType:
                s.prepareCarryForDataNodes(nodes, total)
        case MetaNodeType:
                s.prepareCarryForMetaNodes(nodes, total)
        default:
        }
}

func (s *CarryWeightNodeSelector) getTotalMaxForDataNodes(nodes *sync.Map) (total uint64) {
        nodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if dataNode.Total > total {
                        total = dataNode.Total
                }
                return true
        })
        return
}

func (s *CarryWeightNodeSelector) getTotalMaxForMetaNodes(nodes *sync.Map) (total uint64) {
        nodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if metaNode.Total > total {
                        total = metaNode.Total
                }
                return true
        })
        return
}

func (s *CarryWeightNodeSelector) getTotalMax(nodes *sync.Map) (total uint64) {
        switch s.nodeType {
        case DataNodeType:
                total = s.getTotalMaxForDataNodes(nodes)
        case MetaNodeType:
                total = s.getTotalMaxForMetaNodes(nodes)
        default:
        }
        return
}

func (s *CarryWeightNodeSelector) getCarryDataNodes(maxTotal uint64, excludeHosts []string, dataNodes *sync.Map) (nodeTabs SortedWeightedNodes, availCount int) {
        nodeTabs = make(SortedWeightedNodes, 0)
        dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if contains(excludeHosts, dataNode.Addr) {
                        // log.LogDebugf("[getAvailCarryDataNodeTab] dataNode [%v] is excludeHosts", dataNode.Addr)
                        return true
                }
                if !dataNode.canAllocDp() {
                        log.LogDebugf("[getAvailCarryDataNodeTab] dataNode [%v] is not writeable, offline %v, dpCnt %d",
                                dataNode.Addr, dataNode.ToBeOffline, dataNode.DataPartitionCount)
                        return true
                }

                if !dataNode.canAlloc() {
                        log.LogWarnf("[getAvailCarryDataNodeTab] dataNode [%v] is overSold", dataNode.Addr)
                        return true
                }
                if s.carry[dataNode.ID] >= 1.0 {
                        availCount++
                }

                nt := new(weightedNode)
                nt.Carry = s.carry[dataNode.ID]
                nt.Weight = float64(dataNode.AvailableSpace) / float64(maxTotal)
                nt.Ptr = dataNode
                nodeTabs = append(nodeTabs, nt)
                return true
        })
        return
}

func (s *CarryWeightNodeSelector) getCarryMetaNodes(maxTotal uint64, excludeHosts []string, metaNodes *sync.Map) (nodes SortedWeightedNodes, availCount int) {
        nodes = make(SortedWeightedNodes, 0)
        metaNodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if contains(excludeHosts, metaNode.Addr) {
                        return true
                }
                if !metaNode.isWritable() {
                        return true
                }
                if s.carry[metaNode.ID] >= 1.0 {
                        availCount++
                }
                nt := new(weightedNode)
                nt.Carry = s.carry[metaNode.ID]
                nt.Weight = (float64)(metaNode.Total-metaNode.Used) / (float64)(maxTotal)
                nt.Ptr = metaNode
                nodes = append(nodes, nt)
                return true
        })
        return
}

func (s *CarryWeightNodeSelector) getCarryNodes(nset *nodeSet, maxTotal uint64, excludeHosts []string) (SortedWeightedNodes, int) {
        switch s.nodeType {
        case DataNodeType:
                return s.getCarryDataNodes(maxTotal, excludeHosts, nset.dataNodes)
        case MetaNodeType:
                return s.getCarryMetaNodes(maxTotal, excludeHosts, nset.metaNodes)
        default:
                panic("unknown node type")
        }
}

func (s *CarryWeightNodeSelector) setNodeCarry(nodes SortedWeightedNodes, availCarryCount, replicaNum int) {
        for availCarryCount < replicaNum {
                availCarryCount = 0
                for _, nt := range nodes {
                        carry := nt.Carry + nt.Weight
                        // limit the max value of weight
                        // prevent subsequent selections make node overloading
                        if carry > 10.0 {
                                carry = 10.0
                        }
                        nt.Carry = carry
                        s.carry[nt.Ptr.GetID()] = carry
                        if carry > 1.0 {
                                availCarryCount++
                        }
                }
        }
}

func (s *CarryWeightNodeSelector) selectNodeForWrite(node Node) {
        node.SelectNodeForWrite()
        // decrease node weight
        s.carry[node.GetID()] -= 1.0
}

func (s *CarryWeightNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        nodes := ns.getNodes(s.nodeType)
        total := s.getTotalMax(nodes)
        // prepare carry for every nodes
        s.prepareCarry(nodes, total)
        orderHosts := make([]string, 0)
        newHosts = make([]string, 0)
        peers = make([]proto.Peer, 0)
        // if replica == 0, return
        if replicaNum == 0 {
                return
        }
        // if we cannot get enough writable nodes, return error
        weightedNodes, count := s.getCarryNodes(ns, total, excludeHosts)
        if len(weightedNodes) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(weightedNodes))
                return
        }
        // create enough carry nodes
        // we say a node is "carry node", whent its carry >= 1.0
        s.setNodeCarry(weightedNodes, count, replicaNum)
        // sort nodes by weight
        sort.Sort(weightedNodes)
        // pick first N nodes
        for i := 0; i < replicaNum; i++ {
                node := weightedNodes[i].Ptr
                s.selectNodeForWrite(node)
                orderHosts = append(orderHosts, node.GetAddr())
                peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
                peers = append(peers, peer)
        }
        log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
        // reshuffle for primary-backup replication
        if newHosts, err = reshuffleHosts(orderHosts); err != nil {
                err = fmt.Errorf("action[%vNodeSelector::Select] err:%v  orderHosts is nil", s.GetName(), err.Error())
                return
        }
        return
}

func NewCarryWeightNodeSelector(nodeType NodeType) *CarryWeightNodeSelector {
        return &CarryWeightNodeSelector{
                carry:    make(map[uint64]float64),
                nodeType: nodeType,
        }
}

type AvailableSpaceFirstNodeSelector struct {
        nodeType NodeType
}

func (s *AvailableSpaceFirstNodeSelector) getNodeAvailableSpace(node interface{}) uint64 {
        switch s.nodeType {
        case DataNodeType:
                dataNode := node.(*DataNode)
                return dataNode.AvailableSpace
        case MetaNodeType:
                metaNode := node.(*MetaNode)
                return metaNode.Total - metaNode.Used
        default:
                panic("unkown node type")
        }
}

func (s *AvailableSpaceFirstNodeSelector) GetName() string {
        return AvailableSpaceFirstNodeSelectorName
}

func (s *AvailableSpaceFirstNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        newHosts = make([]string, 0)
        peers = make([]proto.Peer, 0)
        // if replica == 0, return
        if replicaNum == 0 {
                return
        }
        orderHosts := make([]string, 0)
        nodes := ns.getNodes(s.nodeType)
        sortedNodes := make([]Node, 0)
        nodes.Range(func(key, value interface{}) bool {
                sortedNodes = append(sortedNodes, asNodeWrap(value, s.nodeType))
                return true
        })
        // if we cannot get enough nodes, return error
        if len(sortedNodes) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(sortedNodes))
                return
        }
        // sort nodes by available space
        sort.Slice(sortedNodes, func(i, j int) bool {
                return s.getNodeAvailableSpace(sortedNodes[i]) > s.getNodeAvailableSpace(sortedNodes[j])
        })
        nodeIndex := 0
        // pick first N nodes
        for i := 0; i < replicaNum && nodeIndex < len(sortedNodes); i++ {
                selectedIndex := len(sortedNodes)
                // loop until we get a writable node
                for nodeIndex < len(sortedNodes) {
                        node := sortedNodes[nodeIndex]
                        nodeIndex += 1
                        if canAllocPartition(node, s.nodeType) {
                                if excludeHosts == nil || !contains(excludeHosts, node.GetAddr()) {
                                        selectedIndex = nodeIndex - 1
                                        break
                                }
                        }
                }
                // if we get a writable node, append it to host list
                if selectedIndex != len(sortedNodes) {
                        node := sortedNodes[selectedIndex]
                        node.SelectNodeForWrite()
                        orderHosts = append(orderHosts, node.GetAddr())
                        peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
                        peers = append(peers, peer)
                }
        }
        // if we cannot get enough writable nodes, return error
        if len(orderHosts) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(orderHosts))
                return
        }
        log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
        // reshuffle for primary-backup replication
        if newHosts, err = reshuffleHosts(orderHosts); err != nil {
                err = fmt.Errorf("action[%vNodeSelector::Select] err:%v  orderHosts is nil", s.GetName(), err.Error())
                return
        }
        return
}

func NewAvailableSpaceFirstNodeSelector(nodeType NodeType) *AvailableSpaceFirstNodeSelector {
        return &AvailableSpaceFirstNodeSelector{
                nodeType: nodeType,
        }
}

type RoundRobinNodeSelector struct {
        index int

        nodeType NodeType
}

func (s *RoundRobinNodeSelector) GetName() string {
        return RoundRobinNodeSelectorName
}

func (s *RoundRobinNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        newHosts = make([]string, 0)
        peers = make([]proto.Peer, 0)
        // if replica == 0, return
        if replicaNum == 0 {
                return
        }
        orderHosts := make([]string, 0)
        nodes := ns.getNodes(s.nodeType)
        sortedNodes := make([]Node, 0)
        nodes.Range(func(key, value interface{}) bool {
                sortedNodes = append(sortedNodes, asNodeWrap(value, s.nodeType))
                return true
        })
        // if we cannot get enough nodes, return error
        if len(sortedNodes) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(sortedNodes))
                return
        }
        // sort nodes by id, so we can get a node list that is as stable as possible
        sort.Slice(sortedNodes, func(i, j int) bool {
                return sortedNodes[i].GetID() < sortedNodes[j].GetID()
        })
        nodeIndex := 0
        // pick first N nodes
        for i := 0; i < replicaNum && nodeIndex < len(sortedNodes); i++ {
                selectedIndex := len(sortedNodes)
                // loop until we get a writable node
                for nodeIndex < len(sortedNodes) {
                        node := sortedNodes[(nodeIndex+s.index)%len(sortedNodes)]
                        nodeIndex += 1
                        if canAllocPartition(node, s.nodeType) {
                                if excludeHosts == nil || !contains(excludeHosts, node.GetAddr()) {
                                        selectedIndex = nodeIndex - 1
                                        break
                                }
                        }
                }
                // if we get a writable node, append it to host list
                if selectedIndex != len(sortedNodes) {
                        node := sortedNodes[(selectedIndex+s.index)%len(sortedNodes)]
                        orderHosts = append(orderHosts, node.GetAddr())
                        node.SelectNodeForWrite()
                        peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
                        peers = append(peers, peer)
                }
        }
        // if we cannot get enough writable nodes, return error
        if len(orderHosts) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(orderHosts))
                return
        }
        // move the index of selector
        s.index += nodeIndex
        log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
        // reshuffle for primary-backup replication
        if newHosts, err = reshuffleHosts(orderHosts); err != nil {
                err = fmt.Errorf("action[%vNodeSelector::Select] err:%v  orderHosts is nil", s.GetName(), err.Error())
                return
        }
        return
}

func NewRoundRobinNodeSelector(nodeType NodeType) *RoundRobinNodeSelector {
        return &RoundRobinNodeSelector{
                nodeType: nodeType,
        }
}

const (
        StrawNodeSelectorRandMax = 65536
)

// NOTE: this node selector inspired by Straw2 algorithm, which is widely used in ceph
type StrawNodeSelector struct {
        rand     *rand.Rand
        nodeType NodeType
}

func (s *StrawNodeSelector) GetName() string {
        return StrawNodeSelectorName
}

func (s *StrawNodeSelector) getWeight(node Node) float64 {
        switch s.nodeType {
        case DataNodeType:
                dataNode := node.(*DataNode)
                return float64(dataNode.AvailableSpace) / util.GB
        case MetaNodeType:
                metaNode := node.(*MetaNode)
                return float64(metaNode.Total-metaNode.Used) / util.GB
        default:
                panic("unkown node type")
        }
}

func (s *StrawNodeSelector) selectOneNode(nodes []Node) (index int, maxNode Node) {
        maxStraw := float64(0)
        index = -1
        for i, node := range nodes {
                straw := float64(s.rand.Intn(StrawNodeSelectorRandMax))
                straw = math.Log(straw/float64(StrawNodeSelectorRandMax)) / s.getWeight(node)
                if index == -1 || straw > maxStraw {
                        maxStraw = straw
                        maxNode = node
                        index = i
                }
        }
        return
}

func (s *StrawNodeSelector) Select(ns *nodeSet, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        nodes := make([]Node, 0)
        ns.getNodes(s.nodeType).Range(func(key, value interface{}) bool {
                node := asNodeWrap(value, s.nodeType)
                if !contains(excludeHosts, node.GetAddr()) {
                        nodes = append(nodes, node)
                }
                return true
        })
        orderHosts := make([]string, 0)
        for len(orderHosts) < replicaNum {
                if len(nodes)+len(orderHosts) < replicaNum {
                        break
                }
                index, node := s.selectOneNode(nodes)
                if index != 0 {
                        nodes[0], nodes[index] = node, nodes[0]
                }
                nodes = nodes[1:]
                if !canAllocPartition(node, s.nodeType) {
                        continue
                }
                orderHosts = append(orderHosts, node.GetAddr())
                node.SelectNodeForWrite()
                peer := proto.Peer{ID: node.GetID(), Addr: node.GetAddr()}
                peers = append(peers, peer)
        }
        // if we cannot get enough writable nodes, return error
        if len(orderHosts) < replicaNum {
                err = fmt.Errorf("action[%vNodeSelector::Select] no enough writable hosts,replicaNum:%v  MatchNodeCount:%v  ",
                        s.GetName(), replicaNum, len(orderHosts))
                return
        }
        log.LogInfof("action[%vNodeSelector::Select] peers[%v]", s.GetName(), peers)
        // reshuffle for primary-backup replication
        if newHosts, err = reshuffleHosts(orderHosts); err != nil {
                err = fmt.Errorf("action[%vNodeSelector::Select] err:%v  orderHosts is nil", s.GetName(), err.Error())
                return
        }
        return
}

func NewStrawNodeSelector(nodeType NodeType) *StrawNodeSelector {
        return &StrawNodeSelector{
                rand:     rand.New(rand.NewSource(time.Now().UnixMicro())),
                nodeType: nodeType,
        }
}

func NewNodeSelector(name string, nodeType NodeType) NodeSelector {
        switch name {
        case RoundRobinNodeSelectorName:
                return NewRoundRobinNodeSelector(nodeType)
        case CarryWeightNodeSelectorName:
                return NewCarryWeightNodeSelector(nodeType)
        case AvailableSpaceFirstNodeSelectorName:
                return NewAvailableSpaceFirstNodeSelector(nodeType)
        case StrawNodeSelectorName:
                return NewStrawNodeSelector(nodeType)
        default:
                return NewCarryWeightNodeSelector(nodeType)
        }
}

func (ns *nodeSet) getAvailMetaNodeHosts(excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        ns.nodeSelectLock.Lock()
        defer ns.nodeSelectLock.Unlock()
        // we need a read lock to block the modify of node selector
        ns.metaNodeSelectorLock.RLock()
        defer ns.metaNodeSelectorLock.RUnlock()
        return ns.metaNodeSelector.Select(ns, excludeHosts, replicaNum)
}

func (ns *nodeSet) getAvailDataNodeHosts(excludeHosts []string, replicaNum int) (hosts []string, peers []proto.Peer, err error) {
        ns.nodeSelectLock.Lock()
        defer ns.nodeSelectLock.Unlock()
        // we need a read lock to block the modify of node selector
        ns.dataNodeSelectorLock.Lock()
        defer ns.dataNodeSelectorLock.Unlock()
        return ns.dataNodeSelector.Select(ns, excludeHosts, replicaNum)
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "math"
        "math/rand"
        "sort"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
)

const RoundRobinNodesetSelectorName = "RoundRobin"

const CarryWeightNodesetSelectorName = "CarryWeight"

const AvailableSpaceFirstNodesetSelectorName = "AvailableSpaceFirst"

const StrawNodesetSelectorName = "Straw"

const DefaultNodesetSelectorName = RoundRobinNodeSelectorName

func (ns *nodeSet) getDataNodeTotalSpace() (toalSpace uint64) {
        ns.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                toalSpace += dataNode.Total
                return true
        })
        return
}

func (ns *nodeSet) getMetaNodeTotalSpace() (toalSpace uint64) {
        ns.metaNodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                toalSpace += metaNode.Total
                return true
        })
        return
}

func (ns *nodeSet) getDataNodeTotalAvailableSpace() (space uint64) {
        ns.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if !dataNode.ToBeOffline {
                        space += dataNode.AvailableSpace
                }
                return true
        })
        return
}

func (ns *nodeSet) getMetaNodeTotalAvailableSpace() (space uint64) {
        ns.metaNodes.Range(func(key, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if !metaNode.ToBeOffline {
                        space += metaNode.Total - metaNode.Used
                }
                return true
        })
        return
}

func (ns *nodeSet) canWriteFor(nodeType NodeType, replica int) bool {
        switch nodeType {
        case DataNodeType:
                return ns.canWriteForDataNode(replica)
        case MetaNodeType:
                return ns.canWriteForMetaNode(replica)
        default:
                panic("unknow node type")
        }
}

func (ns *nodeSet) getTotalSpaceOf(nodeType NodeType) uint64 {
        switch nodeType {
        case DataNodeType:
                return ns.getDataNodeTotalSpace()
        case MetaNodeType:
                return ns.getMetaNodeTotalSpace()
        default:
                panic("unknow node type")
        }
}

func (ns *nodeSet) getTotalAvailableSpaceOf(nodeType NodeType) uint64 {
        switch nodeType {
        case DataNodeType:
                return ns.getDataNodeTotalAvailableSpace()
        case MetaNodeType:
                return ns.getMetaNodeTotalAvailableSpace()
        default:
                panic("unknow node type")
        }
}

type NodesetSelector interface {
        GetName() string
        Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error)
}

type RoundRobinNodesetSelector struct {
        index int

        nodeType NodeType
}

func (s *RoundRobinNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        // sort nodesets by id, so we can get a node list that is as stable as possible
        sort.Slice(nsc, func(i, j int) bool {
                return nsc[i].ID < nsc[j].ID
        })
        for i := 0; i < len(nsc); i++ {

                if s.index >= len(nsc) {
                        s.index = 0
                }

                ns = nsc[s.index]
                s.index++

                if containsID(excludeNodeSets, ns.ID) {
                        continue
                }
                if ns.canWriteFor(s.nodeType, int(replicaNum)) {
                        return
                }
        }

        switch s.nodeType {
        case DataNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
        case MetaNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
        default:
                panic("unknow node type")
        }
        return
}

func (s *RoundRobinNodesetSelector) GetName() string {
        return RoundRobinNodesetSelectorName
}

func NewRoundRobinNodesetSelector(nodeType NodeType) *RoundRobinNodesetSelector {
        return &RoundRobinNodesetSelector{
                nodeType: nodeType,
        }
}

type CarryWeightNodesetSelector struct {
        carrys map[uint64]float64

        nodeType NodeType
}

func (s *CarryWeightNodesetSelector) GetName() string {
        return CarryWeightNodesetSelectorName
}

func (s *CarryWeightNodesetSelector) getMaxTotal(nsc nodeSetCollection) uint64 {
        total := uint64(0)
        for i := 0; i < nsc.Len(); i++ {
                tmp := nsc[i].getTotalSpaceOf(s.nodeType)
                if tmp > total {
                        total = tmp
                }
        }
        return total
}

func (s *CarryWeightNodesetSelector) prepareCarry(nsc nodeSetCollection, total uint64) {
        for _, nodeset := range nsc {
                id := nodeset.ID
                if _, ok := s.carrys[id]; !ok {
                        // use total available space to calculate initial weight
                        s.carrys[id] = float64(nodeset.getTotalAvailableSpaceOf(s.nodeType)) / float64(total)
                }
        }
}

func (s *CarryWeightNodesetSelector) getAvailNodesets(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (newNsc nodeSetCollection) {
        newNsc = make(nodeSetCollection, 0, nsc.Len())
        for i := 0; i < nsc.Len(); i++ {
                ns := nsc[i]
                if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
                        newNsc = append(newNsc, ns)
                }
        }
        return
}

func (s *CarryWeightNodesetSelector) getCarryCount(nsc nodeSetCollection) (count int) {
        for i := 0; i < nsc.Len(); i++ {
                ns := nsc[i]
                if s.carrys[ns.ID] >= 1.0 {
                        count += 1
                }
        }
        return
}

func (s *CarryWeightNodesetSelector) setNodesetCarry(nsc nodeSetCollection, total uint64) int {
        count := s.getCarryCount(nsc)
        for count < 1 {
                count = 0
                for i := 0; i < nsc.Len(); i++ {
                        nset := nsc[i]
                        weight := float64(nset.getTotalAvailableSpaceOf(s.nodeType)) / float64(total)
                        s.carrys[nset.ID] += weight
                        if s.carrys[nset.ID] >= 1.0 {
                                count += 1
                        }
                        // limit the max value of weight
                        if s.carrys[nset.ID] > 10.0 {
                                s.carrys[nset.ID] = 10.0
                        }
                }
        }
        return count
}

func (s *CarryWeightNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        total := s.getMaxTotal(nsc)
        // prepare weight of evert nodesets
        s.prepareCarry(nsc, total)
        nsc = s.getAvailNodesets(nsc, excludeNodeSets, replicaNum)
        avaliCount := 0
        if len(nsc) < 1 {
                goto err
        }
        avaliCount = s.setNodesetCarry(nsc, total)
        // sort nodesets by weight
        sort.Slice(nsc, func(i, j int) bool {
                return s.carrys[nsc[i].ID] > s.carrys[nsc[j].ID]
        })
        // pick the first nodeset than has N writable node
        for i := 0; i < avaliCount; i++ {
                ns = nsc[i]
                if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
                        break
                }
        }
        if ns != nil {
                if !ns.canWriteFor(s.nodeType, int(replicaNum)) || containsID(excludeNodeSets, ns.ID) {
                        goto err
                }
                s.carrys[ns.ID] -= 1.0
        }
        return
err:
        switch s.nodeType {
        case DataNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
        case MetaNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
        default:
                panic("unknow node type")
        }
        return
}

func NewCarryWeightNodesetSelector(nodeType NodeType) *CarryWeightNodesetSelector {
        return &CarryWeightNodesetSelector{
                carrys:   make(map[uint64]float64),
                nodeType: nodeType,
        }
}

type AvailableSpaceFirstNodesetSelector struct {
        nodeType NodeType
}

func (s *AvailableSpaceFirstNodesetSelector) GetName() string {
        return AvailableSpaceFirstNodesetSelectorName
}

func (s *AvailableSpaceFirstNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        // sort nodesets by available space
        sort.Slice(nsc, func(i, j int) bool {
                return nsc[i].getTotalAvailableSpaceOf(s.nodeType) > nsc[j].getTotalAvailableSpaceOf(s.nodeType)
        })
        // pick the first nodeset that has N writable nodes
        for i := 0; i < nsc.Len(); i++ {
                ns = nsc[i]
                if ns.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, ns.ID) {
                        return
                }
        }
        switch s.nodeType {
        case DataNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
        case MetaNodeType:
                err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
        default:
                panic("unknow node type")
        }
        return
}

func NewAvailableSpaceFirstNodesetSelector(nodeType NodeType) *AvailableSpaceFirstNodesetSelector {
        return &AvailableSpaceFirstNodesetSelector{
                nodeType: nodeType,
        }
}

const (
        StrawNodesetSelectorRandMax = 65536
)

// NOTE: this nodeset selector inspired by Straw2 algorithm, which is widely used in ceph
type StrawNodesetSelector struct {
        nodeType NodeType
        rand     *rand.Rand
}

func (s *StrawNodesetSelector) GetName() string {
        return StrawNodesetSelectorName
}

func (s *StrawNodesetSelector) getWeight(ns *nodeSet) float64 {
        return float64(ns.getTotalAvailableSpaceOf(s.nodeType) / util.GB)
}

func (s *StrawNodesetSelector) Select(nsc nodeSetCollection, excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        tmp := make(nodeSetCollection, 0)
        for _, nodeset := range nsc {
                if nodeset.canWriteFor(s.nodeType, int(replicaNum)) && !containsID(excludeNodeSets, nodeset.ID) {
                        tmp = append(tmp, nodeset)
                }
        }
        nsc = tmp
        if len(nsc) < 1 {
                switch s.nodeType {
                case DataNodeType:
                        err = errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
                case MetaNodeType:
                        err = errors.NewError(proto.ErrNoNodeSetToCreateMetaPartition)
                default:
                        panic("unknow node type")
                }
                return
        }
        maxStraw := float64(0)
        for _, nodeset := range nsc {
                straw := float64(s.rand.Intn(StrawNodesetSelectorRandMax))
                straw = math.Log(straw/float64(StrawNodesetSelectorRandMax)) / s.getWeight(nodeset)
                if ns == nil || straw > maxStraw {
                        ns = nodeset
                        maxStraw = straw
                }
        }
        return
}

func NewStrawNodesetSelector(nodeType NodeType) *StrawNodesetSelector {
        return &StrawNodesetSelector{
                nodeType: nodeType,
                rand:     rand.New(rand.NewSource(time.Now().Unix())),
        }
}

func NewNodesetSelector(name string, nodeType NodeType) NodesetSelector {
        switch name {
        case CarryWeightNodesetSelectorName:
                return NewCarryWeightNodesetSelector(nodeType)
        case RoundRobinNodesetSelectorName:
                return NewRoundRobinNodesetSelector(nodeType)
        case AvailableSpaceFirstNodesetSelectorName:
                return NewAvailableSpaceFirstNodesetSelector(nodeType)
        case StrawNodesetSelectorName:
                return NewStrawNodesetSelector(nodeType)
        default:
                return NewRoundRobinNodesetSelector(nodeType)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "crypto/md5"
        "encoding/hex"
        "encoding/json"
        "fmt"
        "math/rand"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

func newCreateDataPartitionRequest(volName string, ID uint64, replicaNum int, members []proto.Peer,
        dataPartitionSize, leaderSize int, hosts []string, createType int, partitionType int,
        decommissionedDisks []string, verSeq uint64) (req *proto.CreateDataPartitionRequest) {
        req = &proto.CreateDataPartitionRequest{
                PartitionTyp:        partitionType,
                PartitionId:         ID,
                PartitionSize:       dataPartitionSize,
                ReplicaNum:          replicaNum,
                VolumeId:            volName,
                Members:             members,
                Hosts:               hosts,
                CreateType:          createType,
                LeaderSize:          leaderSize,
                DecommissionedDisks: decommissionedDisks,
                VerSeq:              verSeq,
        }
        return
}

func newDeleteDataPartitionRequest(ID uint64) (req *proto.DeleteDataPartitionRequest) {
        req = &proto.DeleteDataPartitionRequest{
                PartitionId: ID,
        }
        return
}

func newAddDataPartitionRaftMemberRequest(ID uint64, addPeer proto.Peer) (req *proto.AddDataPartitionRaftMemberRequest) {
        req = &proto.AddDataPartitionRaftMemberRequest{
                PartitionId: ID,
                AddPeer:     addPeer,
        }
        return
}

func newRemoveDataPartitionRaftMemberRequest(ID uint64, removePeer proto.Peer) (req *proto.RemoveDataPartitionRaftMemberRequest) {
        req = &proto.RemoveDataPartitionRaftMemberRequest{
                PartitionId: ID,
                RemovePeer:  removePeer,
        }
        return
}

func newLoadDataPartitionMetricRequest(ID uint64) (req *proto.LoadDataPartitionRequest) {
        req = &proto.LoadDataPartitionRequest{
                PartitionId: ID,
        }
        return
}

func newStopDataPartitionRepairRequest(ID uint64, stop bool) (req *proto.StopDataPartitionRepairRequest) {
        req = &proto.StopDataPartitionRepairRequest{
                PartitionId: ID,
                Stop:        stop,
        }
        return
}

func unmarshalTaskResponse(task *proto.AdminTask) (err error) {
        bytes, err := json.Marshal(task.Response)
        if err != nil {
                return
        }
        var response interface{}
        switch task.OpCode {
        case proto.OpDataNodeHeartbeat:
                response = &proto.DataNodeHeartbeatResponse{}
        case proto.OpDeleteDataPartition:
                response = &proto.DeleteDataPartitionResponse{}
        case proto.OpLoadDataPartition:
                response = &proto.LoadDataPartitionResponse{}
        case proto.OpDeleteFile:
                response = &proto.DeleteFileResponse{}
        case proto.OpMetaNodeHeartbeat:
                response = &proto.MetaNodeHeartbeatResponse{}
        case proto.OpDeleteMetaPartition:
                response = &proto.DeleteMetaPartitionResponse{}
        case proto.OpUpdateMetaPartition:
                response = &proto.UpdateMetaPartitionResponse{}
        case proto.OpDecommissionMetaPartition:
                response = &proto.MetaPartitionDecommissionResponse{}
        case proto.OpVersionOperation:
                response = &proto.MultiVersionOpResponse{}
        case proto.OpLcNodeHeartbeat:
                response = &proto.LcNodeHeartbeatResponse{}
        case proto.OpLcNodeScan:
                response = &proto.LcNodeRuleTaskResponse{}
        case proto.OpLcNodeSnapshotVerDel:
                response = &proto.SnapshotVerDelTaskResponse{}

        default:
                log.LogError(fmt.Sprintf("unknown operate code(%v)", task.OpCode))
        }

        if response == nil {
                return fmt.Errorf("unmarshalTaskResponse failed")
        }
        if err = json.Unmarshal(bytes, response); err != nil {
                return
        }
        task.Response = response
        return
}

func contains(arr []string, element string) (ok bool) {
        if arr == nil || len(arr) == 0 {
                return
        }

        for _, e := range arr {
                if e == element {
                        ok = true
                        break
                }
        }
        return
}

func containsID(arr []uint64, element uint64) bool {
        if arr == nil || len(arr) == 0 {
                return false
        }

        for _, e := range arr {
                if e == element {
                        return true
                }
        }

        return false
}

func reshuffleHosts(oldHosts []string) (newHosts []string, err error) {
        if oldHosts == nil || len(oldHosts) == 0 {
                log.LogError(fmt.Sprintf("action[reshuffleHosts],err:%v", proto.ErrReshuffleArray))
                err = proto.ErrReshuffleArray
                return
        }

        lenOldHosts := len(oldHosts)
        newHosts = make([]string, lenOldHosts)
        if lenOldHosts == 1 {
                copy(newHosts, oldHosts)
                return
        }

        for i := lenOldHosts; i > 1; i-- {
                rand.Seed(time.Now().UnixNano())
                oCurrPos := rand.Intn(i)
                oldHosts[i-1], oldHosts[oCurrPos] = oldHosts[oCurrPos], oldHosts[i-1]
        }
        copy(newHosts, oldHosts)
        return
}

// Warn provides warnings when exits
func Warn(clusterID, msg string) {
        key := fmt.Sprintf("%s_%s", clusterID, ModuleName)
        WarnBySpecialKey(key, msg)
}

// WarnBySpecialKey provides warnings when exits
func WarnBySpecialKey(key, msg string) {
        log.LogWarn(msg)
        exporter.Warning(msg)
}

func keyNotFound(name string) (err error) {
        return errors.NewErrorf("parameter %v not found", name)
}

func unmatchedKey(name string) (err error) {
        return errors.NewErrorf("parameter %v not match", name)
}

func txInvalidMask() (err error) {
        return errors.New("transaction mask key value pair should be: enableTxMaskKey=[create|mkdir|remove|rename|mknod|symlink|link]\n enableTxMaskKey=off \n enableTxMaskKey=all")
}

func notFoundMsg(name string) (err error) {
        return errors.NewErrorf("%v not found", name)
}

func metaPartitionNotFound(id uint64) (err error) {
        return notFoundMsg(fmt.Sprintf("meta partition[%v]", id))
}

func metaReplicaNotFound(addr string) (err error) {
        return notFoundMsg(fmt.Sprintf("meta replica[%v]", addr))
}

func dataPartitionNotFound(id uint64) (err error) {
        return notFoundMsg(fmt.Sprintf("data partition[%v]", id))
}

func dataReplicaNotFound(addr string) (err error) {
        return notFoundMsg(fmt.Sprintf("data replica[%v]", addr))
}

func zoneNotFound(name string) (err error) {
        return notFoundMsg(fmt.Sprintf("zone[%v]", name))
}

func nodeSetNotFound(id uint64) (err error) {
        return notFoundMsg(fmt.Sprintf("node set[%v]", id))
}

func dataNodeNotFound(addr string) (err error) {
        return notFoundMsg(fmt.Sprintf("data node[%v]", addr))
}

func metaNodeNotFound(addr string) (err error) {
        return notFoundMsg(fmt.Sprintf("meta node[%v]", addr))
}

func lcNodeNotFound(addr string) (err error) {
        return notFoundMsg(fmt.Sprintf("lc node[%v]", addr))
}

func volNotFound(name string) (err error) {
        return notFoundMsg(fmt.Sprintf("vol[%v]", name))
}

func matchKey(serverKey, clientKey string) bool {
        h := md5.New()
        _, err := h.Write([]byte(serverKey))
        if err != nil {
                log.LogWarnf("action[matchKey] write server key[%v] failed,err[%v]", serverKey, err)
                return false
        }
        cipherStr := h.Sum(nil)
        return strings.ToLower(clientKey) == strings.ToLower(hex.EncodeToString(cipherStr))
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "context"
        "fmt"
        syslog "log"
        "net/http"
        "net/http/httputil"
        "regexp"
        "strconv"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/raftstore/raftstore_db"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

// configuration keys
const (
        ClusterName          = "clusterName"
        ID                   = "id"
        IP                   = "ip"
        Port                 = "port"
        LogLevel             = "logLevel"
        LogDir               = "logDir"
        WalDir               = "walDir"
        StoreDir             = "storeDir"
        EbsAddrKey           = "ebsAddr"
        BStoreAddrKey        = "bStoreAddr"
        EbsServicePathKey    = "ebsServicePath"
        BStoreServicePathKey = "bStoreServicePath"
        GroupID              = 1
        ModuleName           = "master"
        CfgRetainLogs        = "retainLogs"
        DefaultRetainLogs    = 20000
        cfgTickInterval      = "tickInterval"
        cfgRaftRecvBufSize   = "raftRecvBufSize"
        cfgElectionTick      = "electionTick"
        SecretKey            = "masterServiceKey"
        Stat                 = "stat"
        Authenticate         = "authenticate"
        AuthNodeHost         = "authNodeHost"
        AuthNodeEnableHTTPS  = "authNodeEnableHTTPS"
        AuthNodeCertFile     = "authNodeCertFile"
)

var (
        // regexps for data validation
        volNameRegexp = regexp.MustCompile("^[a-zA-Z0-9][a-zA-Z0-9_.-]{1,61}[a-zA-Z0-9]$")
        ownerRegexp   = regexp.MustCompile("^[A-Za-z][A-Za-z0-9_]{0,20}$")

        useConnPool = true // for test
        gConfig     *clusterConfig
)

var overSoldFactor = defaultOverSoldFactor

func overSoldLimit() bool {
        if overSoldFactor <= 0 {
                return false
        }

        return true
}

func overSoldCap(cap uint64) uint64 {
        if overSoldFactor <= 0 {
                return cap
        }

        return uint64(float32(cap) * overSoldFactor)
}

func setOverSoldFactor(factor float32) {
        if factor != overSoldFactor {
                overSoldFactor = factor
        }
}

var volNameErr = errors.New("name can only start and end with number or letters, and len can't less than 3")

// Server represents the server in a cluster
type Server struct {
        id              uint64
        clusterName     string
        ip              string
        bindIp          bool
        port            string
        logDir          string
        walDir          string
        storeDir        string
        bStoreAddr      string
        servicePath     string
        retainLogs      uint64
        tickInterval    int
        raftRecvBufSize int
        electionTick    int
        leaderInfo      *LeaderInfo
        config          *clusterConfig
        cluster         *Cluster
        user            *User
        rocksDBStore    *raftstore_db.RocksDBStore
        raftStore       raftstore.RaftStore
        fsm             *MetadataFsm
        partition       raftstore.Partition
        wg              sync.WaitGroup
        reverseProxy    *httputil.ReverseProxy
        metaReady       bool
        apiServer       *http.Server
}

// NewServer creates a new server
func NewServer() *Server {
        return &Server{}
}

// Start starts a server
func (m *Server) Start(cfg *config.Config) (err error) {
        m.config = newClusterConfig()
        gConfig = m.config
        m.leaderInfo = &LeaderInfo{}
        m.reverseProxy = m.newReverseProxy()
        if err = m.checkConfig(cfg); err != nil {
                log.LogError(errors.Stack(err))
                return
        }

        if m.rocksDBStore, err = raftstore_db.NewRocksDBStoreAndRecovery(m.storeDir, LRUCacheSize, WriteBufferSize); err != nil {
                return
        }

        if err = m.createRaftServer(cfg); err != nil {
                log.LogError(errors.Stack(err))
                return
        }
        m.initCluster()
        m.initUser()
        m.cluster.partition = m.partition
        m.cluster.idAlloc.partition = m.partition
        MasterSecretKey := cfg.GetString(SecretKey)
        if m.cluster.MasterSecretKey, err = cryptoutil.Base64Decode(MasterSecretKey); err != nil {
                return fmt.Errorf("action[Start] failed %v, err: master service Key invalid = %s", proto.ErrInvalidCfg, MasterSecretKey)
        }
        m.cluster.authenticate = cfg.GetBool(Authenticate)
        if m.cluster.authenticate {
                m.cluster.initAuthentication(cfg)
        }

        m.cluster.scheduleTask()
        m.startHTTPService(ModuleName, cfg)
        exporter.RegistConsul(m.clusterName, ModuleName, cfg)
        WarnMetrics = newWarningMetrics(m.cluster)
        metricsService := newMonitorMetrics(m.cluster)
        metricsService.start()

        _, err = stat.NewStatistic(m.logDir, Stat, int64(stat.DefaultStatLogSize),
                stat.DefaultTimeOutUs, true)

        m.wg.Add(1)
        return nil
}

// Shutdown closes the server
func (m *Server) Shutdown() {
        var err error
        if m.apiServer != nil {
                if err = m.apiServer.Shutdown(context.Background()); err != nil {
                        log.LogErrorf("action[Shutdown] failed, err: %v", err)
                }
        }
        stat.CloseStat()

        // stop raftServer first
        if m.fsm != nil {
                m.fsm.Stop()
        }

        // then stop rocksDBStore
        time.Sleep(time.Second)
        if m.rocksDBStore != nil {
                m.rocksDBStore.Close()
        }

        m.wg.Done()
}

// Sync waits for the execution termination of the server
func (m *Server) Sync() {
        m.wg.Wait()
}

func (m *Server) checkConfig(cfg *config.Config) (err error) {
        m.clusterName = cfg.GetString(ClusterName)
        m.ip = cfg.GetString(IP)
        m.bindIp = cfg.GetBool(proto.BindIpKey)
        m.port = cfg.GetString(proto.ListenPort)
        m.logDir = cfg.GetString(LogDir)
        m.walDir = cfg.GetString(WalDir)
        m.storeDir = cfg.GetString(StoreDir)
        m.bStoreAddr = cfg.GetString(BStoreAddrKey)
        if m.bStoreAddr == "" {
                m.bStoreAddr = cfg.GetString(EbsAddrKey)
        }
        m.servicePath = cfg.GetString(BStoreServicePathKey)
        if m.servicePath == "" {
                m.servicePath = cfg.GetString(EbsServicePathKey)
        }
        peerAddrs := cfg.GetString(cfgPeers)
        if m.port == "" || m.walDir == "" || m.storeDir == "" || m.clusterName == "" || peerAddrs == "" {
                return fmt.Errorf("%v,err:%v,%v,%v,%v,%v,%v", proto.ErrInvalidCfg, "one of (listen,walDir,storeDir,clusterName) is null",
                        m.port, m.walDir, m.storeDir, m.clusterName, peerAddrs)
        }

        if m.id, err = strconv.ParseUint(cfg.GetString(ID), 10, 64); err != nil {
                return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
        }

        m.config.DisableAutoCreate = cfg.GetBoolWithDefault(disableAutoCreate, false)
        syslog.Printf("get disableAutoCreate cfg %v", m.config.DisableAutoCreate)

        m.config.faultDomain = cfg.GetBoolWithDefault(faultDomain, false)
        m.config.heartbeatPort = cfg.GetInt64(heartbeatPortKey)
        m.config.replicaPort = cfg.GetInt64(replicaPortKey)
        if m.config.heartbeatPort <= 1024 {
                m.config.heartbeatPort = raftstore.DefaultHeartbeatPort
        }
        if m.config.replicaPort <= 1024 {
                m.config.replicaPort = raftstore.DefaultReplicaPort
        }
        syslog.Printf("heartbeatPort[%v],replicaPort[%v]\n", m.config.heartbeatPort, m.config.replicaPort)
        if err = m.config.parsePeers(peerAddrs); err != nil {
                return
        }
        nodeSetCapacity := cfg.GetString(nodeSetCapacity)
        if nodeSetCapacity != "" {
                if m.config.nodeSetCapacity, err = strconv.Atoi(nodeSetCapacity); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }
        if m.config.nodeSetCapacity < 3 {
                m.config.nodeSetCapacity = defaultNodeSetCapacity
        }

        m.config.DefaultNormalZoneCnt = defaultNodeSetGrpBatchCnt
        m.config.DomainBuildAsPossible = cfg.GetBoolWithDefault(cfgDomainBuildAsPossible, false)
        domainBatchGrpCnt := cfg.GetString(cfgDomainBatchGrpCnt)
        if domainBatchGrpCnt != "" {
                if m.config.DefaultNormalZoneCnt, err = strconv.Atoi(domainBatchGrpCnt); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        metaNodeReservedMemory := cfg.GetString(cfgMetaNodeReservedMem)
        if metaNodeReservedMemory != "" {
                if m.config.metaNodeReservedMem, err = strconv.ParseUint(metaNodeReservedMemory, 10, 64); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }
        if m.config.metaNodeReservedMem < 32*1024*1024 {
                m.config.metaNodeReservedMem = defaultMetaNodeReservedMem
        }

        retainLogs := cfg.GetString(CfgRetainLogs)
        if retainLogs != "" {
                if m.retainLogs, err = strconv.ParseUint(retainLogs, 10, 64); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }
        if m.retainLogs <= 0 {
                m.retainLogs = DefaultRetainLogs
        }
        syslog.Println("retainLogs=", m.retainLogs)

        missingDataPartitionInterval := cfg.GetString(missingDataPartitionInterval)
        if missingDataPartitionInterval != "" {
                if m.config.MissingDataPartitionInterval, err = strconv.ParseInt(missingDataPartitionInterval, 10, 0); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        dpNoLeaderReportInterval := cfg.GetString(cfgDpNoLeaderReportIntervalSec)
        if dpNoLeaderReportInterval != "" {
                if m.config.DpNoLeaderReportIntervalSec, err = strconv.ParseInt(dpNoLeaderReportInterval, 10, 0); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        mpNoLeaderReportInterval := cfg.GetString(cfgMpNoLeaderReportIntervalSec)
        if mpNoLeaderReportInterval != "" {
                if m.config.MpNoLeaderReportIntervalSec, err = strconv.ParseInt(mpNoLeaderReportInterval, 10, 0); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        dataPartitionTimeOutSec := cfg.GetString(dataPartitionTimeOutSec)
        if dataPartitionTimeOutSec != "" {
                if m.config.DataPartitionTimeOutSec, err = strconv.ParseInt(dataPartitionTimeOutSec, 10, 0); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        numberOfDataPartitionsToLoad := cfg.GetString(NumberOfDataPartitionsToLoad)
        if numberOfDataPartitionsToLoad != "" {
                if m.config.numberOfDataPartitionsToLoad, err = strconv.Atoi(numberOfDataPartitionsToLoad); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }
        if m.config.numberOfDataPartitionsToLoad <= 40 {
                m.config.numberOfDataPartitionsToLoad = 40
        }
        if secondsToFreeDP := cfg.GetString(secondsToFreeDataPartitionAfterLoad); secondsToFreeDP != "" {
                if m.config.secondsToFreeDataPartitionAfterLoad, err = strconv.ParseInt(secondsToFreeDP, 10, 64); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        intervalToScanS3ExpirationVal := cfg.GetString(intervalToScanS3Expiration)
        if intervalToScanS3ExpirationVal != "" {
                if m.config.IntervalToScanS3Expiration, err = strconv.ParseInt(intervalToScanS3ExpirationVal, 10, 0); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        m.tickInterval = int(cfg.GetFloat(cfgTickInterval))
        m.raftRecvBufSize = int(cfg.GetInt(cfgRaftRecvBufSize))
        m.electionTick = int(cfg.GetFloat(cfgElectionTick))
        if m.tickInterval <= 300 {
                m.tickInterval = 500
        }
        if m.electionTick <= 3 {
                m.electionTick = 5
        }

        maxQuotaNumPerVol := cfg.GetString(cfgMaxQuotaNumPerVol)
        if maxQuotaNumPerVol != "" {
                if m.config.MaxQuotaNumPerVol, err = strconv.Atoi(maxQuotaNumPerVol); err != nil {
                        return fmt.Errorf("%v,err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }

        m.config.MonitorPushAddr = cfg.GetString(cfgMonitorPushAddr)

        m.config.volForceDeletion = cfg.GetBoolWithDefault(cfgVolForceDeletion, true)

        threshold := cfg.GetInt64WithDefault(cfgVolDeletionDentryThreshold, 0)
        if threshold < 0 {
                return fmt.Errorf("volDeletionDentryThreshold can't be less than 0 ! ")
        }
        m.config.volDeletionDentryThreshold = uint64(threshold)

        return
}

func (m *Server) createRaftServer(cfg *config.Config) (err error) {
        raftCfg := &raftstore.Config{
                NodeID:            m.id,
                RaftPath:          m.walDir,
                IPAddr:            cfg.GetString(IP),
                NumOfLogsToRetain: m.retainLogs,
                HeartbeatPort:     int(m.config.heartbeatPort),
                ReplicaPort:       int(m.config.replicaPort),
                TickInterval:      m.tickInterval,
                ElectionTick:      m.electionTick,
                RecvBufSize:       m.raftRecvBufSize,
        }
        if m.raftStore, err = raftstore.NewRaftStore(raftCfg, cfg); err != nil {
                return errors.Trace(err, "NewRaftStore failed! id[%v] walPath[%v]", m.id, m.walDir)
        }
        syslog.Printf("peers[%v],tickInterval[%v],electionTick[%v]\n", m.config.peers, m.tickInterval, m.electionTick)
        m.initFsm()
        partitionCfg := &raftstore.PartitionConfig{
                ID:      GroupID,
                Peers:   m.config.peers,
                Applied: m.fsm.applied,
                SM:      m.fsm,
        }
        if m.partition, err = m.raftStore.CreatePartition(partitionCfg); err != nil {
                return errors.Trace(err, "CreatePartition failed")
        }
        return
}

func (m *Server) initFsm() {
        m.fsm = newMetadataFsm(m.rocksDBStore, m.retainLogs, m.raftStore.RaftServer())
        m.fsm.registerLeaderChangeHandler(m.handleLeaderChange)
        m.fsm.registerPeerChangeHandler(m.handlePeerChange)

        // register the handlers for the interfaces defined in the Raft library
        m.fsm.registerApplySnapshotHandler(m.handleApplySnapshot)
        m.fsm.registerRaftUserCmdApplyHandler(m.handleRaftUserCmd)
        m.fsm.restore()
}

func (m *Server) initCluster() {
        log.LogInfo("action[initCluster] begin")
        m.cluster = newCluster(m.clusterName, m.leaderInfo, m.fsm, m.partition, m.config)
        m.cluster.retainLogs = m.retainLogs
        log.LogInfo("action[initCluster] end")

        // incase any limiter on follower
        log.LogInfo("action[loadApiLimiterInfo] begin")
        m.cluster.loadApiLimiterInfo()
        log.LogInfo("action[loadApiLimiterInfo] end")
}

func (m *Server) initUser() {
        log.LogInfo("action[initUser] begin")
        m.user = newUser(m.fsm, m.partition)
        log.LogInfo("action[initUser] end")
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

type snapshotDelManager struct {
        cluster              *Cluster
        lcSnapshotTaskStatus *lcSnapshotVerStatus
        lcNodeStatus         *lcNodeStatus
        idleNodeCh           chan struct{}
        exitCh               chan struct{}
}

func newSnapshotManager() *snapshotDelManager {
        log.LogInfof("action[newSnapshotManager] construct")
        snapshotMgr := &snapshotDelManager{
                lcSnapshotTaskStatus: newLcSnapshotVerStatus(),
                lcNodeStatus:         newLcNodeStatus(),
                idleNodeCh:           make(chan struct{}, 1000), // support notify multi snapshot tasks
                exitCh:               make(chan struct{}),
        }
        return snapshotMgr
}

func (m *snapshotDelManager) process() {
        for {
                select {
                case <-m.exitCh:
                        log.LogInfo("exitCh notified, snapshotDelManager process exit")
                        return
                case <-m.idleNodeCh:
                        log.LogDebug("idleLcNodeCh notified")

                        task := m.lcSnapshotTaskStatus.GetOneTask()
                        if task == nil {
                                log.LogDebugf("lcSnapshotTaskStatus.GetOneTask, no task")
                                continue
                        }

                        nodeAddr := m.lcNodeStatus.GetIdleNode()
                        if nodeAddr == "" {
                                log.LogWarn("no idle lcnode, redo task")
                                m.lcSnapshotTaskStatus.RedoTask(task)
                                continue
                        }

                        val, ok := m.cluster.lcNodes.Load(nodeAddr)
                        if !ok {
                                log.LogErrorf("lcNodes.Load, nodeAddr(%v) is not available, redo task", nodeAddr)
                                m.lcNodeStatus.RemoveNode(nodeAddr)
                                m.lcSnapshotTaskStatus.RedoTask(task)
                                continue
                        }

                        node := val.(*LcNode)
                        adminTask := node.createSnapshotVerDelTask(m.cluster.masterAddr(), task)
                        m.cluster.addLcNodeTasks([]*proto.AdminTask{adminTask})
                        log.LogDebugf("add snapshot version del task(%v) to lcnode(%v)", *task, nodeAddr)
                }
        }
}

func (m *snapshotDelManager) notifyIdleLcNode() {
        m.lcSnapshotTaskStatus.RLock()
        defer m.lcSnapshotTaskStatus.RUnlock()

        if len(m.lcSnapshotTaskStatus.VerInfos) > 0 {
                select {
                case m.idleNodeCh <- struct{}{}:
                        log.LogDebug("action[handleLcNodeHeartbeatResp], snapshotDelManager scan routine notified!")
                default:
                        log.LogDebug("action[handleLcNodeHeartbeatResp], snapshotDelManager skipping notify!")
                }
        }
}

//----------------------------------------------

type lcSnapshotVerStatus struct {
        sync.RWMutex
        VerInfos    map[string]*proto.SnapshotVerDelTask
        TaskResults map[string]*proto.SnapshotVerDelTaskResponse
}

func newLcSnapshotVerStatus() *lcSnapshotVerStatus {
        return &lcSnapshotVerStatus{
                VerInfos:    make(map[string]*proto.SnapshotVerDelTask),
                TaskResults: make(map[string]*proto.SnapshotVerDelTaskResponse),
        }
}

func (vs *lcSnapshotVerStatus) GetOneTask() (task *proto.SnapshotVerDelTask) {
        vs.Lock()
        defer vs.Unlock()
        if len(vs.VerInfos) == 0 {
                return
        }

        for _, i := range vs.VerInfos {
                task = i
                break
        }
        if task == nil {
                return
        }

        delete(vs.VerInfos, task.Id)
        t := time.Now()
        vs.TaskResults[task.Id] = &proto.SnapshotVerDelTaskResponse{
                ID:         task.Id,
                UpdateTime: &t,
        }
        log.LogDebugf("GetOneTask(%v) and add TaskResults", task)
        return
}

func (vs *lcSnapshotVerStatus) RedoTask(task *proto.SnapshotVerDelTask) {
        vs.Lock()
        defer vs.Unlock()
        if task == nil {
                return
        }

        vs.VerInfos[task.Id] = task
}

func (vs *lcSnapshotVerStatus) AddVerInfo(task *proto.SnapshotVerDelTask) {
        vs.Lock()
        defer vs.Unlock()
        if len(vs.VerInfos) > 10000 {
                return
        }

        if _, ok := vs.TaskResults[task.Id]; ok {
                log.LogDebugf("VerInfo: %v is in TaskResults, already in processing", task)
                return
        }
        vs.VerInfos[task.Id] = task
        log.LogDebugf("AddVerInfo task: %v, now num: %v", task, len(vs.VerInfos))
}

func (vs *lcSnapshotVerStatus) ResetVerInfos() {
        vs.Lock()
        defer vs.Unlock()
        log.LogDebugf("ResetVerInfos remove num %v", len(vs.VerInfos))
        vs.VerInfos = make(map[string]*proto.SnapshotVerDelTask)
}

func (vs *lcSnapshotVerStatus) AddResult(resp *proto.SnapshotVerDelTaskResponse) {
        vs.Lock()
        defer vs.Unlock()
        vs.TaskResults[resp.ID] = resp
}

func (vs *lcSnapshotVerStatus) DeleteOldResult() {
        vs.Lock()
        defer vs.Unlock()
        for k, v := range vs.TaskResults {
                // delete result that already done
                if v.Done == true && time.Now().After(v.EndTime.Add(time.Minute*10)) {
                        delete(vs.TaskResults, k)
                        log.LogDebugf("delete result already done: %v", v)
                }
                // delete result that not done but no updating
                if v.Done != true && time.Now().After(v.UpdateTime.Add(time.Minute*10)) {
                        delete(vs.TaskResults, k)
                        log.LogWarnf("delete result that not done but no updating: %v", v)
                }
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "container/list"
        "fmt"
        "sort"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

type topology struct {
        dataNodes            *sync.Map
        metaNodes            *sync.Map
        zoneMap              *sync.Map
        zoneIndexForDataNode int
        zoneIndexForMetaNode int
        zones                []*Zone
        domainExcludeZones   []string // not domain zone, empty if domain disable.
        zoneLock             sync.RWMutex
}

func newTopology() (t *topology) {
        t = new(topology)
        t.zoneMap = new(sync.Map)
        t.dataNodes = new(sync.Map)
        t.metaNodes = new(sync.Map)
        t.zones = make([]*Zone, 0)
        return
}

func (t *topology) zoneLen() int {
        t.zoneLock.RLock()
        defer t.zoneLock.RUnlock()
        return len(t.zones)
}

func (t *topology) clear() {
        t.dataNodes.Range(func(key, value interface{}) bool {
                t.dataNodes.Delete(key)
                return true
        })
        t.metaNodes.Range(func(key, value interface{}) bool {
                t.metaNodes.Delete(key)
                return true
        })
}

func (t *topology) putZone(zone *Zone) (err error) {
        t.zoneLock.Lock()
        defer t.zoneLock.Unlock()
        if _, ok := t.zoneMap.Load(zone.name); ok {
                return fmt.Errorf("zone[%v] has exist", zone.name)
        }
        t.zoneMap.Store(zone.name, zone)
        t.zones = append(t.zones, zone)
        return
}

func (t *topology) putZoneIfAbsent(zone *Zone) (beStoredZone *Zone) {
        t.zoneLock.Lock()
        defer t.zoneLock.Unlock()
        oldZone, ok := t.zoneMap.Load(zone.name)
        if ok {
                return oldZone.(*Zone)
        }
        t.zoneMap.Store(zone.name, zone)
        t.zones = append(t.zones, zone)
        beStoredZone = zone
        return
}

func (t *topology) getZoneNameList() (zoneList []string) {
        zoneList = make([]string, 0)
        t.zoneMap.Range(func(zoneName, value interface{}) bool {
                zoneList = append(zoneList, zoneName.(string))
                return true
        })
        return zoneList
}

func (t *topology) getZone(name string) (zone *Zone, err error) {
        t.zoneMap.Range(func(zoneName, value interface{}) bool {
                if zoneName != name {
                        return true
                }
                zone = value.(*Zone)
                return true
        })
        if zone == nil {
                return nil, fmt.Errorf("zone[%v] is not found", name)
        }
        return
}

func (t *topology) putDataNode(dataNode *DataNode) (err error) {
        if _, ok := t.dataNodes.Load(dataNode.Addr); ok {
                return
        }
        zone, err := t.getZone(dataNode.ZoneName)
        if err != nil {
                return
        }

        zone.putDataNode(dataNode)
        t.putDataNodeToCache(dataNode)
        return
}

func (t *topology) putDataNodeToCache(dataNode *DataNode) {
        t.dataNodes.Store(dataNode.Addr, dataNode)
}

func (t *topology) deleteDataNode(dataNode *DataNode) {
        zone, err := t.getZone(dataNode.ZoneName)
        if err != nil {
                return
        }
        zone.deleteDataNode(dataNode)
        t.dataNodes.Delete(dataNode.Addr)
}

func (t *topology) getZoneByDataNode(dataNode *DataNode) (zone *Zone, err error) {
        _, ok := t.dataNodes.Load(dataNode.Addr)
        if !ok {
                return nil, errors.Trace(dataNodeNotFound(dataNode.Addr), "%v not found", dataNode.Addr)
        }

        return t.getZone(dataNode.ZoneName)
}

func (t *topology) putMetaNode(metaNode *MetaNode) (err error) {
        if _, ok := t.metaNodes.Load(metaNode.Addr); ok {
                return
        }
        zone, err := t.getZone(metaNode.ZoneName)
        if err != nil {
                return
        }
        zone.putMetaNode(metaNode)
        t.putMetaNodeToCache(metaNode)
        return
}

func (t *topology) deleteMetaNode(metaNode *MetaNode) {
        t.metaNodes.Delete(metaNode.Addr)
        zone, err := t.getZone(metaNode.ZoneName)
        if err != nil {
                return
        }
        zone.deleteMetaNode(metaNode)
}

func (t *topology) putMetaNodeToCache(metaNode *MetaNode) {
        t.metaNodes.Store(metaNode.Addr, metaNode)
}

type nodeSetCollection []*nodeSet

func (nsc nodeSetCollection) Len() int {
        return len(nsc)
}

func (nsc nodeSetCollection) Less(i, j int) bool {
        return nsc[i].metaNodeLen() < nsc[j].metaNodeLen()
}

func (nsc nodeSetCollection) Swap(i, j int) {
        nsc[i], nsc[j] = nsc[j], nsc[i]
}

type nodeSetGroup struct {
        ID            uint64
        domainId      uint64
        nsgInnerIndex int // worked if alloc num of replica not equal with standard set num of nsg
        nodeSets      []*nodeSet
        nodeSetsIds   []uint64
        status        uint8
        sync.RWMutex
}

func newNodeSetGrp(c *Cluster) *nodeSetGroup {
        var id uint64
        var err error
        if id, err = c.idAlloc.allocateCommonID(); err != nil {
                return nil
        }
        log.LogInfof("action[newNodeSetGrp] construct,id[%v]", id)
        nsg := &nodeSetGroup{
                ID:     id,
                status: normal,
        }
        return nsg
}

type DomainNodeSetGrpManager struct {
        domainId             uint64
        nsgIndex             int // alloc host from  available nodesetGrp with balance policy
        nodeSetGrpMap        []*nodeSetGroup
        zoneAvailableNodeSet map[string]*list.List
        nsId2NsGrpMap        map[uint64]int // map nodeset id  to nodeset group index in nodeSetGrpMap
        lastBuildIndex       int            // build index for 2 plus 1 policy,multi zones need balance build
        status               uint8          // all nodesetGrp may be unavailable or no nodesetGrp be existed on given policy
        nsIdMap              map[uint64]int // store all ns already be put into manager
}

type DomainManager struct {
        c                     *Cluster
        init                  bool // manager  can't be used in some startup stage before load
        domainNodeSetGrpVec   []*DomainNodeSetGrpManager
        domainId2IndexMap     map[uint64]int
        ZoneName2DomainIdMap  map[string]uint64
        excludeZoneListDomain map[string]int // upgrade old datastore old zones use old policy
        dataRatioLimit        float64
        excludeZoneUseRatio   float64
        sync.RWMutex
}

func newDomainNodeSetGrpManager() *DomainNodeSetGrpManager {
        log.LogInfof("action[newDomainManager] construct")
        ns := &DomainNodeSetGrpManager{
                nsgIndex:             0,
                zoneAvailableNodeSet: make(map[string]*list.List),
                nsId2NsGrpMap:        make(map[uint64]int),
                nsIdMap:              make(map[uint64]int),
        }
        return ns
}

func newDomainManager(cls *Cluster) *DomainManager {
        log.LogInfof("action[newDomainManager] construct")
        ns := &DomainManager{
                c:                     cls,
                domainId2IndexMap:     make(map[uint64]int),
                ZoneName2DomainIdMap:  make(map[string]uint64),
                excludeZoneListDomain: make(map[string]int),
                dataRatioLimit:        defaultDomainUsageThreshold,
                excludeZoneUseRatio:   defaultDomainUsageThreshold,
        }
        return ns
}

func (nsgm *DomainManager) start() {
        log.LogInfof("action[DomainManager:start] start")
        nsgm.init = true
}

func (nsgm *DomainManager) createDomain(zoneName string) (err error) {
        if nsgm.init == false {
                return fmt.Errorf("createDomain err [%v]", err)
        }
        log.LogInfof("zone name [%v] createDomain", zoneName)
        zoneList := strings.Split(zoneName, ",")
        grpRegion := newDomainNodeSetGrpManager()
        if grpRegion.domainId, err = nsgm.c.idAlloc.allocateCommonID(); err != nil {
                return fmt.Errorf("createDomain err [%v]", err)
        }
        nsgm.Lock()
        for i := 0; i < len(zoneList); i++ {
                if domainId, ok := nsgm.ZoneName2DomainIdMap[zoneList[i]]; ok {
                        nsgm.Unlock()
                        return fmt.Errorf("zone name [%v] exist in domain [%v]", zoneList[i], domainId)
                }
        }
        nsgm.domainNodeSetGrpVec = append(nsgm.domainNodeSetGrpVec, grpRegion)
        for i := 0; i < len(zoneList); i++ {
                nsgm.ZoneName2DomainIdMap[zoneList[i]] = grpRegion.domainId
                nsgm.domainId2IndexMap[grpRegion.domainId] = len(nsgm.domainNodeSetGrpVec) - 1
                log.LogInfof("action[createDomain] domainid [%v] zonename [%v] index [%v]", grpRegion.domainId, zoneList[i], len(nsgm.domainNodeSetGrpVec)-1)
        }

        nsgm.Unlock()
        if err = nsgm.c.putZoneDomain(false); err != nil {
                return fmt.Errorf("putZoneDomain err [%v]", err)
        }
        return
}

func (nsgm *DomainManager) checkExcludeZoneState() {
        if len(nsgm.excludeZoneListDomain) == 0 {
                log.LogInfof("action[checkExcludeZoneState] no excludeZoneList for Domain,size zero")
                return
        }
        excludeNeedDomain := true
        log.LogInfof("action[checkExcludeZoneState] excludeZoneList size[%v]", len(nsgm.excludeZoneListDomain))
        for zoneNm := range nsgm.excludeZoneListDomain {
                if value, ok := nsgm.c.t.zoneMap.Load(zoneNm); ok {
                        zone := value.(*Zone)
                        if nsgm.excludeZoneUseRatio == 0 || nsgm.excludeZoneUseRatio > 1 {
                                nsgm.excludeZoneUseRatio = defaultDomainUsageThreshold
                        }
                        if zone.isUsedRatio(nsgm.excludeZoneUseRatio) {
                                if zone.status == normalZone {
                                        log.LogInfof("action[checkExcludeZoneState] zone[%v] be set unavailableZone", zone.name)
                                }
                                zone.status = unavailableZone
                        } else {
                                excludeNeedDomain = false
                                if zone.status == unavailableZone {
                                        log.LogInfof("action[checkExcludeZoneState] zone[%v] be set normalZone", zone.name)
                                }
                                zone.status = normalZone
                        }
                }
        }
        if excludeNeedDomain {
                log.LogInfof("action[checkExcludeZoneState] exclude zone cann't be used since now!excludeNeedDomain[%v]",
                        excludeNeedDomain)
                nsgm.c.needFaultDomain = true
        } else {
                if nsgm.c.needFaultDomain == true {
                        log.LogInfof("action[checkExcludeZoneState] needFaultDomain be set false")
                }
                nsgm.c.needFaultDomain = false
        }
}

func (nsgm *DomainManager) checkAllGrpState() {
        for i := 0; i < len(nsgm.domainNodeSetGrpVec); i++ {
                nsgm.checkGrpState(nsgm.domainNodeSetGrpVec[i])
        }
}

func (nsgm *DomainManager) checkGrpState(domainGrpManager *DomainNodeSetGrpManager) {
        nsgm.RLock()
        defer nsgm.RUnlock()
        if len(domainGrpManager.nodeSetGrpMap) == 0 {
                log.LogInfof("action[checkGrpState] leave,size zero")
                return
        }
        log.LogInfof("action[checkGrpState] nodeSetGrpMap size [%v]", len(domainGrpManager.nodeSetGrpMap))
        metaUnAvailableCnt := 0
        dataUnAvailableCnt := 0
        for i := 0; i < len(domainGrpManager.nodeSetGrpMap); i++ {
                log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v]",
                        i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status)
                grpStatus := normal
                grpMetaUnAvailableCnt := 0

                for j := 0; j < len(domainGrpManager.nodeSetGrpMap[i].nodeSets); j++ {
                        var (
                                metaWorked bool
                                dataWorked bool
                                used       uint64
                                total      uint64
                        )

                        domainGrpManager.nodeSetGrpMap[i].nodeSets[j].dataNodes.Range(func(key, value interface{}) bool {
                                node := value.(*DataNode)
                                if node.isWriteAble() {
                                        used = used + node.Used
                                } else {
                                        used = used + node.Total
                                }
                                total = total + node.Total

                                log.LogInfof("action[checkGrpState] nodeid[%v] zonename[%v] used [%v] total [%v] UsageRatio [%v] got available metanode",
                                        node.ID, node.ZoneName, node.Used, node.Total, node.UsageRatio)
                                return true
                        })

                        if float64(used)/float64(total) < nsgm.dataRatioLimit {
                                dataWorked = true
                        }
                        domainGrpManager.nodeSetGrpMap[i].nodeSets[j].metaNodes.Range(func(key, value interface{}) bool {
                                node := value.(*MetaNode)
                                if node.isWritable() {
                                        metaWorked = true
                                        log.LogInfof("action[checkGrpState] nodeset[%v] zonename[%v] used [%v] total [%v] threshold [%v] got available metanode",
                                                node.ID, node.ZoneName, node.Used, node.Total, node.Threshold)
                                        return false
                                }
                                log.LogInfof("action[checkGrpState] nodeset[%v] zonename[%v] used [%v] total [%v] threshold [%v] got available metanode",
                                        node.ID, node.ZoneName, node.Used, node.Total, node.Threshold)
                                return true
                        })
                        if !metaWorked || !dataWorked {
                                log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v] be set metaWorked[%v] dataWorked[%v]",
                                        i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status, metaWorked, dataWorked)
                                if !metaWorked {
                                        grpMetaUnAvailableCnt++
                                        if grpMetaUnAvailableCnt == 2 { // meta can be used if one node is not active
                                                if grpStatus == dataNodesUnAvailable {
                                                        log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status change from dataNodesUnAvailable to unavailable",
                                                                i, domainGrpManager.nodeSetGrpMap[i].ID)
                                                        grpStatus = unavailableZone
                                                        break
                                                }
                                                log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status be set metaNodesUnAvailable",
                                                        i, domainGrpManager.nodeSetGrpMap[i].ID)
                                                grpStatus = metaNodesUnAvailable
                                                metaUnAvailableCnt++
                                        }
                                }
                                if !dataWorked && grpStatus != dataNodesUnAvailable {
                                        if grpStatus == metaNodesUnAvailable {
                                                log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status change from metaNodesUnAvailable to unavailable",
                                                        i, domainGrpManager.nodeSetGrpMap[i].ID)
                                                grpStatus = unavailableZone
                                                break
                                        }
                                        log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], grp status be set dataNodesUnAvailable",
                                                i, domainGrpManager.nodeSetGrpMap[i].ID)
                                        grpStatus = dataNodesUnAvailable
                                        dataUnAvailableCnt++
                                }
                        }
                }
                domainGrpManager.nodeSetGrpMap[i].status = grpStatus
                log.LogInfof("action[checkGrpState] nodesetgrp index[%v], id[%v], status[%v] be set normal",
                        i, domainGrpManager.nodeSetGrpMap[i].ID, domainGrpManager.nodeSetGrpMap[i].status)
        }

        domainGrpManager.status = normal
        if dataUnAvailableCnt == len(domainGrpManager.nodeSetGrpMap) {
                domainGrpManager.status = dataNodesUnAvailable
        }
        if metaUnAvailableCnt == len(domainGrpManager.nodeSetGrpMap) {
                if domainGrpManager.status == dataNodesUnAvailable {
                        domainGrpManager.status = unavailableZone
                } else {
                        domainGrpManager.status = metaNodesUnAvailable
                }
        }
        log.LogInfof("action[checkGrpState] nodesetgrp size [%v] dataUnAvailableCnt [%v] metaUnAvailableCnt [%v] nsgm.status now[%v]",
                len(domainGrpManager.nodeSetGrpMap), dataUnAvailableCnt, metaUnAvailableCnt, domainGrpManager.status)
}

type buildNodeSetGrpMethod func(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error)

func (nsgm *DomainManager) buildNodeSetGrp(domainGrpManager *DomainNodeSetGrpManager) (err error) {
        log.LogInfof("action[buildNodeSetGrp] available zone [%v]", len(domainGrpManager.zoneAvailableNodeSet))
        if len(domainGrpManager.zoneAvailableNodeSet) == 0 {
                err = fmt.Errorf("action[buildNodeSetGrp] failed zone available zero")
                log.LogErrorf("[%v]", err)
                return
        }

        var method map[int]buildNodeSetGrpMethod
        method = make(map[int]buildNodeSetGrpMethod)
        method[3] = buildNodeSetGrp3Zone
        method[2] = buildNodeSetGrp2Plus1
        method[1] = buildNodeSetGrpOneZone
        step := defaultNodeSetGrpStep

        zoneCnt := nsgm.c.cfg.DefaultNormalZoneCnt
        log.LogInfof("action[buildNodeSetGrp] zoncnt [%v]", zoneCnt)
        if zoneCnt >= 3 {
                zoneCnt = 3
        }

        if zoneCnt > len(domainGrpManager.zoneAvailableNodeSet) {
                if nsgm.c.cfg.DomainBuildAsPossible || domainGrpManager.domainId > 0 {
                        log.LogInfof("action[buildNodeSetGrp] zoncnt [%v]", zoneCnt)
                        zoneCnt = len(domainGrpManager.zoneAvailableNodeSet)
                } else {
                        err = fmt.Errorf("action[buildNodeSetGrp] failed zone available [%v] need [%v]", zoneCnt, len(domainGrpManager.zoneAvailableNodeSet))
                        log.LogErrorf("[%v]", err)
                        return
                }
        }
        for {
                log.LogInfof("action[buildNodeSetGrp] zoneCnt [%v] step [%v]", zoneCnt, step)
                err = method[zoneCnt](nsgm, domainGrpManager)
                if err != nil {
                        log.LogInfof("action[buildNodeSetGrp] err [%v]", err)
                        break
                }
                step--
                if step == 0 {
                        break
                }
        }
        if domainGrpManager.status != normal || len(domainGrpManager.nodeSetGrpMap) == 0 {
                return fmt.Errorf("cann't build new group [%v]", err)
        }

        return nil
}

func (nsgm *DomainManager) getHostFromNodeSetGrpSpecific(domainGrpManager *DomainNodeSetGrpManager, replicaNum uint8, createType uint32) (
        hosts []string,
        peers []proto.Peer,
        err error,
) {
        log.LogErrorf("action[getHostFromNodeSetGrpSpecific]  replicaNum[%v],type[%v], nsg cnt[%v], nsg status[%v]",
                replicaNum, createType, len(domainGrpManager.nodeSetGrpMap), domainGrpManager.status)
        if len(domainGrpManager.nodeSetGrpMap) == 0 {
                log.LogErrorf("action[getHostFromNodeSetGrpSpecific] [%v] nodeSetGrpMap zero", domainGrpManager.domainId)
                return nil, nil, fmt.Errorf("nodeSetGrpMap zero")
        }

        nsgm.RLock()
        defer nsgm.RUnlock()

        var cnt int
        nsgIndex := domainGrpManager.nsgIndex
        domainGrpManager.nsgIndex = (domainGrpManager.nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)

        for {
                if cnt >= len(domainGrpManager.nodeSetGrpMap) {
                        log.LogInfof("action[getHostFromNodeSetGrpSpecific] failed all nsGrp unavailable,cnt[%v]", cnt)
                        err = fmt.Errorf("action[getHostFromNodeSetGrpSpecific],err:no nsGrp status normal,cnt[%v]", cnt)
                        break
                }
                cnt++
                nsgIndex = (nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
                nsg := domainGrpManager.nodeSetGrpMap[nsgIndex]

                needReplicaNumArray := [3]int{1, 2, 3}
                for _, needReplicaNum := range needReplicaNumArray {
                        var (
                                host []string
                                peer []proto.Peer
                        )
                        // every replica will look around every nodeset and break if get one
                        for i := 0; i < defaultFaultDomainZoneCnt; i++ {
                                ns := nsg.nodeSets[nsg.nsgInnerIndex]
                                nsg.nsgInnerIndex = (nsg.nsgInnerIndex + 1) % defaultFaultDomainZoneCnt
                                log.LogInfof("action[getHostFromNodeSetGrpSpecific]  nodesetid[%v],zonename[%v], datanode len[%v],metanode len[%v],capacity[%v]",
                                        ns.ID, ns.zoneName, ns.dataNodeLen(), ns.metaNodeLen(), ns.Capacity)

                                needNum := needReplicaNum
                                if needReplicaNum > int(replicaNum)-len(hosts) {
                                        needNum = int(replicaNum) - len(hosts)
                                }

                                if createType == TypeDataPartition {
                                        if host, peer, err = ns.getAvailDataNodeHosts(nil, needNum); err != nil {
                                                log.LogErrorf("action[getHostFromNodeSetGrpSpecific] ns[%v] zone[%v] TypeDataPartition err[%v]", ns.ID, ns.zoneName, err)
                                                // nsg.status = dataNodesUnAvailable
                                                continue
                                        }
                                } else {
                                        if host, peer, err = ns.getAvailMetaNodeHosts(nil, needNum); err != nil {
                                                log.LogErrorf("action[getHostFromNodeSetGrpSpecific]  ns[%v] zone[%v] TypeMetaPartition err[%v]", ns.ID, ns.zoneName, err)
                                                // nsg.status = metaNodesUnAvailable
                                                continue
                                        }
                                }

                                hosts = append(hosts, host...)
                                peers = append(peers, peer...)
                                if int(replicaNum) == len(hosts) {
                                        log.LogInfof("action[getHostFromNodeSetGrpSpecific]  ngGrp[%v] unable support type[%v] replicaNum[%v]", nsg.ID, createType, replicaNum)
                                        return
                                }
                        }
                        hosts = nil
                        peers = nil
                }

        }

        return nil, nil, fmt.Errorf("action[getHostFromNodeSetGrpSpecific] cann't alloc host")
}

func (nsgm *DomainManager) getHostFromNodeSetGrp(domainId uint64, replicaNum uint8, createType uint32) (
        hosts []string,
        peers []proto.Peer,
        err error) {
        var ok bool
        var index int

        if index, ok = nsgm.domainId2IndexMap[domainId]; !ok {
                err = fmt.Errorf("action[getHostFromNodeSetGrp] not found domainid[%v]", domainId)
                return
        }
        domainGrpManager := nsgm.domainNodeSetGrpVec[index]

        log.LogInfof("action[getHostFromNodeSetGrp] domainId [%v] index [%v] replicaNum[%v],type[%v], nsg cnt[%v], nsg status[%v]",
                domainId, index, replicaNum, createType, len(domainGrpManager.nodeSetGrpMap), domainGrpManager.status)

        // this scenario is abnormal  may be caused by zone unavailable in high probability
        if domainGrpManager.status != normal {
                return nsgm.getHostFromNodeSetGrpSpecific(domainGrpManager, replicaNum, createType)
        }

        // grp map be build with three zone on standard,no grp if zone less than three,here will build
        // nodesetGrp with zones less than three,because offer service is much more important than high available
        if len(domainGrpManager.zoneAvailableNodeSet) != 0 {
                if nsgm.buildNodeSetGrp(domainGrpManager); len(domainGrpManager.nodeSetGrpMap) == 0 {
                        err = fmt.Errorf("no usable group")
                        log.LogErrorf("action[getHostFromNodeSetGrp] no usable group build failed,err[%v]", err)
                        return
                }
        } else if len(domainGrpManager.nodeSetGrpMap) == 0 {
                err = fmt.Errorf("no usable group")
                log.LogInfof("action[getHostFromNodeSetGrp] err[%v]", err)
                return
        }

        nsgm.RLock()
        defer nsgm.RUnlock()

        var cnt int
        nsgIndex := domainGrpManager.nsgIndex
        domainGrpManager.nsgIndex = (domainGrpManager.nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)

        for {
                if cnt >= len(domainGrpManager.nodeSetGrpMap) {
                        err = fmt.Errorf("action[getHostFromNodeSetGrp] need replica cnt [%v] but get host cnt [%v] from nodesetgrps count[%v]",
                                replicaNum, len(hosts), cnt)
                        log.LogErrorf(err.Error())
                        return nil, nil, err
                }
                cnt++
                nsgIndex = (nsgIndex + 1) % len(domainGrpManager.nodeSetGrpMap)
                nsg := domainGrpManager.nodeSetGrpMap[nsgIndex]

                var (
                        host []string
                        peer []proto.Peer
                )

                // it's better to get enough replicas from one nsg(copy set) and will get complement from
                // other nsg if not

                for i := 0; i < defaultMaxReplicaCnt*len(nsg.nodeSets); i++ {
                        ns := nsg.nodeSets[nsg.nsgInnerIndex]
                        log.LogInfof("action[getHostFromNodeSetGrp]  nodesetid[%v],zonename[%v], datanode len[%v],metanode len[%v],capacity[%v]",
                                ns.ID, ns.zoneName, ns.dataNodeLen(), ns.metaNodeLen(), ns.Capacity)
                        nsg.nsgInnerIndex = (nsg.nsgInnerIndex + 1) % defaultFaultDomainZoneCnt
                        if nsg.status == unavailableZone {
                                log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] unavailableZone", ns.ID, ns.zoneName)
                                continue
                        }
                        if createType == TypeDataPartition {
                                if nsg.status == dataNodesUnAvailable {
                                        log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] dataNodesUnAvailable", ns.ID, ns.zoneName)
                                        continue
                                }
                                if host, peer, err = ns.getAvailDataNodeHosts(hosts, 1); err != nil {
                                        log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] TypeDataPartition err[%v]", ns.ID, ns.zoneName, err)
                                        // nsg.status = dataNodesUnAvailable
                                        continue
                                }
                        } else {
                                if nsg.status == metaNodesUnAvailable {
                                        log.LogWarnf("action[getHostFromNodeSetGrp] ns[%v] zone[%v] metaNodesUnAvailable", ns.ID, ns.zoneName)
                                        continue
                                }
                                if host, peer, err = ns.getAvailMetaNodeHosts(hosts, 1); err != nil {
                                        log.LogWarnf("action[getHostFromNodeSetGrp]  ns[%v] zone[%v] TypeMetaPartition err[%v]", ns.ID, ns.zoneName, err)
                                        // nsg.status = metaNodesUnAvailable
                                        continue
                                }
                        }
                        hosts = append(hosts, host[0])
                        peers = append(peers, peer[0])
                        log.LogInfof("action[getHostFromNodeSetGrp]  get host[%v] peer[%v], nsg id[%v] nsgInnerIndex[%v]", host[0], peer[0], nsg.ID, nsg.nsgInnerIndex)

                        if len(hosts) == int(replicaNum) {
                                return hosts, peers, nil
                        }
                }
        }
}

// nodeset may not
type nsList struct {
        lst      *list.List
        ele      *list.Element
        zoneName string
}

func (nsgm *DomainManager) buildNodeSetGrpPrepare(domainGrpManager *DomainNodeSetGrpManager) (buildIndex int, zoneAvaVec []nsList) {
        sortedKeys := make([]string, 0)
        for k := range domainGrpManager.zoneAvailableNodeSet {
                sortedKeys = append(sortedKeys, k)
        }
        sort.Strings(sortedKeys)
        for _, zoneName := range sortedKeys {
                var zoneInfo nsList
                zoneInfo.lst = domainGrpManager.zoneAvailableNodeSet[zoneName]
                zoneInfo.zoneName = zoneName
                zoneAvaVec = append(zoneAvaVec, zoneInfo)
        }
        buildIndex = domainGrpManager.lastBuildIndex % len(zoneAvaVec)
        domainGrpManager.lastBuildIndex = (domainGrpManager.lastBuildIndex + 1) % len(zoneAvaVec)
        return
}

func (nsgm *DomainManager) buildNodeSetGrpDoWork(zoneName string, nodeList *list.List, needCnt int) (resList []nsList, err error) {
        log.LogInfof("action[buildNodeSetGrpDoWork] step in")
        var tmpList []nsList
        ele := nodeList.Front()
        for {
                if ele == nil {
                        log.LogInfof("action[buildNodeSetGrpDoWork] zone [%v] can't create nodeset group nodeList not qualified", zoneName)
                        err = fmt.Errorf("action[buildNodeSetGrpDoWork] zone [%v] can't create nodeset group nodeList not qualified", zoneName)
                        return
                }
                nst := ele.Value.(*nodeSet)
                log.LogInfof("action[buildNodeSetGrpDoWork] nodeset [%v] zonename [%v] ,metacnt[%v],datacnt[%v]",
                        nst.ID, nst.zoneName, nst.metaNodeLen(), nst.dataNodeLen())
                if nst.dataNodeLen() > 0 && nst.metaNodeLen() > 0 {
                        var nsl nsList
                        nsl.lst = nodeList
                        nsl.ele = ele
                        nsl.zoneName = zoneName
                        tmpList = append(tmpList, nsl)
                        log.LogInfof("action[buildNodeSetGrpDoWork] nodeset [%v] zonename [%v] qualified be put in,metacnt[%v],datacnt[%v]",
                                nst.ID, nst.zoneName, nst.metaNodeLen(), nst.dataNodeLen())
                        needCnt = needCnt - 1
                        if needCnt == 0 {
                                break
                        }
                }
                ele = ele.Next()
        }
        if needCnt == 0 {
                resList = append(resList, tmpList...)
        } else {
                err = fmt.Errorf("not quliaifed")
        }
        return
}

func (nsgm *DomainManager) buildNodeSetGrpCommit(resList []nsList, domainGrpManager *DomainNodeSetGrpManager) {
        nodeSetGrp := newNodeSetGrp(nsgm.c)
        nodeSetGrp.domainId = domainGrpManager.domainId
        for i := 0; i < len(resList); i++ {
                nst := resList[i].ele.Value.(*nodeSet)
                nodeSetGrp.nodeSets = append(nodeSetGrp.nodeSets, nst)
                nodeSetGrp.nodeSetsIds = append(nodeSetGrp.nodeSetsIds, nst.ID)
                log.LogInfof("action[buildNodeSetGrpCommit] build nodesetGrp id[%v] with append nst id [%v] zoneName [%v]", nodeSetGrp.ID, nst.ID, nst.zoneName)
                resList[i].lst.Remove(resList[i].ele)
                domainGrpManager.nsId2NsGrpMap[nst.ID] = len(domainGrpManager.nodeSetGrpMap)
                if resList[i].lst.Len() == 0 {
                        delete(domainGrpManager.zoneAvailableNodeSet, resList[i].zoneName)
                        log.LogInfof("action[buildNodeSetGrpCommit] after grp build no nodeset available for zone[%v],nodesetid:[%v], zonelist size[%v]",
                                nst.zoneName, nst.ID, len(domainGrpManager.zoneAvailableNodeSet))
                }
        }

        log.LogInfof("action[buildNodeSetGrpCommit] success build nodesetgrp zonelist size[%v], nodesetids[%v]",
                len(domainGrpManager.zoneAvailableNodeSet), nodeSetGrp.nodeSetsIds)
        domainGrpManager.nodeSetGrpMap = append(domainGrpManager.nodeSetGrpMap, nodeSetGrp)
        nsgm.c.putNodeSetGrpInfo(opSyncNodeSetGrp, nodeSetGrp)
        domainGrpManager.status = normal
}

// policy of build zone if zone count large then three
func buildNodeSetGrp3Zone(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
        nsgm.Lock()
        defer nsgm.Unlock()
        log.LogInfof("action[buildNodeSetGrp3Zone step in")
        if len(domainGrpManager.zoneAvailableNodeSet) < defaultFaultDomainZoneCnt {
                log.LogInfof("action[DomainManager::buildNodeSetGrp3Zone] size error,can't create group zone cnt[%v]",
                        len(domainGrpManager.zoneAvailableNodeSet))
                return fmt.Errorf("defaultFaultDomainZoneCnt not satisfied")
        }

        var resList []nsList
        buildIndex, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)
        cnt := 0
        for {
                if cnt > 0 {
                        buildIndex = (buildIndex + 1) % len(zoneAvaVec)
                }
                if cnt == len(zoneAvaVec) || len(resList) == defaultReplicaNum {
                        log.LogInfof("step out inner loop in buildNodeSetGrp3Zone cnt [%v], inner index [%v]", cnt, buildIndex)
                        break
                }
                cnt++
                nodeList := zoneAvaVec[buildIndex].lst
                zoneName := zoneAvaVec[buildIndex].zoneName
                var tmpList []nsList
                if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneName, nodeList, 1); err != nil {
                        continue
                }
                resList = append(resList, tmpList...)
        }
        if len(resList) < defaultReplicaNum {
                log.LogInfof("action[DomainManager::buildNodeSetGrp3Zone] can't create nodeset group nodeset qualified count [%v]", len(resList))
                return fmt.Errorf("defaultFaultDomainZoneCnt not satisfied")
        }
        nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)
        return nil
}

func buildNodeSetGrpOneZone(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
        nsgm.Lock()
        defer nsgm.Unlock()
        log.LogInfof("action[buildNodeSetGrpOneZone] step in")
        if len(domainGrpManager.zoneAvailableNodeSet) != 1 {
                log.LogErrorf("action[buildNodeSetGrpOneZone] available zone cnt[%v]", len(domainGrpManager.zoneAvailableNodeSet))
                err = fmt.Errorf("available zone cnt[%v]", len(domainGrpManager.zoneAvailableNodeSet))
                return
        }
        buildIndex, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)

        if zoneAvaVec[buildIndex].lst.Len() < defaultReplicaNum {
                log.LogErrorf("action[buildNodeSetGrpOneZone] not enough nodeset in available list")
                return fmt.Errorf("not enough nodeset in available list")
        }
        var resList []nsList
        if resList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[buildIndex].zoneName,
                zoneAvaVec[buildIndex].lst, defaultReplicaNum); err != nil {
                return err
        }
        nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)

        return nil
}

// build 2 plus 1 nodesetGrp with 2zone or larger
func buildNodeSetGrp2Plus1(nsgm *DomainManager, domainGrpManager *DomainNodeSetGrpManager) (err error) {
        nsgm.Lock()
        defer nsgm.Unlock()
        log.LogInfof("step in buildNodeSetGrp2Plus1")

        cnt := 0
        var resList []nsList

        _, zoneAvaVec := nsgm.buildNodeSetGrpPrepare(domainGrpManager)
        var np1, np2 int

        if zoneAvaVec[0].lst.Len() < zoneAvaVec[1].lst.Len() {
                np1 = 0
                np2 = 1
        } else {
                np1 = 1
                np2 = 0
        }
        for i := 2; i < len(zoneAvaVec); i++ {
                if zoneAvaVec[i].lst.Len() > zoneAvaVec[np1].lst.Len() {
                        if zoneAvaVec[i].lst.Len() > zoneAvaVec[np2].lst.Len() {
                                np2 = i
                        } else {
                                np1 = i
                        }
                }
        }
        if zoneAvaVec[np1].lst.Len() < 1 || zoneAvaVec[np2].lst.Len() < 2 {
                log.LogInfof("step out buildNodeSetGrp2Plus1 np1 [%v] np2 [%v] cnt [%v], inner index [%v]",
                        np1, np2, cnt, domainGrpManager.lastBuildIndex)
                return fmt.Errorf("action[buildNodeSetGrp2Plus1] failed")
        }

        var tmpList []nsList
        if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[np1].zoneName, zoneAvaVec[np1].lst, 1); err != nil {
                return
        }
        resList = append(resList, tmpList...)
        if tmpList, err = nsgm.buildNodeSetGrpDoWork(zoneAvaVec[np2].zoneName, zoneAvaVec[np2].lst, 2); err != nil {
                return
        }
        resList = append(resList, tmpList...)
        nsgm.buildNodeSetGrpCommit(resList, domainGrpManager)

        return
}

func (nsgm *DomainManager) putNodeSet(ns *nodeSet, load bool) (err error) {
        nsgm.Lock()
        defer nsgm.Unlock()
        var (
                ok       bool
                index    int
                nsGrp    *DomainNodeSetGrpManager
                domainId uint64
        )
        if _, ok = nsgm.excludeZoneListDomain[ns.zoneName]; ok {
                log.LogInfof("action[DomainManager::putNodeSet] zone[%v],nodesetid:[%v], domain vec size[%v]",
                        ns.zoneName, ns.ID, len(nsgm.domainNodeSetGrpVec))
                return
        }

        if domainId, ok = nsgm.ZoneName2DomainIdMap[ns.zoneName]; !ok {
                domainId = 0 // no domainid be set before;therefore, put it to default domain
                nsgm.ZoneName2DomainIdMap[ns.zoneName] = 0
        }
        if index, ok = nsgm.domainId2IndexMap[domainId]; !ok {
                if domainId > 0 && load == false { // domainId 0 can be created through nodeset create,others be created by createDomain
                        err = fmt.Errorf("inconsistent domainid exist in name map but node exist in index map")
                        log.LogErrorf("action[putNodeSet]  %v", err)
                        return
                }
                grpRegion := newDomainNodeSetGrpManager()
                nsgm.domainNodeSetGrpVec = append(nsgm.domainNodeSetGrpVec, grpRegion)
                nsgm.ZoneName2DomainIdMap[ns.zoneName] = 0 // domainId must be zero here
                grpRegion.domainId = domainId
                index = len(nsgm.domainNodeSetGrpVec) - 1
                nsgm.domainId2IndexMap[domainId] = index
                log.LogInfof("action[putNodeSet] build domainId[%v] zoneName [%v] index [%v]", domainId, ns.zoneName, index)
        }
        nsGrp = nsgm.domainNodeSetGrpVec[index]

        if _, ok = nsGrp.nsIdMap[ns.ID]; ok {
                log.LogInfof("action[DomainManager::putNodeSet]  zone[%v],nodesetid:[%v] already be put before load[%v]",
                        ns.zoneName, ns.ID, load)
                return
        }
        nsGrp.nsIdMap[ns.ID] = 0
        log.LogInfof("action[DomainManager::putNodeSet]  zone[%v],nodesetid:[%v], domain vec size[%v], load[%v]",
                ns.zoneName, ns.ID, len(nsgm.domainNodeSetGrpVec), load)

        // nodeset already be put into grp,this should be happened at condition of load == true
        // here hosts in ns should be nullptr and wait node register
        if grpidx, ok := nsGrp.nsId2NsGrpMap[ns.ID]; ok {
                nsGrp.nodeSetGrpMap[grpidx].nodeSets = append(nsGrp.nodeSetGrpMap[grpidx].nodeSets, ns)
                log.LogInfof("action[DomainManager::putNodeSet]  zone[%v],nodesetid:[%v] already be put before grp index[%v], grp id[%v] load[%v]",
                        ns.zoneName, ns.ID, grpidx, nsGrp.nodeSetGrpMap[grpidx].ID, load)
                return
        }
        if _, ok := nsGrp.zoneAvailableNodeSet[ns.zoneName]; !ok {
                nsGrp.zoneAvailableNodeSet[ns.zoneName] = list.New()
                log.LogInfof("action[DomainManager::putNodeSet] init list for zone[%v],zonelist size[%v]", ns.zoneName, len(nsGrp.zoneAvailableNodeSet))
        }
        log.LogInfof("action[DomainManager::putNodeSet] domainid [%v] ns id[%v] be put in zone[%v]", nsGrp.domainId, ns.ID, ns.zoneName)
        nsGrp.zoneAvailableNodeSet[ns.zoneName].PushBack(ns)

        return
}

type nodeSet struct {
        ID                             uint64
        Capacity                       int
        zoneName                       string
        metaNodes                      *sync.Map
        dataNodes                      *sync.Map
        decommissionDataPartitionList  *DecommissionDataPartitionList
        decommissionParallelLimit      int32
        decommissionDiskParallelFactor float64
        nodeSelectLock                 sync.Mutex
        dataNodeSelectorLock           sync.RWMutex
        dataNodeSelector               NodeSelector
        metaNodeSelectorLock           sync.RWMutex
        metaNodeSelector               NodeSelector
        sync.RWMutex
        manualDecommissionDiskList        *DecommissionDiskList
        autoDecommissionDiskList          *DecommissionDiskList
        doneDecommissionDiskListTraverse  chan struct{}
        startDecommissionDiskListTraverse chan struct{}
        DecommissionDisks                 sync.Map
        diskParallelFactorLk              sync.Mutex
}

type nodeSetDecommissionParallelStatus struct {
        ID          uint64
        CurTokenNum int32
        MaxTokenNum int32
        RunningDp   []uint64
}

func newNodeSet(c *Cluster, id uint64, cap int, zoneName string) *nodeSet {
        log.LogInfof("action[newNodeSet] id[%v]", id)
        ns := &nodeSet{
                ID:                                id,
                Capacity:                          cap,
                zoneName:                          zoneName,
                metaNodes:                         new(sync.Map),
                dataNodes:                         new(sync.Map),
                decommissionDataPartitionList:     NewDecommissionDataPartitionList(c),
                manualDecommissionDiskList:        NewDecommissionDiskList(),
                autoDecommissionDiskList:          NewDecommissionDiskList(),
                doneDecommissionDiskListTraverse:  make(chan struct{}, 1),
                startDecommissionDiskListTraverse: make(chan struct{}, 1),
                dataNodeSelector:                  NewNodeSelector(DefaultNodeSelectorName, DataNodeType),
                metaNodeSelector:                  NewNodeSelector(DefaultNodeSelectorName, MetaNodeType),
        }
        go ns.traverseDecommissionDisk(c)
        return ns
}

func (ns *nodeSet) GetDataNodeSelector() string {
        ns.dataNodeSelectorLock.RLock()
        defer ns.dataNodeSelectorLock.RUnlock()
        return ns.dataNodeSelector.GetName()
}

func (ns *nodeSet) SetDataNodeSelector(name string) {
        ns.dataNodeSelectorLock.Lock()
        defer ns.dataNodeSelectorLock.Unlock()
        ns.dataNodeSelector = NewNodeSelector(name, DataNodeType)
}

func (ns *nodeSet) GetMetaNodeSelector() string {
        ns.metaNodeSelectorLock.RLock()
        defer ns.metaNodeSelectorLock.RUnlock()
        return ns.metaNodeSelector.GetName()
}

func (ns *nodeSet) SetMetaNodeSelector(name string) {
        ns.metaNodeSelectorLock.Lock()
        defer ns.metaNodeSelectorLock.Unlock()
        ns.metaNodeSelector = NewNodeSelector(name, MetaNodeType)
}

func (ns *nodeSet) metaNodeLen() (count int) {
        ns.RLock()
        defer ns.RUnlock()
        ns.metaNodes.Range(func(key, value interface{}) bool {
                count++
                return true
        })
        return
}

func (ns *nodeSet) startDecommissionSchedule() {
        ns.decommissionDataPartitionList.startTraverse()
        ns.startDecommissionDiskListTraverse <- struct{}{}
}

func (ns *nodeSet) dataNodeLen() (count int) {
        ns.RLock()
        defer ns.RUnlock()
        ns.dataNodes.Range(func(key, value interface{}) bool {
                count++
                return true
        })
        return
}

func (ns *nodeSet) putMetaNode(metaNode *MetaNode) {
        ns.metaNodes.Store(metaNode.Addr, metaNode)
}

func (ns *nodeSet) deleteMetaNode(metaNode *MetaNode) {
        ns.metaNodes.Delete(metaNode.Addr)
}

func (ns *nodeSet) canWriteForDataNode(replicaNum int) bool {
        var count int
        ns.dataNodes.Range(func(key, value interface{}) bool {
                node := value.(*DataNode)
                if node.isWriteAble() && node.dpCntInLimit() {
                        count++
                }
                if count >= replicaNum {
                        return false
                }
                return true
        })
        log.LogInfof("canWriteForDataNode zone[%v], ns[%v],count[%v], replicaNum[%v]",
                ns.zoneName, ns.ID, count, replicaNum)
        return count >= replicaNum
}

func (ns *nodeSet) canWriteForMetaNode(replicaNum int) bool {
        var count int
        ns.metaNodes.Range(func(key, value interface{}) bool {
                node := value.(*MetaNode)
                if node.isWritable() {
                        count++
                }
                if count >= replicaNum {
                        return false
                }
                return true
        })
        log.LogInfof("canWriteForMetaNode zone[%v], ns[%v],count[%v] replicaNum[%v]",
                ns.zoneName, ns.ID, count, replicaNum)
        return count >= replicaNum
}

func (ns *nodeSet) putDataNode(dataNode *DataNode) {
        ns.dataNodes.Store(dataNode.Addr, dataNode)
}

func (ns *nodeSet) deleteDataNode(dataNode *DataNode) {
        ns.dataNodes.Delete(dataNode.Addr)
}

func (ns *nodeSet) AddToDecommissionDataPartitionList(dp *DataPartition, c *Cluster) {
        ns.decommissionDataPartitionList.Put(ns.ID, dp, c)
}

func (ns *nodeSet) UpdateMaxParallel(maxParallel int32) {
        ns.decommissionDataPartitionList.updateMaxParallel(maxParallel)
        log.LogDebugf("action[UpdateMaxParallel]nodeSet[%v] decommission limit update to [%v]", ns.ID, maxParallel)
        atomic.StoreInt32(&ns.decommissionParallelLimit, maxParallel)
}

func (ns *nodeSet) UpdateDecommissionDiskFactor(factor float64) {
        log.LogDebugf("action[UpdateDecommissionFactor]nodeSet[%v] decommission disk factor update to [%v]", ns.ID, factor)
        ns.diskParallelFactorLk.Lock()
        defer ns.diskParallelFactorLk.Unlock()
        ns.decommissionDiskParallelFactor = factor
}

func (ns *nodeSet) QueryDecommissionDiskLimit() int {
        ns.diskParallelFactorLk.Lock()
        defer ns.diskParallelFactorLk.Unlock()
        log.LogDebugf("action[QueryDecommissionDiskLimit]nodeSet[%v] decommission disk limit to [%v]",
                ns.ID, int(ns.decommissionDiskParallelFactor*float64(ns.dataNodeLen())))
        return int(ns.decommissionDiskParallelFactor * float64(ns.dataNodeLen()))
}

func (ns *nodeSet) getDecommissionParallelStatus() (int32, int32, []uint64) {
        return ns.decommissionDataPartitionList.getDecommissionParallelStatus()
}

func (ns *nodeSet) AcquireDecommissionToken(id uint64) bool {
        return ns.decommissionDataPartitionList.acquireDecommissionToken(id)
}

func (ns *nodeSet) ReleaseDecommissionToken(id uint64) {
        ns.decommissionDataPartitionList.releaseDecommissionToken(id)
}

func (ns *nodeSet) AddDecommissionDisk(dd *DecommissionDisk) {
        ns.DecommissionDisks.Store(dd.GenerateKey(), dd)
        if dd.IsManualDecommissionDisk() {
                ns.addManualDecommissionDisk(dd)
        } else {
                ns.addAutoDecommissionDisk(dd)
        }
        log.LogInfof("action[AddDecommissionDisk] add disk %v type %v to  ns %v", dd.GenerateKey(), dd.Type, ns.ID)
}

func (ns *nodeSet) RemoveDecommissionDisk(dd *DecommissionDisk) {
        ns.DecommissionDisks.Delete(dd.GenerateKey())
        if dd.IsManualDecommissionDisk() {
                ns.removeManualDecommissionDisk(dd)
        } else {
                ns.removeAutoDecommissionDisk(dd)
        }
        log.LogInfof("action[RemoveDecommissionDisk] remove disk %v type %v  from  ns %v", dd.GenerateKey(), dd.Type, ns.ID)
}

func (ns *nodeSet) addManualDecommissionDisk(dd *DecommissionDisk) {
        ns.manualDecommissionDiskList.Put(ns.ID, dd)
}

func (ns *nodeSet) addAutoDecommissionDisk(dd *DecommissionDisk) {
        ns.autoDecommissionDiskList.Put(ns.ID, dd)
}

func (ns *nodeSet) removeManualDecommissionDisk(dd *DecommissionDisk) {
        ns.manualDecommissionDiskList.Remove(ns.ID, dd)
}

func (ns *nodeSet) removeAutoDecommissionDisk(dd *DecommissionDisk) {
        ns.autoDecommissionDiskList.Remove(ns.ID, dd)
}

func (ns *nodeSet) traverseDecommissionDisk(c *Cluster) {
        t := time.NewTicker(DecommissionInterval)
        // wait for loading all decommissionDisk when reload metadata
        log.LogInfof("action[traverseDecommissionDisk]wait %v", ns.ID)
        <-ns.startDecommissionDiskListTraverse
        log.LogInfof("action[traverseDecommissionDisk] traverseDecommissionDisk start %v", ns.ID)
        defer t.Stop()
        for {
                select {
                case <-ns.doneDecommissionDiskListTraverse:
                        log.LogWarnf("traverse stopped")
                        return
                case <-t.C:
                        if c.partition != nil && !c.partition.IsRaftLeader() {
                                log.LogWarnf("Leader changed, stop traverse!")
                                continue
                        }
                        runningCnt := 0
                        ns.DecommissionDisks.Range(func(key, value interface{}) bool {
                                disk := value.(*DecommissionDisk)
                                disk.updateDecommissionStatus(c, false)
                                status := disk.GetDecommissionStatus()
                                if status == DecommissionRunning {
                                        runningCnt++
                                } else if status == DecommissionSuccess || status == DecommissionFail || status == DecommissionPause {
                                        // remove from decommission disk list
                                        log.LogWarnf("traverseDecommissionDisk remove disk %v status %v",
                                                disk.GenerateKey(), disk.GetDecommissionStatus())
                                        ns.RemoveDecommissionDisk(disk)
                                }
                                return true
                        })
                        ns.diskParallelFactorLk.Lock()
                        maxDiskDecommissionCnt := int(ns.decommissionDiskParallelFactor * float64(ns.dataNodeLen()))
                        ns.diskParallelFactorLk.Unlock()
                        if maxDiskDecommissionCnt == 0 && ns.dataNodeLen() != 0 {
                                manualCnt, manualDisks := ns.manualDecommissionDiskList.PopMarkDecommissionDisk(0)
                                log.LogDebugf("traverseDecommissionDisk traverse manualCnt %v",
                                        manualCnt)
                                if manualCnt > 0 {
                                        for _, disk := range manualDisks {
                                                c.TryDecommissionDisk(disk)
                                        }
                                }
                                if c.AutoDecommissionDiskIsEnabled() {
                                        autoCnt, autoDisks := ns.autoDecommissionDiskList.PopMarkDecommissionDisk(0)
                                        log.LogDebugf("traverseDecommissionDisk traverse autoCnt %v",
                                                autoCnt)
                                        if autoCnt > 0 {
                                                for _, disk := range autoDisks {
                                                        c.TryDecommissionDisk(disk)
                                                }
                                        }
                                }
                        } else {
                                newDiskDecommissionCnt := maxDiskDecommissionCnt - runningCnt
                                log.LogDebugf("traverseDecommissionDisk traverse DiskDecommissionCnt %v",
                                        newDiskDecommissionCnt)
                                if newDiskDecommissionCnt > 0 {
                                        manualCnt, manualDisks := ns.manualDecommissionDiskList.PopMarkDecommissionDisk(newDiskDecommissionCnt)
                                        log.LogDebugf("traverseDecommissionDisk traverse manualCnt %v",
                                                manualCnt)
                                        if manualCnt > 0 {
                                                for _, disk := range manualDisks {
                                                        c.TryDecommissionDisk(disk)
                                                }
                                        }
                                        if newDiskDecommissionCnt-manualCnt > 0 && c.AutoDecommissionDiskIsEnabled() {
                                                autoCnt, autoDisks := ns.autoDecommissionDiskList.PopMarkDecommissionDisk(newDiskDecommissionCnt - manualCnt)
                                                log.LogDebugf("traverseDecommissionDisk traverse autoCnt %v",
                                                        autoCnt)
                                                if autoCnt > 0 {
                                                        for _, disk := range autoDisks {
                                                                c.TryDecommissionDisk(disk)
                                                        }
                                                }
                                        }
                                }
                        }
                }
        }
}

func (t *topology) isSingleZone() bool {
        t.zoneLock.RLock()
        defer t.zoneLock.RUnlock()
        var zoneLen int
        t.zoneMap.Range(func(zoneName, value interface{}) bool {
                zoneLen++
                return true
        })
        return zoneLen == 1
}

func (t *topology) getDomainExcludeZones() (zones []*Zone) {
        t.zoneLock.RLock()
        defer t.zoneLock.RUnlock()
        zones = make([]*Zone, 0)
        for i := 0; i < len(t.domainExcludeZones); i++ {
                if value, ok := t.zoneMap.Load(t.domainExcludeZones[i]); ok {
                        zones = append(zones, value.(*Zone))
                        log.LogInfof("action[getDomainExcludeZones] append zone name:[%v]_[%v]", t.domainExcludeZones[i], value.(*Zone).name)
                }
        }
        return
}

func (t *topology) getAllZones() (zones []*Zone) {
        t.zoneLock.RLock()
        defer t.zoneLock.RUnlock()
        zones = make([]*Zone, 0)
        t.zoneMap.Range(func(zoneName, value interface{}) bool {
                zone := value.(*Zone)
                zones = append(zones, zone)
                return true
        })
        return
}

func (t *topology) getZoneByIndex(index int) (zone *Zone) {
        t.zoneLock.RLock()
        defer t.zoneLock.RUnlock()
        return t.zones[index]
}

func (t *topology) getNodeSetByNodeSetId(nodeSetId uint64) (nodeSet *nodeSet, err error) {
        zones := t.getAllZones()
        for _, zone := range zones {
                nodeSet, err = zone.getNodeSet(nodeSetId)
                if err == nil {
                        return nodeSet, nil
                }
        }
        return nil, errors.NewErrorf("set %v not found", nodeSetId)
}

func calculateDemandWriteNodes(zoneNum int, replicaNum int) (demandWriteNodes int) {
        if zoneNum == 1 {
                demandWriteNodes = replicaNum
        } else {
                if replicaNum == 1 {
                        demandWriteNodes = 1
                } else {
                        demandWriteNodes = 2
                }
        }
        return
}

func (t *topology) allocZonesForMetaNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) {
        if len(t.domainExcludeZones) > 0 {
                zones = t.getDomainExcludeZones()
                log.LogInfof("action[allocZonesForMetaNode] getDomainExcludeZones zones [%v]", t.domainExcludeZones)
        } else {
                // if domain enable, will not enter here
                zones = t.getAllZones()
        }
        if t.isSingleZone() {
                return zones, nil
        }
        if excludeZone == nil {
                excludeZone = make([]string, 0)
        }
        candidateZones := make([]*Zone, 0)
        demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum)
        for i := 0; i < len(zones); i++ {
                if t.zoneIndexForMetaNode >= len(zones) {
                        t.zoneIndexForMetaNode = 0
                }
                zone := zones[t.zoneIndexForMetaNode]
                t.zoneIndexForMetaNode++
                if zone.status == unavailableZone {
                        continue
                }
                if contains(excludeZone, zone.name) {
                        continue
                }
                if zone.canWriteForMetaNode(uint8(demandWriteNodes)) {
                        candidateZones = append(candidateZones, zone)
                }
                if len(candidateZones) >= zoneNum {
                        break
                }
        }

        // if across zone,candidateZones must be larger than or equal with 2,otherwise,must have a candidate zone
        if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 {
                log.LogError(fmt.Sprintf("action[allocZonesForMetaNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v",
                        zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateMetaPartition))
                return nil, proto.ErrNoZoneToCreateMetaPartition
        }
        zones = candidateZones
        err = nil
        return
}

func (t *topology) allocZonesForDataNode(zoneNum, replicaNum int, excludeZone []string) (zones []*Zone, err error) {
        // domain enabled and have old zones to be used
        if len(t.domainExcludeZones) > 0 {
                zones = t.getDomainExcludeZones()
        } else {
                // if domain enable, will not enter here
                zones = t.getAllZones()
        }

        log.LogInfof("len(zones) = %v \n", len(zones))
        if t.isSingleZone() {
                return zones, nil
        }
        if excludeZone == nil {
                excludeZone = make([]string, 0)
        }

        demandWriteNodes := calculateDemandWriteNodes(zoneNum, replicaNum)
        candidateZones := make([]*Zone, 0)

        for i := 0; i < len(zones); i++ {
                if t.zoneIndexForDataNode >= len(zones) {
                        t.zoneIndexForDataNode = 0
                }

                zone := zones[t.zoneIndexForDataNode]
                t.zoneIndexForDataNode++

                if zone.status == unavailableZone {
                        continue
                }
                if contains(excludeZone, zone.name) {
                        continue
                }
                if zone.canWriteForDataNode(uint8(demandWriteNodes)) {
                        candidateZones = append(candidateZones, zone)
                }
                if len(candidateZones) >= zoneNum {
                        break
                }
        }

        // if across zone,candidateZones must be larger than or equal with 2,otherwise,must have one candidate zone
        if (zoneNum >= 2 && len(candidateZones) < 2) || len(candidateZones) < 1 {
                log.LogError(fmt.Sprintf("action[allocZonesForDataNode],reqZoneNum[%v],candidateZones[%v],demandWriteNodes[%v],err:%v",
                        zoneNum, len(candidateZones), demandWriteNodes, proto.ErrNoZoneToCreateDataPartition))
                return nil, errors.NewError(proto.ErrNoZoneToCreateDataPartition)
        }
        zones = candidateZones
        err = nil
        return
}

func (ns *nodeSet) dataNodeCount() int {
        var count int
        ns.dataNodes.Range(func(key, value interface{}) bool {
                count++
                return true
        })
        return count
}

// Zone stores all the zone related information
type Zone struct {
        name                    string
        dataNodesetSelectorLock sync.RWMutex
        dataNodesetSelector     NodesetSelector
        metaNodesetSelectorLock sync.RWMutex
        metaNodesetSelector     NodesetSelector
        status                  int
        dataNodes               *sync.Map
        metaNodes               *sync.Map
        nodeSetMap              map[uint64]*nodeSet
        nsLock                  sync.RWMutex
        QosIopsRLimit           uint64
        QosIopsWLimit           uint64
        QosFlowRLimit           uint64
        QosFlowWLimit           uint64
        sync.RWMutex
}

type zoneValue struct {
        Name                string
        QosIopsRLimit       uint64
        QosIopsWLimit       uint64
        QosFlowRLimit       uint64
        QosFlowWLimit       uint64
        DataNodesetSelector string
        MetaNodesetSelector string
}

func newZone(name string) (zone *Zone) {
        zone = &Zone{name: name}
        zone.status = normalZone
        zone.dataNodes = new(sync.Map)
        zone.metaNodes = new(sync.Map)
        zone.nodeSetMap = make(map[uint64]*nodeSet)
        zone.dataNodesetSelector = NewNodesetSelector(DefaultNodesetSelectorName, DataNodeType)
        zone.metaNodesetSelector = NewNodesetSelector(DefaultNodesetSelectorName, MetaNodeType)
        return
}

func printZonesName(zones []*Zone) string {
        str := "["
        if len(zones) == 0 {
                return str
        }

        for _, zone := range zones {
                str = str + zone.name + ","
        }

        return str
}

func (zone *Zone) GetDataNodesetSelector() string {
        zone.dataNodesetSelectorLock.RLock()
        defer zone.dataNodesetSelectorLock.RUnlock()
        return zone.dataNodesetSelector.GetName()
}

func (zone *Zone) SetDataNodesetSelector(name string) {
        zone.dataNodesetSelectorLock.Lock()
        defer zone.dataNodesetSelectorLock.Unlock()
        zone.dataNodesetSelector = NewNodesetSelector(name, DataNodeType)
}

func (zone *Zone) GetMetaNodesetSelector() string {
        zone.metaNodesetSelectorLock.RLock()
        defer zone.metaNodesetSelectorLock.RUnlock()
        return zone.metaNodesetSelector.GetName()
}

func (zone *Zone) SetMetaNodeSelector(name string) {
        zone.metaNodesetSelectorLock.Lock()
        defer zone.metaNodesetSelectorLock.Unlock()
        zone.metaNodesetSelector = NewNodesetSelector(name, MetaNodeType)
}

func (zone *Zone) getFsmValue() *zoneValue {
        return &zoneValue{
                Name:                zone.name,
                QosIopsRLimit:       zone.QosIopsRLimit,
                QosIopsWLimit:       zone.QosIopsWLimit,
                QosFlowRLimit:       zone.QosFlowRLimit,
                QosFlowWLimit:       zone.QosFlowWLimit,
                DataNodesetSelector: zone.GetDataNodesetSelector(),
                MetaNodesetSelector: zone.GetMetaNodesetSelector(),
        }
}

func (zone *Zone) setStatus(status int) {
        zone.status = status
}

func (zone *Zone) getStatus() int {
        return zone.status
}

func (zone *Zone) getStatusToString() string {
        if zone.status == normalZone {
                return "available"
        } else {
                return "unavailable"
        }
}

func (zone *Zone) isSingleNodeSet() bool {
        zone.RLock()
        defer zone.RUnlock()
        return len(zone.nodeSetMap) == 1
}

func (zone *Zone) getNodeSet(setID uint64) (ns *nodeSet, err error) {
        zone.nsLock.RLock()
        defer zone.nsLock.RUnlock()
        ns, ok := zone.nodeSetMap[setID]
        if !ok {
                return nil, errors.NewErrorf("set %v not found", setID)
        }
        return
}

func (zone *Zone) putNodeSet(ns *nodeSet) (err error) {
        zone.nsLock.Lock()
        defer zone.nsLock.Unlock()

        if _, ok := zone.nodeSetMap[ns.ID]; ok {
                return fmt.Errorf("nodeSet [%v] has exist", ns.ID)
        }
        zone.nodeSetMap[ns.ID] = ns
        return
}

func (zone *Zone) createNodeSet(c *Cluster) (ns *nodeSet, err error) {
        cnt := 1
        allNodeSet := zone.getAllNodeSet()
        log.LogInfof("action[createNodeSet] zone[%v] FaultDomain:[%v] init[%v] DefaultNormalZoneCnt[%v] nodeset cnt[%v]",
                zone.name, c.FaultDomain, c.domainManager.init, c.cfg.DefaultNormalZoneCnt, len(allNodeSet))

        if c.FaultDomain && c.domainManager.init && c.cfg.DefaultNormalZoneCnt < defaultReplicaNum {
                if _, ok := c.domainManager.excludeZoneListDomain[zone.name]; !ok {
                        dstNsCnt := 0
                        if c.cfg.DefaultNormalZoneCnt == 1 { // one zone support domain need 3 nodeset at begin
                                dstNsCnt = 3
                        } else {
                                dstNsCnt = 2 // two zone construct domain need 2 nodeset for each
                        }
                        if len(allNodeSet) < dstNsCnt {
                                log.LogInfof("action[createNodeSet] zone[%v] nodeset len:[%v] less then 3,create to 3 one time",
                                        zone.name, len(allNodeSet))
                                cnt = dstNsCnt - len(allNodeSet)
                        }
                } else {
                        log.LogInfof("action[createNodeSet] zone[%v] get in excludeZoneListDomain", zone.name)
                }
        }

        for {
                if cnt == 0 {
                        break
                }
                cnt--
                id, err := c.idAlloc.allocateCommonID()
                if err != nil {
                        return nil, err
                }
                ns = newNodeSet(c, id, c.cfg.nodeSetCapacity, zone.name)
                ns.UpdateMaxParallel(int32(c.DecommissionLimit))
                ns.UpdateDecommissionDiskFactor(c.DecommissionDiskFactor)
                ns.startDecommissionSchedule()
                log.LogInfof("action[createNodeSet] syncAddNodeSet[%v] zonename[%v]", ns.ID, zone.name)
                if err = c.syncAddNodeSet(ns); err != nil {
                        return nil, err
                }
                if err = zone.putNodeSet(ns); err != nil {
                        return nil, err
                }
                log.LogInfof("action[createNodeSet] nodeSet[%v]", ns.ID)
        }
        return
}

func (zone *Zone) getAllNodeSet() (nsc nodeSetCollection) {
        zone.nsLock.RLock()
        defer zone.nsLock.RUnlock()
        nsc = make(nodeSetCollection, 0)
        for _, ns := range zone.nodeSetMap {
                nsc = append(nsc, ns)
        }
        return
}

func (zone *Zone) getAvailNodeSetForMetaNode() (nset *nodeSet) {
        allNodeSet := zone.getAllNodeSet()
        sort.Sort(sort.Reverse(allNodeSet))

        for _, ns := range allNodeSet {
                if ns.metaNodeLen() < ns.Capacity {
                        if nset == nil {
                                nset = ns
                        } else {
                                if nset.Capacity-nset.metaNodeLen() < ns.Capacity-ns.metaNodeLen() {
                                        nset = ns
                                }
                        }
                        continue
                }
        }
        return
}

func (zone *Zone) getAvailNodeSetForDataNode() (nset *nodeSet) {
        allNodeSet := zone.getAllNodeSet()
        for _, ns := range allNodeSet {
                if ns.dataNodeLen() < ns.Capacity {
                        if nset == nil {
                                nset = ns
                        } else {
                                if nset.Capacity-nset.dataNodeLen() < ns.Capacity-ns.dataNodeLen() {
                                        nset = ns
                                }
                        }
                        continue
                }
        }
        return
}

func (zone *Zone) putDataNode(dataNode *DataNode) (err error) {
        var ns *nodeSet
        if ns, err = zone.getNodeSet(dataNode.NodeSetID); err != nil {
                log.LogErrorf("action[putDataNode] nodeSet[%v] not found", dataNode.NodeSetID)
                return
        }
        ns.putDataNode(dataNode)
        zone.dataNodes.Store(dataNode.Addr, dataNode)
        return
}

func (zone *Zone) getDataNode(addr string) (dataNode *DataNode, err error) {
        value, ok := zone.dataNodes.Load(addr)
        if !ok {
                return nil, errors.Trace(dataNodeNotFound(addr), "%v not found", addr)
        }
        dataNode = value.(*DataNode)
        return
}

func (zone *Zone) deleteDataNode(dataNode *DataNode) {
        ns, err := zone.getNodeSet(dataNode.NodeSetID)
        if err != nil {
                log.LogErrorf("action[zoneDeleteDataNode] nodeSet[%v] not found", dataNode.NodeSetID)
                return
        }
        ns.deleteDataNode(dataNode)
        zone.dataNodes.Delete(dataNode.Addr)
}

func (zone *Zone) putMetaNode(metaNode *MetaNode) (err error) {
        var ns *nodeSet
        if ns, err = zone.getNodeSet(metaNode.NodeSetID); err != nil {
                log.LogErrorf("action[zonePutMetaNode] nodeSet[%v] not found", metaNode.NodeSetID)
                return
        }
        ns.putMetaNode(metaNode)
        zone.metaNodes.Store(metaNode.Addr, metaNode)
        return
}

func (zone *Zone) deleteMetaNode(metaNode *MetaNode) (err error) {
        ns, err := zone.getNodeSet(metaNode.NodeSetID)
        if err != nil {
                log.LogErrorf("action[zoneDeleteMetaNode] nodeSet[%v] not found", metaNode.NodeSetID)
                return
        }
        ns.deleteMetaNode(metaNode)
        zone.metaNodes.Delete(metaNode.Addr)
        return
}

func (zone *Zone) allocNodeSetForDataNode(excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        nset := zone.getAllNodeSet()
        if nset == nil {
                return nil, errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
        }

        zone.nsLock.Lock()
        defer zone.nsLock.Unlock()
        // we need a read lock to block the modify of nodeset selector
        zone.dataNodesetSelectorLock.RLock()
        defer zone.dataNodesetSelectorLock.RUnlock()

        ns, err = zone.dataNodesetSelector.Select(nset, excludeNodeSets, replicaNum)

        if err != nil {
                log.LogErrorf("action[allocNodeSetForDataNode],nset len[%v],excludeNodeSets[%v],rNum[%v] err:%v",
                        nset.Len(), excludeNodeSets, replicaNum, proto.ErrNoNodeSetToCreateDataPartition)
                return nil, errors.NewError(proto.ErrNoNodeSetToCreateDataPartition)
        }
        return ns, nil
}

func (zone *Zone) allocNodeSetForMetaNode(excludeNodeSets []uint64, replicaNum uint8) (ns *nodeSet, err error) {
        nset := zone.getAllNodeSet()
        if nset == nil {
                return nil, proto.ErrNoNodeSetToCreateMetaPartition
        }

        zone.nsLock.Lock()
        defer zone.nsLock.Unlock()
        // we need a read lock to block the modify of nodeset selector
        zone.metaNodesetSelectorLock.RLock()
        defer zone.metaNodesetSelectorLock.RUnlock()
        ns, err = zone.metaNodesetSelector.Select(nset, excludeNodeSets, replicaNum)

        if err != nil {
                log.LogError(fmt.Sprintf("action[allocNodeSetForMetaNode],zone[%v],excludeNodeSets[%v],rNum[%v],err:%v",
                        zone.name, excludeNodeSets, replicaNum, proto.ErrNoNodeSetToCreateMetaPartition))
                return nil, proto.ErrNoNodeSetToCreateMetaPartition
        }
        return ns, nil
}

func (zone *Zone) canWriteForDataNode(replicaNum uint8) (can bool) {
        zone.RLock()
        defer zone.RUnlock()
        var leastAlive uint8
        zone.dataNodes.Range(func(addr, value interface{}) bool {
                dataNode := value.(*DataNode)
                if !dataNode.dpCntInLimit() {
                        return true
                }
                if dataNode.isActive && dataNode.isWriteAbleWithSize(30*util.GB) {
                        leastAlive++
                }
                if leastAlive >= replicaNum {
                        can = true
                        return false
                }
                return true
        })
        log.LogInfof("canWriteForDataNode leastAlive[%v],replicaNum[%v],count[%v]\n", leastAlive, replicaNum, zone.dataNodeCount())
        return
}

func (zone *Zone) isUsedRatio(ratio float64) (can bool) {
        zone.RLock()
        defer zone.RUnlock()
        var (
                dataNodeUsed  uint64
                dataNodeTotal uint64
                metaNodeUsed  uint64
                metaNodeTotal uint64
        )
        zone.dataNodes.Range(func(addr, value interface{}) bool {
                dataNode := value.(*DataNode)
                if dataNode.isActive == true {
                        dataNodeUsed += dataNode.Used
                } else {
                        dataNodeUsed += dataNode.Total
                }
                dataNodeTotal += dataNode.Total
                return true
        })

        if float64(dataNodeUsed)/float64(dataNodeTotal) > ratio {
                log.LogInfof("action[isUsedRatio] zone[%v] dataNodeUsed [%v] total [%v], ratio[%v]", zone.name, dataNodeUsed, dataNodeTotal, ratio)
                return true
        }

        zone.metaNodes.Range(func(addr, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if metaNode.IsActive == true && metaNode.isWritable() == true {
                        metaNodeUsed += metaNode.Used
                } else {
                        metaNodeUsed += metaNode.Total
                }
                metaNodeTotal += metaNode.Total
                return true
        })

        if float64(metaNodeUsed)/float64(metaNodeTotal) > ratio {
                log.LogInfof("action[isUsedRatio] zone[%v] metaNodeUsed [%v] total [%v], ratio[%v]", zone.name, metaNodeUsed, metaNodeTotal, ratio)
                return true
        }

        return false
}

func (zone *Zone) getDataUsed() (dataNodeUsed uint64, dataNodeTotal uint64) {
        zone.RLock()
        defer zone.RUnlock()
        zone.dataNodes.Range(func(addr, value interface{}) bool {
                dataNode := value.(*DataNode)
                if dataNode.isActive == true {
                        dataNodeUsed += dataNode.Used
                } else {
                        dataNodeUsed += dataNode.Total
                }
                dataNodeTotal += dataNode.Total
                return true
        })

        return dataNodeUsed, dataNodeTotal
}

func (zone *Zone) getMetaUsed() (metaNodeUsed uint64, metaNodeTotal uint64) {
        zone.RLock()
        defer zone.RUnlock()

        zone.metaNodes.Range(func(addr, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if metaNode.IsActive == true && metaNode.isWritable() == true {
                        metaNodeUsed += metaNode.Used
                } else {
                        metaNodeUsed += metaNode.Total
                }
                metaNodeTotal += metaNode.Total
                return true
        })
        return metaNodeUsed, metaNodeTotal
}

func (zone *Zone) getSpaceLeft(dataType uint32) (spaceLeft uint64) {
        if dataType == TypeDataPartition {
                dataNodeUsed, dataNodeTotal := zone.getDataUsed()
                return dataNodeTotal - dataNodeUsed
        } else {
                metaNodeUsed, metaNodeTotal := zone.getMetaUsed()
                return metaNodeTotal - metaNodeUsed
        }
}

func (zone *Zone) canWriteForMetaNode(replicaNum uint8) (can bool) {
        zone.RLock()
        defer zone.RUnlock()
        var leastAlive uint8
        zone.metaNodes.Range(func(addr, value interface{}) bool {
                metaNode := value.(*MetaNode)
                if metaNode.IsActive == true && metaNode.isWritable() == true {
                        leastAlive++
                }
                if leastAlive >= replicaNum {
                        can = true
                        return false
                }
                return true
        })
        return
}

func (zone *Zone) getDataNodeMaxTotal() (maxTotal uint64) {
        zone.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if dataNode.Total > maxTotal {
                        maxTotal = dataNode.Total
                }
                return true
        })
        return
}

func (zone *Zone) getAvailNodeHosts(nodeType uint32, excludeNodeSets []uint64, excludeHosts []string, replicaNum int) (newHosts []string, peers []proto.Peer, err error) {
        if replicaNum == 0 {
                return
        }

        log.LogDebugf("[x] get node host, zone(%s), nodeType(%d)", zone.name, nodeType)

        if nodeType == TypeDataPartition {
                ns, err := zone.allocNodeSetForDataNode(excludeNodeSets, uint8(replicaNum))
                if err != nil {
                        return nil, nil, errors.Trace(err, "zone[%v] alloc node set,replicaNum[%v]", zone.name, replicaNum)
                }
                return ns.getAvailDataNodeHosts(excludeHosts, replicaNum)
        }

        ns, err := zone.allocNodeSetForMetaNode(excludeNodeSets, uint8(replicaNum))
        if err != nil {
                return nil, nil, errors.NewErrorf("zone[%v],err[%v]", zone.name, err)
        }

        return ns.getAvailMetaNodeHosts(excludeHosts, replicaNum)
}

func (zone *Zone) updateNodesetSelector(cluster *Cluster, dataNodesetSelector string, metaNodesetSelector string) error {
        needSync := false
        if dataNodesetSelector != "" && dataNodesetSelector != zone.GetDataNodesetSelector() {
                needSync = true
                zone.SetDataNodesetSelector(dataNodesetSelector)
        }
        if metaNodesetSelector != "" && metaNodesetSelector != zone.GetMetaNodesetSelector() {
                needSync = true
                zone.SetMetaNodeSelector(metaNodesetSelector)
        }
        if !needSync {
                return nil
        }
        return cluster.sycnPutZoneInfo(zone)
}

func (zone *Zone) updateDataNodeQosLimit(cluster *Cluster, qosParam *qosArgs) error {
        var err error
        if qosParam.flowRVal > 0 {
                zone.QosFlowRLimit = qosParam.flowRVal
        }
        if qosParam.flowWVal > 0 {
                zone.QosFlowWLimit = qosParam.flowWVal
        }
        if qosParam.iopsRVal > 0 {
                zone.QosIopsRLimit = qosParam.iopsRVal
        }
        if qosParam.iopsWVal > 0 {
                zone.QosIopsWLimit = qosParam.iopsWVal
        }

        if err = cluster.sycnPutZoneInfo(zone); err != nil {
                return err
        }
        zone.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if qosParam.flowRVal > 0 {
                        dataNode.QosFlowRLimit = qosParam.flowRVal
                }
                if qosParam.flowWVal > 0 {
                        dataNode.QosFlowWLimit = qosParam.flowWVal
                }
                if qosParam.iopsRVal > 0 {
                        dataNode.QosIopsRLimit = qosParam.iopsRVal
                }
                if qosParam.iopsWVal > 0 {
                        dataNode.QosIopsWLimit = qosParam.iopsWVal
                }
                return true
        })
        return nil
}

func (zone *Zone) loadDataNodeQosLimit() {
        zone.dataNodes.Range(func(key, value interface{}) bool {
                dataNode := value.(*DataNode)
                if zone.QosFlowRLimit > 0 {
                        dataNode.QosFlowRLimit = zone.QosFlowRLimit
                }
                if zone.QosFlowWLimit > 0 {
                        dataNode.QosFlowWLimit = zone.QosFlowWLimit
                }
                if zone.QosIopsRLimit > 0 {
                        dataNode.QosIopsRLimit = zone.QosIopsRLimit
                }
                if zone.QosIopsWLimit > 0 {
                        dataNode.QosIopsWLimit = zone.QosIopsWLimit
                }
                return true
        })
}

func (zone *Zone) dataNodeCount() (len int) {
        zone.dataNodes.Range(func(key, value interface{}) bool {
                len++
                return true
        })
        return
}

func (zone *Zone) updateDecommissionLimit(limit int32, c *Cluster) (err error) {
        nodeSets := zone.getAllNodeSet()

        if nodeSets == nil {
                log.LogWarnf("Nodeset form %v is nil", zone.name)
                return proto.ErrNoNodeSetToUpdateDecommissionLimit
        }

        for _, ns := range nodeSets {
                ns.UpdateMaxParallel(limit)
                if err = c.syncUpdateNodeSet(ns); err != nil {
                        log.LogWarnf("UpdateMaxParallel nodeset [%v] failed,err:%v", ns.ID, err.Error())
                        continue
                }
        }
        log.LogInfof("All nodeset from %v set decommission limit to %v", zone.name, limit)
        return
}

func (zone *Zone) updateDecommissionDiskFactor(factor float64, c *Cluster) (err error) {
        nodeSets := zone.getAllNodeSet()

        if nodeSets == nil {
                log.LogWarnf("Nodeset form %v is nil", zone.name)
                return proto.ErrNoNodeSetToUpdateDecommissionDiskFactor
        }

        for _, ns := range nodeSets {
                ns.UpdateDecommissionDiskFactor(factor)
                if err = c.syncUpdateNodeSet(ns); err != nil {
                        log.LogWarnf("updateDecommissionDiskFactor nodeset [%v] failed,err:%v", ns.ID, err.Error())
                        continue
                }
        }
        log.LogInfof("All nodeset from %v set decommission disk factor to %v", zone.name, factor)
        return
}

func (zone *Zone) queryDecommissionDiskLimit() (err error, diskLimit []proto.DecommissionDiskLimitDetail) {
        nodeSets := zone.getAllNodeSet()
        diskLimit = make([]proto.DecommissionDiskLimitDetail, 0)
        if nodeSets == nil {
                log.LogWarnf("Nodeset form %v is nil", zone.name)
                return proto.ErrNoNodeSetToQueryDecommissionDiskLimit, nil
        }

        for _, ns := range nodeSets {
                limit := ns.QueryDecommissionDiskLimit()
                diskLimit = append(diskLimit, proto.DecommissionDiskLimitDetail{NodeSetId: ns.ID, Limit: limit})
        }
        log.LogInfof("All nodeset from %v set decommission disk limit  %v", zone.name, diskLimit)
        return
}

func (zone *Zone) queryDecommissionParallelStatus() (err error, stats []nodeSetDecommissionParallelStatus) {
        nodeSets := zone.getAllNodeSet()

        if nodeSets == nil {
                log.LogWarnf("Nodeset form %v is nil", zone.name)
                return proto.ErrNoNodeSetToQueryDecommissionLimitStatus, stats
        }

        for _, ns := range nodeSets {
                curToken, maxToken, dps := ns.getDecommissionParallelStatus()
                stat := nodeSetDecommissionParallelStatus{
                        ID:          ns.ID,
                        CurTokenNum: curToken,
                        MaxTokenNum: maxToken,
                        RunningDp:   dps,
                }
                stats = append(stats, stat)
        }
        log.LogInfof("All nodeset from %v  decommission limit status %v", zone.name, stats)
        return
}

func (zone *Zone) startDecommissionListTraverse(c *Cluster) (err error) {
        nodeSets := zone.getAllNodeSet()
        log.LogDebugf("startDecommissionListTraverse nodeSets len %v ", len(nodeSets))
        if len(nodeSets) == 0 {
                log.LogWarnf("action[startDecommissionListTraverse] Nodeset form %v is nil", zone.name)
                return nil
        }

        for _, ns := range nodeSets {
                log.LogInfof("action[startDecommissionListTraverse] ns[%v] from zone %v", ns.ID, zone.name)
                ns.startDecommissionSchedule()
        }
        log.LogInfof("action[startDecommissionListTraverse] All nodeset from %v start decommission schedule", zone.name)
        return
}

type DecommissionDataPartitionList struct {
        mu               sync.Mutex
        cacheMap         map[uint64]*list.Element
        decommissionList *list.List
        done             chan struct{}
        parallelLimit    int32
        curParallel      int32
        start            chan struct{}
        runningMap       map[uint64]struct{}
}

type DecommissionDataPartitionListValue struct {
        DecommissionDataPartitionCacheValue
        ParallelLimit int32
        CurParallel   int32
}

type DecommissionDataPartitionCacheValue struct {
        CacheMap []dataPartitionValue
        Status   uint32
}

const DecommissionInterval = 5 * time.Second

func NewDecommissionDataPartitionList(c *Cluster) *DecommissionDataPartitionList {
        l := new(DecommissionDataPartitionList)
        l.mu = sync.Mutex{}
        l.cacheMap = make(map[uint64]*list.Element)
        l.done = make(chan struct{}, 1)
        l.start = make(chan struct{}, 1)
        l.decommissionList = list.New()
        l.runningMap = make(map[uint64]struct{})
        atomic.StoreInt32(&l.curParallel, 0)
        atomic.StoreInt32(&l.parallelLimit, defaultDecommissionParallelLimit)
        go l.traverse(c)
        return l
}

// reserved
func (l *DecommissionDataPartitionList) Stop() {
        l.done <- struct{}{}
}

func (l *DecommissionDataPartitionList) Length() int {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.decommissionList.Len()
}

func (l *DecommissionDataPartitionList) Put(id uint64, value *DataPartition, c *Cluster) {
        if value == nil {
                log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] cannot put nil value", id)
                return
        }
        // can only add running or mark or prepare
        if !value.canAddToDecommissionList() {
                log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] put wrong dp[%v] status[%v]",
                        id, value.PartitionID, value.GetDecommissionStatus())
                return
        }
        // prepare status reset to mark status to retry again
        if value.GetDecommissionStatus() == DecommissionPrepare {
                value.SetDecommissionStatus(markDecommission)
        }

        l.mu.Lock()
        if _, ok := l.cacheMap[value.PartitionID]; ok {
                l.mu.Unlock()
                return
        }
        elm := l.decommissionList.PushBack(value)
        l.cacheMap[value.PartitionID] = elm
        l.mu.Unlock()
        // restore from rocksdb
        if value.checkConsumeToken() {
                value.TryAcquireDecommissionToken(c)
        }
        log.LogInfof("action[DecommissionDataPartitionListPut] ns[%v] add dp[%v] status[%v] isRecover[%v]",
                id, value.PartitionID, value.GetDecommissionStatus(), value.isRecover)
}

func (l *DecommissionDataPartitionList) Remove(value *DataPartition) {
        if value == nil {
                log.LogWarnf("Cannot remove nil value")
                return
        }
        l.mu.Lock()
        defer l.mu.Unlock()
        if elm, ok := l.cacheMap[value.PartitionID]; ok {
                delete(l.cacheMap, value.PartitionID)
                l.decommissionList.Remove(elm)
                log.LogDebugf("Remove dp[%v]", value.PartitionID)
        }
}

func (l *DecommissionDataPartitionList) getDecommissionParallelStatus() (int32, int32, []uint64) {
        l.mu.Lock()
        defer l.mu.Unlock()
        dps := make([]uint64, 0)
        for id := range l.runningMap {
                dps = append(dps, id)
        }

        return atomic.LoadInt32(&l.curParallel), atomic.LoadInt32(&l.parallelLimit), dps
}

func (l *DecommissionDataPartitionList) updateMaxParallel(maxParallel int32) {
        atomic.StoreInt32(&l.parallelLimit, maxParallel)
}

func (l *DecommissionDataPartitionList) acquireDecommissionToken(id uint64) bool {
        if atomic.LoadInt32(&l.parallelLimit) == 0 {
                l.mu.Lock()
                l.runningMap[id] = struct{}{}
                atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
                l.mu.Unlock()
                return true
        }
        if atomic.LoadInt32(&l.curParallel) >= atomic.LoadInt32(&l.parallelLimit) {
                return false
        }

        l.mu.Lock()
        l.runningMap[id] = struct{}{}
        atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
        l.mu.Unlock()
        return true
}

func (l *DecommissionDataPartitionList) releaseDecommissionToken(id uint64) {
        l.mu.Lock()
        defer l.mu.Unlock()
        if _, ok := l.runningMap[id]; !ok {
                return
        }
        delete(l.runningMap, id)
        atomic.StoreInt32(&l.curParallel, int32(len(l.runningMap)))
}

func (l *DecommissionDataPartitionList) GetAllDecommissionDataPartitions() (collection []*DataPartition) {
        l.mu.Lock()
        defer l.mu.Unlock()
        collection = make([]*DataPartition, 0, l.decommissionList.Len())
        for elm := l.decommissionList.Front(); elm != nil; elm = elm.Next() {
                collection = append(collection, elm.Value.(*DataPartition))
        }
        return collection
}

func (l *DecommissionDataPartitionList) startTraverse() {
        l.start <- struct{}{}
}

func (l *DecommissionDataPartitionList) traverse(c *Cluster) {
        t := time.NewTicker(DecommissionInterval)
        // wait for loading all ap when reload metadata
        <-l.start
        defer t.Stop()
        for {
                select {
                case <-l.done:
                        log.LogWarnf("traverse stopped!")
                        return
                case <-t.C:
                        if c.partition != nil && !c.partition.IsRaftLeader() {
                                log.LogWarnf("Leader changed, stop traverse!")
                                continue
                        }
                        allDecommissionDP := l.GetAllDecommissionDataPartitions()
                        for _, dp := range allDecommissionDP {
                                if dp.IsDecommissionSuccess() {
                                        log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for success",
                                                dp.PartitionID)
                                        l.Remove(dp)
                                        dp.ReleaseDecommissionToken(c)
                                        dp.ResetDecommissionStatus()
                                        c.syncUpdateDataPartition(dp)
                                } else if dp.IsDecommissionFailed() {
                                        if !dp.tryRollback(c) {
                                                dp.restoreReplica(c)
                                                log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for fail",
                                                        dp.PartitionID)
                                                l.Remove(dp)
                                        }
                                        // rollback fail/success need release token
                                        dp.ReleaseDecommissionToken(c)
                                } else if dp.IsDecommissionPaused() {
                                        log.LogDebugf("action[DecommissionListTraverse]Remove dp[%v] for paused ",
                                                dp.PartitionID)
                                        dp.ReleaseDecommissionToken(c)
                                        l.Remove(dp)
                                } else if dp.IsDecommissionInitial() { // fixed done ,not release token
                                        l.Remove(dp)
                                        dp.ResetDecommissionStatus()
                                        c.syncUpdateDataPartition(dp)
                                } else if dp.IsMarkDecommission() && dp.TryAcquireDecommissionToken(c) {
                                        // TODO: decommission in here
                                        go func(dp *DataPartition) {
                                                if !dp.TryToDecommission(c) {
                                                        // retry should release token
                                                        if dp.IsMarkDecommission() {
                                                                dp.ReleaseDecommissionToken(c)
                                                        }
                                                }
                                        }(dp) // special replica cnt cost some time from prepare to running
                                }
                        }
                }
        }
}

type DecommissionDiskList struct {
        mu               sync.Mutex
        cacheMap         map[string]*list.Element
        decommissionList *list.List
}

func NewDecommissionDiskList() *DecommissionDiskList {
        l := new(DecommissionDiskList)
        l.mu = sync.Mutex{}
        l.cacheMap = make(map[string]*list.Element)
        l.decommissionList = list.New()
        return l
}

func (l *DecommissionDiskList) Put(nsId uint64, value *DecommissionDisk) {
        if value == nil {
                log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] cannot put nil value", nsId)
                return
        }
        // can only add running or mark
        if !value.canAddToDecommissionList() {
                log.LogWarnf("action[DecommissionDataPartitionListPut] ns[%v] put wrong disk[%v] status[%v]",
                        nsId, value.GenerateKey(), value.GetDecommissionStatus())
                return
        }
        l.mu.Lock()
        defer l.mu.Unlock()

        if _, ok := l.cacheMap[value.GenerateKey()]; ok {
                return
        }
        elm := l.decommissionList.PushBack(value)
        l.cacheMap[value.GenerateKey()] = elm

        log.LogDebugf("action[DecommissionDataPartitionListPut] ns[%v] add disk[%v] status[%v] type[%v]",
                nsId, value.GenerateKey(), value.GetDecommissionStatus(), value.Type)
}

func (l *DecommissionDiskList) Remove(nsId uint64, value *DecommissionDisk) {
        if value == nil {
                log.LogWarnf("action[DecommissionDataPartitionListRemove] ns[%v]Cannot remove nil value", nsId)
                return
        }
        l.mu.Lock()
        defer l.mu.Unlock()
        if elm, ok := l.cacheMap[value.GenerateKey()]; ok {
                delete(l.cacheMap, value.GenerateKey())
                l.decommissionList.Remove(elm)
                log.LogDebugf("action[DecommissionDataPartitionListRemove] ns[%v] remove disk[%v]", nsId, value.GenerateKey())
        }
}

func (l *DecommissionDiskList) Length() int {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.decommissionList.Len()
}

// only pop decommission disk with markDecommission status from front
func (l *DecommissionDiskList) PopMarkDecommissionDisk(limit int) (count int, collection []*DecommissionDisk) {
        l.mu.Lock()
        defer l.mu.Unlock()
        collection = make([]*DecommissionDisk, count)
        count = 0
        for elm := l.decommissionList.Front(); elm != nil; elm = elm.Next() {
                if count == limit && limit != 0 {
                        break
                }
                disk := elm.Value.(*DecommissionDisk)
                if disk.GetDecommissionStatus() != markDecommission {
                        continue
                }
                collection = append(collection, disk)
                count++
                log.LogDebugf("action[PopMarkDecommissionDisk] pop disk[%v]", disk)
        }
        return count, collection
}

package master

import (
        "crypto/sha1"
        "encoding/hex"
        "io"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

const (
        accessKeyLength     = 16
        secretKeyLength     = 32
        RootUserID          = "root"
        DefaultRootPasswd   = "CubeFSRoot"
        DefaultUserPassword = "CubeFSUser"
)

type User struct {
        fsm            *MetadataFsm
        partition      raftstore.Partition
        userStore      sync.Map // K: userID, V: UserInfo
        AKStore        sync.Map // K: ak, V: userID
        volUser        sync.Map // K: vol, V: userIDs
        userStoreMutex sync.RWMutex
        AKStoreMutex   sync.RWMutex
        volUserMutex   sync.RWMutex
}

func newUser(fsm *MetadataFsm, partition raftstore.Partition) (u *User) {
        u = new(User)
        u.fsm = fsm
        u.partition = partition
        return
}

func (u *User) createKey(param *proto.UserCreateParam) (userInfo *proto.UserInfo, err error) {
        var (
                AKUser     *proto.AKUser
                userPolicy *proto.UserPolicy
                exist      bool
        )
        if param.ID == "" {
                err = proto.ErrInvalidUserID
                return
        }
        if !param.Type.Valid() {
                err = proto.ErrInvalidUserType
                return
        }

        userID := param.ID
        password := param.Password
        if password == "" {
                password = DefaultUserPassword
        }
        accessKey := param.AccessKey
        if accessKey == "" {
                accessKey = util.RandomString(accessKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
        } else {
                if !proto.IsValidAK(accessKey) {
                        err = proto.ErrInvalidAccessKey
                        return
                }
        }
        secretKey := param.SecretKey
        if secretKey == "" {
                secretKey = util.RandomString(secretKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
        } else {
                if !proto.IsValidSK(secretKey) {
                        err = proto.ErrInvalidSecretKey
                        return
                }
        }
        userType := param.Type
        description := param.Description
        u.userStoreMutex.Lock()
        defer u.userStoreMutex.Unlock()
        u.AKStoreMutex.Lock()
        defer u.AKStoreMutex.Unlock()
        // check duplicate
        if _, exist = u.userStore.Load(userID); exist {
                err = proto.ErrDuplicateUserID
                return
        }
        _, exist = u.AKStore.Load(accessKey)
        for exist {
                accessKey = util.RandomString(accessKeyLength, util.Numeric|util.LowerLetter|util.UpperLetter)
                _, exist = u.AKStore.Load(accessKey)
        }
        userPolicy = proto.NewUserPolicy()
        userInfo = &proto.UserInfo{
                UserID: userID, AccessKey: accessKey, SecretKey: secretKey, Policy: userPolicy,
                UserType: userType, CreateTime: time.Unix(time.Now().Unix(), 0).Format(proto.TimeFormat), Description: description,
        }
        AKUser = &proto.AKUser{AccessKey: accessKey, UserID: userID, Password: encodingPassword(password)}
        if err = u.syncAddUserInfo(userInfo); err != nil {
                return
        }
        if err = u.syncAddAKUser(AKUser); err != nil {
                return
        }
        u.userStore.Store(userID, userInfo)
        u.AKStore.Store(accessKey, AKUser)

        return
}

func (u *User) deleteKey(userID string) (err error) {
        var (
                akUser   *proto.AKUser
                userInfo *proto.UserInfo
        )

        u.userStoreMutex.Lock()
        defer u.userStoreMutex.Unlock()
        u.AKStoreMutex.Lock()
        defer u.AKStoreMutex.Unlock()

        if value, exist := u.userStore.Load(userID); !exist {
                err = proto.ErrUserNotExists
                return
        } else {
                userInfo = value.(*proto.UserInfo)
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        if len(userInfo.Policy.OwnVols) > 0 {
                err = proto.ErrOwnVolExists
                return
        }
        if userInfo.UserType == proto.UserTypeRoot {
                err = proto.ErrNoPermission
                return
        }
        if akUser, err = u.getAKUser(userInfo.AccessKey); err != nil {
                return
        }
        if err = u.syncDeleteUserInfo(userInfo); err != nil {
                return
        }
        if err = u.syncDeleteAKUser(akUser); err != nil {
                return
        }
        u.userStore.Delete(userID)
        u.AKStore.Delete(akUser.AccessKey)
        // delete userID from related policy in volUserStore
        u.removeUserFromAllVol(userID)
        log.LogInfof("action[deleteUser], userID: %v, accesskey[%v]", userID, userInfo.AccessKey)
        return
}

func (u *User) updateKey(param *proto.UserUpdateParam) (userInfo *proto.UserInfo, err error) {
        if param.UserID == "" {
                err = proto.ErrInvalidUserID
                return
        }

        u.userStoreMutex.Lock()
        defer u.userStoreMutex.Unlock()
        u.AKStoreMutex.Lock()
        defer u.AKStoreMutex.Unlock()

        if value, exist := u.userStore.Load(param.UserID); !exist {
                err = proto.ErrUserNotExists
                return
        } else {
                userInfo = value.(*proto.UserInfo)
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        if userInfo.UserType == proto.UserTypeRoot {
                err = proto.ErrNoPermission
                return
        }
        formerAK := userInfo.AccessKey
        var akMark, skMark, typeMark, describeMark int
        if param.AccessKey != "" {
                if !proto.IsValidAK(param.AccessKey) {
                        err = proto.ErrInvalidAccessKey
                        return
                }
                if _, exist := u.AKStore.Load(param.AccessKey); exist {
                        err = proto.ErrDuplicateAccessKey
                        return
                }
                akMark = 1
        }
        if param.SecretKey != "" {
                if !proto.IsValidSK(param.SecretKey) {
                        err = proto.ErrInvalidSecretKey
                        return
                }
                skMark = 1
        }
        // Type == 0,do not modify type
        if param.Type != 0 {
                if param.Type.Valid() {
                        typeMark = 1
                } else {
                        err = proto.ErrInvalidUserType
                        return
                }
        }
        if param.Description != "" {
                describeMark = 1
        }

        var akUserBef *proto.AKUser
        var akUserAft *proto.AKUser

        if value, exist := u.AKStore.Load(formerAK); exist {
                akUserBef = value.(*proto.AKUser)
        } else {
                err = proto.ErrAccessKeyNotExists
                return
        }
        if akMark == 1 {
                userInfo.AccessKey = param.AccessKey
        }
        if skMark == 1 {
                userInfo.SecretKey = param.SecretKey
        }
        if typeMark == 1 {
                userInfo.UserType = param.Type
        }
        if describeMark == 1 {
                userInfo.Description = param.Description
        }

        if len(strings.TrimSpace(param.Password)) != 0 {
                akUserBef.Password = encodingPassword(param.Password)
        }

        akUserAft = &proto.AKUser{AccessKey: userInfo.AccessKey, UserID: param.UserID, Password: akUserBef.Password}

        if err = u.syncUpdateUserInfo(userInfo); err != nil {
                return
        }
        if err = u.syncDeleteAKUser(akUserBef); err != nil {
                return
        }
        if err = u.syncAddAKUser(akUserAft); err != nil {
                return
        }
        u.AKStore.Delete(formerAK)
        u.AKStore.Store(akUserAft.AccessKey, akUserAft)

        log.LogInfof("action[updateUser], userID: %v, accesskey[%v], secretkey[%v]", userInfo.UserID, userInfo.AccessKey, userInfo.SecretKey)
        return
}

func (u *User) getKeyInfo(ak string) (userInfo *proto.UserInfo, err error) {
        var akUser *proto.AKUser
        if akUser, err = u.getAKUser(ak); err != nil {
                return
        }
        if userInfo, err = u.getUserInfo(akUser.UserID); err != nil {
                return
        }
        log.LogInfof("action[getKeyInfo], accesskey[%v]", ak)
        return
}

func (u *User) getUserInfo(userID string) (userInfo *proto.UserInfo, err error) {
        if value, exist := u.userStore.Load(userID); exist {
                userInfo = value.(*proto.UserInfo)
        } else {
                err = proto.ErrUserNotExists
                return
        }
        log.LogInfof("action[getUserInfo], userID: %v", userID)
        return
}

func (u *User) updatePolicy(params *proto.UserPermUpdateParam) (userInfo *proto.UserInfo, err error) {
        if userInfo, err = u.getUserInfo(params.UserID); err != nil {
                return
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        if userInfo.Policy.IsOwn(params.Volume) {
                err = proto.ErrIsOwner
                return
        }
        userInfo.Policy.AddAuthorizedVol(params.Volume, params.Policy)
        if err = u.syncUpdateUserInfo(userInfo); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        if err = u.addUserToVol(params.UserID, params.Volume); err != nil {
                return
        }
        log.LogInfof("action[updatePolicy], userID: %v, volume: %v", params.UserID, params.Volume)
        return
}

func (u *User) removePolicy(params *proto.UserPermRemoveParam) (userInfo *proto.UserInfo, err error) {
        if userInfo, err = u.getUserInfo(params.UserID); err != nil {
                return
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        if userInfo.Policy.IsOwn(params.Volume) {
                err = proto.ErrIsOwner
                return
        }
        userInfo.Policy.RemoveAuthorizedVol(params.Volume)
        if err = u.syncUpdateUserInfo(userInfo); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        if err = u.removeUserFromVol(params.UserID, params.Volume); err != nil {
                return
        }
        log.LogInfof("action[removePolicy], userID: %v, volume: %v", params.UserID, params.Volume)
        return
}

func (u *User) addOwnVol(userID, volName string) (userInfo *proto.UserInfo, err error) {
        if userInfo, err = u.getUserInfo(userID); err != nil {
                return
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        userInfo.Policy.AddOwnVol(volName)
        userInfo.Policy.RemoveAuthorizedVol(volName)
        if err = u.syncUpdateUserInfo(userInfo); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        if err = u.addUserToVol(userID, volName); err != nil {
                return
        }
        log.LogInfof("action[addOwnVol], userID: %v, volume: %v", userID, volName)
        return
}

func (u *User) removeOwnVol(userID, volName string) (userInfo *proto.UserInfo, err error) {
        if userInfo, err = u.getUserInfo(userID); err != nil {
                return
        }
        userInfo.Mu.Lock()
        defer userInfo.Mu.Unlock()
        userInfo.Policy.RemoveOwnVol(volName)
        if err = u.syncUpdateUserInfo(userInfo); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        if err = u.removeUserFromVol(userID, volName); err != nil {
                return
        }
        log.LogInfof("action[removeOwnVol], userID: %v, volume: %v", userID, volName)
        return
}

func (u *User) deleteVolPolicy(volName string) (err error) {
        var (
                volUser  *proto.VolUser
                userInfo *proto.UserInfo
        )
        // delete policy
        deletedUsers := make([]string, 0)
        var userIDs []string
        if userIDs, err = u.getUsersOfVol(volName); err != nil {
                return
        }
        for _, userID := range userIDs {
                if userInfo, err = u.getUserInfo(userID); err != nil {
                        if err == proto.ErrUserNotExists {
                                deletedUsers = append(deletedUsers, userID)
                                log.LogWarnf("action[deleteVolPolicy], userID: %v does not exist", userID)
                                continue
                        }
                        return
                }
                userInfo.Mu.Lock()
                userInfo.Policy.RemoveOwnVol(volName)
                userInfo.Policy.RemoveAuthorizedVol(volName)
                if err = u.syncUpdateUserInfo(userInfo); err != nil {
                        err = proto.ErrPersistenceByRaft
                        userInfo.Mu.Unlock()
                        return
                }
                userInfo.Mu.Unlock()
        }
        // delete volName index
        if value, exist := u.volUser.Load(volName); exist {
                volUser = value.(*proto.VolUser)
        } else {
                return nil
        }
        volUser.Mu.Lock()
        if err = u.syncDeleteVolUser(volUser); err != nil {
                volUser.Mu.Unlock()
                return
        }
        u.volUser.Delete(volUser.Vol)
        volUser.Mu.Unlock()
        for _, deletedUser := range deletedUsers {
                u.removeUserFromAllVol(deletedUser)
        }
        log.LogInfof("action[deleteVolPolicy], volName: %v", volName)
        return
}

func (u *User) transferVol(params *proto.UserTransferVolParam) (targetUserInfo *proto.UserInfo, err error) {
        var userInfo *proto.UserInfo
        userInfo, err = u.getUserInfo(params.UserSrc)
        if (err != nil && err != proto.ErrUserNotExists) || (!params.Force && err == proto.ErrUserNotExists) {
                return
        }
        if err == nil {
                isOwned := userInfo.Policy.IsOwn(params.Volume)
                if !isOwned && !params.Force && params.UserSrc != params.UserDst {
                        err = proto.ErrHaveNoPolicy
                        return
                }
                if isOwned {
                        if _, err = u.removeOwnVol(params.UserSrc, params.Volume); err != nil {
                                return
                        }
                }
        }

        if targetUserInfo, err = u.addOwnVol(params.UserDst, params.Volume); err != nil {
                return
        }
        log.LogInfof("action[transferVol], volName: %v, userSrc: %v, userDst: %v", params.Volume, params.UserSrc, params.UserDst)
        return
}

func (u *User) getAllUserInfo(keywords string) (users []*proto.UserInfo) {
        users = make([]*proto.UserInfo, 0)
        u.userStore.Range(func(key, value interface{}) bool {
                userInfo := value.(*proto.UserInfo)
                if strings.Contains(userInfo.UserID, keywords) {
                        users = append(users, userInfo)
                }
                return true
        })
        log.LogInfof("action[getAllUserInfo], keywords: %v, total numbers: %v", keywords, len(users))
        return
}

func (u *User) getUsersOfVol(volName string) (userIDs []string, err error) {
        var volUser *proto.VolUser
        userIDs = make([]string, 0)
        if value, exist := u.volUser.Load(volName); exist {
                volUser = value.(*proto.VolUser)
        } else {
                err = proto.ErrHaveNoPolicy
                return
        }
        volUser.Mu.RLock()
        defer volUser.Mu.RUnlock()
        for _, userID := range volUser.UserIDs {
                userIDs = append(userIDs, userID)
        }
        log.LogInfof("action[getUsersOfVol], vol: %v, user numbers: %v", volName, len(userIDs))
        return
}

func (u *User) getAKUser(ak string) (akUser *proto.AKUser, err error) {
        if value, exist := u.AKStore.Load(ak); exist {
                akUser = value.(*proto.AKUser)
        } else {
                err = proto.ErrAccessKeyNotExists
        }
        return
}

func (u *User) addUserToVol(userID, volName string) (err error) {
        u.volUserMutex.Lock()
        defer u.volUserMutex.Unlock()
        var volUser *proto.VolUser
        if value, ok := u.volUser.Load(volName); ok {
                volUser = value.(*proto.VolUser)
                volUser.Mu.Lock()
                defer volUser.Mu.Unlock()
                if contains(volUser.UserIDs, userID) {
                        return
                }
                volUser.UserIDs = append(volUser.UserIDs, userID)
        } else {
                volUser = &proto.VolUser{Vol: volName, UserIDs: []string{userID}}
                u.volUser.Store(volName, volUser)
        }
        if err = u.syncAddVolUser(volUser); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (u *User) removeUserFromVol(userID, volName string) (err error) {
        var volUser *proto.VolUser
        if value, ok := u.volUser.Load(volName); ok {
                volUser = value.(*proto.VolUser)
                volUser.Mu.Lock()
                defer volUser.Mu.Unlock()
                volUser.UserIDs, _ = removeString(volUser.UserIDs, userID)
        } else {
                err = proto.ErrHaveNoPolicy
                return
        }
        if err = u.syncUpdateVolUser(volUser); err != nil {
                err = proto.ErrPersistenceByRaft
                return
        }
        return
}

func (u *User) removeUserFromAllVol(userID string) {
        u.volUser.Range(func(key, value interface{}) bool {
                volUser := value.(*proto.VolUser)
                volUser.Mu.Lock()
                var exist bool
                volUser.UserIDs, exist = removeString(volUser.UserIDs, userID)
                if exist {
                        if err := u.syncUpdateVolUser(volUser); err != nil {
                                err = proto.ErrPersistenceByRaft
                                log.LogErrorf("action[deleteUser], userID: %v, volUser: %v, err: %v", userID, volUser, err)
                        }
                }
                volUser.Mu.Unlock()
                return true
        })
}

func removeString(array []string, element string) ([]string, bool) {
        for k, v := range array {
                if v == element {
                        return append(array[:k], array[k+1:]...), true
                }
        }
        return array, false
}

func encodingPassword(s string) string {
        t := sha1.New()
        io.WriteString(t, s)
        return hex.EncodeToString(t.Sum(nil))
}

func (u *User) clearUserStore() {
        u.userStore.Range(func(key, value interface{}) bool {
                u.userStore.Delete(key)
                return true
        })
}

func (u *User) clearAKStore() {
        u.AKStore.Range(func(key, value interface{}) bool {
                u.AKStore.Delete(key)
                return true
        })
}

func (u *User) clearVolUsers() {
        u.volUser.Range(func(key, value interface{}) bool {
                u.volUser.Delete(key)
                return true
        })
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

func (u *User) submit(metadata *RaftCmd) (err error) {
        cmd, err := metadata.Marshal()
        if err != nil {
                return errors.New(err.Error())
        }
        if _, err = u.partition.Submit(cmd); err != nil {
                msg := fmt.Sprintf("action[user_submit] err:%v", err.Error())
                return errors.New(msg)
        }
        return
}

// key = #ak#accesskey, value = userInfo
func (u *User) syncAddUserInfo(userInfo *proto.UserInfo) (err error) {
        return u.syncPutUserInfo(opSyncAddUserInfo, userInfo)
}

func (u *User) syncDeleteUserInfo(userInfo *proto.UserInfo) (err error) {
        return u.syncPutUserInfo(opSyncDeleteUserInfo, userInfo)
}

func (u *User) syncUpdateUserInfo(userInfo *proto.UserInfo) (err error) {
        return u.syncPutUserInfo(opSyncUpdateUserInfo, userInfo)
}

func (u *User) syncPutUserInfo(opType uint32, userInfo *proto.UserInfo) (err error) {
        raftCmd := new(RaftCmd)
        raftCmd.Op = opType
        raftCmd.K = userPrefix + userInfo.UserID
        raftCmd.V, err = json.Marshal(userInfo)
        if err != nil {
                return errors.New(err.Error())
        }
        return u.submit(raftCmd)
}

// key = #user#userid, value = userInfo
func (u *User) syncAddAKUser(akUser *proto.AKUser) (err error) {
        return u.syncPutAKUser(opSyncAddAKUser, akUser)
}

func (u *User) syncDeleteAKUser(akUser *proto.AKUser) (err error) {
        return u.syncPutAKUser(opSyncDeleteAKUser, akUser)
}

func (u *User) syncPutAKUser(opType uint32, akUser *proto.AKUser) (err error) {
        userInfo := new(RaftCmd)
        userInfo.Op = opType
        userInfo.K = akPrefix + akUser.AccessKey
        userInfo.V, err = json.Marshal(akUser)
        if err != nil {
                return errors.New(err.Error())
        }
        return u.submit(userInfo)
}

// key = #voluser#volname, value = userIDs
func (u *User) syncAddVolUser(volUser *proto.VolUser) (err error) {
        return u.syncPutVolUser(opSyncAddVolUser, volUser)
}

func (u *User) syncDeleteVolUser(volUser *proto.VolUser) (err error) {
        return u.syncPutVolUser(opSyncDeleteVolUser, volUser)
}

func (u *User) syncUpdateVolUser(volUser *proto.VolUser) (err error) {
        return u.syncPutVolUser(opSyncUpdateVolUser, volUser)
}

func (u *User) syncPutVolUser(opType uint32, volUser *proto.VolUser) (err error) {
        userInfo := new(RaftCmd)
        userInfo.Op = opType
        userInfo.K = volUserPrefix + volUser.Vol
        userInfo.V, err = json.Marshal(volUser)
        if err != nil {
                return errors.New(err.Error())
        }
        return u.submit(userInfo)
}

func (u *User) loadUserStore() (err error) {
        result, err := u.fsm.store.SeekForPrefix([]byte(userPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadUserKeyInfo], err: %v", err.Error())
                return err
        }
        for _, value := range result {
                userInfo := &proto.UserInfo{}
                if err = json.Unmarshal(value, userInfo); err != nil {
                        err = fmt.Errorf("action[loadUserKeyInfo], unmarshal err: %v", err.Error())
                        return err
                }
                u.userStore.Store(userInfo.UserID, userInfo)
                log.LogInfof("action[loadUserKeyInfo], userID[%v]", userInfo.UserID)
        }
        return
}

func (u *User) loadAKStore() (err error) {
        result, err := u.fsm.store.SeekForPrefix([]byte(akPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadAKStore], err: %v", err.Error())
                return err
        }
        for _, value := range result {
                akUser := &proto.AKUser{}
                if err = json.Unmarshal(value, akUser); err != nil {
                        err = fmt.Errorf("action[loadAKStore], unmarshal err: %v", err.Error())
                        return err
                }
                u.AKStore.Store(akUser.AccessKey, akUser)
                log.LogInfof("action[loadAKStore], ak[%v], userID[%v]", akUser.AccessKey, akUser.UserID)
        }
        return
}

func (u *User) loadVolUsers() (err error) {
        result, err := u.fsm.store.SeekForPrefix([]byte(volUserPrefix))
        if err != nil {
                err = fmt.Errorf("action[loadVolUsers], err: %v", err.Error())
                return err
        }
        for _, value := range result {
                volUser := &proto.VolUser{}
                if err = json.Unmarshal(value, volUser); err != nil {
                        err = fmt.Errorf("action[loadVolUsers], unmarshal err: %v", err.Error())
                        return err
                }
                u.volUser.Store(volUser.Vol, volUser)
                log.LogInfof("action[loadVolUsers], vol[%v]", volUser.Vol)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "math"
        "runtime/debug"
        "strconv"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

type VolVarargs struct {
        zoneName                string
        description             string
        capacity                uint64 // GB
        deleteLockTime          int64  // h
        followerRead            bool
        authenticate            bool
        dpSelectorName          string
        dpSelectorParm          string
        coldArgs                *coldVolArgs
        domainId                uint64
        dpReplicaNum            uint8
        enablePosixAcl          bool
        dpReadOnlyWhenVolFull   bool
        enableQuota             bool
        enableTransaction       proto.TxOpMask
        txTimeout               int64
        txConflictRetryNum      int64
        txConflictRetryInterval int64
        txOpLimit               int
}

// Vol represents a set of meta partitionMap and data partitionMap
type Vol struct {
        ID                uint64
        Name              string
        Owner             string
        OSSAccessKey      string
        OSSSecretKey      string
        dpReplicaNum      uint8
        mpReplicaNum      uint8
        Status            uint8
        threshold         float32
        dataPartitionSize uint64 // byte
        Capacity          uint64 // GB
        VolType           int

        EbsBlkSize       int
        CacheCapacity    uint64
        CacheAction      int
        CacheThreshold   int
        CacheTTL         int
        CacheHighWater   int
        CacheLowWater    int
        CacheLRUInterval int
        CacheRule        string

        PreloadCacheOn          bool
        NeedToLowerReplica      bool
        FollowerRead            bool
        authenticate            bool
        crossZone               bool
        domainOn                bool
        defaultPriority         bool // old default zone first
        enablePosixAcl          bool
        enableTransaction       proto.TxOpMask
        txTimeout               int64
        txConflictRetryNum      int64
        txConflictRetryInterval int64
        txOpLimit               int
        zoneName                string
        MetaPartitions          map[uint64]*MetaPartition `graphql:"-"`
        dataPartitions          *DataPartitionMap
        mpsCache                []byte
        viewCache               []byte
        createDpMutex           sync.RWMutex
        createMpMutex           sync.RWMutex
        createTime              int64
        DeleteLockTime          int64
        description             string
        dpSelectorName          string
        dpSelectorParm          string
        domainId                uint64
        qosManager              *QosCtrlManager
        DpReadOnlyWhenVolFull   bool
        aclMgr                  AclManager
        uidSpaceManager         *UidSpaceManager
        volLock                 sync.RWMutex
        quotaManager            *MasterQuotaManager
        enableQuota             bool
        VersionMgr              *VolVersionManager
        Forbidden               bool
        mpsLock                 *mpsLockManager
        EnableAuditLog          bool
        preloadCapacity         uint64
}

func newVol(vv volValue) (vol *Vol) {
        vol = &Vol{ID: vv.ID, Name: vv.Name, MetaPartitions: make(map[uint64]*MetaPartition, 0)}

        if vol.threshold <= 0 {
                vol.threshold = defaultMetaPartitionMemUsageThreshold
        }

        vol.dataPartitions = newDataPartitionMap(vv.Name)
        vol.VersionMgr = newVersionMgr(vol)
        vol.dpReplicaNum = vv.DpReplicaNum
        vol.mpReplicaNum = vv.ReplicaNum
        vol.Owner = vv.Owner

        vol.dataPartitionSize = vv.DataPartitionSize
        vol.Capacity = vv.Capacity
        vol.FollowerRead = vv.FollowerRead
        vol.authenticate = vv.Authenticate
        vol.crossZone = vv.CrossZone
        vol.zoneName = vv.ZoneName
        vol.viewCache = make([]byte, 0)
        vol.mpsCache = make([]byte, 0)
        vol.createTime = vv.CreateTime
        vol.DeleteLockTime = vv.DeleteLockTime
        vol.description = vv.Description
        vol.defaultPriority = vv.DefaultPriority
        vol.domainId = vv.DomainId
        vol.enablePosixAcl = vv.EnablePosixAcl
        vol.enableQuota = vv.EnableQuota
        vol.enableTransaction = vv.EnableTransaction
        vol.txTimeout = vv.TxTimeout
        vol.txConflictRetryNum = vv.TxConflictRetryNum
        vol.txConflictRetryInterval = vv.TxConflictRetryInterval
        vol.txOpLimit = vv.TxOpLimit

        vol.VolType = vv.VolType
        vol.EbsBlkSize = vv.EbsBlkSize
        vol.CacheCapacity = vv.CacheCapacity
        vol.CacheAction = vv.CacheAction
        vol.CacheThreshold = vv.CacheThreshold
        vol.CacheTTL = vv.CacheTTL
        vol.CacheHighWater = vv.CacheHighWater
        vol.CacheLowWater = vv.CacheLowWater
        vol.CacheLRUInterval = vv.CacheLRUInterval
        vol.CacheRule = vv.CacheRule
        vol.Status = vv.Status

        limitQosVal := &qosArgs{
                qosEnable:     vv.VolQosEnable,
                diskQosEnable: vv.DiskQosEnable,
                iopsRVal:      vv.IopsRLimit,
                iopsWVal:      vv.IopsWLimit,
                flowRVal:      vv.FlowRlimit,
                flowWVal:      vv.FlowWlimit,
        }
        vol.initQosManager(limitQosVal)

        magnifyQosVal := &qosArgs{
                iopsRVal: uint64(vv.IopsRMagnify),
                iopsWVal: uint64(vv.IopsWMagnify),
                flowRVal: uint64(vv.FlowWMagnify),
                flowWVal: uint64(vv.FlowWMagnify),
        }
        vol.qosManager.volUpdateMagnify(magnifyQosVal)
        vol.DpReadOnlyWhenVolFull = vv.DpReadOnlyWhenVolFull
        vol.mpsLock = newMpsLockManager(vol)
        vol.EnableAuditLog = true
        vol.preloadCapacity = math.MaxUint64 // mark as special value to trigger calculate
        return
}

func newVolFromVolValue(vv *volValue) (vol *Vol) {
        vol = newVol(*vv)
        // overwrite oss secure
        vol.OSSAccessKey, vol.OSSSecretKey = vv.OSSAccessKey, vv.OSSSecretKey
        vol.Status = vv.Status
        vol.dpSelectorName = vv.DpSelectorName
        vol.dpSelectorParm = vv.DpSelectorParm

        if vol.txTimeout == 0 {
                vol.txTimeout = proto.DefaultTransactionTimeout
        }
        if vol.txConflictRetryNum == 0 {
                vol.txConflictRetryNum = proto.DefaultTxConflictRetryNum
        }
        if vol.txConflictRetryInterval == 0 {
                vol.txConflictRetryInterval = proto.DefaultTxConflictRetryInterval
        }
        vol.Forbidden = vv.Forbidden
        vol.EnableAuditLog = vv.EnableAuditLog
        return vol
}

type mpsLockManager struct {
        mpsLock         sync.RWMutex
        lastEffectStack string
        lockTime        time.Time
        innerLock       sync.RWMutex
        onLock          bool
        hang            bool
        vol             *Vol
        enable          int32 // only config debug log enable lock
}

var (
        lockCheckInterval  = time.Second
        lockExpireInterval = time.Minute
)

func newMpsLockManager(vol *Vol) *mpsLockManager {
        lc := &mpsLockManager{vol: vol}
        go lc.CheckExceptionLock(lockCheckInterval, lockExpireInterval)
        if log.EnableDebug() {
                atomic.StoreInt32(&lc.enable, 0)
        }
        return lc
}

func (mpsLock *mpsLockManager) Lock() {
        mpsLock.mpsLock.Lock()
        if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
                mpsLock.innerLock.Lock()
                mpsLock.onLock = true
                mpsLock.lockTime = time.Now()
                mpsLock.lastEffectStack = fmt.Sprintf("Lock stack %v", string(debug.Stack()))
        }
}

func (mpsLock *mpsLockManager) UnLock() {
        mpsLock.mpsLock.Unlock()
        if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
                mpsLock.onLock = false
                mpsLock.lockTime = time.Unix(0, 0)
                mpsLock.lastEffectStack = fmt.Sprintf("UnLock stack %v", string(debug.Stack()))
                mpsLock.innerLock.Unlock()
        }
}

func (mpsLock *mpsLockManager) RLock() {
        mpsLock.mpsLock.RLock()
        if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
                mpsLock.innerLock.RLock()
                mpsLock.hang = false
                mpsLock.onLock = true
                mpsLock.lockTime = time.Now()
                mpsLock.lastEffectStack = fmt.Sprintf("RLock stack %v", string(debug.Stack()))
        }
}

func (mpsLock *mpsLockManager) RUnlock() {
        mpsLock.mpsLock.RUnlock()
        if log.EnableDebug() && atomic.LoadInt32(&mpsLock.enable) == 1 {
                mpsLock.onLock = false
                mpsLock.hang = false
                mpsLock.lockTime = time.Unix(0, 0)
                mpsLock.lastEffectStack = fmt.Sprintf("RUnlock stack %v", string(debug.Stack()))
                mpsLock.innerLock.RUnlock()
        }
}

func (mpsLock *mpsLockManager) CheckExceptionLock(interval time.Duration, expireTime time.Duration) {
        ticker := time.NewTicker(interval)
        for {
                select {
                case <-ticker.C:
                        if mpsLock.vol.status() == proto.VolStatusMarkDelete || atomic.LoadInt32(&mpsLock.enable) == 0 {
                                break
                        }
                        if !log.EnableDebug() {
                                continue
                        }
                        if !mpsLock.onLock {
                                continue
                        }
                        tm := time.Now()
                        if tm.After(mpsLock.lockTime.Add(expireTime)) {
                                log.LogWarnf("vol %v mpsLock hang more than %v since time %v stack(%v)",
                                        mpsLock.vol.Name, expireTime, mpsLock.lockTime, mpsLock.lastEffectStack)
                                mpsLock.hang = true
                        }
                }
        }
}

func (vol *Vol) CheckStrategy(c *Cluster) {
        // make sure resume all the processing ver deleting tasks before checking
        if !atomic.CompareAndSwapInt32(&vol.VersionMgr.checkStrategy, 0, 1) {
                return
        }

        go func() {
                waitTime := 5 * time.Second * defaultIntervalToCheck
                waited := false
                for {
                        time.Sleep(waitTime)
                        if vol.Status == proto.VolStatusMarkDelete {
                                break
                        }
                        if c != nil && c.IsLeader() {
                                if !waited {
                                        log.LogInfof("wait for %v seconds once after becoming leader to make sure all the ver deleting tasks are resumed",
                                                waitTime)
                                        time.Sleep(waitTime)
                                        waited = true
                                }
                                if !proto.IsHot(vol.VolType) {
                                        return
                                }
                                vol.VersionMgr.RLock()
                                if vol.VersionMgr.strategy.GetPeriodicSecond() == 0 || vol.VersionMgr.strategy.Enable == false { // strategy not be set
                                        vol.VersionMgr.RUnlock()
                                        continue
                                }
                                vol.VersionMgr.RUnlock()
                                vol.VersionMgr.checkCreateStrategy(c)
                                vol.VersionMgr.checkDeleteStrategy(c)
                        }
                }
        }()
}

func (vol *Vol) CalculatePreloadCapacity() uint64 {
        total := uint64(0)

        dps := vol.dataPartitions.partitions
        for _, dp := range dps {
                if proto.IsPreLoadDp(dp.PartitionType) {
                        total += dp.total / util.GB
                }
        }

        if overSoldFactor <= 0 {
                return total
        }

        return uint64(float32(total) / overSoldFactor)
}

func (vol *Vol) getPreloadCapacity() uint64 {
        if vol.preloadCapacity != math.MaxUint64 {
                return vol.preloadCapacity
        }
        vol.preloadCapacity = vol.CalculatePreloadCapacity()
        log.LogDebugf("[getPreloadCapacity] vol(%v) calculated preload capacity: %v", vol.Name, vol.preloadCapacity)
        return vol.preloadCapacity
}

func (vol *Vol) initQosManager(limitArgs *qosArgs) {
        vol.qosManager = &QosCtrlManager{
                cliInfoMgrMap:        make(map[uint64]*ClientInfoMgr, 0),
                serverFactorLimitMap: make(map[uint32]*ServerFactorLimit, 0),
                qosEnable:            limitArgs.qosEnable,
                vol:                  vol,
                ClientHitTriggerCnt:  defaultClientTriggerHitCnt,
                ClientReqPeriod:      defaultClientReqPeriodSeconds,
        }

        if limitArgs.iopsRVal == 0 {
                limitArgs.iopsRVal = defaultIopsRLimit
        }
        if limitArgs.iopsWVal == 0 {
                limitArgs.iopsWVal = defaultIopsWLimit
        }
        if limitArgs.flowRVal == 0 {
                limitArgs.flowRVal = defaultFlowRLimit
        }
        if limitArgs.flowWVal == 0 {
                limitArgs.flowWVal = defaultFlowWLimit
        }
        arrLimit := [defaultLimitTypeCnt]uint64{limitArgs.iopsRVal, limitArgs.iopsWVal, limitArgs.flowRVal, limitArgs.flowWVal}
        arrType := [defaultLimitTypeCnt]uint32{proto.IopsReadType, proto.IopsWriteType, proto.FlowReadType, proto.FlowWriteType}

        for i := 0; i < defaultLimitTypeCnt; i++ {
                vol.qosManager.serverFactorLimitMap[arrType[i]] = &ServerFactorLimit{
                        Name:       proto.QosTypeString(arrType[i]),
                        Type:       arrType[i],
                        Total:      arrLimit[i],
                        Buffer:     arrLimit[i],
                        requestCh:  make(chan interface{}, 10240),
                        qosManager: vol.qosManager,
                }
                go vol.qosManager.serverFactorLimitMap[arrType[i]].dispatch()
        }
}

func (vol *Vol) refreshOSSSecure() (key, secret string) {
        vol.OSSAccessKey = util.RandomString(16, util.Numeric|util.LowerLetter|util.UpperLetter)
        vol.OSSSecretKey = util.RandomString(32, util.Numeric|util.LowerLetter|util.UpperLetter)
        return vol.OSSAccessKey, vol.OSSSecretKey
}

func (vol *Vol) addMetaPartition(mp *MetaPartition) {
        vol.mpsLock.Lock()
        defer vol.mpsLock.UnLock()
        if _, ok := vol.MetaPartitions[mp.PartitionID]; !ok {
                vol.MetaPartitions[mp.PartitionID] = mp
                return
        }
        // replace the old partition in the map with mp
        vol.MetaPartitions[mp.PartitionID] = mp
}

func (vol *Vol) metaPartition(partitionID uint64) (mp *MetaPartition, err error) {
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        mp, ok := vol.MetaPartitions[partitionID]
        if !ok {
                err = proto.ErrMetaPartitionNotExists
        }
        return
}

func (vol *Vol) maxPartitionID() (maxPartitionID uint64) {
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        for id := range vol.MetaPartitions {
                if id > maxPartitionID {
                        maxPartitionID = id
                }
        }
        return
}

func (vol *Vol) getRWMetaPartitionNum() (num uint64, isHeartBeatDone bool) {
        if time.Now().Unix()-vol.createTime <= defaultMetaPartitionTimeOutSec {
                log.LogInfof("The vol[%v] is being created.", vol.Name)
                return num, false
        }
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        for _, mp := range vol.MetaPartitions {
                if !mp.heartBeatDone {
                        log.LogInfof("The mp[%v] of vol[%v] is not done", mp.PartitionID, vol.Name)
                        return num, false
                }
                if mp.Status == proto.ReadWrite {
                        num++
                } else {
                        log.LogWarnf("The mp[%v] of vol[%v] is not RW", mp.PartitionID, vol.Name)
                }
        }
        return num, true
}

func (vol *Vol) getDataPartitionsView() (body []byte, err error) {
        return vol.dataPartitions.updateResponseCache(false, 0, vol.VolType)
}

func (vol *Vol) getDataPartitionViewCompress() (body []byte, err error) {
        return vol.dataPartitions.updateCompressCache(false, 0, vol.VolType)
}

func (vol *Vol) getDataPartitionByID(partitionID uint64) (dp *DataPartition, err error) {
        return vol.dataPartitions.get(partitionID)
}

func (vol *Vol) addMetaPartitions(c *Cluster, count int) (err error) {
        // add extra meta partitions at a time
        var (
                start uint64
                end   uint64
        )

        vol.createMpMutex.Lock()
        defer vol.createMpMutex.Unlock()

        // update End of the maxMetaPartition range
        maxPartitionId := vol.maxPartitionID()
        rearMetaPartition := vol.MetaPartitions[maxPartitionId]
        oldEnd := rearMetaPartition.End
        end = rearMetaPartition.MaxInodeID + gConfig.MetaPartitionInodeIdStep

        if err = rearMetaPartition.canSplit(end, gConfig.MetaPartitionInodeIdStep, false); err != nil {
                return err
        }

        rearMetaPartition.End = end
        if err = c.syncUpdateMetaPartition(rearMetaPartition); err != nil {
                rearMetaPartition.End = oldEnd
                log.LogErrorf("action[addMetaPartitions] split partition partitionID[%v] err[%v]", rearMetaPartition.PartitionID, err)
                return
        }

        // create new meta partitions
        for i := 0; i < count; i++ {
                start = end + 1
                end = start + gConfig.MetaPartitionInodeIdStep

                if end > (defaultMaxMetaPartitionInodeID - gConfig.MetaPartitionInodeIdStep) {
                        end = defaultMaxMetaPartitionInodeID
                        log.LogWarnf("action[addMetaPartitions] vol[%v] add too many meta partition ,partition range overflow ! ", vol.Name)
                }

                if i == count-1 {
                        end = defaultMaxMetaPartitionInodeID
                }

                if err = vol.createMetaPartition(c, start, end); err != nil {
                        log.LogErrorf("action[addMetaPartitions] vol[%v] add meta partition err[%v]", vol.Name, err)
                        break
                }

                if end == defaultMaxMetaPartitionInodeID {
                        break
                }
        }

        return
}

func (vol *Vol) initMetaPartitions(c *Cluster, count int) (err error) {
        // initialize k meta partitionMap at a time
        var (
                start uint64
                end   uint64
        )
        if count < defaultInitMetaPartitionCount {
                count = defaultInitMetaPartitionCount
        }
        if count > defaultMaxInitMetaPartitionCount {
                count = defaultMaxInitMetaPartitionCount
        }

        vol.createMpMutex.Lock()
        for index := 0; index < count; index++ {
                if index != 0 {
                        start = end + 1
                }
                end = gConfig.MetaPartitionInodeIdStep * uint64(index+1)
                if index == count-1 {
                        end = defaultMaxMetaPartitionInodeID
                }
                if err = vol.createMetaPartition(c, start, end); err != nil {
                        log.LogErrorf("action[initMetaPartitions] vol[%v] init meta partition err[%v]", vol.Name, err)
                        break
                }
        }
        vol.createMpMutex.Unlock()

        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        if len(vol.MetaPartitions) != count {
                err = fmt.Errorf("action[initMetaPartitions] vol[%v] init meta partition failed,mpCount[%v],expectCount[%v],err[%v]",
                        vol.Name, len(vol.MetaPartitions), count, err)
        }
        return
}

func (vol *Vol) initDataPartitions(c *Cluster, dpCount int) (err error) {
        if dpCount == 0 {
                dpCount = defaultInitDataPartitionCnt
        }
        // initialize k data partitionMap at a time
        err = c.batchCreateDataPartition(vol, dpCount, true)
        return
}

func (vol *Vol) checkDataPartitions(c *Cluster) (cnt int) {
        if vol.getDataPartitionsCount() == 0 && vol.Status != proto.VolStatusMarkDelete && proto.IsHot(vol.VolType) {
                c.batchCreateDataPartition(vol, 1, false)
        }

        shouldDpInhibitWriteByVolFull := vol.shouldInhibitWriteBySpaceFull()
        totalPreloadCapacity := uint64(0)

        partitions := vol.dataPartitions.clonePartitions()
        for _, dp := range partitions {

                if proto.IsPreLoadDp(dp.PartitionType) {
                        now := time.Now().Unix()
                        if now > dp.PartitionTTL {
                                log.LogWarnf("[checkDataPartitions] dp(%d) is deleted because of ttl expired, now(%d), ttl(%d)", dp.PartitionID, now, dp.PartitionTTL)
                                vol.deleteDataPartition(c, dp)
                                continue
                        }

                        startTime := dp.dataNodeStartTime()
                        if now-dp.createTime > 600 && dp.used == 0 && now-startTime > 600 {
                                log.LogWarnf("[checkDataPartitions] dp(%d) is deleted because of clear, now(%d), create(%d), start(%d)",
                                        dp.PartitionID, now, dp.createTime, startTime)
                                vol.deleteDataPartition(c, dp)
                                continue
                        }

                        totalPreloadCapacity += dp.total / util.GB
                }

                dp.checkReplicaStatus(c.cfg.DataPartitionTimeOutSec)
                dp.checkStatus(c.Name, true, c.cfg.DataPartitionTimeOutSec, c, shouldDpInhibitWriteByVolFull, vol.Forbidden)
                dp.checkLeader(c.Name, c.cfg.DataPartitionTimeOutSec)
                dp.checkMissingReplicas(c.Name, c.leaderInfo.addr, c.cfg.MissingDataPartitionInterval, c.cfg.IntervalToAlarmMissingDataPartition)
                dp.checkReplicaNum(c, vol)

                if time.Now().Unix()-vol.createTime < defaultIntervalToCheckHeartbeat*3 && !vol.Forbidden {
                        dp.setReadWrite()
                }

                if dp.Status == proto.ReadWrite {
                        cnt++
                }

                dp.checkDiskError(c.Name, c.leaderInfo.addr)

                dp.checkReplicationTask(c.Name, vol.dataPartitionSize)
        }

        if overSoldFactor > 0 {
                totalPreloadCapacity = uint64(float32(totalPreloadCapacity) / overSoldFactor)
        }
        vol.preloadCapacity = totalPreloadCapacity
        if vol.preloadCapacity != 0 {
                log.LogDebugf("[checkDataPartitions] vol(%v) totalPreloadCapacity(%v GB), overSoldFactor(%v)",
                        vol.Name, totalPreloadCapacity, overSoldFactor)
        }

        return
}

func (vol *Vol) loadDataPartition(c *Cluster) {
        partitions, startIndex := vol.dataPartitions.getDataPartitionsToBeChecked(c.cfg.PeriodToLoadALLDataPartitions)
        if len(partitions) == 0 {
                return
        }
        c.waitForResponseToLoadDataPartition(partitions)
        msg := fmt.Sprintf("action[loadDataPartition] vol[%v],checkStartIndex:%v checkCount:%v",
                vol.Name, startIndex, len(partitions))
        log.LogInfo(msg)
}

func (vol *Vol) releaseDataPartitions(releaseCount int, afterLoadSeconds int64) {
        partitions, startIndex := vol.dataPartitions.getDataPartitionsToBeReleased(releaseCount, afterLoadSeconds)
        if len(partitions) == 0 {
                return
        }
        vol.dataPartitions.freeMemOccupiedByDataPartitions(partitions)
        msg := fmt.Sprintf("action[freeMemOccupiedByDataPartitions] vol[%v] release data partition start:%v releaseCount:%v",
                vol.Name, startIndex, len(partitions))
        log.LogInfo(msg)
}

func (vol *Vol) tryUpdateDpReplicaNum(c *Cluster, partition *DataPartition) (err error) {
        partition.RLock()
        defer partition.RUnlock()

        if partition.isRecover || vol.dpReplicaNum != 2 || partition.ReplicaNum != 3 || len(partition.Hosts) != 2 {
                return
        }

        if partition.isSpecialReplicaCnt() {
                return
        }
        oldReplicaNum := partition.ReplicaNum
        partition.ReplicaNum = partition.ReplicaNum - 1

        if err = c.syncUpdateDataPartition(partition); err != nil {
                partition.ReplicaNum = oldReplicaNum
        }
        return
}

func (vol *Vol) isOkUpdateRepCnt() (ok bool, rsp []uint64) {
        if proto.IsCold(vol.VolType) {
                return
        }
        ok = true
        dps := vol.cloneDataPartitionMap()
        for _, dp := range dps {
                if vol.dpReplicaNum != dp.ReplicaNum {
                        rsp = append(rsp, dp.PartitionID)
                        ok = false
                        // output dps detail info
                        if len(rsp) > 20 {
                                return
                        }
                }
        }
        return ok, rsp
}

func (vol *Vol) checkReplicaNum(c *Cluster) {
        if !vol.NeedToLowerReplica {
                return
        }
        var err error
        if proto.IsCold(vol.VolType) {
                return
        }

        dps := vol.cloneDataPartitionMap()
        cnt := 0
        for _, dp := range dps {
                host := dp.getToBeDecommissionHost(int(vol.dpReplicaNum))
                if host == "" {
                        continue
                }
                if err = dp.removeOneReplicaByHost(c, host, vol.dpReplicaNum == dp.ReplicaNum); err != nil {
                        if dp.isSpecialReplicaCnt() && len(dp.Hosts) > 1 {
                                log.LogWarnf("action[checkReplicaNum] removeOneReplicaByHost host [%v],vol[%v],err[%v]", host, vol.Name, err)
                                continue
                        }
                        log.LogErrorf("action[checkReplicaNum] removeOneReplicaByHost host [%v],vol[%v],err[%v]", host, vol.Name, err)
                        continue
                }
                cnt++
                if cnt > 100 {
                        return
                }
        }
        vol.NeedToLowerReplica = false
}

func (vol *Vol) checkMetaPartitions(c *Cluster) {
        var tasks []*proto.AdminTask
        metaPartitionInodeIdStep := gConfig.MetaPartitionInodeIdStep
        maxPartitionID := vol.maxPartitionID()
        mps := vol.cloneMetaPartitionMap()
        var (
                doSplit bool
                err     error
        )
        for _, mp := range mps {
                doSplit = mp.checkStatus(c.Name, true, int(vol.mpReplicaNum), maxPartitionID, metaPartitionInodeIdStep, vol.Forbidden)
                if doSplit && !c.cfg.DisableAutoCreate {
                        nextStart := mp.MaxInodeID + metaPartitionInodeIdStep
                        log.LogInfof(c.Name, fmt.Sprintf("cluster[%v],vol[%v],meta partition[%v] splits start[%v] maxinodeid:[%v] default step:[%v],nextStart[%v]",
                                c.Name, vol.Name, mp.PartitionID, mp.Start, mp.MaxInodeID, metaPartitionInodeIdStep, nextStart))
                        if err = vol.splitMetaPartition(c, mp, nextStart, metaPartitionInodeIdStep, false); err != nil {
                                Warn(c.Name, fmt.Sprintf("cluster[%v],vol[%v],meta partition[%v] splits failed,err[%v]", c.Name, vol.Name, mp.PartitionID, err))
                        }
                }

                mp.checkLeader(c.Name)
                mp.checkReplicaNum(c, vol.Name, vol.mpReplicaNum)
                mp.checkEnd(c, maxPartitionID)
                mp.reportMissingReplicas(c.Name, c.leaderInfo.addr, defaultMetaPartitionTimeOutSec, defaultIntervalToAlarmMissingMetaPartition)
                tasks = append(tasks, mp.replicaCreationTasks(c.Name, vol.Name)...)
        }
        c.addMetaNodeTasks(tasks)
        vol.checkSplitMetaPartition(c, metaPartitionInodeIdStep)
}

func (vol *Vol) checkSplitMetaPartition(c *Cluster, metaPartitionInodeStep uint64) {
        maxPartitionID := vol.maxPartitionID()
        maxMP, err := vol.metaPartition(maxPartitionID)
        if err != nil {
                return
        }
        // Any of the following conditions will trigger max mp split
        // 1. The memory of the metanode which max mp belongs to reaches the threshold
        // 2. The number of inodes managed by max mp reaches the threshold(0.75)
        // 3. The number of RW mp is less than 3
        maxMPInodeUsedRatio := float64(maxMP.MaxInodeID-maxMP.Start) / float64(metaPartitionInodeStep)
        RWMPNum, isHeartBeatDone := vol.getRWMetaPartitionNum()
        if !isHeartBeatDone {
                log.LogInfof("Not all volume[%s] mp heartbeat is done, skip mp split", vol.Name)
                return
        }
        if maxMP.memUsedReachThreshold(c.Name, vol.Name) || RWMPNum < lowerLimitRWMetaPartition ||
                maxMPInodeUsedRatio > metaPartitionInodeUsageThreshold {
                end := maxMP.MaxInodeID + metaPartitionInodeStep/4
                if RWMPNum < lowerLimitRWMetaPartition {
                        end = maxMP.MaxInodeID + metaPartitionInodeStep
                }
                if err := vol.splitMetaPartition(c, maxMP, end, metaPartitionInodeStep, true); err != nil {
                        msg := fmt.Sprintf("action[checkSplitMetaPartition],split meta maxMP[%v] failed,err[%v]\n",
                                maxMP.PartitionID, err)
                        Warn(c.Name, msg)
                }
                log.LogInfof("volume[%v] split MaxMP[%v], MaxInodeID[%d] Start[%d] RWMPNum[%d] maxMPInodeUsedRatio[%.2f]",
                        vol.Name, maxPartitionID, maxMP.MaxInodeID, maxMP.Start, RWMPNum, maxMPInodeUsedRatio)
        }
        return
}

func (mp *MetaPartition) memUsedReachThreshold(clusterName, volName string) bool {
        liveReplicas := mp.getLiveReplicas()
        foundReadonlyReplica := false
        var readonlyReplica *MetaReplica
        for _, replica := range liveReplicas {
                if replica.Status == proto.ReadOnly {
                        foundReadonlyReplica = true
                        readonlyReplica = replica
                        break
                }
        }
        if !foundReadonlyReplica || readonlyReplica == nil {
                return false
        }
        if readonlyReplica.metaNode.isWritable() {
                msg := fmt.Sprintf("action[checkSplitMetaPartition] vol[%v],max meta parition[%v] status is readonly\n",
                        volName, mp.PartitionID)
                Warn(clusterName, msg)
                return false
        }
        return true
}

func (vol *Vol) cloneMetaPartitionMap() (mps map[uint64]*MetaPartition) {
        mps = make(map[uint64]*MetaPartition, 0)
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        for _, mp := range vol.MetaPartitions {
                mps[mp.PartitionID] = mp
        }
        return
}

func (vol *Vol) setMpRdOnly() {
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        for _, mp := range vol.MetaPartitions {
                if mp.Status != proto.Unavailable {
                        mp.Status = proto.ReadOnly
                }
        }
}

func (vol *Vol) cloneDataPartitionMap() (dps map[uint64]*DataPartition) {
        vol.dataPartitions.RLock()
        defer vol.dataPartitions.RUnlock()
        dps = make(map[uint64]*DataPartition, 0)
        for _, dp := range vol.dataPartitions.partitionMap {
                dps[dp.PartitionID] = dp
        }
        return
}

func (vol *Vol) setDpRdOnly() {
        vol.dataPartitions.RLock()
        defer vol.dataPartitions.RUnlock()
        for _, dp := range vol.dataPartitions.partitionMap {
                if dp.Status != proto.Unavailable {
                        dp.Status = proto.ReadOnly
                }
        }
}

func (vol *Vol) setStatus(status uint8) {
        vol.volLock.Lock()
        defer vol.volLock.Unlock()
        vol.Status = status
}

func (vol *Vol) status() uint8 {
        vol.volLock.RLock()
        defer vol.volLock.RUnlock()
        return vol.Status
}

func (vol *Vol) capacity() uint64 {
        vol.volLock.RLock()
        defer vol.volLock.RUnlock()
        return vol.Capacity
}

func (vol *Vol) autoDeleteDp(c *Cluster) {
        if vol.dataPartitions == nil {
                return
        }

        maxSize := overSoldCap(vol.CacheCapacity * util.GB)
        maxCnt := maxSize / vol.dataPartitionSize

        if maxSize%vol.dataPartitionSize != 0 {
                maxCnt++
        }

        partitions := vol.dataPartitions.clonePartitions()
        for _, dp := range partitions {
                if !proto.IsCacheDp(dp.PartitionType) {
                        continue
                }

                if maxCnt > 0 {
                        maxCnt--
                        continue
                }

                log.LogInfof("[autoDeleteDp] start delete dp, id[%d]", dp.PartitionID)
                vol.deleteDataPartition(c, dp)
        }
}

func (vol *Vol) checkAutoDataPartitionCreation(c *Cluster) {
        defer func() {
                if r := recover(); r != nil {
                        log.LogWarnf("checkAutoDataPartitionCreation occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkAutoDataPartitionCreation occurred panic")
                }
        }()

        if ok, _ := vol.needCreateDataPartition(); !ok {
                return
        }

        vol.setStatus(proto.VolStatusNormal)
        log.LogInfof("action[autoCreateDataPartitions] vol[%v] before autoCreateDataPartitions", vol.Name)
        if !c.DisableAutoAllocate && !vol.Forbidden {
                vol.autoCreateDataPartitions(c)
        }
}

func (vol *Vol) shouldInhibitWriteBySpaceFull() bool {
        if !vol.DpReadOnlyWhenVolFull {
                return false
        }

        if vol.capacity() == 0 {
                return false
        }

        if !proto.IsHot(vol.VolType) {
                return false
        }

        usedSpace := vol.totalUsedSpace() / util.GB
        if usedSpace >= vol.capacity() {
                return true
        }

        return false
}

func (vol *Vol) needCreateDataPartition() (ok bool, err error) {
        ok = false
        if vol.status() == proto.VolStatusMarkDelete {
                err = proto.ErrVolNotExists
                return
        }

        if vol.capacity() == 0 {
                err = proto.ErrVolNoAvailableSpace
                return
        }

        if proto.IsHot(vol.VolType) {
                if vol.shouldInhibitWriteBySpaceFull() {
                        vol.setAllDataPartitionsToReadOnly()
                        err = proto.ErrVolNoAvailableSpace
                        return
                }
                ok = true
                return
        }

        // cold
        if vol.CacheAction == proto.NoCache && vol.CacheRule == "" {
                err = proto.ErrVolNoCacheAndRule
                return
        }

        ok = true
        return
}

func (vol *Vol) autoCreateDataPartitions(c *Cluster) {
        if time.Since(vol.dataPartitions.lastAutoCreateTime) < time.Minute {
                return
        }

        if c.cfg.DisableAutoCreate {
                // if disable auto create, once alloc size is over capacity, not allow to create new dp
                allocSize := uint64(len(vol.dataPartitions.partitions)) * vol.dataPartitionSize
                totalSize := vol.capacity() * util.GB
                if allocSize > totalSize {
                        return
                }

                if vol.dataPartitions.readableAndWritableCnt < minNumOfRWDataPartitions {
                        c.batchCreateDataPartition(vol, minNumOfRWDataPartitions, false)
                        log.LogWarnf("autoCreateDataPartitions: readWrite less than 10, alloc new 10 partitions, vol %s", vol.Name)
                }

                return
        }

        if proto.IsCold(vol.VolType) {

                vol.dataPartitions.lastAutoCreateTime = time.Now()
                maxSize := overSoldCap(vol.CacheCapacity * util.GB)
                allocSize := uint64(0)
                for _, dp := range vol.cloneDataPartitionMap() {
                        if !proto.IsCacheDp(dp.PartitionType) {
                                continue
                        }

                        allocSize += dp.total
                }

                if maxSize <= allocSize {
                        log.LogInfof("action[autoCreateDataPartitions] (%s) no need to create again, alloc [%d], max [%d]", vol.Name, allocSize, maxSize)
                        return
                }

                count := (maxSize-allocSize-1)/vol.dataPartitionSize + 1
                log.LogInfof("action[autoCreateDataPartitions] vol[%v] count[%v]", vol.Name, count)
                c.batchCreateDataPartition(vol, int(count), false)
                return
        }

        if (vol.Capacity > 200000 && vol.dataPartitions.readableAndWritableCnt < 200) || vol.dataPartitions.readableAndWritableCnt < minNumOfRWDataPartitions {
                vol.dataPartitions.lastAutoCreateTime = time.Now()
                count := vol.calculateExpansionNum()
                log.LogInfof("action[autoCreateDataPartitions] vol[%v] count[%v]", vol.Name, count)
                c.batchCreateDataPartition(vol, count, false)
        }
}

// Calculate the expansion number (the number of data partitions to be allocated to the given volume)
func (vol *Vol) calculateExpansionNum() (count int) {
        c := float64(vol.Capacity) * volExpansionRatio * float64(util.GB) / float64(util.DefaultDataPartitionSize)
        switch {
        case c < minNumOfRWDataPartitions:
                count = minNumOfRWDataPartitions
        case c > maxNumberOfDataPartitionsForExpansion:
                count = maxNumberOfDataPartitionsForExpansion
        default:
                count = int(c)
        }
        return
}

func (vol *Vol) setAllDataPartitionsToReadOnly() {
        vol.dataPartitions.setAllDataPartitionsToReadOnly()
}

func (vol *Vol) totalUsedSpace() uint64 {
        return vol.totalUsedSpaceByMeta(false)
}

func (vol *Vol) totalUsedSpaceByMeta(byMeta bool) uint64 {
        if proto.IsCold(vol.VolType) || byMeta {
                return vol.ebsUsedSpace()
        }

        return vol.cfsUsedSpace()
}

func (vol *Vol) cfsUsedSpace() uint64 {
        return vol.dataPartitions.totalUsedSpace()
}

func (vol *Vol) sendViewCacheToFollower(c *Cluster) {
        var err error
        log.LogInfof("action[asyncSendPartitionsToFollower]")

        metadata := new(RaftCmd)
        metadata.Op = opSyncDataPartitionsView
        metadata.K = vol.Name
        metadata.V = vol.dataPartitions.getDataPartitionResponseCache()

        if err = c.submit(metadata); err != nil {
                log.LogErrorf("action[asyncSendPartitionsToFollower] error [%v]", err)
        }
        log.LogInfof("action[asyncSendPartitionsToFollower] finished")
}

func (vol *Vol) ebsUsedSpace() uint64 {
        size := uint64(0)
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()

        for _, pt := range vol.MetaPartitions {
                size += pt.dataSize()
        }

        return size
}

func (vol *Vol) updateViewCache(c *Cluster) {
        view := proto.NewVolView(vol.Name, vol.Status, vol.FollowerRead, vol.createTime, vol.CacheTTL, vol.VolType, vol.DeleteLockTime)
        view.SetOwner(vol.Owner)
        view.SetOSSSecure(vol.OSSAccessKey, vol.OSSSecretKey)
        mpViews := vol.getMetaPartitionsView()
        view.MetaPartitions = mpViews
        mpViewsReply := newSuccessHTTPReply(mpViews)
        mpsBody, err := json.Marshal(mpViewsReply)
        if err != nil {
                log.LogErrorf("action[updateViewCache] failed,vol[%v],err[%v]", vol.Name, err)
                return
        }
        vol.setMpsCache(mpsBody)
        // dpResps := vol.dataPartitions.getDataPartitionsView(0)
        // view.DataPartitions = dpResps
        view.DomainOn = vol.domainOn
        viewReply := newSuccessHTTPReply(view)
        body, err := json.Marshal(viewReply)
        if err != nil {
                log.LogErrorf("action[updateViewCache] failed,vol[%v],err[%v]", vol.Name, err)
                return
        }
        vol.setViewCache(body)
}

func (vol *Vol) getMetaPartitionsView() (mpViews []*proto.MetaPartitionView) {
        mps := make(map[uint64]*MetaPartition)
        vol.mpsLock.RLock()
        for key, mp := range vol.MetaPartitions {
                mps[key] = mp
        }
        vol.mpsLock.RUnlock()

        mpViews = make([]*proto.MetaPartitionView, 0)
        for _, mp := range mps {
                mpViews = append(mpViews, getMetaPartitionView(mp))
        }
        return
}

func (vol *Vol) setMpsCache(body []byte) {
        vol.volLock.Lock()
        defer vol.volLock.Unlock()
        vol.mpsCache = body
}

func (vol *Vol) getMpsCache() []byte {
        vol.volLock.RLock()
        defer vol.volLock.RUnlock()
        return vol.mpsCache
}

func (vol *Vol) setViewCache(body []byte) {
        vol.volLock.Lock()
        defer vol.volLock.Unlock()
        vol.viewCache = body
}

func (vol *Vol) getViewCache() []byte {
        vol.volLock.RLock()
        defer vol.volLock.RUnlock()
        return vol.viewCache
}

func (vol *Vol) deleteDataPartition(c *Cluster, dp *DataPartition) {
        var addrs []string
        for _, replica := range dp.Replicas {
                addrs = append(addrs, replica.Addr)
        }

        for _, addr := range addrs {
                if err := vol.deleteDataPartitionFromDataNode(c, dp.createTaskToDeleteDataPartition(addr)); err != nil {
                        log.LogErrorf("[deleteDataPartitionFromDataNode] delete data replica from datanode fail, id %d, err %s", dp.PartitionID, err.Error())
                }
        }

        vol.dataPartitions.del(dp)

        err := c.syncDeleteDataPartition(dp)
        if err != nil {
                log.LogErrorf("[deleteDataPartition] delete data partition from store fail, [%d], err: %s", dp.PartitionID, err.Error())
                return
        }

        log.LogInfof("[deleteDataPartition] delete data partition success, [%d]", dp.PartitionID)
}

// Periodically check the volume's status.
// If an volume is marked as deleted, then generate corresponding delete task (meta partition or data partition)
// If all the meta partition and data partition of this volume have been deleted, then delete this volume.
func (vol *Vol) checkStatus(c *Cluster) {
        if !atomic.CompareAndSwapInt32(&vol.VersionMgr.checkStatus, 0, 1) {
                return
        }
        defer func() {
                atomic.StoreInt32(&vol.VersionMgr.checkStatus, 0)
                if r := recover(); r != nil {
                        log.LogWarnf("checkStatus occurred panic,err[%v]", r)
                        WarnBySpecialKey(fmt.Sprintf("%v_%v_scheduling_job_panic", c.Name, ModuleName),
                                "checkStatus occurred panic")
                }
        }()
        vol.updateViewCache(c)
        vol.volLock.Lock()
        defer vol.volLock.Unlock()
        if vol.Status != proto.VolStatusMarkDelete {
                return
        }
        log.LogInfof("action[volCheckStatus] vol[%v],status[%v]", vol.Name, vol.Status)
        metaTasks := vol.getTasksToDeleteMetaPartitions()
        dataTasks := vol.getTasksToDeleteDataPartitions()

        if len(metaTasks) == 0 && len(dataTasks) == 0 {
                vol.deleteVolFromStore(c)
        }
        go func() {
                for _, metaTask := range metaTasks {
                        vol.deleteMetaPartitionFromMetaNode(c, metaTask)
                }

                for _, dataTask := range dataTasks {
                        vol.deleteDataPartitionFromDataNode(c, dataTask)
                }
        }()

        return
}

func (vol *Vol) deleteMetaPartitionFromMetaNode(c *Cluster, task *proto.AdminTask) {
        mp, err := vol.metaPartition(task.PartitionID)
        if err != nil {
                return
        }
        metaNode, err := c.metaNode(task.OperatorAddr)
        if err != nil {
                return
        }

        mp.RLock()
        _, err = mp.getMetaReplica(task.OperatorAddr)
        mp.RUnlock()
        if err != nil {
                log.LogWarnf("deleteMetaPartitionFromMetaNode (%s) maybe alread been deleted", task.ToString())
                return
        }

        _, err = metaNode.Sender.syncSendAdminTask(task)
        if err != nil {
                log.LogErrorf("action[deleteMetaPartition] vol[%v],meta partition[%v],err[%v]", mp.volName, mp.PartitionID, err)
                return
        }
        mp.Lock()
        mp.removeReplicaByAddr(metaNode.Addr)
        mp.removeMissingReplica(metaNode.Addr)
        mp.Unlock()
        return
}

func (vol *Vol) deleteDataPartitionFromDataNode(c *Cluster, task *proto.AdminTask) (err error) {
        dp, err := vol.getDataPartitionByID(task.PartitionID)
        if err != nil {
                return
        }

        dataNode, err := c.dataNode(task.OperatorAddr)
        if err != nil {
                return
        }

        dp.RLock()
        _, ok := dp.hasReplica(task.OperatorAddr)
        dp.RUnlock()
        if !ok {
                log.LogWarnf("deleteDataPartitionFromDataNode task(%s) maybe already executed", task.ToString())
                return
        }

        _, err = dataNode.TaskManager.syncSendAdminTask(task)
        if err != nil {
                log.LogErrorf("action[deleteDataReplica] vol[%v],data partition[%v],err[%v]", dp.VolName, dp.PartitionID, err)
                return
        }

        dp.Lock()
        dp.removeReplicaByAddr(dataNode.Addr)
        dp.checkAndRemoveMissReplica(dataNode.Addr)
        if err = dp.update("deleteDataReplica", dp.VolName, dp.Peers, dp.Hosts, c); err != nil {
                dp.Unlock()
                return
        }
        dp.Unlock()

        return
}

func (vol *Vol) deleteVolFromStore(c *Cluster) (err error) {
        log.LogWarnf("deleteVolFromStore vol %v", vol.Name)
        if err = c.syncDeleteVol(vol); err != nil {
                return
        }

        // delete the metadata of the meta and data partitionMap first
        vol.deleteDataPartitionsFromStore(c)
        vol.deleteMetaPartitionsFromStore(c)
        // then delete the volume
        c.deleteVol(vol.Name)
        c.volStatInfo.Delete(vol.Name)

        c.DelBucketLifecycle(vol.Name)
        return
}

func (vol *Vol) deleteMetaPartitionsFromStore(c *Cluster) {
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        for _, mp := range vol.MetaPartitions {
                c.syncDeleteMetaPartition(mp)
        }
        return
}

func (vol *Vol) deleteDataPartitionsFromStore(c *Cluster) {
        vol.dataPartitions.RLock()
        defer vol.dataPartitions.RUnlock()
        for _, dp := range vol.dataPartitions.partitions {
                c.syncDeleteDataPartition(dp)
        }
}

func (vol *Vol) getTasksToDeleteMetaPartitions() (tasks []*proto.AdminTask) {
        vol.mpsLock.RLock()
        defer vol.mpsLock.RUnlock()
        tasks = make([]*proto.AdminTask, 0)

        for _, mp := range vol.MetaPartitions {
                log.LogDebugf("get delete task from vol(%s) mp(%d)", vol.Name, mp.PartitionID)
                for _, replica := range mp.Replicas {
                        log.LogDebugf("get delete task from vol(%s) mp(%d),replica(%v)", vol.Name, mp.PartitionID, replica.Addr)
                        tasks = append(tasks, replica.createTaskToDeleteReplica(mp.PartitionID))
                }
        }
        return
}

func (vol *Vol) getTasksToDeleteDataPartitions() (tasks []*proto.AdminTask) {
        tasks = make([]*proto.AdminTask, 0)
        vol.dataPartitions.RLock()
        defer vol.dataPartitions.RUnlock()

        for _, dp := range vol.dataPartitions.partitions {
                for _, replica := range dp.Replicas {
                        tasks = append(tasks, dp.createTaskToDeleteDataPartition(replica.Addr))
                }
        }
        return
}

func (vol *Vol) getDataPartitionsCount() (count int) {
        vol.volLock.RLock()
        count = len(vol.dataPartitions.partitionMap)
        vol.volLock.RUnlock()
        return
}

func (vol *Vol) String() string {
        return fmt.Sprintf("name[%v],dpNum[%v],mpNum[%v],cap[%v],status[%v]",
                vol.Name, vol.dpReplicaNum, vol.mpReplicaNum, vol.Capacity, vol.Status)
}

func (vol *Vol) doSplitMetaPartition(c *Cluster, mp *MetaPartition, end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (nextMp *MetaPartition, err error) {
        mp.Lock()
        defer mp.Unlock()

        if err = mp.canSplit(end, metaPartitionInodeIdStep, ignoreNoLeader); err != nil {
                return
        }

        log.LogWarnf("action[splitMetaPartition],partition[%v],start[%v],end[%v],new end[%v]", mp.PartitionID, mp.Start, mp.End, end)
        cmdMap := make(map[string]*RaftCmd, 0)
        oldEnd := mp.End
        mp.End = end

        updateMpRaftCmd, err := c.buildMetaPartitionRaftCmd(opSyncUpdateMetaPartition, mp)
        if err != nil {
                return
        }

        cmdMap[updateMpRaftCmd.K] = updateMpRaftCmd
        if nextMp, err = vol.doCreateMetaPartition(c, mp.End+1, defaultMaxMetaPartitionInodeID); err != nil {
                Warn(c.Name, fmt.Sprintf("action[updateEnd] clusterID[%v] partitionID[%v] create meta partition err[%v]",
                        c.Name, mp.PartitionID, err))
                log.LogErrorf("action[updateEnd] partitionID[%v] err[%v]", mp.PartitionID, err)
                return
        }

        addMpRaftCmd, err := c.buildMetaPartitionRaftCmd(opSyncAddMetaPartition, nextMp)
        if err != nil {
                return
        }

        cmdMap[addMpRaftCmd.K] = addMpRaftCmd
        if err = c.syncBatchCommitCmd(cmdMap); err != nil {
                mp.End = oldEnd
                return nil, errors.NewError(err)
        }

        mp.updateInodeIDRangeForAllReplicas()
        mp.addUpdateMetaReplicaTask(c)
        return
}

func (vol *Vol) splitMetaPartition(c *Cluster, mp *MetaPartition, end uint64, metaPartitionInodeIdStep uint64, ignoreNoLeader bool) (err error) {
        if c.DisableAutoAllocate {
                err = errors.NewErrorf("cluster auto allocate is disable")
                return
        }
        if vol.Forbidden {
                err = errors.NewErrorf("volume %v is forbidden", vol.Name)
                return
        }

        vol.createMpMutex.Lock()
        defer vol.createMpMutex.Unlock()

        maxPartitionID := vol.maxPartitionID()
        if maxPartitionID != mp.PartitionID {
                err = fmt.Errorf("mp[%v] is not the last meta partition[%v]", mp.PartitionID, maxPartitionID)
                return
        }

        nextMp, err := vol.doSplitMetaPartition(c, mp, end, metaPartitionInodeIdStep, ignoreNoLeader)
        if err != nil {
                return
        }

        vol.addMetaPartition(nextMp)
        log.LogWarnf("action[splitMetaPartition],next partition[%v],start[%v],end[%v]", nextMp.PartitionID, nextMp.Start, nextMp.End)
        return
}

func (vol *Vol) createMetaPartition(c *Cluster, start, end uint64) (err error) {
        var mp *MetaPartition
        if mp, err = vol.doCreateMetaPartition(c, start, end); err != nil {
                return
        }
        if err = c.syncAddMetaPartition(mp); err != nil {
                return errors.NewError(err)
        }
        vol.addMetaPartition(mp)
        return
}

func (vol *Vol) doCreateMetaPartition(c *Cluster, start, end uint64) (mp *MetaPartition, err error) {
        var (
                hosts       []string
                partitionID uint64
                peers       []proto.Peer
                wg          sync.WaitGroup
        )

        errChannel := make(chan error, vol.mpReplicaNum)

        if c.isFaultDomain(vol) {
                if hosts, peers, err = c.getHostFromDomainZone(vol.domainId, TypeMetaPartition, vol.mpReplicaNum); err != nil {
                        log.LogErrorf("action[doCreateMetaPartition] getHostFromDomainZone err[%v]", err)
                        return nil, errors.NewError(err)
                }
        } else {
                var excludeZone []string
                zoneNum := c.decideZoneNum(vol.crossZone)

                if hosts, peers, err = c.getHostFromNormalZone(TypeMetaPartition, excludeZone, nil, nil, int(vol.mpReplicaNum), zoneNum, vol.zoneName); err != nil {
                        log.LogErrorf("action[doCreateMetaPartition] getHostFromNormalZone err[%v]", err)
                        return nil, errors.NewError(err)
                }

        }

        log.LogInfof("target meta hosts:%v,peers:%v", hosts, peers)
        if partitionID, err = c.idAlloc.allocateMetaPartitionID(); err != nil {
                return nil, errors.NewError(err)
        }

        mp = newMetaPartition(partitionID, start, end, vol.mpReplicaNum, vol.Name, vol.ID, vol.VersionMgr.getLatestVer())
        mp.setHosts(hosts)
        mp.setPeers(peers)

        for _, host := range hosts {
                wg.Add(1)
                go func(host string) {
                        defer func() {
                                wg.Done()
                        }()
                        if err = c.syncCreateMetaPartitionToMetaNode(host, mp); err != nil {
                                errChannel <- err
                                return
                        }
                        mp.Lock()
                        defer mp.Unlock()
                        if err = mp.afterCreation(host, c); err != nil {
                                errChannel <- err
                        }
                }(host)
        }

        wg.Wait()

        select {
        case err = <-errChannel:
                for _, host := range hosts {
                        wg.Add(1)
                        go func(host string) {
                                defer func() {
                                        wg.Done()
                                }()
                                mr, err := mp.getMetaReplica(host)
                                if err != nil {
                                        return
                                }
                                task := mr.createTaskToDeleteReplica(mp.PartitionID)
                                tasks := make([]*proto.AdminTask, 0)
                                tasks = append(tasks, task)
                                c.addMetaNodeTasks(tasks)
                        }(host)
                }
                wg.Wait()
                return nil, errors.NewError(err)
        default:
                mp.Status = proto.ReadWrite
        }
        log.LogInfof("action[doCreateMetaPartition] success,volName[%v],partition[%v],start[%v],end[%v]", vol.Name, partitionID, start, end)
        return
}

func setVolFromArgs(args *VolVarargs, vol *Vol) {
        vol.zoneName = args.zoneName
        vol.Capacity = args.capacity
        vol.DeleteLockTime = args.deleteLockTime
        vol.FollowerRead = args.followerRead
        vol.authenticate = args.authenticate
        vol.enablePosixAcl = args.enablePosixAcl
        vol.DpReadOnlyWhenVolFull = args.dpReadOnlyWhenVolFull
        vol.enableQuota = args.enableQuota
        vol.enableTransaction = args.enableTransaction
        vol.txTimeout = args.txTimeout
        vol.txConflictRetryNum = args.txConflictRetryNum
        vol.txConflictRetryInterval = args.txConflictRetryInterval
        vol.txOpLimit = args.txOpLimit
        vol.dpReplicaNum = args.dpReplicaNum

        if proto.IsCold(vol.VolType) {
                coldArgs := args.coldArgs
                vol.CacheLRUInterval = coldArgs.cacheLRUInterval
                vol.CacheLowWater = coldArgs.cacheLowWater
                vol.CacheHighWater = coldArgs.cacheHighWater
                vol.CacheTTL = coldArgs.cacheTtl
                vol.CacheThreshold = coldArgs.cacheThreshold
                vol.CacheAction = coldArgs.cacheAction
                vol.CacheRule = coldArgs.cacheRule
                vol.CacheCapacity = coldArgs.cacheCap
                vol.EbsBlkSize = coldArgs.objBlockSize
        }

        vol.description = args.description

        vol.dpSelectorName = args.dpSelectorName
        vol.dpSelectorParm = args.dpSelectorParm
}

func getVolVarargs(vol *Vol) *VolVarargs {
        args := &coldVolArgs{
                objBlockSize:     vol.EbsBlkSize,
                cacheCap:         vol.CacheCapacity,
                cacheAction:      vol.CacheAction,
                cacheThreshold:   vol.CacheThreshold,
                cacheTtl:         vol.CacheTTL,
                cacheHighWater:   vol.CacheHighWater,
                cacheLowWater:    vol.CacheLowWater,
                cacheLRUInterval: vol.CacheLRUInterval,
                cacheRule:        vol.CacheRule,
        }

        return &VolVarargs{
                zoneName:                vol.zoneName,
                description:             vol.description,
                capacity:                vol.Capacity,
                deleteLockTime:          vol.DeleteLockTime,
                followerRead:            vol.FollowerRead,
                authenticate:            vol.authenticate,
                dpSelectorName:          vol.dpSelectorName,
                dpSelectorParm:          vol.dpSelectorParm,
                enablePosixAcl:          vol.enablePosixAcl,
                enableQuota:             vol.enableQuota,
                dpReplicaNum:            vol.dpReplicaNum,
                enableTransaction:       vol.enableTransaction,
                txTimeout:               vol.txTimeout,
                txConflictRetryNum:      vol.txConflictRetryNum,
                txConflictRetryInterval: vol.txConflictRetryInterval,
                txOpLimit:               vol.txOpLimit,
                coldArgs:                args,
                dpReadOnlyWhenVolFull:   vol.DpReadOnlyWhenVolFull,
        }
}

func (vol *Vol) initQuotaManager(c *Cluster) {
        vol.quotaManager = &MasterQuotaManager{
                MpQuotaInfoMap: make(map[uint64][]*proto.QuotaReportInfo),
                IdQuotaInfoMap: make(map[uint32]*proto.QuotaInfo),
                c:              c,
                vol:            vol,
        }
}

func (vol *Vol) loadQuotaManager(c *Cluster) (err error) {
        vol.quotaManager = &MasterQuotaManager{
                MpQuotaInfoMap: make(map[uint64][]*proto.QuotaReportInfo),
                IdQuotaInfoMap: make(map[uint32]*proto.QuotaInfo),
                c:              c,
                vol:            vol,
        }

        result, err := c.fsm.store.SeekForPrefix([]byte(quotaPrefix + strconv.FormatUint(vol.ID, 10) + keySeparator))
        if err != nil {
                err = fmt.Errorf("loadQuotaManager get quota failed, err [%v]", err)
                return err
        }

        for _, value := range result {
                quotaInfo := &proto.QuotaInfo{}

                if err = json.Unmarshal(value, quotaInfo); err != nil {
                        log.LogErrorf("loadQuotaManager Unmarshal fail err [%v]", err)
                        return err
                }
                log.LogDebugf("loadQuotaManager info [%v]", quotaInfo)
                if vol.Name != quotaInfo.VolName {
                        panic(fmt.Sprintf("vol name do not match vol name [%v], quotaInfo vol name [%v]", vol.Name, quotaInfo.VolName))
                }
                vol.quotaManager.IdQuotaInfoMap[quotaInfo.QuotaId] = quotaInfo
        }

        return err
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/json"
        "fmt"
        "io"
        "math"
        "net/http"
        "os"
        "path"
        "strconv"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// APIResponse defines the structure of the response to an HTTP request
type APIResponse struct {
        Code int         `json:"code"`
        Msg  string      `json:"msg"`
        Data interface{} `json:"data,omitempty"`
}

// NewAPIResponse returns a new API response.
func NewAPIResponse(code int, msg string) *APIResponse {
        return &APIResponse{
                Code: code,
                Msg:  msg,
        }
}

// Marshal is a wrapper function of json.Marshal
func (api *APIResponse) Marshal() ([]byte, error) {
        return json.Marshal(api)
}

// register the APIs
func (m *MetaNode) registerAPIHandler() (err error) {
        http.HandleFunc("/getPartitions", m.getPartitionsHandler)
        http.HandleFunc("/getPartitionById", m.getPartitionByIDHandler)
        http.HandleFunc("/getLeaderPartitions", m.getLeaderPartitionsHandler)
        http.HandleFunc("/getInode", m.getInodeHandler)
        http.HandleFunc("/getSplitKey", m.getSplitKeyHandler)
        http.HandleFunc("/getExtentsByInode", m.getExtentsByInodeHandler)
        http.HandleFunc("/getEbsExtentsByInode", m.getEbsExtentsByInodeHandler)
        // get all inodes of the partitionID
        http.HandleFunc("/getAllInodes", m.getAllInodesHandler)
        // get dentry information
        http.HandleFunc("/getDentry", m.getDentryHandler)
        http.HandleFunc("/getDirectory", m.getDirectoryHandler)
        http.HandleFunc("/getAllDentry", m.getAllDentriesHandler)
        http.HandleFunc("/getAllTxInfo", m.getAllTxHandler)
        http.HandleFunc("/getParams", m.getParamsHandler)
        http.HandleFunc("/getSmuxStat", m.getSmuxStatHandler)
        http.HandleFunc("/getRaftStatus", m.getRaftStatusHandler)
        http.HandleFunc("/genClusterVersionFile", m.genClusterVersionFileHandler)
        http.HandleFunc("/getInodeSnapshot", m.getInodeSnapshotHandler)
        http.HandleFunc("/getDentrySnapshot", m.getDentrySnapshotHandler)
        // get tx information
        http.HandleFunc("/getTx", m.getTxHandler)
        return
}

func (m *MetaNode) getParamsHandler(w http.ResponseWriter,
        r *http.Request) {
        resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
        params := make(map[string]interface{})
        params[metaNodeDeleteBatchCountKey] = DeleteBatchCount()
        resp.Data = params
        data, _ := resp.Marshal()
        if _, err := w.Write(data); err != nil {
                log.LogErrorf("[getPartitionsHandler] response %s", err)
        }
}

func (m *MetaNode) getSmuxStatHandler(w http.ResponseWriter,
        r *http.Request) {
        resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
        resp.Data = smuxPool.GetStat()
        data, _ := resp.Marshal()
        if _, err := w.Write(data); err != nil {
                log.LogErrorf("[getSmuxStatHandler] response %s", err)
        }
}

func (m *MetaNode) getPartitionsHandler(w http.ResponseWriter,
        r *http.Request) {
        resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
        resp.Data = m.metadataManager
        data, _ := resp.Marshal()
        if _, err := w.Write(data); err != nil {
                log.LogErrorf("[getPartitionsHandler] response %s", err)
        }
}

func (m *MetaNode) getPartitionByIDHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getPartitionByIDHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        msg := make(map[string]interface{})
        leader, _ := mp.IsLeader()
        _, leaderTerm := mp.LeaderTerm()
        msg["leaderAddr"] = leader
        msg["leader_term"] = leaderTerm
        conf := mp.GetBaseConfig()
        msg["partition_id"] = conf.PartitionId
        msg["partition_type"] = conf.PartitionType
        msg["vol_name"] = conf.VolName
        msg["start"] = conf.Start
        msg["end"] = conf.End
        msg["peers"] = conf.Peers
        msg["nodeId"] = conf.NodeId
        msg["cursor"] = conf.Cursor
        resp.Data = msg
        resp.Code = http.StatusOK
        resp.Msg = http.StatusText(http.StatusOK)
}

func (m *MetaNode) getLeaderPartitionsHandler(w http.ResponseWriter, r *http.Request) {
        resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
        mps := m.metadataManager.GetLeaderPartitions()
        resp.Data = mps
        data, err := resp.Marshal()
        if err != nil {
                log.LogErrorf("json marshal error:%v", err)
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        if _, err := w.Write(data); err != nil {
                log.LogErrorf("[getPartitionsHandler] response %s", err)
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
        }
}

func (m *MetaNode) getAllInodesHandler(w http.ResponseWriter, r *http.Request) {
        var err error

        defer func() {
                if err != nil {
                        msg := fmt.Sprintf("[getAllInodesHandler] err(%v)", err)
                        if _, e := w.Write([]byte(msg)); e != nil {
                                log.LogErrorf("[getAllInodesHandler] failed to write response: err(%v) msg(%v)", e, msg)
                        }
                }
        }()

        if err = r.ParseForm(); err != nil {
                return
        }
        id, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                return
        }
        mp, err := m.metadataManager.GetPartition(id)
        if err != nil {
                return
        }
        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                return
        }
        var inode *Inode

        f := func(i BtreeItem) bool {
                var (
                        data []byte
                        e    error
                )

                if inode != nil {
                        if _, e = w.Write([]byte("\n")); e != nil {
                                log.LogErrorf("[getAllInodesHandler] failed to write response: %v", e)
                                return false
                        }
                }

                inode, _ = i.(*Inode).getInoByVer(verSeq, false)
                if inode == nil {
                        return true
                }
                if data, e = inode.MarshalToJSON(); e != nil {
                        log.LogErrorf("[getAllInodesHandler] failed to marshal to json: %v", e)
                        return false
                }

                if _, e = w.Write(data); e != nil {
                        log.LogErrorf("[getAllInodesHandler] failed to write response: %v", e)
                        return false
                }

                return true
        }

        mp.GetInodeTree().Ascend(f)
}

func (m *MetaNode) getSplitKeyHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        log.LogDebugf("getSplitKeyHandler")
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getSplitKeyHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        log.LogDebugf("getSplitKeyHandler")
        id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        log.LogDebugf("getSplitKeyHandler")
        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        log.LogDebugf("getSplitKeyHandler")
        verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        log.LogDebugf("getSplitKeyHandler")
        req := &InodeGetSplitReq{
                PartitionID: pid,
                Inode:       id,
                VerSeq:      verSeq,
                VerAll:      verAll,
        }
        log.LogDebugf("getSplitKeyHandler")
        p := &Packet{}
        err = mp.InodeGetSplitEk(req, p)
        if err != nil {
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        log.LogDebugf("getSplitKeyHandler")
        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
                log.LogDebugf("getSplitKeyHandler data %v", resp.Data)
        } else {
                log.LogDebugf("getSplitKeyHandler")
        }
        return
}

func (m *MetaNode) getInodeHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getInodeHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        verAll, _ := strconv.ParseBool(r.FormValue("verAll"))

        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        req := &InodeGetReq{
                PartitionID: pid,
                Inode:       id,
                VerSeq:      verSeq,
                VerAll:      verAll,
        }
        p := &Packet{}
        err = mp.InodeGet(req, p)
        if err != nil {
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) getRaftStatusHandler(w http.ResponseWriter, r *http.Request) {
        const (
                paramRaftID = "id"
        )

        resp := NewAPIResponse(http.StatusOK, http.StatusText(http.StatusOK))
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getRaftStatusHandler] response %s", err)
                }
        }()

        raftID, err := strconv.ParseUint(r.FormValue(paramRaftID), 10, 64)
        if err != nil {
                err = fmt.Errorf("parse param %v fail: %v", paramRaftID, err)
                resp.Msg = err.Error()
                resp.Code = http.StatusBadRequest
                return
        }

        raftStatus := m.raftStore.RaftStatus(raftID)
        resp.Data = raftStatus
}

func (m *MetaNode) getEbsExtentsByInodeHandler(w http.ResponseWriter,
        r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getEbsExtentsByInodeHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        req := &proto.GetExtentsRequest{
                PartitionID: pid,
                Inode:       id,
        }
        p := &Packet{}
        if err = mp.ObjExtentsList(req, p); err != nil {
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) getExtentsByInodeHandler(w http.ResponseWriter,
        r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getExtentsByInodeHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        id, err := strconv.ParseUint(r.FormValue("ino"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        verAll, _ := strconv.ParseBool(r.FormValue("verAll"))
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }

        req := &proto.GetExtentsRequest{
                PartitionID: pid,
                Inode:       id,
                VerSeq:      uint64(verSeq),
                VerAll:      verAll,
        }
        p := &Packet{}
        if err = mp.ExtentsList(req, p); err != nil {
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) getDentryHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        name := r.FormValue("name")
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getDentryHandler] response %s", err)
                }
        }()
        var (
                pid  uint64
                pIno uint64
                err  error
        )
        if pid, err = strconv.ParseUint(r.FormValue("pid"), 10, 64); err == nil {
                pIno, err = strconv.ParseUint(r.FormValue("parentIno"), 10, 64)
        }
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }
        verAll, _ := strconv.ParseBool(r.FormValue("verAll"))

        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        req := &LookupReq{
                PartitionID: pid,
                ParentID:    pIno,
                Name:        name,
                VerSeq:      verSeq,
                VerAll:      verAll,
        }
        p := &Packet{}
        if err = mp.Lookup(req, p); err != nil {
                resp.Code = http.StatusSeeOther
                resp.Msg = err.Error()
                return
        }

        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) getTxHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getTxHandler] response %s", err)
                }
        }()
        var (
                pid  uint64
                txId string
                err  error
        )
        if pid, err = strconv.ParseUint(r.FormValue("pid"), 10, 64); err == nil {
                if txId = r.FormValue("txId"); txId == "" {
                        err = fmt.Errorf("no txId")
                }
        }
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        req := &proto.TxGetInfoRequest{
                Pid:  pid,
                TxID: txId,
        }
        p := &Packet{}
        if err = mp.TxGetInfo(req, p); err != nil {
                resp.Code = http.StatusSeeOther
                resp.Msg = err.Error()
                return
        }

        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) getRealVerSeq(w http.ResponseWriter, r *http.Request) (verSeq uint64, err error) {
        if r.FormValue("verSeq") != "" {
                var ver int64
                if ver, err = strconv.ParseInt(r.FormValue("verSeq"), 10, 64); err != nil {
                        return
                }
                verSeq = uint64(ver)
                if verSeq == 0 {
                        verSeq = math.MaxUint64
                }
        }
        return
}

func (m *MetaNode) getAllDentriesHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusSeeOther, "")
        shouldSkip := false
        defer func() {
                if !shouldSkip {
                        data, _ := resp.Marshal()
                        if _, err := w.Write(data); err != nil {
                                log.LogErrorf("[getAllDentriesHandler] response %s", err)
                        }
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Code = http.StatusBadRequest
                resp.Msg = err.Error()
                return
        }
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }

        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        buff := bytes.NewBufferString(`{"code": 200, "msg": "OK", "data":[`)
        if _, err := w.Write(buff.Bytes()); err != nil {
                return
        }
        buff.Reset()
        var (
                val       []byte
                delimiter = []byte{',', '\n'}
                isFirst   = true
        )

        mp.GetDentryTree().Ascend(func(i BtreeItem) bool {
                den, _ := i.(*Dentry).getDentryFromVerList(verSeq, false)
                if den == nil || den.isDeleted() {
                        return true
                }

                if !isFirst {
                        if _, err = w.Write(delimiter); err != nil {
                                return false
                        }
                } else {
                        isFirst = false
                }
                val, err = json.Marshal(den)
                if err != nil {
                        w.WriteHeader(http.StatusInternalServerError)
                        w.Write([]byte(err.Error()))
                        return false
                }
                if _, err = w.Write(val); err != nil {
                        return false
                }
                return true
        })
        shouldSkip = true
        buff.WriteString(`]}`)
        if _, err = w.Write(buff.Bytes()); err != nil {
                log.LogErrorf("[getAllDentriesHandler] response %s", err)
        }
        return
}

func (m *MetaNode) getAllTxHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusOK, "")
        shouldSkip := false
        defer func() {
                if !shouldSkip {
                        data, _ := resp.Marshal()
                        if _, err := w.Write(data); err != nil {
                                log.LogErrorf("[getAllTxHandler] response %s", err)
                        }
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Code = http.StatusBadRequest
                resp.Msg = err.Error()
                return
        }
        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        buff := bytes.NewBufferString(`{"code": 200, "msg": "OK", "data":[`)
        if _, err := w.Write(buff.Bytes()); err != nil {
                return
        }
        buff.Reset()
        var (
                val       []byte
                delimiter = []byte{',', '\n'}
                isFirst   = true
        )

        f := func(i BtreeItem) bool {
                if !isFirst {
                        if _, err = w.Write(delimiter); err != nil {
                                return false
                        }
                } else {
                        isFirst = false
                }

                if ino, ok := i.(*TxRollbackInode); ok {
                        _, err = w.Write([]byte(ino.ToString()))
                        if err != nil {
                                return false
                        }
                        return true
                }
                if den, ok := i.(*TxRollbackDentry); ok {
                        _, err = w.Write([]byte(den.ToString()))
                        if err != nil {
                                return false
                        }
                        return true
                }

                val, err = json.Marshal(i)
                if err != nil {
                        w.WriteHeader(http.StatusInternalServerError)
                        w.Write([]byte(err.Error()))
                        return false
                }
                if _, err = w.Write(val); err != nil {
                        return false
                }
                return true
        }

        txTree, rbInoTree, rbDenTree := mp.TxGetTree()
        txTree.Ascend(f)
        rbInoTree.Ascend(f)
        rbDenTree.Ascend(f)

        shouldSkip = true
        buff.WriteString(`]}`)
        if _, err = w.Write(buff.Bytes()); err != nil {
                log.LogErrorf("[getAllTxHandler] response %s", err)
        }
        return
}

func (m *MetaNode) getDirectoryHandler(w http.ResponseWriter, r *http.Request) {
        resp := NewAPIResponse(http.StatusBadRequest, "")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[getDirectoryHandler] response %s", err)
                }
        }()
        pid, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        pIno, err := strconv.ParseUint(r.FormValue("parentIno"), 10, 64)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        verSeq, err := m.getRealVerSeq(w, r)
        if err != nil {
                resp.Msg = err.Error()
                return
        }

        mp, err := m.metadataManager.GetPartition(pid)
        if err != nil {
                resp.Code = http.StatusNotFound
                resp.Msg = err.Error()
                return
        }
        req := ReadDirReq{
                ParentID: pIno,
                VerSeq:   verSeq,
        }
        p := &Packet{}
        if err = mp.ReadDir(&req, p); err != nil {
                resp.Code = http.StatusInternalServerError
                resp.Msg = err.Error()
                return
        }
        resp.Code = http.StatusSeeOther
        resp.Msg = p.GetResultMsg()
        if len(p.Data) > 0 {
                resp.Data = json.RawMessage(p.Data)
        }
        return
}

func (m *MetaNode) genClusterVersionFileHandler(w http.ResponseWriter, r *http.Request) {
        r.ParseForm()
        resp := NewAPIResponse(http.StatusOK, "Generate cluster version file success")
        defer func() {
                data, _ := resp.Marshal()
                if _, err := w.Write(data); err != nil {
                        log.LogErrorf("[genClusterVersionFileHandler] response %s", err)
                }
        }()
        paths := make([]string, 0)
        paths = append(paths, m.metadataDir, m.raftDir)
        for _, p := range paths {
                if _, err := os.Stat(path.Join(p, config.ClusterVersionFile)); err == nil || os.IsExist(err) {
                        resp.Code = http.StatusCreated
                        resp.Msg = "Cluster version file already exists in " + p
                        return
                }
        }
        for _, p := range paths {
                if err := config.CheckOrStoreClusterUuid(p, m.clusterUuid, true); err != nil {
                        resp.Code = http.StatusInternalServerError
                        resp.Msg = "Failed to create cluster version file in " + p
                        return
                }
        }
        return
}

func (m *MetaNode) getInodeSnapshotHandler(w http.ResponseWriter, r *http.Request) {
        m.getSnapshotHandler(w, r, inodeFile)
}

func (m *MetaNode) getDentrySnapshotHandler(w http.ResponseWriter, r *http.Request) {
        m.getSnapshotHandler(w, r, dentryFile)
}

func (m *MetaNode) getSnapshotHandler(w http.ResponseWriter, r *http.Request, file string) {
        var err error
        defer func() {
                if err != nil {
                        msg := fmt.Sprintf("[getInodeSnapshotHandler] err(%v)", err)
                        log.LogErrorf("%s", msg)
                        if _, e := w.Write([]byte(msg)); e != nil {
                                log.LogErrorf("[getInodeSnapshotHandler] failed to write response: err(%v) msg(%v)", e, msg)
                        }
                }
        }()
        if err = r.ParseForm(); err != nil {
                return
        }
        id, err := strconv.ParseUint(r.FormValue("pid"), 10, 64)
        if err != nil {
                return
        }
        mp, err := m.metadataManager.GetPartition(id)
        if err != nil {
                return
        }

        filename := path.Join(mp.GetBaseConfig().RootDir, snapshotDir, file)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[getInodeSnapshotHandler] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[getInodeSnapshotHandler] OpenFile: %s", err.Error())
                return
        }
        defer fp.Close()

        _, err = io.Copy(w, fp)
        if err != nil {
                err = errors.NewErrorf("[getInodeSnapshotHandler] copy: %s", err.Error())
                return
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "sync"

        "github.com/cubefs/cubefs/util/btree"
)

const defaultBTreeDegree = 32

type (
        // BtreeItem type alias google btree Item
        BtreeItem = btree.Item
)

// BTree is the wrapper of Google's btree.
type BTree struct {
        sync.RWMutex
        tree *btree.BTree
}

// NewBtree creates a new btree.
func NewBtree() *BTree {
        return &BTree{
                tree: btree.New(defaultBTreeDegree),
        }
}

// Get returns the object of the given key in the btree.
func (b *BTree) Get(key BtreeItem) (item BtreeItem) {
        b.RLock()
        item = b.tree.Get(key)
        b.RUnlock()
        return
}

func (b *BTree) CopyGet(key BtreeItem) (item BtreeItem) {
        b.Lock()
        item = b.tree.CopyGet(key)
        b.Unlock()
        return
}

// Find searches for the given key in the btree.
func (b *BTree) Find(key BtreeItem, fn func(i BtreeItem)) {
        b.RLock()
        item := b.tree.Get(key)
        b.RUnlock()
        if item == nil {
                return
        }
        fn(item)
}

func (b *BTree) CopyFind(key BtreeItem, fn func(i BtreeItem)) {
        b.Lock()
        item := b.tree.CopyGet(key)
        fn(item)
        b.Unlock()
}

// Has checks if the key exists in the btree.
func (b *BTree) Has(key BtreeItem) (ok bool) {
        b.RLock()
        ok = b.tree.Has(key)
        b.RUnlock()
        return
}

// Delete deletes the object by the given key.
func (b *BTree) Delete(key BtreeItem) (item BtreeItem) {
        b.Lock()
        item = b.tree.Delete(key)
        b.Unlock()
        return
}

func (b *BTree) Execute(fn func(tree *btree.BTree) interface{}) interface{} {
        b.Lock()
        defer b.Unlock()
        return fn(b.tree)
}

// ReplaceOrInsert is the wrapper of google's btree ReplaceOrInsert.
func (b *BTree) ReplaceOrInsert(key BtreeItem, replace bool) (item BtreeItem, ok bool) {
        b.Lock()
        if replace {
                item = b.tree.ReplaceOrInsert(key)
                b.Unlock()
                ok = true
                return
        }

        item = b.tree.Get(key)
        if item == nil {
                item = b.tree.ReplaceOrInsert(key)
                b.Unlock()
                ok = true
                return
        }
        ok = false
        b.Unlock()
        return
}

// Ascend is the wrapper of the google's btree Ascend.
// This function scans the entire btree. When the data is huge, it is not recommended to use this function online.
// Instead, it is recommended to call GetTree to obtain the snapshot of the current btree, and then do the scan on the snapshot.
func (b *BTree) Ascend(fn func(i BtreeItem) bool) {
        b.RLock()
        b.tree.Ascend(fn)
        b.RUnlock()
}

// AscendRange is the wrapper of the google's btree AscendRange.
func (b *BTree) AscendRange(greaterOrEqual, lessThan BtreeItem, iterator func(i BtreeItem) bool) {
        b.RLock()
        b.tree.AscendRange(greaterOrEqual, lessThan, iterator)
        b.RUnlock()
}

// AscendGreaterOrEqual is the wrapper of the google's btree AscendGreaterOrEqual
func (b *BTree) AscendGreaterOrEqual(pivot BtreeItem, iterator func(i BtreeItem) bool) {
        b.RLock()
        b.tree.AscendGreaterOrEqual(pivot, iterator)
        b.RUnlock()
}

// GetTree returns the snapshot of a btree.
func (b *BTree) GetTree() *BTree {
        b.Lock()
        t := b.tree.Clone()
        b.Unlock()
        nb := NewBtree()
        nb.tree = t
        return nb
}

// Reset resets the current btree.
func (b *BTree) Reset() {
        b.Lock()
        b.tree.Clear(true)
        b.Unlock()
}

// Len returns the total number of items in the btree.
func (b *BTree) Len() (size int) {
        b.RLock()
        size = b.tree.Len()
        b.RUnlock()
        return
}

// MaxItem returns the largest item in the btree.
func (b *BTree) MaxItem() BtreeItem {
        b.RLock()
        item := b.tree.Max()
        b.RUnlock()
        return item
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "strings"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

// DataPartition defines the struct of data partition that will be used on the meta node.
type DataPartition struct {
        PartitionID   uint64
        Status        int8
        ReplicaNum    uint8
        PartitionType string
        Hosts         []string
        IsDiscard     bool
}

// GetAllAddrs returns all addresses of the data partition.
func (dp *DataPartition) GetAllAddrs() (m string) {
        return strings.Join(dp.Hosts[1:], proto.AddrSplit) + proto.AddrSplit
}

// DataPartitionsView defines the view of the data node.
type DataPartitionsView struct {
        DataPartitions []*DataPartition
}

func NewDataPartitionsView() *DataPartitionsView {
        return &DataPartitionsView{}
}

// Vol defines the view of the data partition with the read/write lock.
type Vol struct {
        sync.RWMutex
        dataPartitionView map[uint64]*DataPartition
        volDeleteLockTime int64
}

// NewVol returns a new volume instance.
func NewVol() *Vol {
        return &Vol{
                dataPartitionView: make(map[uint64]*DataPartition),
        }
}

// GetPartition returns the data partition based on the given partition ID.
func (v *Vol) GetPartition(partitionID uint64) *DataPartition {
        v.RLock()
        defer v.RUnlock()
        return v.dataPartitionView[partitionID]
}

// UpdatePartitions updates the data partition.
func (v *Vol) UpdatePartitions(partitions *DataPartitionsView) {
        for _, dp := range partitions.DataPartitions {
                log.LogDebugf("action[UpdatePartitions] dp (id:%v,status:%v)", dp.PartitionID, dp.Status)
                v.replaceOrInsert(dp)
        }
}

func (v *Vol) replaceOrInsert(partition *DataPartition) {
        v.Lock()
        defer v.Unlock()
        v.dataPartitionView[partition.PartitionID] = partition
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "fmt"
        "math"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

// Dentry wraps necessary properties of the `dentry` information in file system.
// Marshal exporterKey:
//  +-------+----------+------+
//  | item  | ParentId | Name |
//  +-------+----------+------+
//  | bytes |    8     | rest |
//  +-------+----------+------+
// Marshal value:
//  +-------+-------+------+
//  | item  | Inode | Type |
//  +-------+-------+------+
//  | bytes |   8   |   4  |
//  +-------+-------+------+
// Marshal entity:
//  +-------+-----------+--------------+-----------+--------------+
//  | item  | KeyLength | MarshaledKey | ValLength | MarshaledVal |
//  +-------+-----------+--------------+-----------+--------------+
//  | bytes |     4     |   KeyLength  |     4     |   ValLength  |
//  +-------+-----------+--------------+-----------+--------------+

type DentryMultiSnap struct {
        VerSeq     uint64
        dentryList DentryBatch
}

type Dentry struct {
        ParentId uint64 // FileID value of the parent inode.
        Name     string // Name of the current dentry.
        Inode    uint64 // FileID value of the current inode.
        Type     uint32
        // snapshot
        multiSnap *DentryMultiSnap
}

func NewDentrySnap(seq uint64) *DentryMultiSnap {
        return &DentryMultiSnap{
                VerSeq: seq,
        }
}

func (d *Dentry) getSnapListLen() int {
        if d.multiSnap == nil {
                return 0
        }
        return len(d.multiSnap.dentryList)
}

func (d *Dentry) addVersion(ver uint64) {
        dn := d.CopyDirectly().(*Dentry)
        dn.setVerSeq(d.getSeqFiled())
        d.setVerSeq(ver)
        d.multiSnap.dentryList = append([]*Dentry{dn}, d.multiSnap.dentryList...)
}

func (d *Dentry) setVerSeq(verSeq uint64) {
        if verSeq == 0 {
                return
        }
        if d.multiSnap == nil {
                d.multiSnap = NewDentrySnap(verSeq)
        } else {
                d.multiSnap.VerSeq = verSeq
        }
}

func (d *Dentry) getSeqFiled() (verSeq uint64) {
        if d.multiSnap == nil {
                return 0
        }
        return d.multiSnap.VerSeq
}

func isSeqEqual(ver_1 uint64, ver_2 uint64) bool {
        if isInitSnapVer(ver_1) {
                ver_1 = 0
        }
        if isInitSnapVer(ver_2) {
                ver_2 = 0
        }
        return (ver_1 & math.MaxInt64) == (ver_2 & math.MaxInt64)
}

func (d *Dentry) getVerSeq() (verSeq uint64) {
        if d.multiSnap == nil {
                return 0
        }
        return d.multiSnap.VerSeq & math.MaxInt64
}

func (d *Dentry) isDeleted() bool {
        if d.multiSnap == nil {
                return false
        }
        return (d.multiSnap.VerSeq >> 63) != 0
}

func (d *Dentry) setDeleted() {
        if d.multiSnap == nil {
                log.LogErrorf("action[setDeleted] d %v be set deleted not found multiSnap", d)
                return
        }
        log.LogDebugf("action[setDeleted] d %v be set deleted", d)
        d.multiSnap.VerSeq |= uint64(1) << 63
}

func (d *Dentry) minimizeSeq() (verSeq uint64) {
        cnt := d.getSnapListLen()
        if cnt == 0 {
                return d.getVerSeq()
        }
        return d.multiSnap.dentryList[cnt-1].getVerSeq()
}

func (d *Dentry) isEffective(verSeq uint64) bool {
        if verSeq == 0 {
                return false
        }
        if isInitSnapVer(verSeq) {
                verSeq = 0
        }
        return verSeq >= d.minimizeSeq()
}

// isHit return the right version or else return the version can be seen
func (d *Dentry) getDentryFromVerList(verSeq uint64, isHit bool) (den *Dentry, idx int) {
        if verSeq == 0 || (verSeq >= d.getVerSeq() && !isInitSnapVer(verSeq)) {
                if d.isDeleted() {
                        log.LogDebugf("action[getDentryFromVerList] tmp dentry %v, is deleted, seq [%v]", d, d.getVerSeq())
                        return
                }
                return d, 0
        }

        // read the oldest version snapshot,the oldest version is 0 should make a different with the lastest uncommit version read(with seq 0)
        if isInitSnapVer(verSeq) {
                if d.getVerSeq() == 0 {
                        return d, 0
                }
                denListLen := d.getSnapListLen()
                if denListLen == 0 {
                        return
                }
                den = d.multiSnap.dentryList[denListLen-1]
                if d.multiSnap.dentryList[denListLen-1].getVerSeq() != 0 || d.multiSnap.dentryList[denListLen-1].isDeleted() {
                        return nil, 0
                }
                return den, denListLen
        }
        if d.multiSnap == nil {
                return
        }
        for id, lDen := range d.multiSnap.dentryList {
                if verSeq < lDen.getVerSeq() {
                        log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil, request seq [%v], history ver seq [%v]", lDen, verSeq, lDen.getVerSeq())
                } else {
                        if lDen.isDeleted() {
                                log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil due to latest is deleted", lDen)
                                return
                        }
                        if isHit && lDen.getVerSeq() != verSeq {
                                log.LogDebugf("action[getDentryFromVerList] den in ver list %v, return nil due to ver not equal %v vs %v", lDen, lDen.getVerSeq(), verSeq)
                                return
                        }
                        return lDen, id + 1
                }
        }
        log.LogDebugf("action[getDentryFromVerList] den in ver list not found right dentry with seq [%v]", verSeq)
        return
}

func (d *Dentry) getLastestVer(reqVerSeq uint64, commit bool, verlist []*proto.VolVersionInfo) (uint64, bool) {
        if len(verlist) == 0 {
                return 0, false
        }
        for id, info := range verlist {
                if commit && id == len(verlist)-1 {
                        break
                }
                if info.Ver >= reqVerSeq { // include reqSeq itself
                        return info.Ver, true
                }
        }

        log.LogDebugf("action[getLastestVer] inode[%v] reqVerseq [%v] not found, the largetst one %v",
                d.Inode, reqVerSeq, verlist[len(verlist)-1].Ver)
        return 0, false
}

func (d *Dentry) deleteTopLayer(mpVerSeq uint64) (rd *Dentry, dmore bool, clean bool) {
        if d.isDeleted() {
                log.LogDebugf("action[deleteTopLayer.delSeq_0] do noting dentry %v seq 0 be deleted before", d)
                return nil, false, false
        }

        // if there's no snapshot itself, nor have snapshot after dentry's ver then need unlink directly and make no snapshot
        // just move to upper layer,the request snapshot be dropped
        if d.getSnapListLen() == 0 {
                if d.getVerSeq() == mpVerSeq {
                        // operate dentry directly
                        log.LogDebugf("action[deleteTopLayer.delSeq_0] no snapshot depend on this dentry,could drop seq 0 dentry %v", d)
                        return d, true, true
                }
        }

        if d.getVerSeq() < mpVerSeq {
                dn := d.CopyDirectly()
                dn.(*Dentry).setVerSeq(d.getVerSeq())
                d.setVerSeq(mpVerSeq)
                d.multiSnap.dentryList = append([]*Dentry{dn.(*Dentry)}, d.multiSnap.dentryList...)
                log.LogDebugf("action[deleteTopLayer.delSeq_0] create version and push to dentry list. dentry %v", dn.(*Dentry))
        } else {
                d.setVerSeq(mpVerSeq)
        }
        d.setVerSeq(mpVerSeq)
        d.setDeleted() // denParm create at the same version.no need to push to history list
        log.LogDebugf("action[deleteTopLayer.delSeq_0] den %v be set deleted at version seq [%v]", d, mpVerSeq)

        return d, true, false
}

func (d *Dentry) updateTopLayerSeq(delVerSeq uint64, verlist []*proto.VolVersionInfo) (rd *Dentry, dmore bool, clean bool) {
        if !isSeqEqual(delVerSeq, d.getVerSeq()) {
                // header layer do nothing and be depends on should not be dropped
                log.LogDebugf("action[updateTopLayerSeq.inSnapList_del_%v] den %v first layer do nothing", delVerSeq, d)
                return d, false, false
        }
        for _, info := range verlist {
                if info.Ver > d.getVerSeq() {
                        d.setVerSeq(info.Ver)
                        return d, false, false
                }
        }
        return d, true, true
}

func (d *Dentry) cleanDeletedVersion(index int) (bDrop bool) {
        if index == 0 {
                if len(d.multiSnap.dentryList) == 0 && d.isDeleted() {
                        bDrop = true
                }
                return
        }
        delIdx := index - 1
        if !d.multiSnap.dentryList[delIdx].isDeleted() {
                return
        }

        // del the dentry before
        log.LogDebugf("ction[cleanDeleteVersion] dentry (%v) delete the last seq [%v] which set deleted before",
                d, d.multiSnap.dentryList[delIdx].getVerSeq())
        d.multiSnap.dentryList = append(d.multiSnap.dentryList[:delIdx], d.multiSnap.dentryList[:delIdx+1]...)

        if len(d.multiSnap.dentryList) == 0 && d.isDeleted() {
                log.LogDebugf("ction[cleanDeleteVersion] dentry (%v) require to be deleted", d)
                bDrop = true
        }
        return
}

// the lastest dentry may be deleted before and set status DentryDeleted,
// the scope of  deleted happened from the DentryDeleted flag owner(include in) to the file with the same name be created is invisible,
// if create anther dentry with larger verSeq, put the deleted dentry to the history list.
// return doMore bool.True means need do next step on caller such as unlink parentIO
func (d *Dentry) deleteVerSnapshot(delVerSeq uint64, mpVerSeq uint64, verlist []*proto.VolVersionInfo) (rd *Dentry, dmore bool, clean bool) { // bool is doMore
        log.LogDebugf("action[deleteVerSnapshot] enter.dentry %v delVerseq [%v] mpver [%v] verList %v", d, delVerSeq, mpVerSeq, verlist)
        // create denParm version
        if !isInitSnapVer(delVerSeq) && delVerSeq > mpVerSeq {
                panic(fmt.Sprintf("Dentry version %v large than mp[%v]", delVerSeq, mpVerSeq))
        }

        if delVerSeq == 0 {
                return d.deleteTopLayer(mpVerSeq)
        } else {
                var (
                        idx    int
                        den    *Dentry
                        endSeq uint64
                )
                if den, idx = d.getDentryFromVerList(delVerSeq, true); den == nil {
                        log.LogDebugf("action[deleteVerSnapshot.inSnapList_del_%v] den %v not found", delVerSeq, d)
                        return nil, false, false
                }
                if idx == 0 { // top layer
                        return d.updateTopLayerSeq(delVerSeq, verlist)
                }
                // if any alive snapshot in mp dimension exist in seq scope from den to next ascend neighbor, dio snapshot be keep or else drop
                startSeq := den.getVerSeq()
                realIdx := idx - 1 // index in history list layer
                if realIdx == 0 {
                        endSeq = d.getVerSeq()
                } else {
                        endSeq = d.multiSnap.dentryList[realIdx-1].getVerSeq()
                        if d.multiSnap.dentryList[realIdx-1].isDeleted() {
                                log.LogInfof("action[deleteVerSnapshot.inSnapList_del_%v] inode[%v] layer %v name %v be deleted already!",
                                        delVerSeq, d.Inode, realIdx, d.multiSnap.dentryList[realIdx-1].Name)
                        }
                }

                log.LogDebugf("action[deleteVerSnapshot.inSnapList_del_%v] inode[%v] try drop multiVersion idx %v effective seq scope [%v,%v) ", delVerSeq,
                        d.Inode, realIdx, den.getVerSeq(), endSeq)

                for _, info := range verlist {
                        if info.Ver >= startSeq && info.Ver < endSeq { // the version itself not include in
                                log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] dir layer idx %v include snapshot %v.don't drop", delVerSeq, den.Inode, realIdx, info.Ver)
                                // there's some snapshot depends on the version trying to be deleted,
                                // keep it,all the snapshots which depends on this version will reach here when make snapshot delete, and found the scope is minimized
                                // other versions depends upon this version will be found zero finally after deletions and do clean
                                den.setVerSeq(info.Ver)
                                return den, false, false
                        }
                        if info.Ver >= endSeq {
                                break
                        }
                        log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] try drop scope [%v, %v), mp ver [%v] not suitable",
                                delVerSeq, den.Inode, den.getVerSeq(), endSeq, info.Ver)
                }

                log.LogDebugf("action[deleteVerSnapshotInList.inSnapList_del_%v] inode[%v] try drop multiVersion idx %v", delVerSeq, den.Inode, realIdx)
                d.multiSnap.dentryList = append(d.multiSnap.dentryList[:realIdx], d.multiSnap.dentryList[realIdx+1:]...)
                if d.cleanDeletedVersion(realIdx) {
                        return den, true, true
                }
                return den, false, false
        }
}

func (d *Dentry) String() string {
        str := fmt.Sprintf("dentry(name:[%v],parentId:[%v],inode:[%v],type:[%v],seq:[%v],isDeleted:[%v],dentryList_len[%v])",
                d.Name, d.ParentId, d.Inode, d.Type, d.getVerSeq(), d.isDeleted(), d.getSnapListLen())
        if d.getSnapListLen() > 0 {
                for idx, den := range d.multiSnap.dentryList {
                        str += fmt.Sprintf("idx:%v,content(%v))", idx, den)
                }
        }
        return str
}

type TxDentry struct {
        // ParInode *Inode
        Dentry *Dentry
        TxInfo *proto.TransactionInfo
}

func NewTxDentry(parentID uint64, name string, ino uint64, mode uint32, parInode *Inode, txInfo *proto.TransactionInfo) *TxDentry {
        dentry := &Dentry{
                ParentId: parentID,
                Name:     name,
                Inode:    ino,
                Type:     mode,
        }

        txDentry := &TxDentry{
                // ParInode: parInode,
                Dentry: dentry,
                TxInfo: txInfo,
        }
        return txDentry
}

func (td *TxDentry) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0))

        //bs, err := td.ParInode.Marshal()
        //if err != nil {
        //        return nil, err
        //}
        //if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
        //        return nil, err
        //}
        //if _, err := buff.Write(bs); err != nil {
        //        return nil, err
        //}

        bs, err := td.Dentry.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }

        bs, err = td.TxInfo.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }
        result = buff.Bytes()
        return
}

func (td *TxDentry) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        var dataLen uint32
        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data := make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        dentry := &Dentry{}
        if err = dentry.Unmarshal(data); err != nil {
                return
        }
        td.Dentry = dentry

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }
        txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
        if err = txInfo.Unmarshal(data); err != nil {
                return
        }
        td.TxInfo = txInfo
        return
}

type TxUpdateDentry struct {
        OldDentry *Dentry
        NewDentry *Dentry
        TxInfo    *proto.TransactionInfo
}

func NewTxUpdateDentry(oldDentry *Dentry, newDentry *Dentry, txInfo *proto.TransactionInfo) *TxUpdateDentry {
        txUpdateDentry := &TxUpdateDentry{
                OldDentry: oldDentry,
                NewDentry: newDentry,
                TxInfo:    txInfo,
        }
        return txUpdateDentry
}

func (td *TxUpdateDentry) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0))
        bs, err := td.OldDentry.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }

        bs, err = td.NewDentry.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }

        bs, err = td.TxInfo.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }
        result = buff.Bytes()
        return
}

func (td *TxUpdateDentry) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        var dataLen uint32
        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data := make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        oldDentry := &Dentry{}
        if err = oldDentry.Unmarshal(data); err != nil {
                return
        }
        td.OldDentry = oldDentry

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        newDentry := &Dentry{}
        if err = newDentry.Unmarshal(data); err != nil {
                return
        }
        td.NewDentry = newDentry

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }
        txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
        if err = txInfo.Unmarshal(data); err != nil {
                return
        }
        td.TxInfo = txInfo
        return
}

type DentryBatch []*Dentry

// todo(leon chang), buffer need alloc first before and write directly consider the space and performance

// Marshal marshals a dentry into a byte array.
func (d *Dentry) Marshal() (result []byte, err error) {
        keyBytes := d.MarshalKey()
        valBytes := d.MarshalValue()
        keyLen := uint32(len(keyBytes))
        valLen := uint32(len(valBytes))
        buff := bytes.NewBuffer(make([]byte, 0))
        buff.Grow(int(keyLen + valLen + 8))

        if err = binary.Write(buff, binary.BigEndian, keyLen); err != nil {
                return
        }
        if _, err = buff.Write(keyBytes); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, valLen); err != nil {
                return nil, err
        }
        if _, err = buff.Write(valBytes); err != nil {
                return
        }
        result = buff.Bytes()
        return
}

// Unmarshal unmarshals the dentry from a byte array.
func (d *Dentry) Unmarshal(raw []byte) (err error) {
        var (
                keyLen uint32
                valLen uint32
        )
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &keyLen); err != nil {
                return
        }
        keyBytes := make([]byte, keyLen)
        if _, err = buff.Read(keyBytes); err != nil {
                return
        }
        if err = d.UnmarshalKey(keyBytes); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &valLen); err != nil {
                return
        }
        valBytes := make([]byte, valLen)
        if _, err = buff.Read(valBytes); err != nil {
                return
        }
        err = d.UnmarshalValue(valBytes)
        return
}

// Marshal marshals the dentryBatch into a byte array.
func (d DentryBatch) Marshal() ([]byte, error) {
        buff := bytes.NewBuffer(make([]byte, 0))
        if err := binary.Write(buff, binary.BigEndian, uint32(len(d))); err != nil {
                return nil, err
        }
        for _, dentry := range d {
                bs, err := dentry.Marshal()
                if err != nil {
                        return nil, err
                }
                if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                        return nil, err
                }
                if _, err := buff.Write(bs); err != nil {
                        return nil, err
                }
        }
        return buff.Bytes(), nil
}

// Unmarshal unmarshals the dentryBatch.
func DentryBatchUnmarshal(raw []byte) (DentryBatch, error) {
        buff := bytes.NewBuffer(raw)
        var batchLen uint32
        if err := binary.Read(buff, binary.BigEndian, &batchLen); err != nil {
                return nil, err
        }

        result := make(DentryBatch, 0, int(batchLen))

        var dataLen uint32
        for j := 0; j < int(batchLen); j++ {
                if err := binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                        return nil, err
                }
                data := make([]byte, int(dataLen))
                if _, err := buff.Read(data); err != nil {
                        return nil, err
                }
                den := &Dentry{}
                if err := den.Unmarshal(data); err != nil {
                        return nil, err
                }
                result = append(result, den)
        }

        return result, nil
}

// Less tests whether the current dentry is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (d *Dentry) Less(than BtreeItem) (less bool) {
        dentry, ok := than.(*Dentry)
        less = ok && ((d.ParentId < dentry.ParentId) || ((d.ParentId == dentry.ParentId) && (d.Name < dentry.Name)))
        return
}

func (d *Dentry) CopyDirectly() BtreeItem {
        newDentry := *d
        newDentry.multiSnap = nil
        return &newDentry
}

func (d *Dentry) Copy() BtreeItem {
        newDentry := *d
        if d.multiSnap != nil {
                newDentry.multiSnap = &DentryMultiSnap{
                        VerSeq:     d.multiSnap.VerSeq,
                        dentryList: d.multiSnap.dentryList,
                }
        }
        return &newDentry
}

// MarshalKey is the bytes version of the MarshalKey method which returns the byte slice result.
func (d *Dentry) MarshalKey() (k []byte) {
        buff := bytes.NewBuffer(make([]byte, 0))
        buff.Grow(32)
        if err := binary.Write(buff, binary.BigEndian, &d.ParentId); err != nil {
                panic(err)
        }
        buff.Write([]byte(d.Name))
        k = buff.Bytes()
        return
}

// UnmarshalKey unmarshals the exporterKey from bytes.
func (d *Dentry) UnmarshalKey(k []byte) (err error) {
        buff := bytes.NewBuffer(k)
        if err = binary.Read(buff, binary.BigEndian, &d.ParentId); err != nil {
                return
        }
        d.Name = string(buff.Bytes())
        return
}

func (d *Dentry) MarshalValue() []byte {
        buff := bytes.NewBuffer(nil)
        buff.Grow(24 + d.getSnapListLen()*20)

        writeBinary := func(data interface{}) {
                if err := binary.Write(buff, binary.BigEndian, data); err != nil {
                        panic(err)
                }
        }

        writeBinary(&d.Inode)
        writeBinary(&d.Type)
        seq := d.getSeqFiled()
        if seq == 0 {
                return buff.Bytes()
        }
        writeBinary(&seq)

        verCnt := uint32(d.getSnapListLen())
        writeBinary(&verCnt)

        if d.getSnapListLen() > 0 {
                for _, dd := range d.multiSnap.dentryList {
                        writeBinary(&dd.Inode)
                        writeBinary(&dd.Type)
                        seq = dd.getSeqFiled()
                        writeBinary(&seq)
                }
        }

        return buff.Bytes()
}

func (d *Dentry) UnmarshalValue(val []byte) (err error) {
        buff := bytes.NewBuffer(val)
        if err = binary.Read(buff, binary.BigEndian, &d.Inode); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &d.Type); err != nil {
                return
        }

        if len(val) >= 24 {
                var seq uint64
                if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
                        return
                }

                d.multiSnap = NewDentrySnap(seq)

                verCnt := uint32(0)
                if err = binary.Read(buff, binary.BigEndian, &verCnt); err != nil {
                        return
                }

                for i := 0; i < int(verCnt); i++ {
                        // todo(leonchang) name and parentid should be removed to reduce space
                        den := &Dentry{
                                Name:     d.Name,
                                ParentId: d.ParentId,
                        }
                        if err = binary.Read(buff, binary.BigEndian, &den.Inode); err != nil {
                                return
                        }
                        if err = binary.Read(buff, binary.BigEndian, &den.Type); err != nil {
                                return
                        }
                        if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
                                return
                        }
                        if seq > 0 {
                                den.multiSnap = NewDentrySnap(seq)
                        }
                        d.multiSnap.dentryList = append(d.multiSnap.dentryList, den)
                }
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/util/btree"
)

type ExtentVal struct {
        dataMap map[string][]byte
        verSeq  uint64
}

type Extend struct {
        inode     uint64
        dataMap   map[string][]byte
        verSeq    uint64
        multiVers []*Extend
        versionMu sync.RWMutex
        mu        sync.RWMutex
}

func (e *Extend) checkSequence() (err error) {
        e.versionMu.RLock()
        defer e.versionMu.RUnlock()

        lastSeq := e.verSeq
        for id, extend := range e.multiVers {
                if lastSeq <= extend.verSeq {
                        return fmt.Errorf("id[%v] seq [%v] not less than last seq [%v]", id, extend.verSeq, lastSeq)
                }
        }
        return
}

func (e *Extend) GetMinVer() uint64 {
        if len(e.multiVers) == 0 {
                return e.verSeq
        }
        return e.multiVers[len(e.multiVers)-1].verSeq
}

func (e *Extend) GetExtentByVersion(ver uint64) (extend *Extend) {
        if ver == 0 {
                return e
        }
        if isInitSnapVer(ver) {
                if e.GetMinVer() != 0 {
                        return nil
                }
                return e.multiVers[len(e.multiVers)-1]
        }
        e.versionMu.RLock()
        defer e.versionMu.RUnlock()
        for i := 0; i < len(e.multiVers)-1; i++ {
                if e.multiVers[i].verSeq <= ver {
                        return e.multiVers[i]
                }
        }
        return
}

func NewExtend(inode uint64) *Extend {
        return &Extend{inode: inode, dataMap: make(map[string][]byte)}
}

func NewExtendFromBytes(raw []byte) (*Extend, error) {
        var err error
        buffer := bytes.NewBuffer(raw)
        // decode inode
        var inode uint64
        if inode, err = binary.ReadUvarint(buffer); err != nil {
                return nil, err
        }
        ext := NewExtend(inode)
        // decode number of key-value pairs
        var numKV uint64
        if numKV, err = binary.ReadUvarint(buffer); err != nil {
                return nil, err
        }
        readBytes := func() ([]byte, error) {
                var length uint64
                if length, err = binary.ReadUvarint(buffer); err != nil {
                        return nil, err
                }
                data := make([]byte, length)
                if _, err = buffer.Read(data); err != nil {
                        return nil, err
                }
                return data, nil
        }
        for i := 0; i < int(numKV); i++ {
                var k, v []byte
                if k, err = readBytes(); err != nil {
                        return nil, err
                }
                if v, err = readBytes(); err != nil {
                        return nil, err
                }
                ext.Put(k, v, 0)
        }

        if buffer.Len() > 0 {
                // read verSeq
                verSeq, err := binary.ReadUvarint(buffer)
                if err != nil {
                        return nil, err
                }
                ext.verSeq = verSeq

                // read number of multiVers
                numMultiVers, err := binary.ReadUvarint(buffer)
                if err != nil {
                        return nil, err
                }
                if numMultiVers > 0 {
                        // read each multiVers
                        ext.multiVers = make([]*Extend, numMultiVers)
                        for i := uint64(0); i < numMultiVers; i++ {
                                // read multiVers length
                                mvLen, err := binary.ReadUvarint(buffer)
                                if err != nil {
                                        return nil, err
                                }
                                mvBytes := make([]byte, mvLen)
                                if _, err = buffer.Read(mvBytes); err != nil {
                                        return nil, err
                                }

                                // recursively decode multiVers
                                mv, err := NewExtendFromBytes(mvBytes)
                                if err != nil {
                                        return nil, err
                                }

                                ext.multiVers[i] = mv
                        }
                }
        }
        return ext, nil
}

func (e *Extend) Less(than btree.Item) bool {
        ext, is := than.(*Extend)
        return is && e.inode < ext.inode
}

func (e *Extend) Put(key, value []byte, verSeq uint64) {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.dataMap[string(key)] = value
        e.verSeq = verSeq
}

func (e *Extend) Get(key []byte) (value []byte, exist bool) {
        e.mu.RLock()
        defer e.mu.RUnlock()
        value, exist = e.dataMap[string(key)]
        return
}

func (e *Extend) Remove(key []byte) {
        e.mu.Lock()
        defer e.mu.Unlock()
        delete(e.dataMap, string(key))
        return
}

func (e *Extend) Range(visitor func(key, value []byte) bool) {
        e.mu.RLock()
        defer e.mu.RUnlock()
        for k, v := range e.dataMap {
                if !visitor([]byte(k), v) {
                        return
                }
        }
}

func (e *Extend) Merge(o *Extend, override bool) {
        e.mu.Lock()
        defer e.mu.Unlock()
        o.Range(func(key, value []byte) bool {
                strKey := string(key)
                if _, exist := e.dataMap[strKey]; override || !exist {
                        copied := make([]byte, len(value))
                        copy(copied, value)
                        e.dataMap[strKey] = copied
                }
                return true
        })
}

func (e *Extend) Copy() btree.Item {
        newExt := NewExtend(e.inode)
        e.mu.RLock()
        defer e.mu.RUnlock()
        for k, v := range e.dataMap {
                newExt.dataMap[k] = v
        }
        newExt.verSeq = e.verSeq
        newExt.multiVers = e.multiVers
        return newExt
}

func (e *Extend) Bytes() ([]byte, error) {
        var err error
        e.mu.RLock()
        defer e.mu.RUnlock()
        var n int
        tmp := make([]byte, binary.MaxVarintLen64)
        buffer := bytes.NewBuffer(nil)
        // write inode with varint codec
        n = binary.PutUvarint(tmp, e.inode)
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // write number of key-value pairs
        n = binary.PutUvarint(tmp, uint64(len(e.dataMap)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // write key-value paris
        writeBytes := func(val []byte) error {
                n = binary.PutUvarint(tmp, uint64(len(val)))
                if _, err = buffer.Write(tmp[:n]); err != nil {
                        return err
                }
                if _, err = buffer.Write(val); err != nil {
                        return err
                }
                return nil
        }
        for k, v := range e.dataMap {
                // key
                if err = writeBytes([]byte(k)); err != nil {
                        return nil, err
                }
                // value
                if err = writeBytes(v); err != nil {
                        return nil, err
                }
        }

        if e.verSeq > 0 {
                // write verSeq
                verSeqBytes := make([]byte, binary.MaxVarintLen64)
                verSeqLen := binary.PutUvarint(verSeqBytes, e.verSeq)
                if _, err = buffer.Write(verSeqBytes[:verSeqLen]); err != nil {
                        return nil, err
                }

                // write number of multiVers
                n = binary.PutUvarint(tmp, uint64(len(e.multiVers)))
                if _, err = buffer.Write(tmp[:n]); err != nil {
                        return nil, err
                }

                // write each multiVers
                for _, mv := range e.multiVers {
                        // write multiVers bytes
                        mvBytes, err := mv.Bytes()
                        if err != nil {
                                return nil, err
                        }
                        // write multiVers length
                        n = binary.PutUvarint(tmp, uint64(len(mvBytes)))
                        if _, err = buffer.Write(tmp[:n]); err != nil {
                                return nil, err
                        }
                        // write multiVers bytes
                        if _, err = buffer.Write(mvBytes); err != nil {
                                return nil, err
                        }
                }

                return buffer.Bytes(), nil
        }
        return buffer.Bytes(), nil
}

func (e *Extend) GetInode() (inode uint64) {
        return e.inode
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "container/list"
        "sync"
)

type freeList struct {
        sync.Mutex
        list  *list.List
        index map[uint64]*list.Element
}

func newFreeList() *freeList {
        return &freeList{
                list:  list.New(),
                index: make(map[uint64]*list.Element),
        }
}

// Pop removes the first item on the list and returns it.
func (fl *freeList) Pop() (ino uint64) {
        fl.Lock()
        defer fl.Unlock()
        item := fl.list.Front()
        if item == nil {
                return
        }
        val := fl.list.Remove(item)
        ino = val.(uint64)
        delete(fl.index, ino)
        return
}

// Push inserts a new item at the back of the list.
func (fl *freeList) Push(ino uint64) {
        fl.Lock()
        defer fl.Unlock()
        if _, ok := fl.index[ino]; !ok {
                item := fl.list.PushBack(ino)
                fl.index[ino] = item
        }
}

func (fl *freeList) Remove(ino uint64) {
        fl.Lock()
        defer fl.Unlock()
        if item, ok := fl.index[ino]; ok {
                fl.list.Remove(item)
                delete(fl.index, ino)
        }
}

func (fl *freeList) Len() int {
        fl.Lock()
        defer fl.Unlock()
        return len(fl.index)
}

//go:build gofuzz
// +build gofuzz

// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package metanode

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

type InodeParam struct {
        Ino  uint64
        Type uint32
}

func FuzzNewInode(data []byte) int {
        f := fuzz.NewConsumer(data)
        param := InodeParam{}

        err := f.GenerateStruct(&param)
        if err != nil {
                return 0
        }

        ino := NewInode(param.Ino, param.Type)
        if ino == nil {
                return 0
        }
        return 1
}

func FuzzNewExtend(data []byte) int {
        f := fuzz.NewConsumer(data)
        var ino uint64

        err := f.GenerateStruct(&ino)
        if err != nil {
                return 0
        }

        extend := NewExtend(ino)
        if extend == nil {
                return 0
        }
        return 1
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io"
        syslog "log"
        "math"
        "runtime/debug"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/timeutil"
)

const (
        DeleteMarkFlag = 1 << 0
        InodeDelTop    = 1 << 1
)

var (
        // InodeV1Flag uint64 = 0x01
        V2EnableColdInodeFlag uint64 = 0x02
        V3EnableSnapInodeFlag uint64 = 0x04
)

// Inode wraps necessary properties of `Inode` information in the file system.
// Marshal exporterKey:
//  +-------+-------+
//  | item  | Inode |
//  +-------+-------+
//  | bytes |   8   |
//  +-------+-------+
// Marshal value:
//  +-------+------+------+-----+----+----+----+--------+------------------+
//  | item  | Type | Size | Gen | CT | AT | MT | ExtLen | MarshaledExtents |
//  +-------+------+------+-----+----+----+----+--------+------------------+
//  | bytes |  4   |  8   |  8  | 8  | 8  | 8  |   4    |      ExtLen      |
//  +-------+------+------+-----+----+----+----+--------+------------------+
// Marshal entity:
//  +-------+-----------+--------------+-----------+--------------+
//  | item  | KeyLength | MarshaledKey | ValLength | MarshaledVal |
//  +-------+-----------+--------------+-----------+--------------+
//  | bytes |     4     |   KeyLength  |     4     |   ValLength  |
//  +-------+-----------+--------------+-----------+--------------+

type InodeMultiSnap struct {
        verSeq        uint64 // latest version be create or modified
        multiVersions InodeBatch
        ekRefMap      *sync.Map
}

type Inode struct {
        sync.RWMutex
        Inode      uint64 // Inode ID
        Type       uint32
        Uid        uint32
        Gid        uint32
        Size       uint64
        Generation uint64
        CreateTime int64
        AccessTime int64
        ModifyTime int64
        LinkTarget []byte // SymLink target name
        NLink      uint32 // NodeLink counts
        Flag       int32
        Reserved   uint64 // reserved space
        // Extents    *ExtentsTree
        Extents    *SortedExtents
        ObjExtents *SortedObjExtents
        // Snapshot
        multiSnap *InodeMultiSnap
}

func (i *Inode) GetMultiVerString() string {
        if i.multiSnap == nil {
                return "nil"
        }

        return fmt.Sprintf("%v", i.multiSnap.multiVersions)
}

func (i *Inode) RangeMultiVer(visitor func(idx int, info *Inode) bool) {
        if i.multiSnap == nil {
                return
        }
        for k, v := range i.multiSnap.multiVersions {
                if !visitor(k, v) {
                        break
                }
        }
}

func isInitSnapVer(seq uint64) bool {
        return seq == math.MaxUint64
}

func NewMultiSnap(seq uint64) *InodeMultiSnap {
        return &InodeMultiSnap{
                verSeq: seq,
        }
}

func (i *Inode) verUpdate(seq uint64) {
        if seq == 0 && i.multiSnap == nil {
                return
        }
        if i.multiSnap == nil {
                i.multiSnap = NewMultiSnap(seq)
        } else {
                i.multiSnap.verSeq = seq
        }
}

func (i *Inode) setVerNoCheck(seq uint64) {
        i.verUpdate(seq)
}

func (i *Inode) setVer(seq uint64) {
        if i.getVer() > seq {
                syslog.Println(fmt.Sprintf("inode[%v] old seq [%v] cann't use seq [%v]", i.getVer(), seq, string(debug.Stack())))
                log.LogFatalf("inode[%v] old seq [%v] cann't use seq [%v] stack %v", i.Inode, i.getVer(), seq, string(debug.Stack()))
        }
        i.verUpdate(seq)
}

func (i *Inode) insertEkRefMap(mpId uint64, ek *proto.ExtentKey) {
        if i.multiSnap == nil {
                i.multiSnap = NewMultiSnap(i.getVer())
        }
        if i.multiSnap.ekRefMap == nil {
                i.multiSnap.ekRefMap = new(sync.Map)
        }
        storeEkSplit(mpId, i.Inode, i.multiSnap.ekRefMap, ek)
}

func (i *Inode) isEkInRefMap(mpId uint64, ek *proto.ExtentKey) (ok bool) {
        if i.multiSnap == nil {
                return
        }
        if i.multiSnap.ekRefMap == nil {
                log.LogErrorf("[storeEkSplit] mpId [%v] inodeID %v ekRef nil", mpId, i.Inode)
                return
        }
        log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] mp[%v] extent id[%v] ek [%v]", mpId, i.Inode, ek.PartitionId, ek.ExtentId, ek)
        id := ek.PartitionId<<32 | ek.ExtentId
        _, ok = i.multiSnap.ekRefMap.Load(id)
        return
}

func (i *Inode) getVer() uint64 {
        if i.multiSnap == nil {
                return 0
        }
        return i.multiSnap.verSeq
}

func (i *Inode) getLayerLen() int {
        if i.multiSnap == nil {
                return 0
        }
        return len(i.multiSnap.multiVersions)
}

func (i *Inode) getLayerVer(layer int) uint64 {
        if i.multiSnap == nil {
                log.LogErrorf("getLayerVer. inode[%v] multi snap nil", i.Inode)
                return 0
        }

        if layer > i.getLayerLen()-1 {
                log.LogErrorf("getLayerVer. inode[%v] layer %v not exist, len %v", i.Inode, layer, i.getLayerLen())
                return 0
        }
        if i.multiSnap.multiVersions[layer] == nil {
                log.LogErrorf("getLayerVer. inode[%v] layer %v nil", i.Inode, layer)
                return 0
        }
        return i.multiSnap.multiVersions[layer].getVer()
}

func (i *Inode) isEmptyVerList() bool {
        return i.getLayerLen() == 0
}

func (i *Inode) isTailIndexInList(id int) bool {
        return id == i.getLayerLen()-1
}

func (i *Inode) getTailVerInList() (verSeq uint64, found bool) {
        mLen := i.getLayerLen()
        if mLen > 0 {
                return i.getLayerVer(mLen - 1), true
        }
        return 0, false
}

// freelist clean inode get all exist extents info, deal special case for split key
func (inode *Inode) GetAllExtsOfflineInode(mpID uint64) (extInfo map[uint64][]*proto.ExtentKey) {
        log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] inode.Extents: %v, ino verList: %v",
                mpID, inode.Inode, inode.Extents, inode.GetMultiVerString())

        extInfo = make(map[uint64][]*proto.ExtentKey)

        if inode.getLayerLen() > 0 {
                log.LogWarnf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] verlist len %v should not drop",
                        mpID, inode.Inode, inode.getLayerLen())
        }

        for i := 0; i < inode.getLayerLen()+1; i++ {
                dIno := inode
                if i > 0 {
                        dIno = inode.multiSnap.multiVersions[i-1]
                }
                log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] dino[%v]", mpID, inode.Inode, dIno)
                dIno.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
                        if ek.IsSplit() {
                                var (
                                        dOK  bool
                                        last bool
                                )
                                log.LogDebugf("deleteMarkedInodes DecSplitEk mpID %v inode[%v]", mpID, inode.Inode)
                                if dOK, last = dIno.DecSplitEk(mpID, &ek); !dOK {
                                        return false
                                }
                                if !last {
                                        log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] ek [%v] be removed", mpID, inode.Inode, ek)
                                        return true
                                }

                                log.LogDebugf("deleteMarkedInodes. GetAllExtsOfflineInode.mp[%v] inode[%v] ek [%v] be removed", mpID, inode.Inode, ek)
                        }
                        extInfo[ek.PartitionId] = append(extInfo[ek.PartitionId], &ek)
                        // NOTE: unnecessary to set ext
                        log.LogWritef("GetAllExtsOfflineInode. mp[%v] ino(%v) deleteExtent(%v)", mpID, inode.Inode, ek.String())
                        return true
                })
                // NOTE: clear all extents in this layer
                dIno.Extents = NewSortedExtents()
        }
        return
}

type InodeBatch []*Inode

type TxInode struct {
        Inode  *Inode
        TxInfo *proto.TransactionInfo
}

func NewTxInode(ino uint64, t uint32, txInfo *proto.TransactionInfo) *TxInode {
        ti := &TxInode{
                Inode:  NewInode(ino, t),
                TxInfo: txInfo,
        }
        return ti
}

func (ti *TxInode) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0))

        bs, err := ti.Inode.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }

        bs, err = ti.TxInfo.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }
        result = buff.Bytes()
        return
}

func (ti *TxInode) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)

        var dataLen uint32
        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data := make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }
        ino := NewInode(0, 0)
        if err = ino.Unmarshal(data); err != nil {
                return
        }
        ti.Inode = ino

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }
        txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
        if err = txInfo.Unmarshal(data); err != nil {
                return
        }
        ti.TxInfo = txInfo
        return
}

func (i *InodeBatch) Clone() InodeBatch {
        var rB []*Inode
        for _, inode := range []*Inode(*i) {
                rB = append(rB, inode.Copy().(*Inode))
        }
        return rB
}

func (ino *Inode) getAllInodesInfo() (rsp []proto.InodeInfo) {
        ino.RLock()
        defer ino.RUnlock()

        ino.RangeMultiVer(func(idx int, info *Inode) bool {
                rspInodeInfo := &proto.InodeInfo{}
                replyInfoNoCheck(rspInodeInfo, info)
                rsp = append(rsp, *rspInodeInfo)
                return true
        })
        return
}

func (ino *Inode) getAllLayerEks() (rsp []proto.LayerInfo) {
        ino.RLock()
        defer ino.RUnlock()
        rspInodeInfo := &proto.InodeInfo{}
        replyInfoNoCheck(rspInodeInfo, ino)

        layerInfo := proto.LayerInfo{
                LayerIdx: 0,
                Info:     rspInodeInfo,
                Eks:      ino.Extents.eks,
        }
        rsp = append(rsp, layerInfo)

        ino.RangeMultiVer(func(idx int, info *Inode) bool {
                rspInodeInfo := &proto.InodeInfo{}
                replyInfo(rspInodeInfo, info, nil)
                layerInfo := proto.LayerInfo{
                        LayerIdx: uint32(idx + 1),
                        Info:     rspInodeInfo,
                        Eks:      info.Extents.eks,
                }
                rsp = append(rsp, layerInfo)
                return true
        })

        return
}

// String returns the string format of the inode.
func (i *Inode) String() string {
        i.RLock()
        defer i.RUnlock()
        buff := bytes.NewBuffer(nil)
        buff.Grow(128)
        buff.WriteString("Inode{")
        buff.WriteString(fmt.Sprintf("Inode[%d]", i.Inode))
        buff.WriteString(fmt.Sprintf("Type[%d]", i.Type))
        buff.WriteString(fmt.Sprintf("Uid[%d]", i.Uid))
        buff.WriteString(fmt.Sprintf("Gid[%d]", i.Gid))
        buff.WriteString(fmt.Sprintf("Size[%d]", i.Size))
        buff.WriteString(fmt.Sprintf("Gen[%d]", i.Generation))
        buff.WriteString(fmt.Sprintf("CT[%d]", i.CreateTime))
        buff.WriteString(fmt.Sprintf("AT[%d]", i.AccessTime))
        buff.WriteString(fmt.Sprintf("MT[%d]", i.ModifyTime))
        buff.WriteString(fmt.Sprintf("LinkT[%s]", i.LinkTarget))
        buff.WriteString(fmt.Sprintf("NLink[%d]", i.NLink))
        buff.WriteString(fmt.Sprintf("Flag[%d]", i.Flag))
        buff.WriteString(fmt.Sprintf("Reserved[%d]", i.Reserved))
        buff.WriteString(fmt.Sprintf("Extents[%s]", i.Extents))
        buff.WriteString(fmt.Sprintf("ObjExtents[%s]", i.ObjExtents))
        buff.WriteString(fmt.Sprintf("verSeq[%v]", i.getVer()))
        buff.WriteString(fmt.Sprintf("multiSnap.multiVersions.len[%v]", i.getLayerLen()))
        buff.WriteString("}")
        return buff.String()
}

// NewInode returns a new Inode instance with specified Inode ID, name and type.
// The AccessTime and ModifyTime will be set to the current time.
func NewInode(ino uint64, t uint32) *Inode {
        ts := timeutil.GetCurrentTimeUnix()
        i := &Inode{
                Inode:      ino,
                Type:       t,
                Generation: 1,
                CreateTime: ts,
                AccessTime: ts,
                ModifyTime: ts,
                NLink:      1,
                Extents:    NewSortedExtents(),
                ObjExtents: NewSortedObjExtents(),
                multiSnap:  nil,
        }
        if proto.IsDir(t) {
                i.NLink = 2
        }
        return i
}

// Less tests whether the current Inode item is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (i *Inode) Less(than BtreeItem) bool {
        ino, ok := than.(*Inode)
        return ok && i.Inode < ino.Inode
}

// Copy returns a copy of the inode.
func (i *Inode) Copy() BtreeItem {
        newIno := NewInode(i.Inode, i.Type)
        i.RLock()
        newIno.Uid = i.Uid
        newIno.Gid = i.Gid
        newIno.Size = i.Size
        newIno.Generation = i.Generation
        newIno.CreateTime = i.CreateTime
        newIno.ModifyTime = i.ModifyTime
        newIno.AccessTime = i.AccessTime
        if size := len(i.LinkTarget); size > 0 {
                newIno.LinkTarget = make([]byte, size)
                copy(newIno.LinkTarget, i.LinkTarget)
        }
        newIno.NLink = i.NLink
        newIno.Flag = i.Flag
        newIno.Reserved = i.Reserved
        newIno.Extents = i.Extents.Clone()
        newIno.ObjExtents = i.ObjExtents.Clone()
        if i.multiSnap != nil {
                newIno.multiSnap = &InodeMultiSnap{
                        verSeq:        i.getVer(),
                        multiVersions: i.multiSnap.multiVersions.Clone(),
                        ekRefMap:      i.multiSnap.ekRefMap,
                }
        }
        i.RUnlock()
        return newIno
}

func (i *Inode) CopyInodeOnly(cInode *Inode) *Inode {
        tmpInode := cInode.CopyDirectly().(*Inode)
        tmpInode.Extents = i.Extents
        tmpInode.ObjExtents = i.ObjExtents
        tmpInode.multiSnap = i.multiSnap
        return tmpInode
}

func (i *Inode) CopyDirectly() BtreeItem {
        newIno := NewInode(i.Inode, i.Type)

        newIno.Uid = i.Uid
        newIno.Gid = i.Gid
        newIno.Size = i.Size
        newIno.Generation = i.Generation
        newIno.CreateTime = i.CreateTime
        newIno.ModifyTime = i.ModifyTime
        newIno.AccessTime = i.AccessTime
        if size := len(i.LinkTarget); size > 0 {
                newIno.LinkTarget = make([]byte, size)
                copy(newIno.LinkTarget, i.LinkTarget)
        }
        newIno.NLink = i.NLink
        newIno.Flag = i.Flag
        newIno.Reserved = i.Reserved
        newIno.Extents = i.Extents.Clone()
        newIno.ObjExtents = i.ObjExtents.Clone()

        return newIno
}

// MarshalToJSON is the wrapper of json.Marshal.
func (i *Inode) MarshalToJSON() ([]byte, error) {
        i.RLock()
        defer i.RUnlock()
        return json.Marshal(i)
}

// Marshal marshals the inode into a byte array.
func (i *Inode) Marshal() (result []byte, err error) {
        keyBytes := i.MarshalKey()
        valBytes := i.MarshalValue()
        keyLen := uint32(len(keyBytes))
        valLen := uint32(len(valBytes))
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        if err = binary.Write(buff, binary.BigEndian, keyLen); err != nil {
                return
        }
        if _, err = buff.Write(keyBytes); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, valLen); err != nil {
                return
        }
        if _, err = buff.Write(valBytes); err != nil {
                return
        }
        result = buff.Bytes()
        return
}

// Unmarshal unmarshals the inode.
func (i *Inode) Unmarshal(raw []byte) (err error) {
        var (
                keyLen uint32
                valLen uint32
        )
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &keyLen); err != nil {
                return
        }
        keyBytes := make([]byte, keyLen)
        if _, err = buff.Read(keyBytes); err != nil {
                return
        }
        if err = i.UnmarshalKey(keyBytes); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &valLen); err != nil {
                return
        }
        valBytes := make([]byte, valLen)
        if _, err = buff.Read(valBytes); err != nil {
                return
        }
        err = i.UnmarshalValue(valBytes)
        return
}

// Marshal marshals the inodeBatch into a byte array.
func (i InodeBatch) Marshal() ([]byte, error) {
        buff := bytes.NewBuffer(make([]byte, 0))
        if err := binary.Write(buff, binary.BigEndian, uint32(len(i))); err != nil {
                return nil, err
        }
        for _, inode := range i {
                bs, err := inode.Marshal()
                if err != nil {
                        return nil, err
                }
                if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                        return nil, err
                }
                if _, err := buff.Write(bs); err != nil {
                        return nil, err
                }
        }
        return buff.Bytes(), nil
}

// Unmarshal unmarshals the inodeBatch.
func InodeBatchUnmarshal(raw []byte) (InodeBatch, error) {
        buff := bytes.NewBuffer(raw)
        var batchLen uint32
        if err := binary.Read(buff, binary.BigEndian, &batchLen); err != nil {
                return nil, err
        }

        result := make(InodeBatch, 0, int(batchLen))

        var dataLen uint32
        for j := 0; j < int(batchLen); j++ {
                if err := binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                        return nil, err
                }
                data := make([]byte, int(dataLen))
                if _, err := buff.Read(data); err != nil {
                        return nil, err
                }
                ino := NewInode(0, 0)
                if err := ino.Unmarshal(data); err != nil {
                        return nil, err
                }
                result = append(result, ino)
        }

        return result, nil
}

// MarshalKey marshals the exporterKey to bytes.
func (i *Inode) MarshalKey() (k []byte) {
        k = make([]byte, 8)
        binary.BigEndian.PutUint64(k, i.Inode)
        return
}

// UnmarshalKey unmarshals the exporterKey from bytes.
func (i *Inode) UnmarshalKey(k []byte) (err error) {
        i.Inode = binary.BigEndian.Uint64(k)
        return
}

// MarshalValue marshals the value to bytes.
func (i *Inode) MarshalInodeValue(buff *bytes.Buffer) {
        var err error
        if err = binary.Write(buff, binary.BigEndian, &i.Type); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.Uid); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.Gid); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.Size); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.Generation); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.CreateTime); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.AccessTime); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.ModifyTime); err != nil {
                panic(err)
        }
        // write SymLink
        symSize := uint32(len(i.LinkTarget))
        if err = binary.Write(buff, binary.BigEndian, &symSize); err != nil {
                panic(err)
        }
        if _, err = buff.Write(i.LinkTarget); err != nil {
                panic(err)
        }

        if err = binary.Write(buff, binary.BigEndian, &i.NLink); err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, &i.Flag); err != nil {
                panic(err)
        }
        if i.ObjExtents != nil && len(i.ObjExtents.eks) > 0 {
                i.Reserved |= V2EnableColdInodeFlag
        }
        i.Reserved |= V3EnableSnapInodeFlag

        // log.LogInfof("action[MarshalInodeValue] inode[%v] Reserved %v", i.Inode, i.Reserved)
        if err = binary.Write(buff, binary.BigEndian, &i.Reserved); err != nil {
                panic(err)
        }

        // marshal ExtentsKey
        extData, err := i.Extents.MarshalBinary(true)
        if err != nil {
                panic(err)
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(extData))); err != nil {
                panic(err)
        }
        if _, err = buff.Write(extData); err != nil {
                panic(err)
        }

        if i.Reserved&V2EnableColdInodeFlag > 0 {
                // marshal ObjExtentsKey
                objExtData, err := i.ObjExtents.MarshalBinary()
                if err != nil {
                        panic(err)
                }
                if err = binary.Write(buff, binary.BigEndian, uint32(len(objExtData))); err != nil {
                        panic(err)
                }
                if _, err = buff.Write(objExtData); err != nil {
                        panic(err)
                }
        }

        if err = binary.Write(buff, binary.BigEndian, i.getVer()); err != nil {
                panic(err)
        }

        return
}

// MarshalValue marshals the value to bytes.
func (i *Inode) MarshalValue() (val []byte) {
        var err error
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        buff.Grow(64)

        i.RLock()
        i.MarshalInodeValue(buff)
        if i.getLayerLen() > 0 && i.getVer() == 0 {
                log.LogFatalf("action[MarshalValue] inode[%v] current verseq [%v], hist len (%v) stack(%v)", i.Inode, i.getVer(), i.getLayerLen(), string(debug.Stack()))
        }
        if err = binary.Write(buff, binary.BigEndian, int32(i.getLayerLen())); err != nil {
                i.RUnlock()
                panic(err)
        }

        if i.multiSnap != nil {
                for _, ino := range i.multiSnap.multiVersions {
                        ino.MarshalInodeValue(buff)
                }
        }

        val = buff.Bytes()
        i.RUnlock()
        return
}

// UnmarshalValue unmarshals the value from bytes.
func (i *Inode) UnmarshalInodeValue(buff *bytes.Buffer) (err error) {
        if err = binary.Read(buff, binary.BigEndian, &i.Type); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Uid); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Gid); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Size); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Generation); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.CreateTime); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.AccessTime); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.ModifyTime); err != nil {
                return
        }
        // read symLink
        symSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &symSize); err != nil {
                return
        }
        if symSize > 0 {
                i.LinkTarget = make([]byte, symSize)
                if _, err = io.ReadFull(buff, i.LinkTarget); err != nil {
                        return
                }
        }

        if err = binary.Read(buff, binary.BigEndian, &i.NLink); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Flag); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &i.Reserved); err != nil {
                return
        }

        // unmarshal ExtentsKey
        if i.Extents == nil {
                i.Extents = NewSortedExtents()
        }
        if i.ObjExtents == nil {
                i.ObjExtents = NewSortedObjExtents()
        }

        v3 := i.Reserved&V3EnableSnapInodeFlag > 0
        v2 := i.Reserved&V2EnableColdInodeFlag > 0

        if v2 || v3 {
                extSize := uint32(0)
                if err = binary.Read(buff, binary.BigEndian, &extSize); err != nil {
                        return
                }
                if extSize > 0 {
                        extBytes := make([]byte, extSize)
                        if _, err = io.ReadFull(buff, extBytes); err != nil {
                                return
                        }
                        var ekRef *sync.Map
                        if err, ekRef = i.Extents.UnmarshalBinary(extBytes, v3); err != nil {
                                return
                        }
                        // log.LogDebugf("inode[%v] ekRef %v", i.Inode, ekRef)
                        if ekRef != nil {
                                if i.multiSnap == nil {
                                        i.multiSnap = NewMultiSnap(0)
                                }
                                // log.LogDebugf("inode[%v] ekRef %v", i.Inode, ekRef)
                                i.multiSnap.ekRefMap = ekRef
                        }
                }
        } else {
                if err, _ = i.Extents.UnmarshalBinary(buff.Bytes(), false); err != nil {
                        return
                }
                return
        }

        if v2 {
                // unmarshal ObjExtentsKey
                ObjExtSize := uint32(0)
                if err = binary.Read(buff, binary.BigEndian, &ObjExtSize); err != nil {
                        return
                }
                if ObjExtSize > 0 {
                        objExtBytes := make([]byte, ObjExtSize)
                        if _, err = io.ReadFull(buff, objExtBytes); err != nil {
                                return
                        }
                        if err = i.ObjExtents.UnmarshalBinary(objExtBytes); err != nil {
                                return
                        }
                }
        }

        if v3 {
                var seq uint64
                if err = binary.Read(buff, binary.BigEndian, &seq); err != nil {
                        return
                }
                if seq != 0 {
                        i.setVer(seq)
                }
        }

        return
}

func (i *Inode) GetSpaceSize() (extSize uint64) {
        if i.IsTempFile() {
                return
        }
        extSize += i.Extents.LayerSize()
        return
}

// UnmarshalValue unmarshals the value from bytes.
func (i *Inode) UnmarshalValue(val []byte) (err error) {
        buff := bytes.NewBuffer(val)
        i.UnmarshalInodeValue(buff)
        if i.Reserved&V3EnableSnapInodeFlag > 0 {
                var verCnt int32
                if err = binary.Read(buff, binary.BigEndian, &verCnt); err != nil {
                        log.LogInfof("action[UnmarshalValue] err get ver cnt inode[%v] new seq [%v]", i.Inode, i.getVer())
                        return
                }
                if verCnt > 0 && i.getVer() == 0 {
                        err = fmt.Errorf("inode[%v] verCnt %v root ver [%v]", i.Inode, verCnt, i.getVer())
                        log.LogFatalf("UnmarshalValue. %v", err)
                        return
                }
                for idx := int32(0); idx < verCnt; idx++ {
                        ino := &Inode{Inode: i.Inode}
                        ino.UnmarshalInodeValue(buff)
                        if ino.multiSnap != nil && ino.multiSnap.ekRefMap != nil {
                                if i.multiSnap.ekRefMap == nil {
                                        i.multiSnap.ekRefMap = new(sync.Map)
                                }
                                // log.LogDebugf("UnmarshalValue. inode[%v] merge top layer multiSnap.ekRefMap with layer %v", i.Inode, idx)
                                proto.MergeSplitKey(i.Inode, i.multiSnap.ekRefMap, ino.multiSnap.ekRefMap)
                        }
                        if i.multiSnap == nil {
                                i.multiSnap = &InodeMultiSnap{}
                        }
                        // log.LogDebugf("action[UnmarshalValue] inode[%v] old seq [%v] hist len %v", ino.Inode, ino.getVer(), i.getLayerLen())
                        i.multiSnap.multiVersions = append(i.multiSnap.multiVersions, ino)
                }
        }
        return
}

// AppendExtents append the extent to the btree.
func (i *Inode) AppendExtents(eks []proto.ExtentKey, ct int64, volType int) (delExtents []proto.ExtentKey) {
        if proto.IsCold(volType) {
                return
        }
        i.Lock()
        defer i.Unlock()
        for _, ek := range eks {
                delItems := i.Extents.Append(ek)
                size := i.Extents.Size()
                if i.Size < size {
                        i.Size = size
                }
                delExtents = append(delExtents, delItems...)
        }
        i.Generation++
        i.ModifyTime = ct

        return
}

// AppendObjExtents append the extent to the btree.
func (i *Inode) AppendObjExtents(eks []proto.ObjExtentKey, ct int64) (err error) {
        i.Lock()
        defer i.Unlock()

        for _, ek := range eks {
                err = i.ObjExtents.Append(ek)
                if err != nil {
                        return
                }
                size := i.ObjExtents.Size()
                if i.Size < size {
                        i.Size = size
                }
        }
        i.Generation++
        i.ModifyTime = ct
        return
}

func (i *Inode) PrintAllVersionInfo() {
        if i.multiSnap == nil {
                return
        }
        log.LogInfof("action[PrintAllVersionInfo] inode[%v] verSeq [%v] hist len [%v]", i.Inode, i.getVer(), i.getLayerLen())
        for id, info := range i.multiSnap.multiVersions {
                log.LogInfof("action[PrintAllVersionInfo] layer [%v]  verSeq [%v] inode[%v]", id, info.getVer(), info)
        }
}

// clear snapshot extkey with releated verSeq
func (i *Inode) MultiLayerClearExtByVer(layer int, dVerSeq uint64) (delExtents []proto.ExtentKey) {
        var ino *Inode
        if layer == 0 {
                ino = i
        } else {
                ino = i.multiSnap.multiVersions[layer-1]
        }

        ino.Extents.Lock()
        defer ino.Extents.Unlock()

        for idx, ek := range ino.Extents.eks {
                if ek.GetSeq() > dVerSeq {
                        delExtents = append(delExtents, ek)
                        ino.Extents.eks = append(ino.Extents.eks[idx:], ino.Extents.eks[:idx+1]...)
                }
        }
        return
}

func (i *Inode) mergeExtentArr(mpId uint64, extentKeysLeft []proto.ExtentKey, extentKeysRight []proto.ExtentKey) []proto.ExtentKey {
        lCnt := len(extentKeysLeft)
        rCnt := len(extentKeysRight)
        sortMergedExts := make([]proto.ExtentKey, 0, lCnt+rCnt)
        lPos, rPos := 0, 0

        doWork := func(keyArr *[]proto.ExtentKey, pos int) {
                mLen := len(sortMergedExts)
                if mLen > 0 && sortMergedExts[mLen-1].IsSequenceWithSameSeq(&(*keyArr)[pos]) {
                        sortMergedExts[mLen-1].Size += (*keyArr)[pos].Size
                        log.LogDebugf("[mergeExtentArr] mpId[%v]. ek left %v right %v", mpId, sortMergedExts[mLen-1], (*keyArr)[pos])
                        if !sortMergedExts[mLen-1].IsSplit() || !(*keyArr)[pos].IsSplit() {
                                log.LogErrorf("[mergeExtentArr] mpId[%v] ino[%v] ek merge left %v right %v not all split", mpId, i.Inode, sortMergedExts[mLen-1], (*keyArr)[pos])
                        }
                        i.DecSplitEk(mpId, &(*keyArr)[pos])
                } else {
                        sortMergedExts = append(sortMergedExts, (*keyArr)[pos])
                }
        }

        for {
                if lPos == lCnt {
                        sortMergedExts = append(sortMergedExts, extentKeysRight[rPos:]...)
                        break
                }
                if rPos == rCnt {
                        sortMergedExts = append(sortMergedExts, extentKeysLeft[lPos:]...)
                        break
                }

                if extentKeysLeft[lPos].FileOffset < extentKeysRight[rPos].FileOffset {
                        doWork(&extentKeysLeft, lPos)
                        lPos++
                } else {
                        doWork(&extentKeysRight, rPos)
                        rPos++
                }
        }

        return sortMergedExts
}

// Restore ext info to older version or deleted if no right version
// The list(multiSnap.multiVersions) contains all point of modification on inode, each ext must belong to one layer.
// Once the layer be deleted is top layer ver be changed to upper layer, or else the ext belongs is exclusive and can be dropped
func (i *Inode) RestoreExts2NextLayer(mpId uint64, delExtentsOrigin []proto.ExtentKey, curVer uint64, idx int) (delExtents []proto.ExtentKey, err error) {
        log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] curVer [%v] delExtents size [%v] hist len [%v]", mpId, curVer, len(delExtentsOrigin), i.getLayerLen())
        // no version left.all old versions be deleted
        if i.isEmptyVerList() {
                log.LogWarnf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] restore have no old version left", mpId, i.Inode)
                return delExtentsOrigin, nil
        }
        lastSeq := i.multiSnap.multiVersions[idx].getVer()
        specSnapExtent := make([]proto.ExtentKey, 0)

        for _, delExt := range delExtentsOrigin {
                // curr deleting delExt with a seq larger than the next version's seq, it doesn't belong to any
                // versions,so try to delete it
                log.LogDebugf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] ext split [%v] with seq[%v] gSeq[%v] try to del.the last seq [%v], ek details[%v]",
                        mpId, i.Inode, delExt.IsSplit(), delExt.GetSeq(), curVer, lastSeq, delExt)
                if delExt.GetSeq() > lastSeq {
                        delExtents = append(delExtents, delExt)
                } else {
                        log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] inode[%v] move to level 1 delExt [%v] specSnapExtent size [%v]", mpId, i.Inode, delExt, len(specSnapExtent))
                        specSnapExtent = append(specSnapExtent, delExt)
                }
        }
        if len(specSnapExtent) == 0 {
                log.LogInfof("action[RestoreMultiSnapExts] mpId [%v] inode[%v] no need to move to level 1", mpId, i.Inode)
                return
        }
        if len(specSnapExtent) > 0 && i.isEmptyVerList() {
                err = fmt.Errorf("mpId [%v] inode[%v] error not found prev snapshot index", mpId, i.Inode)
                log.LogErrorf("action[RestoreMultiSnapExts] mpId [%v] inode[%v] %v", mpId, i.Inode, err)
                return
        }

        i.multiSnap.multiVersions[idx].Extents.Lock()
        i.multiSnap.multiVersions[idx].Extents.eks = i.mergeExtentArr(mpId, i.multiSnap.multiVersions[idx].Extents.eks, specSnapExtent)
        i.multiSnap.multiVersions[idx].Extents.Unlock()

        return
}

func (inode *Inode) unlinkTopLayer(mpId uint64, ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
        // if there's no snapshot itself, nor have snapshot after inode's ver then need unlink directly and make no snapshot
        // just move to upper layer, the behavior looks like that the snapshot be dropped
        log.LogDebugf("action[unlinkTopLayer] mpid [%v] mpver [%v] check if have snapshot depends on the deleitng ino[%v] (with no snapshot itself) found seq [%v], verlist %v",
                mpId, mpVer, ino, inode.getVer(), verlist)
        status = proto.OpOk

        delFunc := func() (done bool) {
                if inode.NLink > 1 {
                        log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, file link is %v", ino.Inode, inode.NLink)
                        inode.DecNLink()
                        doMore = false
                        return true
                }
                // first layer need delete
                var err error
                if ext2Del, err = inode.RestoreExts2NextLayer(mpId, inode.Extents.eks, mpVer, 0); err != nil {
                        log.LogErrorf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts split error %v", inode.Inode, err)
                        status = proto.OpNotExistErr
                        log.LogDebugf("action[unlinkTopLayer] mp[%v] iino[%v]", mpId, ino)
                        return
                }
                inode.Extents.eks = inode.Extents.eks[:0]
                log.LogDebugf("action[getAndDelVerInList] mp[%v] ino[%v] verseq [%v] get del exts %v", mpId, inode.Inode, inode.getVer(), ext2Del)
                inode.DecNLink() // dIno should be inode
                doMore = true
                return
        }

        // if topLayer verSeq is as same as mp, the current inode deletion only happen on the first layer
        // or ddelete from client do deletion at top layer which should allow delete ionde with older version contrast to mp version
        // because ddelete have two steps,1 is del dentry,2nd is unlink inode ,version may updated after 1st and before 2nd step, to
        // make sure inode be unlinked by normal deletion, sdk add filed of dentry verSeq to identify and different from other unlink actions
        if mpVer == inode.getVer() {
                if inode.getLayerLen() == 0 {
                        log.LogDebugf("action[unlinkTopLayer] no snapshot available depends on ino[%v] not found seq [%v] and return, verlist %v", ino, inode.getVer(), verlist)
                        inode.DecNLink()
                        log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked", ino.Inode)
                        // operate inode directly
                        doMore = true
                        return
                }

                log.LogDebugf("action[unlinkTopLayer] need restore.ino[%v] withseq [%v] equal mp seq, verlist %v",
                        ino, inode.getVer(), verlist)
                // need restore
                if !proto.IsDir(inode.Type) {
                        delFunc()
                        return
                }
                log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, Dir", ino.Inode)
                inode.DecNLink()
                doMore = true
                return
        }

        log.LogDebugf("action[unlinkTopLayer] need create version.ino[%v] withseq [%v] not equal mp seq [%v], verlist %v", ino, inode.getVer(), mpVer, verlist)
        if proto.IsDir(inode.Type) { // dir is whole info but inode is partition,which is quit different
                _, err := verlist.GetNextOlderVer(mpVer)
                if err == nil {
                        log.LogDebugf("action[unlinkTopLayer] inode[%v] cann't get next older ver [%v] err %v", inode.Inode, mpVer, err)
                        inode.CreateVer(mpVer)
                }
                inode.DecNLink()
                log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, Dir create ver 1st layer", ino.Inode)
                doMore = true
        } else {
                ver, err := verlist.GetNextOlderVer(mpVer)
                if err != nil {
                        if err.Error() == "not found" {
                                delFunc()
                                return
                        }
                        log.LogErrorf("action[unlinkTopLayer] inode[%v] cann't get next older ver [%v] err %v", inode.Inode, mpVer, err)
                        return
                }
                inode.CreateVer(mpVer) // protect origin version
                if inode.NLink == 1 {
                        inode.CreateUnlinkVer(mpVer, ver) // create a effective top level  version
                }
                inode.DecNLink()
                log.LogDebugf("action[unlinkTopLayer] inode[%v] be unlinked, File create ver 1st layer", ino.Inode)
        }
        return
}

func (inode *Inode) dirUnlinkVerInlist(ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
        var idxWithTopLayer int
        var dIno *Inode
        status = proto.OpOk
        if dIno, idxWithTopLayer = inode.getInoByVer(ino.getVer(), false); dIno == nil {
                log.LogDebugf("action[dirUnlinkVerInlist] ino[%v] not found", ino)
                return
        }
        var endSeq uint64
        if idxWithTopLayer == 0 {
                // header layer do nothing and be depends on should not be dropped
                log.LogDebugf("action[dirUnlinkVerInlist] ino[%v] first layer do nothing", ino)
                return
        }
        // if any alive snapshot in mp dimension exist in seq scope from dino to next ascend neighbor, dio snapshot be keep or else drop
        if inode.multiSnap == nil {
                log.LogWarnf("action[dirUnlinkVerInlist] ino[%v] multiSnap should not be nil", inode)
                inode.multiSnap = &InodeMultiSnap{}
        }

        mIdx := idxWithTopLayer - 1
        if mIdx == 0 {
                endSeq = inode.getVer()
        } else {
                endSeq = inode.multiSnap.multiVersions[mIdx-1].getVer()
        }

        log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop multiVersion idx %v effective seq scope [%v,%v) ",
                inode.Inode, mIdx, dIno.getVer(), endSeq)

        doWork := func() bool {
                verlist.RWLock.RLock()
                defer verlist.RWLock.RUnlock()

                for vidx, info := range verlist.VerList {
                        if info.Ver >= dIno.getVer() && info.Ver < endSeq {
                                log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] dir layer idx %v still have effective snapshot seq [%v].so don't drop", inode.Inode, mIdx, info.Ver)
                                return false
                        }
                        if info.Ver >= endSeq || vidx == len(verlist.VerList)-1 {
                                log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop multiVersion idx %v and return", inode.Inode, mIdx)

                                inode.Lock()
                                inode.multiSnap.multiVersions = append(inode.multiSnap.multiVersions[:mIdx], inode.multiSnap.multiVersions[mIdx+1:]...)
                                inode.Unlock()
                                return true
                        }
                        log.LogDebugf("action[dirUnlinkVerInlist] inode[%v] try drop scope [%v, %v), mp ver [%v] not suitable", inode.Inode, dIno.getVer(), endSeq, info.Ver)
                        return true
                }
                return true
        }
        if !doWork() {
                return
        }
        doMore = true
        dIno.DecNLink()
        return
}

func (inode *Inode) unlinkVerInList(mpId uint64, ino *Inode, mpVer uint64, verlist *proto.VolVersionInfoList) (ext2Del []proto.ExtentKey, doMore bool, status uint8) {
        log.LogDebugf("action[unlinkVerInList] mpId [%v] ino[%v] try search seq [%v] isdir %v", mpId, ino, ino.getVer(), proto.IsDir(inode.Type))
        if proto.IsDir(inode.Type) { // snapshot dir deletion don't take link into consider, but considers the scope of snapshot contrast to verList
                return inode.dirUnlinkVerInlist(ino, mpVer, verlist)
        }
        var dIno *Inode
        status = proto.OpOk
        // special case, snapshot is the last one and be depended by upper version,update it's version to the right one
        // ascend search util to the curr unCommit version in the verList
        if ino.getVer() == inode.getVer() || (isInitSnapVer(ino.getVer()) && inode.getVer() == 0) {
                if len(verlist.VerList) == 0 {
                        status = proto.OpNotExistErr
                        log.LogErrorf("action[unlinkVerInList] inode[%v] verlist should be larger than 0, return not found", inode.Inode)
                        return
                }

                // just move to upper layer,the request snapshot be dropped
                nVerSeq, found := inode.getLastestVer(inode.getVer(), verlist)
                if !found {
                        status = proto.OpNotExistErr
                        return
                }
                log.LogDebugf("action[unlinkVerInList] inode[%v] update current verseq [%v] to %v", inode.Inode, inode.getVer(), nVerSeq)
                inode.setVer(nVerSeq)
                return
        } else {
                // don't unlink if no version satisfied
                if ext2Del, dIno = inode.getAndDelVerInList(mpId, ino.getVer(), mpVer, verlist); dIno == nil {
                        status = proto.OpNotExistErr
                        log.LogDebugf("action[unlinkVerInList] ino[%v]", ino)
                        return
                }
        }

        dIno.DecNLink()
        log.LogDebugf("action[unlinkVerInList] inode[%v] snapshot layer be unlinked", ino.Inode)
        doMore = true
        return
}

func (i *Inode) ShouldDelVer(delVer uint64, mpVer uint64) (ok bool, err error) {
        if i.getVer() == 0 {
                if delVer > 0 {
                        if isInitSnapVer(delVer) {
                                return true, nil
                        }
                        return false, fmt.Errorf("not found")
                } else {
                        // mp ver larger than zero means snapshot happened but haven't take effect on this inode
                        if mpVer > 0 {
                                return false, nil
                        }
                        return true, nil
                }
        } else {
                if delVer > i.getVer() {
                        return false, fmt.Errorf("not found")
                } else if delVer == i.getVer() {
                        return true, nil
                }
        }

        if isInitSnapVer(delVer) {
                tailVer, _ := i.getTailVerInList()
                if tailVer == 0 {
                        return true, nil
                }
                return false, fmt.Errorf("not found")
        }
        if i.multiSnap == nil {
                return false, fmt.Errorf("not found")
        }
        for _, inoVer := range i.multiSnap.multiVersions {
                if inoVer.getVer() == delVer {
                        return true, nil
                }
                if inoVer.getVer() < delVer {
                        break
                }
        }
        return false, fmt.Errorf("not found")
}

// idx need calc include nclude top layer. index in multiSnap.multiVersions need add by 1
//
//note:search all layers.
func (ino *Inode) getInoByVer(verSeq uint64, equal bool) (i *Inode, idx int) {
        ino.RLock()
        defer ino.RUnlock()

        if verSeq == 0 || verSeq == ino.getVer() || (isInitSnapVer(verSeq) && ino.getVer() == 0) {
                return ino, 0
        }
        if isInitSnapVer(verSeq) {
                listLen := ino.getLayerLen()
                if listLen == 0 {
                        log.LogDebugf("action[getInoByVer]  ino[%v] no multiversion", ino.Inode)
                        return
                }
                i = ino.multiSnap.multiVersions[listLen-1]
                if i.getVer() != 0 {
                        log.LogDebugf("action[getInoByVer]  ino[%v] lay seq [%v]", ino.Inode, i.getVer())
                        return nil, 0
                }
                return i, listLen
        }
        if verSeq > 0 && ino.getVer() > verSeq {
                if ino.multiSnap != nil {
                        for id, iTmp := range ino.multiSnap.multiVersions {
                                if verSeq == iTmp.getVer() {
                                        log.LogDebugf("action[getInoByVer]  ino[%v] get in multiversion id[%v]", ino.Inode, id)
                                        return iTmp, id + 1
                                } else if verSeq > iTmp.getVer() {
                                        if !equal {
                                                log.LogDebugf("action[getInoByVer]  ino[%v] get in multiversion id[%v], %v, %v", ino.Inode, id, verSeq, iTmp.getVer())
                                                return iTmp, id + 1
                                        }
                                        log.LogDebugf("action[getInoByVer]  ino[%v] get in multiversion id[%v]", ino.Inode, id)
                                        return
                                }
                        }
                }
        } else {
                if !equal {
                        log.LogDebugf("action[getInoByVer]  ino[%v]", ino.Inode)
                        return ino, 0
                }
        }
        return
}

// 1. check if dVer layer is the last layer of the system  1)true,drop it all 2) false goto 3
// 2. if have system layer between dVer and next older inode's layer(not exist is ok), drop dVer related exts and update ver
// 3. else Restore to next inode's Layer

func (i *Inode) getAndDelVerInList(mpId uint64, dVer uint64, mpVer uint64, verlist *proto.VolVersionInfoList) (delExtents []proto.ExtentKey, ino *Inode) {
        var err error
        verlist.RWLock.RLock()
        defer verlist.RWLock.RUnlock()

        log.LogDebugf("action[getAndDelVerInList] ino[%v] verseq [%v] request del ver [%v] hist len %v isTmpFile %v",
                i.Inode, i.getVer(), dVer, i.getLayerLen(), i.IsTempFile())

        // read inode element is fine, lock is need while write
        inoVerLen := i.getLayerLen()
        if inoVerLen == 0 {
                log.LogDebugf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts no left", i.Inode)
                return
        }

        // delete snapshot version
        if isInitSnapVer(dVer) {
                dVer = 0
        }
        lastVer := i.getVer()
        for id, mIno := range i.multiSnap.multiVersions {
                log.LogDebugf("action[getAndDelVerInList] ino[%v] multiSnap.multiVersions level %v verseq [%v]", i.Inode, id, mIno.getVer())
                if mIno.getVer() < dVer {
                        log.LogDebugf("action[getAndDelVerInList] ino[%v] multiSnap.multiVersions level %v verseq [%v]", i.Inode, id, mIno.getVer())
                        return
                }

                if mIno.getVer() == dVer {
                        log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] step 3", i.Inode, mIno.getVer())
                        // 2. get next version should according to verList but not only self multi list

                        var nVerSeq uint64
                        if nVerSeq, err = verlist.GetNextNewerVer(dVer); err != nil {
                                log.LogDebugf("action[getAndDelVerInList] get next version failed, err %v", err)
                                return
                        }
                        if lastVer > nVerSeq {
                                mIno.setVer(nVerSeq)
                                return
                        }
                        if i.isTailIndexInList(id) {
                                i.multiSnap.multiVersions = i.multiSnap.multiVersions[:inoVerLen-1]
                                log.LogDebugf("action[getAndDelVerInList] ino[%v] idx %v be dropped", i.Inode, inoVerLen)
                                return mIno.Extents.eks, mIno
                        }
                        if nVerSeq, err = verlist.GetNextOlderVer(dVer); err != nil {
                                log.LogDebugf("action[getAndDelVerInList] get next version failed, err %v", err)
                                return
                        }

                        log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] nextVerseq [%v] step 3 ver ", i.Inode, mIno.getVer(), nVerSeq)
                        // 2. system next layer not exist in inode ver list. update curr layer to next layer and filter out ek with verSeq
                        // change id layer verSeq to neighbor layer info, omit version delete process

                        if nVerSeq > i.multiSnap.multiVersions[id+1].getVer() {
                                log.LogDebugf("action[getAndDelVerInList] ino[%v]  get next version in verList update ver from %v to %v.And delete exts with ver [%v]",
                                        i.Inode, i.multiSnap.multiVersions[id].getVer(), nVerSeq, dVer)

                                i.multiSnap.multiVersions[id].setVerNoCheck(nVerSeq)
                                i.multiSnap.multiVersions[id] = i.CopyInodeOnly(i.multiSnap.multiVersions[id+1])

                                delExtents = i.MultiLayerClearExtByVer(id+1, dVer)
                                ino = i.multiSnap.multiVersions[id]
                                if len(i.multiSnap.multiVersions[id].Extents.eks) != 0 {
                                        log.LogDebugf("action[getAndDelVerInList] ino[%v]   after clear self still have ext and left", i.Inode)
                                        return
                                }
                        } else {
                                log.LogDebugf("action[getAndDelVerInList] ino[%v] ver [%v] nextver [%v] step 3 ver ", i.Inode, mIno.getVer(), nVerSeq)
                                // 3. next layer exist. the deleted version and  next version are neighbor in verlist, thus need restore and delete
                                if delExtents, err = i.RestoreExts2NextLayer(mpId, mIno.Extents.eks, dVer, id+1); err != nil {
                                        log.LogDebugf("action[getAndDelVerInList] ino[%v] RestoreMultiSnapExts split error %v", i.Inode, err)
                                        return
                                }
                        }
                        // delete layer id
                        i.multiSnap.multiVersions = append(i.multiSnap.multiVersions[:id], i.multiSnap.multiVersions[id+1:]...)

                        log.LogDebugf("action[getAndDelVerInList] ino[%v] verseq [%v] get del exts %v", i.Inode, i.getVer(), delExtents)
                        return delExtents, mIno
                }
                lastVer = mIno.getVer()
        }
        return
}

func (i *Inode) getLastestVer(reqVerSeq uint64, verlist *proto.VolVersionInfoList) (uint64, bool) {
        verlist.RWLock.RLock()
        defer verlist.RWLock.RUnlock()

        if len(verlist.VerList) == 0 {
                return 0, false
        }
        for _, info := range verlist.VerList {
                if info.Ver > reqVerSeq {
                        return info.Ver, true
                }
        }

        log.LogDebugf("action[getLastestVer] inode[%v] reqVerseq [%v] not found, the largetst one %v",
                i.Inode, reqVerSeq, verlist.VerList[len(verlist.VerList)-1].Ver)
        return 0, false
}

func (i *Inode) CreateUnlinkVer(mpVer uint64, nVer uint64) {
        log.LogDebugf("action[CreateUnlinkVer] inode[%v] mpver [%v] nver [%v]", i.Inode, mpVer, nVer)
        // inode copy not include multi ver array
        ino := i.CopyDirectly().(*Inode)
        ino.setVer(nVer)

        i.Extents = NewSortedExtents()
        i.ObjExtents = NewSortedObjExtents()
        i.SetDeleteMark()

        log.LogDebugf("action[CreateUnlinkVer] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
                i.Inode, mpVer, i.getVer(), i.getLayerLen())

        i.Lock()
        if i.multiSnap == nil {
                i.multiSnap = &InodeMultiSnap{}
        }
        if i.getLayerVer(0) == nVer {
                i.multiSnap.multiVersions[0] = ino
        } else {
                i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)
        }

        i.setVer(mpVer)
        i.Unlock()
}

func (i *Inode) CreateVer(ver uint64) {
        // inode copy not include multi ver array
        ino := i.CopyDirectly().(*Inode)
        ino.Extents = NewSortedExtents()
        ino.ObjExtents = NewSortedObjExtents()
        ino.setVer(i.getVer())
        i.setVer(ver)

        i.Lock()
        defer i.Unlock()
        log.LogDebugf("action[CreateVer] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
                i.Inode, ver, i.getVer(), i.getLayerLen())

        if i.multiSnap == nil {
                i.multiSnap = &InodeMultiSnap{}
        }
        i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)
}

func (i *Inode) buildMultiSnap() {
        if i.multiSnap == nil {
                i.multiSnap = &InodeMultiSnap{}
        }
        if i.multiSnap.ekRefMap == nil {
                i.multiSnap.ekRefMap = new(sync.Map)
        }
}

func (i *Inode) SplitExtentWithCheck(param *AppendExtParam) (delExtents []proto.ExtentKey, status uint8) {
        var err error
        param.ek.SetSeq(param.mpVer)
        log.LogDebugf("action[SplitExtentWithCheck] mpId[%v].inode[%v],ek [%v],hist len %v", param.mpId, i.Inode, param.ek, i.getLayerLen())

        if param.mpVer != i.getVer() {
                log.LogDebugf("action[SplitExtentWithCheck] mpId[%v].CreateVer ver [%v]", param.mpId, param.mpVer)
                i.CreateVer(param.mpVer)
        }
        i.Lock()
        defer i.Unlock()

        i.buildMultiSnap()
        delExtents, status = i.Extents.SplitWithCheck(param.mpId, i.Inode, param.ek, i.multiSnap.ekRefMap)
        if status != proto.OpOk {
                log.LogErrorf("action[SplitExtentWithCheck] mpId[%v].status [%v]", param.mpId, status)
                return
        }
        if len(delExtents) == 0 {
                return
        }

        if err = i.CreateLowerVersion(i.getVer(), param.multiVersionList); err != nil {
                return
        }

        if delExtents, err = i.RestoreExts2NextLayer(param.mpId, delExtents, param.mpVer, 0); err != nil {
                log.LogErrorf("action[fsmAppendExtentWithCheck] mpId[%v].ino[%v] RestoreMultiSnapExts split error %v", param.mpId, i.Inode, err)
                return
        }
        if proto.IsHot(param.volType) {
                i.Generation++
                i.ModifyTime = param.ct
        }

        return
}

// try to create version between curVer and seq of multiSnap.multiVersions[0] in verList
func (i *Inode) CreateLowerVersion(curVer uint64, verlist *proto.VolVersionInfoList) (err error) {
        verlist.RWLock.RLock()
        defer verlist.RWLock.RUnlock()

        log.LogDebugf("CreateLowerVersion inode[%v] curver [%v]", i.Inode, curVer)
        if len(verlist.VerList) <= 1 {
                return
        }
        if i.isEmptyVerList() {
                return
        }
        var nextVer uint64
        for _, info := range verlist.VerList {
                if info.Ver < curVer {
                        nextVer = info.Ver
                }
                if info.Ver >= curVer {
                        break
                }
        }
        if nextVer <= i.getLayerVer(0) {
                log.LogDebugf("CreateLowerVersion nextver [%v] layer 0 ver [%v]", nextVer, i.getLayerVer(0))
                return
        }

        ino := i.CopyDirectly().(*Inode)
        ino.Extents = NewSortedExtents()
        ino.ObjExtents = NewSortedObjExtents()
        ino.setVer(nextVer)

        log.LogDebugf("action[CreateLowerVersion] inode[%v] create new version [%v] and store old one [%v], hist len [%v]",
                i.Inode, ino, i.getVer(), i.getLayerLen())
        if i.multiSnap == nil {
                i.multiSnap = &InodeMultiSnap{}
        }
        i.multiSnap.multiVersions = append([]*Inode{ino}, i.multiSnap.multiVersions...)

        return
}

type AppendExtParam struct {
        mpId             uint64
        mpVer            uint64
        multiVersionList *proto.VolVersionInfoList
        ek               proto.ExtentKey
        ct               int64
        discardExtents   []proto.ExtentKey
        volType          int
}

func (i *Inode) AppendExtentWithCheck(param *AppendExtParam) (delExtents []proto.ExtentKey, status uint8) {
        param.ek.SetSeq(param.mpVer)
        log.LogDebugf("action[AppendExtentWithCheck] mpId[%v].mpver [%v] inode[%v] and fsm ver [%v],ek [%v],hist len %v",
                param.mpId, param.mpVer, i.Inode, i.getVer(), param.ek, i.getLayerLen())

        if param.mpVer != i.getVer() {
                log.LogInfof("action[AppendExtentWithCheck] mpId[%v].inode ver [%v]", param.mpId, i.getVer())
                i.CreateVer(param.mpVer)
        }

        i.Lock()
        defer i.Unlock()

        refFunc := func(key *proto.ExtentKey) { i.insertEkRefMap(param.mpId, key) }
        delExtents, status = i.Extents.AppendWithCheck(i.Inode, param.ek, refFunc, param.discardExtents)
        if status != proto.OpOk {
                log.LogErrorf("action[AppendExtentWithCheck] mpId[%v].status [%v]", param.mpId, status)
                return
        }

        // multi version take effect
        if i.getVer() > 0 && len(delExtents) > 0 {
                var err error
                if err = i.CreateLowerVersion(i.getVer(), param.multiVersionList); err != nil {
                        return
                }
                if delExtents, err = i.RestoreExts2NextLayer(param.mpId, delExtents, param.mpVer, 0); err != nil {
                        log.LogErrorf("action[AppendExtentWithCheck] mpId[%v].RestoreMultiSnapExts err %v", param.mpId, err)
                        return nil, proto.OpErr
                }
        }

        if proto.IsHot(param.volType) {
                size := i.Extents.Size()
                if i.Size < size {
                        i.Size = size
                }
                i.Generation++
                i.ModifyTime = param.ct
        }
        return
}

func (i *Inode) ExtentsTruncate(length uint64, ct int64, doOnLastKey func(*proto.ExtentKey), insertRefMap func(ek *proto.ExtentKey)) (delExtents []proto.ExtentKey) {
        delExtents = i.Extents.Truncate(length, doOnLastKey, insertRefMap)
        i.Size = length
        i.ModifyTime = ct
        i.Generation++
        return
}

// IncNLink increases the nLink value by one.
func (i *Inode) IncNLink(verSeq uint64) {
        if i.getVer() < verSeq {
                i.CreateVer(verSeq)
        }
        i.Lock()
        i.NLink++
        i.Unlock()
}

// DecNLink decreases the nLink value by one.
func (i *Inode) DecNLink() {
        i.Lock()
        if proto.IsDir(i.Type) && i.NLink == 2 {
                i.NLink--
        }
        if i.NLink > 0 {
                i.NLink--
        }
        i.Unlock()
}

// DecNLink decreases the nLink value by one.
func (i *Inode) DecNLinkByVer(verSeq uint64) {
        if i.getVer() < verSeq {
                i.CreateVer(verSeq)
        }
        i.DecNLink()
}

func (i *Inode) DecSplitExts(mpId uint64, delExtents interface{}) {
        log.LogDebugf("[DecSplitExts] mpId [%v] inode[%v]", mpId, i.Inode)
        cnt := len(delExtents.([]proto.ExtentKey))
        for id := 0; id < cnt; id++ {
                ek := &delExtents.([]proto.ExtentKey)[id]
                if !ek.IsSplit() {
                        log.LogDebugf("[DecSplitExts] mpId [%v]  ek not split %v", mpId, ek)
                        continue
                }
                if i.multiSnap == nil || i.multiSnap.ekRefMap == nil {
                        log.LogErrorf("[DecSplitExts] mpid [%v]. inode[%v] multiSnap.ekRefMap is nil", mpId, i.Inode)
                        return
                }

                ok, last := i.DecSplitEk(mpId, ek)
                if !ok {
                        log.LogErrorf("[DecSplitExts] mpid [%v]. ek [%v] not found!", mpId, ek)
                        continue
                }
                if last {
                        log.LogDebugf("[DecSplitExts] mpid [%v] ek [%v] split flag be unset to remove all content", mpId, ek)
                        ek.SetSplit(false)
                }
        }
}

func (i *Inode) DecSplitEk(mpId uint64, ext *proto.ExtentKey) (ok bool, last bool) {
        log.LogDebugf("[DecSplitEk] mpId[%v] inode[%v] dp [%v] extent id[%v].key %v ext %v", mpId, i.Inode, ext.PartitionId, ext.ExtentId,
                ext.PartitionId<<32|ext.ExtentId, ext)

        if i.multiSnap == nil || i.multiSnap.ekRefMap == nil {
                log.LogErrorf("DecSplitEk. multiSnap %v", i.multiSnap)
                return
        }
        if val, ok := i.multiSnap.ekRefMap.Load(ext.PartitionId<<32 | ext.ExtentId); !ok {
                log.LogErrorf("[DecSplitEk] mpId[%v]. dp [%v] inode[%v] ext not found", mpId, ext.PartitionId, i.Inode)
                return false, false
        } else {
                if val.(uint32) == 0 {
                        log.LogErrorf("[DecSplitEk] mpId[%v]. dp [%v] inode[%v] ek ref is zero!", mpId, ext.PartitionId, i.Inode)
                        return false, false
                }
                if val.(uint32) == 1 {
                        log.LogDebugf("[DecSplitEk] mpId[%v] inode[%v] dp [%v] extent id[%v].key %v", mpId, i.Inode, ext.PartitionId, ext.ExtentId,
                                ext.PartitionId<<32|ext.ExtentId)
                        i.multiSnap.ekRefMap.Delete(ext.PartitionId<<32 | ext.ExtentId)
                        return true, true
                }
                i.multiSnap.ekRefMap.Store(ext.PartitionId<<32|ext.ExtentId, val.(uint32)-1)
                log.LogDebugf("[DecSplitEk] mpId[%v]. extend dp [%v] inode[%v] ek [%v] val %v", mpId, ext.PartitionId, i.Inode, ext, val.(uint32)-1)
                return true, false
        }
}

// DecNLink decreases the nLink value by one.
func (i *Inode) GetDecNLinkResult() (nLink uint32) {
        i.Lock()
        nLink = i.NLink
        if proto.IsDir(i.Type) && nLink == 2 {
                nLink--
        }
        if nLink > 0 {
                nLink--
        }
        i.Unlock()
        return
}

// GetNLink returns the nLink value.
func (i *Inode) GetNLink() uint32 {
        i.RLock()
        defer i.RUnlock()
        return i.NLink
}

func (i *Inode) IsTempFile() bool {
        i.RLock()
        ok := i.NLink == 0 && !proto.IsDir(i.Type)
        i.RUnlock()
        return ok
}

func (i *Inode) IsEmptyDir() bool {
        i.RLock()
        ok := proto.IsDir(i.Type) && i.NLink <= 2
        i.RUnlock()
        return ok
}

func (i *Inode) IsEmptyDirAndNoSnapshot() bool {
        i.RLock()
        ok := proto.IsDir(i.Type) && i.NLink <= 2 && i.getLayerLen() == 0
        i.RUnlock()
        return ok
}

func (i *Inode) IsTopLayerEmptyDir() bool {
        i.RLock()
        ok := proto.IsDir(i.Type) && i.NLink <= 2
        i.RUnlock()
        return ok
}

// SetDeleteMark set the deleteMark flag. TODO markDelete or deleteMark? markDelete has been used in datanode.
func (i *Inode) SetDeleteMark() {
        i.Lock()
        i.Flag |= DeleteMarkFlag
        i.Unlock()
}

// ShouldDelete returns if the inode has been marked as deleted.
func (i *Inode) ShouldDelete() (ok bool) {
        i.RLock()
        ok = i.Flag&DeleteMarkFlag == DeleteMarkFlag
        i.RUnlock()
        return
}

// inode should delay remove if as 3 conditions:
// 1. DeleteMarkFlag is unset
// 2. NLink == 0
// 3. AccessTime is 7 days ago
func (i *Inode) ShouldDelayDelete() (ok bool) {
        i.RLock()
        ok = (i.Flag&DeleteMarkFlag != DeleteMarkFlag) &&
                (i.NLink == 0) &&
                time.Now().Unix()-i.AccessTime < InodeNLink0DelayDeleteSeconds
        i.RUnlock()
        return
}

// SetAttr sets the attributes of the inode.
func (i *Inode) SetAttr(req *SetattrRequest) {
        log.LogDebugf("action[SetAttr] inode[%v] req seq [%v] inode seq [%v]", i.Inode, req.VerSeq, i.getVer())

        if req.VerSeq != i.getVer() {
                i.CreateVer(req.VerSeq)
        }
        i.Lock()
        log.LogDebugf("action[SetAttr] inode[%v] req seq [%v] inode seq [%v]", i.Inode, req.VerSeq, i.getVer())
        if req.Valid&proto.AttrMode != 0 {
                i.Type = req.Mode
        }
        if req.Valid&proto.AttrUid != 0 {
                i.Uid = req.Uid
        }
        if req.Valid&proto.AttrGid != 0 {
                i.Gid = req.Gid
        }
        if req.Valid&proto.AttrAccessTime != 0 {
                i.AccessTime = req.AccessTime
        }
        if req.Valid&proto.AttrModifyTime != 0 {
                i.ModifyTime = req.ModifyTime
        }

        i.Unlock()
}

func (i *Inode) DoWriteFunc(fn func()) {
        i.Lock()
        defer i.Unlock()
        fn()
}

// DoFunc executes the given function.
func (i *Inode) DoReadFunc(fn func()) {
        i.RLock()
        defer i.RUnlock()
        fn()
}

// SetMtime sets mtime to the current time.
func (i *Inode) SetMtime() {
        mtime := timeutil.GetCurrentTimeUnix()
        i.Lock()
        defer i.Unlock()
        i.ModifyTime = mtime
}

// EmptyExtents clean the inode's extent list.
func (i *Inode) EmptyExtents(mtime int64) (delExtents []proto.ExtentKey) {
        i.Lock()
        defer i.Unlock()
        // eks is safe because extents be reset next and eks is will not be visit except del routine
        delExtents = i.Extents.eks
        i.Extents = NewSortedExtents()

        return delExtents
}

// EmptyExtents clean the inode's extent list.
func (i *Inode) CopyTinyExtents() (delExtents []proto.ExtentKey) {
        i.RLock()
        defer i.RUnlock()
        return i.Extents.CopyTinyExtents()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"
        "fmt"
        syslog "log"
        "net"
        "os"
        "path"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/cmd/common"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/atomicutil"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/loadutil"
        "github.com/cubefs/cubefs/util/log"
)

const (
        partitionPrefix        = "partition_"
        ExpiredPartitionPrefix = "expired_"
)

const sampleDuration = 1 * time.Second

// MetadataManager manages all the meta partitions.
type MetadataManager interface {
        Start() error
        Stop()
        // CreatePartition(id string, start, end uint64, peers []proto.Peer) error
        HandleMetadataOperation(conn net.Conn, p *Packet, remoteAddr string) error
        GetPartition(id uint64) (MetaPartition, error)
        GetLeaderPartitions() map[uint64]MetaPartition
        checkVolVerList() (err error)
}

// MetadataManagerConfig defines the configures in the metadata manager.
type MetadataManagerConfig struct {
        NodeID    uint64
        RootDir   string
        ZoneName  string
        RaftStore raftstore.RaftStore
}

type verOp2Phase struct {
        verSeq              uint64
        verPrepare          uint64
        status              uint32
        step                uint32
        isActiveReqToMaster bool
        sync.Mutex
}

type metadataManager struct {
        nodeId               uint64
        zoneName             string
        rootDir              string
        raftStore            raftstore.RaftStore
        connPool             *util.ConnectPool
        state                uint32
        mu                   sync.RWMutex
        partitions           map[uint64]MetaPartition // Key: metaRangeId, Val: metaPartition
        metaNode             *MetaNode
        flDeleteBatchCount   atomic.Value
        fileStatsEnable      bool
        curQuotaGoroutineNum int32
        maxQuotaGoroutineNum int32
        cpuUtil              atomicutil.Float64
        stopC                chan struct{}
        volUpdating          *sync.Map // map[string]*verOp2Phase
        verUpdateChan        chan string
}

func (m *metadataManager) getPacketLabels(p *Packet) (labels map[string]string) {
        labels = make(map[string]string)
        labels[exporter.Op] = p.GetOpMsg()
        labels[exporter.PartId] = ""
        labels[exporter.Vol] = ""

        if p.Opcode == proto.OpMetaNodeHeartbeat || p.Opcode == proto.OpCreateMetaPartition {
                return
        }

        mp, err := m.getPartition(p.PartitionID)
        if err != nil {
                log.LogInfof("[metaManager] getPacketLabels metric packet: %v", p)
                return
        }

        if exporter.EnablePid {
                labels[exporter.PartId] = fmt.Sprintf("%d", p.PartitionID)
        }
        labels[exporter.Vol] = mp.GetBaseConfig().VolName

        return
}

// HandleMetadataOperation handles the metadata operations.
func (m *metadataManager) HandleMetadataOperation(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if log.EnableInfo() {
                log.LogInfof("HandleMetadataOperation input info op (%s), data %s, remote %s", p.String(), string(p.Data), remoteAddr)
        }

        metric := exporter.NewTPCnt(p.GetOpMsg())
        labels := m.getPacketLabels(p)
        defer func() {
                metric.SetWithLabels(err, labels)
                if err != nil {
                        log.LogWarnf("HandleMetadataOperation output (%s), remote %s, err %s", p.String(), remoteAddr, err.Error())
                        return
                }

                if log.EnableInfo() {
                        log.LogInfof("HandleMetadataOperation out (%s), result (%s), remote %s, cost %s", p.String(),
                                p.GetResultMsg(), remoteAddr, time.Since(start).String())
                }
        }()

        switch p.Opcode {
        case proto.OpMetaCreateInode:
                err = m.opCreateInode(conn, p, remoteAddr)
        case proto.OpMetaLinkInode:
                err = m.opMetaLinkInode(conn, p, remoteAddr)
        case proto.OpMetaFreeInodesOnRaftFollower:
                err = m.opFreeInodeOnRaftFollower(conn, p, remoteAddr)
        case proto.OpMetaUnlinkInode:
                err = m.opMetaUnlinkInode(conn, p, remoteAddr)
        case proto.OpMetaBatchUnlinkInode:
                err = m.opMetaBatchUnlinkInode(conn, p, remoteAddr)
        case proto.OpMetaInodeGet:
                err = m.opMetaInodeGet(conn, p, remoteAddr)
        case proto.OpMetaEvictInode:
                err = m.opMetaEvictInode(conn, p, remoteAddr)
        case proto.OpMetaBatchEvictInode:
                err = m.opBatchMetaEvictInode(conn, p, remoteAddr)
        case proto.OpMetaSetattr:
                err = m.opSetAttr(conn, p, remoteAddr)
        case proto.OpMetaCreateDentry:
                err = m.opCreateDentry(conn, p, remoteAddr)
        case proto.OpMetaDeleteDentry:
                err = m.opDeleteDentry(conn, p, remoteAddr)
        case proto.OpMetaBatchDeleteDentry:
                err = m.opBatchDeleteDentry(conn, p, remoteAddr)
        case proto.OpMetaUpdateDentry:
                err = m.opUpdateDentry(conn, p, remoteAddr)
        case proto.OpMetaReadDir:
                err = m.opReadDir(conn, p, remoteAddr)
        case proto.OpMetaReadDirOnly:
                err = m.opReadDirOnly(conn, p, remoteAddr)
        case proto.OpMetaReadDirLimit:
                err = m.opReadDirLimit(conn, p, remoteAddr)
        case proto.OpCreateMetaPartition:
                err = m.opCreateMetaPartition(conn, p, remoteAddr)
        case proto.OpMetaNodeHeartbeat:
                err = m.opMasterHeartbeat(conn, p, remoteAddr)
        case proto.OpMetaExtentsAdd:
                err = m.opMetaExtentsAdd(conn, p, remoteAddr)
        case proto.OpMetaExtentAddWithCheck:
                err = m.opMetaExtentAddWithCheck(conn, p, remoteAddr)
        case proto.OpMetaExtentsList:
                err = m.opMetaExtentsList(conn, p, remoteAddr)
        case proto.OpMetaObjExtentsList:
                err = m.opMetaObjExtentsList(conn, p, remoteAddr)
        case proto.OpMetaExtentsDel:
                err = m.opMetaExtentsDel(conn, p, remoteAddr)
        case proto.OpMetaTruncate:
                err = m.opMetaExtentsTruncate(conn, p, remoteAddr)
        case proto.OpMetaLookup:
                err = m.opMetaLookup(conn, p, remoteAddr)
        case proto.OpDeleteMetaPartition:
                err = m.opDeleteMetaPartition(conn, p, remoteAddr)
        case proto.OpUpdateMetaPartition:
                err = m.opUpdateMetaPartition(conn, p, remoteAddr)
        case proto.OpLoadMetaPartition:
                err = m.opLoadMetaPartition(conn, p, remoteAddr)
        case proto.OpDecommissionMetaPartition:
                err = m.opDecommissionMetaPartition(conn, p, remoteAddr)
        case proto.OpAddMetaPartitionRaftMember:
                err = m.opAddMetaPartitionRaftMember(conn, p, remoteAddr)
        case proto.OpRemoveMetaPartitionRaftMember:
                err = m.opRemoveMetaPartitionRaftMember(conn, p, remoteAddr)
        case proto.OpMetaPartitionTryToLeader:
                err = m.opMetaPartitionTryToLeader(conn, p, remoteAddr)
        case proto.OpMetaBatchInodeGet:
                err = m.opMetaBatchInodeGet(conn, p, remoteAddr)
        case proto.OpMetaDeleteInode:
                err = m.opMetaDeleteInode(conn, p, remoteAddr)
        case proto.OpMetaBatchDeleteInode:
                err = m.opMetaBatchDeleteInode(conn, p, remoteAddr)
        case proto.OpMetaBatchExtentsAdd:
                err = m.opMetaBatchExtentsAdd(conn, p, remoteAddr)
        case proto.OpMetaBatchObjExtentsAdd:
                err = m.opMetaBatchObjExtentsAdd(conn, p, remoteAddr)
        case proto.OpMetaClearInodeCache:
                err = m.opMetaClearInodeCache(conn, p, remoteAddr)
        // operations for extend attributes
        case proto.OpMetaSetXAttr:
                err = m.opMetaSetXAttr(conn, p, remoteAddr)
        case proto.OpMetaBatchSetXAttr:
                err = m.opMetaBatchSetXAttr(conn, p, remoteAddr)
        case proto.OpMetaGetXAttr:
                err = m.opMetaGetXAttr(conn, p, remoteAddr)
        case proto.OpMetaGetAllXAttr:
                err = m.opMetaGetAllXAttr(conn, p, remoteAddr)
        case proto.OpMetaBatchGetXAttr:
                err = m.opMetaBatchGetXAttr(conn, p, remoteAddr)
        case proto.OpMetaRemoveXAttr:
                err = m.opMetaRemoveXAttr(conn, p, remoteAddr)
        case proto.OpMetaListXAttr:
                err = m.opMetaListXAttr(conn, p, remoteAddr)
        case proto.OpMetaUpdateXAttr:
                err = m.opMetaUpdateXAttr(conn, p, remoteAddr)
        // operations for multipart session
        case proto.OpCreateMultipart:
                err = m.opCreateMultipart(conn, p, remoteAddr)
        case proto.OpListMultiparts:
                err = m.opListMultipart(conn, p, remoteAddr)
        case proto.OpRemoveMultipart:
                err = m.opRemoveMultipart(conn, p, remoteAddr)
        case proto.OpAddMultipartPart:
                err = m.opAppendMultipart(conn, p, remoteAddr)
        case proto.OpGetMultipart:
                err = m.opGetMultipart(conn, p, remoteAddr)

        // operations for transactions
        case proto.OpMetaTxCreateInode:
                err = m.opTxCreateInode(conn, p, remoteAddr)
        case proto.OpMetaTxCreateDentry:
                err = m.opTxCreateDentry(conn, p, remoteAddr)
        case proto.OpTxCommit:
                err = m.opTxCommit(conn, p, remoteAddr)
        case proto.OpMetaTxCreate:
                err = m.opTxCreate(conn, p, remoteAddr)
        case proto.OpMetaTxGet:
                err = m.opTxGet(conn, p, remoteAddr)
        case proto.OpTxCommitRM:
                err = m.opTxCommitRM(conn, p, remoteAddr)
        case proto.OpTxRollbackRM:
                err = m.opTxRollbackRM(conn, p, remoteAddr)
        case proto.OpTxRollback:
                err = m.opTxRollback(conn, p, remoteAddr)
        case proto.OpMetaTxDeleteDentry:
                err = m.opTxDeleteDentry(conn, p, remoteAddr)
        case proto.OpMetaTxUnlinkInode:
                err = m.opTxMetaUnlinkInode(conn, p, remoteAddr)
        case proto.OpMetaTxUpdateDentry:
                err = m.opTxUpdateDentry(conn, p, remoteAddr)
        case proto.OpMetaTxLinkInode:
                err = m.opTxMetaLinkInode(conn, p, remoteAddr)
        case proto.OpMetaBatchSetInodeQuota:
                err = m.opMetaBatchSetInodeQuota(conn, p, remoteAddr)
        case proto.OpMetaBatchDeleteInodeQuota:
                err = m.opMetaBatchDeleteInodeQuota(conn, p, remoteAddr)
        case proto.OpMetaGetInodeQuota:
                err = m.opMetaGetInodeQuota(conn, p, remoteAddr)
        case proto.OpQuotaCreateInode:
                err = m.opQuotaCreateInode(conn, p, remoteAddr)
        case proto.OpQuotaCreateDentry:
                err = m.opQuotaCreateDentry(conn, p, remoteAddr)
        case proto.OpMetaGetUniqID:
                err = m.opMetaGetUniqID(conn, p, remoteAddr)
        // multi version
        case proto.OpVersionOperation:
                err = m.opMultiVersionOp(conn, p, remoteAddr)
        case proto.OpGetExpiredMultipart:
                err = m.opGetExpiredMultipart(conn, p, remoteAddr)
        default:
                err = fmt.Errorf("%s unknown Opcode: %d, reqId: %d", remoteAddr,
                        p.Opcode, p.GetReqID())
        }
        if err != nil {
                err = errors.NewErrorf("%s [%s] req: %d - %s", remoteAddr, p.GetOpMsg(),
                        p.GetReqID(), err.Error())
        }
        return
}

// Start starts the metadata manager.
func (m *metadataManager) Start() (err error) {
        if atomic.CompareAndSwapUint32(&m.state, common.StateStandby, common.StateStart) {
                defer func() {
                        var newState uint32
                        if err != nil {
                                newState = common.StateStandby
                        } else {
                                newState = common.StateRunning
                        }
                        atomic.StoreUint32(&m.state, newState)
                }()
                err = m.onStart()
        }
        return
}

// Stop stops the metadata manager.
func (m *metadataManager) Stop() {
        if atomic.CompareAndSwapUint32(&m.state, common.StateRunning, common.StateShutdown) {
                defer atomic.StoreUint32(&m.state, common.StateStopped)
                m.onStop()
        }
}

func (m *metadataManager) startCpuSample() {
        // async sample cpu util
        go func() {
                for {
                        select {
                        case <-m.stopC:
                                return
                        default:
                                used, err := loadutil.GetCpuUtilPercent(sampleDuration)
                                if err == nil {
                                        m.cpuUtil.Store(used)
                                }
                        }
                }
        }()
}

func (m *metadataManager) startSnapshotVersionPromote() {
        m.verUpdateChan = make(chan string, 1000)
        go func() {
                for {
                        select {
                        case volName := <-m.verUpdateChan:
                                m.checkAndPromoteVersion(volName)
                        case <-m.stopC:
                                return
                        }
                }
        }()
}

// onStart creates the connection pool and loads the partitions.
func (m *metadataManager) onStart() (err error) {
        m.connPool = util.NewConnectPool()
        err = m.loadPartitions()
        if err != nil {
                return
        }
        m.stopC = make(chan struct{})
        // start sampler
        m.startCpuSample()
        m.startSnapshotVersionPromote()
        return
}

// onStop stops each meta partitions.
func (m *metadataManager) onStop() {
        if m.partitions != nil {
                for _, partition := range m.partitions {
                        partition.Stop()
                }
                // stop sampler
                close(m.stopC)
        }
        return
}

// LoadMetaPartition returns the meta partition with the specified volName.
func (m *metadataManager) getPartition(id uint64) (mp MetaPartition, err error) {
        m.mu.RLock()
        defer m.mu.RUnlock()
        mp, ok := m.partitions[id]
        if ok {
                return
        }
        err = errors.New(fmt.Sprintf("unknown meta partition: %d", id))
        return
}

func (m *metadataManager) loadPartitions() (err error) {
        var metaNodeInfo *proto.MetaNodeInfo
        for i := 0; i < 3; i++ {
                if metaNodeInfo, err = masterClient.NodeAPI().GetMetaNode(fmt.Sprintf("%s:%s", m.metaNode.localAddr,
                        m.metaNode.listen)); err != nil {
                        log.LogWarnf("loadPartitions: get MetaNode info fail: err(%v)", err)
                        continue
                }
                break
        }
        if err != nil {
                log.LogErrorf("loadPartitions: get MetaNode info fail: err(%v)", err)
                return
        }
        if len(metaNodeInfo.PersistenceMetaPartitions) == 0 {
                log.LogWarnf("loadPartitions: length of PersistenceMetaPartitions is 0, ExpiredPartition check without effect")
        }

        // Check metadataDir directory
        fileInfo, err := os.Stat(m.rootDir)
        if err != nil {
                os.MkdirAll(m.rootDir, 0o755)
                err = nil
                return
        }
        if !fileInfo.IsDir() {
                err = errors.New("metadataDir must be directory")
                return
        }
        // scan the data directory
        fileInfoList, err := os.ReadDir(m.rootDir)
        if err != nil {
                return
        }
        syslog.Println("Start loadPartitions!!!")
        var wg sync.WaitGroup
        for _, fileInfo := range fileInfoList {
                if fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), partitionPrefix) {

                        if isExpiredPartition(fileInfo.Name(), metaNodeInfo.PersistenceMetaPartitions) {
                                log.LogErrorf("loadPartitions: find expired partition[%s], rename it and you can delete it manually",
                                        fileInfo.Name())
                                oldName := path.Join(m.rootDir, fileInfo.Name())
                                newName := path.Join(m.rootDir, ExpiredPartitionPrefix+fileInfo.Name())
                                os.Rename(oldName, newName)
                                continue
                        }

                        wg.Add(1)
                        go func(fileName string) {
                                var errload error

                                defer func() {
                                        if r := recover(); r != nil {
                                                log.LogWarnf("action[loadPartitions] recovered when load partition, skip it,"+
                                                        " partition: %s, error: %s, failed: %v", fileName, errload, r)
                                                syslog.Printf("load meta partition %v fail: %v", fileName, r)
                                        } else if errload != nil {
                                                log.LogWarnf("action[loadPartitions] failed to load partition, skip it, partition: %s, error: %s",
                                                        fileName, errload)
                                        }
                                }()

                                defer wg.Done()
                                if len(fileName) < 10 {
                                        log.LogWarnf("ignore unknown partition dir: %s", fileName)
                                        return
                                }
                                var id uint64
                                partitionId := fileName[len(partitionPrefix):]
                                id, errload = strconv.ParseUint(partitionId, 10, 64)
                                if errload != nil {
                                        log.LogWarnf("action[loadPartitions] ignore path: %s, not partition", partitionId)
                                        return
                                }

                                partitionConfig := &MetaPartitionConfig{
                                        PartitionId: id,
                                        NodeId:      m.nodeId,
                                        RaftStore:   m.raftStore,
                                        RootDir:     path.Join(m.rootDir, fileName),
                                        ConnPool:    m.connPool,
                                }
                                partitionConfig.AfterStop = func() {
                                        m.detachPartition(id)
                                }
                                // check snapshot dir or backup
                                snapshotDir := path.Join(partitionConfig.RootDir, snapshotDir)
                                if _, errload = os.Stat(snapshotDir); errload != nil {
                                        backupDir := path.Join(partitionConfig.RootDir, snapshotBackup)
                                        if _, errload = os.Stat(backupDir); errload == nil {
                                                if errload = os.Rename(backupDir, snapshotDir); errload != nil {
                                                        errload = errors.Trace(errload,
                                                                fmt.Sprintf(": fail recover backup snapshot %s",
                                                                        snapshotDir))
                                                        return
                                                }
                                        }
                                        errload = nil
                                }
                                partition := NewMetaPartition(partitionConfig, m)
                                if partition == nil {
                                        log.LogErrorf("action[loadPartitions]: NewMetaPartition is nil")
                                        return
                                }
                                errload = m.attachPartition(id, partition)
                                if errload != nil {
                                        log.LogErrorf("action[loadPartitions] load partition id=%d failed: %s.",
                                                id, errload.Error())
                                }
                        }(fileInfo.Name())
                }
        }
        wg.Wait()
        syslog.Println("Finish loadPartitions!!!")
        return
}

func (m *metadataManager) attachPartition(id uint64, partition MetaPartition) (err error) {
        syslog.Println(fmt.Sprintf("start load metaPartition %v", id))
        partition.ForceSetMetaPartitionToLoadding()
        if err = partition.Start(false); err != nil {
                msg := fmt.Sprintf("load meta partition %v fail: %v", id, err)
                log.LogError(msg)
                syslog.Println(msg)
                return
        }
        m.mu.Lock()
        defer m.mu.Unlock()
        m.partitions[id] = partition
        msg := fmt.Sprintf("load meta partition %v success", id)
        log.LogInfof(msg)
        syslog.Println(msg)
        return
}

func (m *metadataManager) detachPartition(id uint64) (err error) {
        m.mu.Lock()
        defer m.mu.Unlock()
        if _, has := m.partitions[id]; has {
                delete(m.partitions, id)
        } else {
                err = fmt.Errorf("unknown partition: %d", id)
        }
        return
}

func (m *metadataManager) createPartition(request *proto.CreateMetaPartitionRequest) (err error) {
        partitionId := fmt.Sprintf("%d", request.PartitionID)
        log.LogInfof("start create meta Partition, partition %s", partitionId)

        mpc := &MetaPartitionConfig{
                PartitionId: request.PartitionID,
                VolName:     request.VolName,
                Start:       request.Start,
                End:         request.End,
                Cursor:      request.Start,
                UniqId:      0,
                Peers:       request.Members,
                RaftStore:   m.raftStore,
                NodeId:      m.nodeId,
                RootDir:     path.Join(m.rootDir, partitionPrefix+partitionId),
                ConnPool:    m.connPool,
                VerSeq:      request.VerSeq,
        }
        mpc.AfterStop = func() {
                m.detachPartition(request.PartitionID)
        }

        partition := NewMetaPartition(mpc, m)
        if partition == nil {
                err = errors.NewErrorf("[createPartition] partition is nil")
                return
        }

        if err = partition.RenameStaleMetadata(); err != nil {
                err = errors.NewErrorf("[createPartition]->%s", err.Error())
        }

        if err = partition.PersistMetadata(); err != nil {
                err = errors.NewErrorf("[createPartition]->%s", err.Error())
                return
        }

        if err = partition.Start(true); err != nil {
                os.RemoveAll(mpc.RootDir)
                log.LogErrorf("load meta partition %v fail: %v", request.PartitionID, err)
                err = errors.NewErrorf("[createPartition]->%s", err.Error())
                return
        }

        m.mu.Lock()
        defer m.mu.Unlock()
        if oldMp, ok := m.partitions[request.PartitionID]; ok {
                err = oldMp.IsEquareCreateMetaPartitionRequst(request)
                partition.Stop()
                partition.DeleteRaft()
                os.RemoveAll(mpc.RootDir)
                return
        }

        m.partitions[request.PartitionID] = partition
        log.LogInfof("load meta partition %v success", request.PartitionID)

        return
}

func (m *metadataManager) deletePartition(id uint64) (err error) {
        m.mu.Lock()
        defer m.mu.Unlock()
        mp, has := m.partitions[id]
        if !has {
                return
        }
        mp.Reset()
        delete(m.partitions, id)
        return
}

// Range scans all the meta partitions.
func (m *metadataManager) Range(needLock bool, f func(i uint64, p MetaPartition) bool) {
        if needLock {
                m.mu.RLock()
                defer m.mu.RUnlock()
        }
        for k, v := range m.partitions {
                if !f(k, v) {
                        return
                }
        }
}

// GetPartition returns the meta partition with the given ID.
func (m *metadataManager) GetPartition(id uint64) (mp MetaPartition, err error) {
        mp, err = m.getPartition(id)
        return
}

// MarshalJSON only marshals the base information of every partition.
func (m *metadataManager) MarshalJSON() (data []byte, err error) {
        m.mu.RLock()
        defer m.mu.RUnlock()
        return json.Marshal(m.partitions)
}

func (m *metadataManager) QuotaGoroutineIsOver() (lsOver bool) {
        log.LogInfof("QuotaGoroutineIsOver cur [%v] max [%v]", m.curQuotaGoroutineNum, m.maxQuotaGoroutineNum)
        if atomic.LoadInt32(&m.curQuotaGoroutineNum) >= m.maxQuotaGoroutineNum {
                return true
        }
        return false
}

func (m *metadataManager) QuotaGoroutineInc(num int32) {
        atomic.AddInt32(&m.curQuotaGoroutineNum, num)
}

func (m *metadataManager) GetLeaderPartitions() map[uint64]MetaPartition {
        m.mu.RLock()
        defer m.mu.RUnlock()

        mps := make(map[uint64]MetaPartition)
        for addr, mp := range m.partitions {
                if _, leader := mp.IsLeader(); leader {
                        mps[addr] = mp
                }
        }

        return mps
}

// NewMetadataManager returns a new metadata manager.
func NewMetadataManager(conf MetadataManagerConfig, metaNode *MetaNode) MetadataManager {
        return &metadataManager{
                nodeId:               conf.NodeID,
                zoneName:             conf.ZoneName,
                rootDir:              conf.RootDir,
                raftStore:            conf.RaftStore,
                partitions:           make(map[uint64]MetaPartition),
                metaNode:             metaNode,
                maxQuotaGoroutineNum: defaultMaxQuotaGoroutine,
                volUpdating:          new(sync.Map),
        }
}

// isExpiredPartition return whether one partition is expired
// if one partition does not exist in master, we decided that it is one expired partition
func isExpiredPartition(fileName string, partitions []uint64) (expiredPartition bool) {
        if len(partitions) == 0 {
                return true
        }

        partitionId := fileName[len(partitionPrefix):]
        id, err := strconv.ParseUint(partitionId, 10, 64)
        if err != nil {
                log.LogWarnf("isExpiredPartition: %s, check error [%v], skip this check", partitionId, err)
                return true
        }

        for _, existId := range partitions {
                if existId == id {
                        return false
                }
        }
        return true
}

// Copyright 2018 The The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/json"
        "fmt"
        "net"
        "os"
        "runtime"
        "sync"
        "sync/atomic"
        "time"

        raftProto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        MaxUsedMemFactor = 1.1
)

func (m *metadataManager) checkFollowerRead(volNames []string, partition MetaPartition) {
        volName := partition.GetVolName()
        for _, name := range volNames {
                if name == volName {
                        partition.SetFollowerRead(true)
                        return
                }
        }
        partition.SetFollowerRead(false)
        return
}

func (m *metadataManager) checkForbiddenVolume(volNames []string, partition MetaPartition) {
        volName := partition.GetVolName()
        for _, name := range volNames {
                if name == volName {
                        partition.SetForbidden(true)
                        return
                }
        }
        partition.SetForbidden(false)
        return
}

func (m *metadataManager) checkDisableAuditLogVolume(volNames []string, partition MetaPartition) {
        volName := partition.GetVolName()
        for _, name := range volNames {
                if name == volName {
                        partition.SetEnableAuditLog(false)
                        return
                }
        }
        partition.SetEnableAuditLog(true)
        return
}

func (m *metadataManager) opMasterHeartbeat(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        // For ack to master
        data := p.Data
        m.responseAckOKToMaster(conn, p)

        var (
                req       = &proto.HeartBeatRequest{}
                resp      = &proto.MetaNodeHeartbeatResponse{}
                adminTask = &proto.AdminTask{
                        Request: req,
                }
        )

        go func() {
                start := time.Now()
                decode := json.NewDecoder(bytes.NewBuffer(data))
                decode.UseNumber()
                if err = decode.Decode(adminTask); err != nil {
                        resp.Status = proto.TaskFailed
                        resp.Result = err.Error()
                        goto end
                }
                m.fileStatsEnable = req.FileStatsEnable
                // collect memory info
                resp.Total = configTotalMem
                resp.MemUsed, err = util.GetProcessMemory(os.Getpid())
                if err != nil {
                        adminTask.Status = proto.TaskFailed
                        goto end
                }
                // set cpu util and io used in here
                resp.CpuUtil = m.cpuUtil.Load()

                m.Range(true, func(id uint64, partition MetaPartition) bool {
                        m.checkFollowerRead(req.FLReadVols, partition)
                        m.checkForbiddenVolume(req.ForbiddenVols, partition)
                        m.checkDisableAuditLogVolume(req.DisableAuditVols, partition)
                        partition.SetUidLimit(req.UidLimitInfo)
                        partition.SetTxInfo(req.TxInfo)
                        partition.setQuotaHbInfo(req.QuotaHbInfos)
                        mConf := partition.GetBaseConfig()

                        mpr := &proto.MetaPartitionReport{
                                PartitionID:      mConf.PartitionId,
                                Start:            mConf.Start,
                                End:              mConf.End,
                                Status:           proto.ReadWrite,
                                MaxInodeID:       mConf.Cursor,
                                VolName:          mConf.VolName,
                                Size:             partition.DataSize(),
                                InodeCnt:         uint64(partition.GetInodeTreeLen()),
                                DentryCnt:        uint64(partition.GetDentryTreeLen()),
                                FreeListLen:      uint64(partition.GetFreeListLen()),
                                UidInfo:          partition.GetUidInfo(),
                                QuotaReportInfos: partition.getQuotaReportInfos(),
                        }
                        mpr.TxCnt, mpr.TxRbInoCnt, mpr.TxRbDenCnt = partition.TxGetCnt()

                        if mConf.Cursor >= mConf.End {
                                mpr.Status = proto.ReadOnly
                        }
                        if resp.MemUsed > uint64(float64(resp.Total)*MaxUsedMemFactor) {
                                mpr.Status = proto.ReadOnly
                        }

                        addr, isLeader := partition.IsLeader()
                        if addr == "" {
                                mpr.Status = proto.Unavailable
                        }
                        mpr.IsLeader = isLeader

                        resp.MetaPartitionReports = append(resp.MetaPartitionReports, mpr)
                        return true
                })
                resp.ZoneName = m.zoneName
                resp.Status = proto.TaskSucceeds
        end:
                adminTask.Request = nil
                adminTask.Response = resp
                m.respondToMaster(adminTask)
                if log.EnableInfo() {
                        log.LogInfof("%s pkt %s, resp success req:%v; respAdminTask: %v, cost %s",
                                remoteAddr, p.String(), req, adminTask, time.Since(start).String())
                }
        }()

        return
}

func (m *metadataManager) opCreateMetaPartition(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        defer func() {
                var buf []byte
                status := proto.OpOk
                if err != nil {
                        status = proto.OpErr
                        buf = []byte(err.Error())
                }
                p.PacketErrorWithBody(status, buf)
                m.respondToClientWithVer(conn, p)
        }()
        req := &proto.CreateMetaPartitionRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                err = errors.NewErrorf("[opCreateMetaPartition]: Unmarshal AdminTask"+
                        " struct: %s", err.Error())
                return
        }
        log.LogInfof("[%s] [remoteAddr=%s]accept a from"+
                " master message: %v", p.String(), remoteAddr, adminTask)
        // create a new meta partition.
        if err = m.createPartition(req); err != nil {
                err = errors.NewErrorf("[opCreateMetaPartition]->%s; request message: %v",
                        err.Error(), adminTask.Request)
                return
        }
        log.LogInfof("%s [%s] create success req:%v; resp: %v", remoteAddr, p.String(),
                req, adminTask)
        return
}

// Handle OpCreate inode.
func (m *metadataManager) opCreateInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &CreateInoReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }

        err = mp.CreateInode(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        // reply the operation result to the client through TCP
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opCreateInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opQuotaCreateInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.QuotaCreateInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.QuotaCreateInode(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        // reply the operation result to the client through TCP
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opQuotaCreateInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxMetaLinkInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.TxLinkInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.TxCreateInodeLink(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opTxMetaLinkInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaLinkInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &LinkInodeReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.CreateInodeLink(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaLinkInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle OpCreate
func (m *metadataManager) opFreeInodeOnRaftFollower(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        mp, err := m.getPartition(p.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],err[%v]", p.GetOpMsgWithReqAndResult(), string(p.Data))
                return
        }
        mp.(*metaPartition).internalDelete(p.Data[:p.Size])
        p.PacketOkReply()
        m.respondToClientWithVer(conn, p)

        return
}

// Handle OpCreate
func (m *metadataManager) opTxCreateDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxCreateDentryRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.TxCreateDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxCreateDentry] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxCreate(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxCreateRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxCreate(req, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxCreate] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxGet(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxGetInfoRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.Pid)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxGetInfo(req, p)
        m.respondToClient(conn, p)

        if log.EnableDebug() {
                log.LogDebugf("%s [opTxGet] req: %d - %v, resp: %v, body: %s",
                        remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        }
        return
}

func (m *metadataManager) opTxCommitRM(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxApplyRMRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxCommitRM(req, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxCommitRM] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxRollbackRM(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxApplyRMRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxRollbackRM(req, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxRollbackRM] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxCommit(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxApplyRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.TmID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxCommit(req, p, remoteAddr)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxCommit] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxRollback(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxApplyRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.TmID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxRollback(req, p, remoteAddr)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opTxRollback] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle OpCreate
func (m *metadataManager) opCreateDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &CreateDentryReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.CreateDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opCreateDentry] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opQuotaCreateDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.QuotaCreateDentryRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.QuotaCreateDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)

        log.LogDebugf("%s [opQuotaCreateDentry] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle OpDelete Dentry
func (m *metadataManager) opTxDeleteDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxDeleteDentryRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.TxDeleteDentry(req, p, remoteAddr)
        m.respondToClient(conn, p)
        if log.EnableDebug() {
                log.LogDebugf("%s [opTxDeleteDentry] req: %d - %v, resp: %v, body: %s",
                        remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        }
        return
}

// Handle OpDelete Dentry
func (m *metadataManager) opDeleteDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &DeleteDentryReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }

        err = mp.DeleteDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opDeleteDentry] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle Op batch Delete Dentry
func (m *metadataManager) opBatchDeleteDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &BatchDeleteDentryReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }

        err = mp.DeleteDentryBatch(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opDeleteDentry] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxUpdateDentry(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.TxUpdateDentryRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.TxUpdateDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opTxUpdateDentry] req: %d - %v; resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opUpdateDentry(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &UpdateDentryReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }

        err = mp.UpdateDentry(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opUpdateDentry] req: %d - %v; resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opTxMetaUnlinkInode(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.TxUnlinkInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.TxUnlinkInode(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaUnlinkInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &UnlinkInoReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.UnlinkInode(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchUnlinkInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &BatchUnlinkInoReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.UnlinkInodeBatch(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opDeleteInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opReadDirOnly(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.ReadDirOnlyRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ReadDirOnly(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
                p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle OpReadDir
func (m *metadataManager) opReadDir(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.ReadDirRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ReadDir(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
                p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Handle OpReadDirLimit
func (m *metadataManager) opReadDirLimit(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.ReadDirLimitRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ReadDirLimit(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [%v]req: %v , resp: %v, body: %s", remoteAddr,
                p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaInodeGet(conn net.Conn, p *Packet,
        remoteAddr string) (err error,
) {
        req := &InodeGetReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("Unmarshal [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        log.LogDebugf("action[opMetaInodeGet] request %v", req)
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("getPartition [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        if err = mp.InodeGet(req, p); err != nil {
                err = errors.NewErrorf("InodeGet [%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
        }

        if err = m.respondToClient(conn, p); err != nil {
                log.LogDebugf("%s [opMetaInodeGet] err [%v] req: %d - %v; resp: %v, body: %s",
                        remoteAddr, err, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        }
        log.LogDebugf("%s [opMetaInodeGet] req: %d - %v; resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)

        if value, ok := m.volUpdating.Load(req.VolName); ok {
                ver2Phase := value.(*verOp2Phase)
                if ver2Phase.verSeq > req.VerSeq {
                        // reuse ExtentType to identify flag of version inconsistent between metanode and client
                        // will resp to client and make client update all streamer's extent and it's verSeq
                        p.ExtentType |= proto.MultiVersionFlag
                        p.VerSeq = ver2Phase.verSeq
                }
        }

        return
}

func (m *metadataManager) opBatchMetaEvictInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.BatchEvictInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] request unmarshal: %v", p.GetOpMsgWithReqAndResult(), err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        if err = mp.EvictInodeBatch(req, p, remoteAddr); err != nil {
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
        }
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opBatchMetaEvictInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaEvictInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.EvictInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        if err = mp.EvictInode(req, p, remoteAddr); err != nil {
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
        }
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaEvictInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opSetAttr(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &SetattrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        if err = mp.SetAttr(req, p.Data, p); err != nil {
                err = errors.NewErrorf("[opSetAttr] req: %v, error: %s", req, err.Error())
        }
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opSetAttr] req: %d - %v, resp: %v, body: %s", remoteAddr,
                p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Lookup request
func (m *metadataManager) opMetaLookup(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.LookupRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.Lookup(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaLookup] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaExtentsAdd(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.AppendExtentKeyRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.ExtentAppend(req, p)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        if err != nil {
                log.LogErrorf("%s [opMetaExtentsAdd] ExtentAppend: %s, "+
                        "response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
        }
        log.LogDebugf("%s [opMetaExtentsAdd] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Append one extent with discard check
func (m *metadataManager) opMetaExtentAddWithCheck(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.AppendExtentKeyWithCheckRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }

        if err = mp.ExtentAppendWithCheck(req, p); err != nil {
                log.LogErrorf("%s [opMetaExtentAddWithCheck] ExtentAppendWithCheck: %s", remoteAddr, err.Error())
        }
        m.updatePackRspSeq(mp, p)
        if err = m.respondToClientWithVer(conn, p); err != nil {
                log.LogErrorf("%s [opMetaExtentAddWithCheck] ExtentAppendWithCheck: %s, "+
                        "response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
        }
        log.LogDebugf("%s [opMetaExtentAddWithCheck] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaExtentsList(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.GetExtentsRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.ExtentsList(req, p)
        m.respondToClient(conn, p)
        if log.EnableDebug() {
                log.LogDebugf("%s [opMetaExtentsList] req: %d - %v; resp: %v, body: %s",
                        remoteAddr, p.GetReqID(), req, p.GetResultMsg(), log.TruncMsg(string(p.Data)))
        }
        return
}

func (m *metadataManager) opMetaObjExtentsList(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.GetExtentsRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.ObjExtentsList(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaObjExtentsList] req: %d - %v; resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaExtentsDel(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        panic("not implemented yet")
        // req := &proto.DelExtentKeyRequest{}
        // if err = json.Unmarshal(p.Data, req); err != nil {
        //         p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
        //         m.respondToClientWithVer(conn, p)
        //         err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
        //         return
        // }
        // mp, err := m.getPartition(req.PartitionID)
        // if err != nil {
        //         p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
        //         m.respondToClientWithVer(conn, p)
        //         err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
        //         return
        // }
        // if !m.serveProxy(conn, mp, p) {
        //         return
        // }
        // mp.ExtentsDelete(req, p)
        // m.respondToClientWithVer(conn, p)
        // log.LogDebugf("%s [OpMetaTruncate] req: %d - %v, resp body: %v, "+
        //         "resp body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        // return
}

func (m *metadataManager) opMetaExtentsTruncate(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &ExtentsTruncateReq{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        mp.ExtentsTruncate(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [OpMetaTruncate] req: %d - %v, resp body: %v, "+
                "resp body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaClearInodeCache(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.ClearInodeCacheRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ClearInodeCache(req, p)
        m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaClearInodeCache] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

// Delete a meta partition.
func (m *metadataManager) opDeleteMetaPartition(conn net.Conn,
        p *Packet, remoteAddr string) (err error) {
        req := &proto.DeleteMetaPartitionRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketOkReply()
                m.respondToClientWithVer(conn, p)
                return
        }
        // Ack the master request
        conf := mp.GetBaseConfig()
        mp.Stop()
        mp.DeleteRaft()
        m.deletePartition(mp.GetBaseConfig().PartitionId)
        os.RemoveAll(conf.RootDir)
        p.PacketOkReply()
        m.respondToClientWithVer(conn, p)
        runtime.GC()
        log.LogInfof("%s [opDeleteMetaPartition] req: %d - %v, resp: %v",
                remoteAddr, p.GetReqID(), req, err)
        return
}

func (m *metadataManager) opUpdateMetaPartition(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := new(UpdatePartitionReq)
        adminTask := &proto.AdminTask{
                Request: req,
        }
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        m.responseAckOKToMaster(conn, p)
        resp := &UpdatePartitionResp{
                VolName:     req.VolName,
                PartitionID: req.PartitionID,
                End:         req.End,
        }
        err = mp.UpdatePartition(req, resp)
        adminTask.Response = resp
        adminTask.Request = nil
        m.respondToMaster(adminTask)
        log.LogInfof("%s [opUpdateMetaPartition] req[%v], response[%v].",
                remoteAddr, req, adminTask)
        return
}

func (m *metadataManager) opLoadMetaPartition(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.MetaPartitionLoadRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if err = mp.ResponseLoadMetaPartition(p); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                log.LogErrorf("%s [opLoadMetaPartition] req[%v], "+
                        "response marshal[%v]", remoteAddr, req, err.Error())
                m.respondToClient(conn, p)
                return
        }
        m.respondToClient(conn, p)
        log.LogInfof("%s [opLoadMetaPartition] req[%v], response status[%s], "+
                "response body[%s], error[%v]", remoteAddr, req, p.GetResultMsg(), p.Data,
                err)
        return
}

func (m *metadataManager) opDecommissionMetaPartition(conn net.Conn,
        p *Packet, remoteAddr string) (err error) {
        var reqData []byte
        req := &proto.MetaPartitionDecommissionRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }
        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return err
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return err
        }
        if !m.serveProxy(conn, mp, p) {
                return nil
        }
        if req.AddPeer.ID == req.RemovePeer.ID {
                err = errors.NewErrorf("[opDecommissionMetaPartition]: AddPeer[%v] same withRemovePeer[%v]", req.AddPeer, req.RemovePeer)
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        reqData, err = json.Marshal(req)
        if err != nil {
                err = errors.NewErrorf("[opDecommissionMetaPartition]: partitionID= %d, "+
                        "Marshal %s", req.PartitionID, err)
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        _, err = mp.ChangeMember(raftProto.ConfAddNode,
                raftProto.Peer{ID: req.AddPeer.ID}, reqData)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return err
        }
        _, err = mp.ChangeMember(raftProto.ConfRemoveNode,
                raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return err
        }
        p.PacketOkReply()
        m.respondToClientWithVer(conn, p)

        return
}

func (m *metadataManager) opAddMetaPartitionRaftMember(conn net.Conn,
        p *Packet, remoteAddr string) (err error) {
        var reqData []byte
        req := &proto.AddMetaPartitionRaftMemberRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }

        defer func() {
                if err != nil {
                        log.LogInfof("pkt %s remote %s reqId add raft member failed, req %v, err %s", p.String(), remoteAddr, adminTask, err.Error())
                        return
                }

                log.LogInfof("pkt %s, remote %s add raft member success, req %v", p.String(), remoteAddr, adminTask)
        }()

        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return err
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpTryOtherAddr, ([]byte)(proto.ErrMetaPartitionNotExists.Error()))
                m.respondToClientWithVer(conn, p)
                return err
        }

        if mp.IsExsitPeer(req.AddPeer) {
                p.PacketOkReply()
                m.respondToClientWithVer(conn, p)
                return
        }

        log.LogInfof("[%s], remote %s start add raft member, req %v", p.String(), remoteAddr, adminTask)

        if !m.serveProxy(conn, mp, p) {
                return nil
        }
        reqData, err = json.Marshal(req)
        if err != nil {
                err = errors.NewErrorf("[opAddMetaPartitionRaftMember]: partitionID= %d, "+
                        "Marshal %s", req.PartitionId, err)
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        if req.AddPeer.ID == 0 {
                err = errors.NewErrorf("[opAddMetaPartitionRaftMember]: partitionID= %d, "+
                        "Marshal %s", req.PartitionId, fmt.Sprintf("unavali AddPeerID %v", req.AddPeer.ID))
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        _, err = mp.ChangeMember(raftProto.ConfAddNode,
                raftProto.Peer{ID: req.AddPeer.ID}, reqData)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                return err
        }
        p.PacketOkReply()
        m.respondToClientWithVer(conn, p)
        return
}

func (m *metadataManager) opRemoveMetaPartitionRaftMember(conn net.Conn,
        p *Packet, remoteAddr string) (err error) {
        var reqData []byte
        req := &proto.RemoveMetaPartitionRaftMemberRequest{}
        adminTask := &proto.AdminTask{
                Request: req,
        }

        defer func() {
                if err != nil {
                        log.LogInfof("[%s], remote %s remove raft member failed, req %v, err %s", p.String(), remoteAddr, adminTask, err.Error())
                        return
                }

                log.LogInfof("[%s], remote %s remove raft member success, req %v", p.String(), remoteAddr, adminTask)
        }()

        decode := json.NewDecoder(bytes.NewBuffer(p.Data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return err
        }

        log.LogInfof("[%s], remote %s remove raft member success, req %v", p.String(), remoteAddr, adminTask)

        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return err
        }

        if !mp.IsExsitPeer(req.RemovePeer) {
                p.PacketOkReply()
                m.respondToClient(conn, p)
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return nil
        }
        reqData, err = json.Marshal(req)
        if err != nil {
                err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
                        "Marshal %s", req.PartitionId, err)
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        if err = mp.CanRemoveRaftMember(req.RemovePeer); err != nil {
                err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
                        "Marshal %s", req.PartitionId, fmt.Sprintf("unavali RemovePeerID %v", req.RemovePeer.ID))
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        if req.RemovePeer.ID == 0 {
                err = errors.NewErrorf("[opRemoveMetaPartitionRaftMember]: partitionID= %d, "+
                        "Marshal %s", req.PartitionId, fmt.Sprintf("unavali RemovePeerID %v", req.RemovePeer.ID))
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        _, err = mp.ChangeMember(raftProto.ConfRemoveNode,
                raftProto.Peer{ID: req.RemovePeer.ID}, reqData)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return err
        }
        p.PacketOkReply()
        m.respondToClient(conn, p)

        return
}

func (m *metadataManager) opMetaBatchInodeGet(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.BatchInodeGetRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        log.LogDebugf("action[opMetaBatchInodeGet] req %v", req)
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.InodeGetBatch(req, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaBatchInodeGet] req: %d - %v, resp: %v, "+
                "body: %s", remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaPartitionTryToLeader(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        mp, err := m.getPartition(p.PartitionID)
        if err != nil {
                goto errDeal
        }
        if err = mp.TryToLeader(p.PartitionID); err != nil {
                goto errDeal
        }
        p.PacketOkReply()
        m.respondToClient(conn, p)
        return
errDeal:
        p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
        m.respondToClient(conn, p)
        return
}

func (m *metadataManager) opMetaDeleteInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.DeleteInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.DeleteInode(req, p, remoteAddr)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaDeleteInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchDeleteInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        var req *proto.DeleteInodeBatchRequest
        if err = json.Unmarshal(p.Data, &req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.DeleteInodeBatch(req, p, remoteAddr)
        log.LogDebugf("%s [opMetaDeleteInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)

        _ = m.respondToClientWithVer(conn, p)

        return
}

func (m *metadataManager) opMetaUpdateXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.UpdateXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.UpdateXAttr(req, p)
        m.updatePackRspSeq(mp, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaSetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaSetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.SetXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.SetXAttr(req, p)
        m.updatePackRspSeq(mp, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaSetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchSetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.BatchSetXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.BatchSetXAttr(req, p)
        m.updatePackRspSeq(mp, p)
        _ = m.respondToClient(conn, p)
        log.LogDebugf("%s [OpMetaBatchSetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaGetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.GetXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.GetXAttr(req, p)
        _ = m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaGetAllXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.GetAllXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.GetAllXAttr(req, p)
        _ = m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchGetXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.BatchGetXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.BatchGetXAttr(req, p)
        _ = m.respondToClient(conn, p)
        log.LogDebugf("%s [opMetaBatchGetXAttr req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaRemoveXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.RemoveXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.RemoveXAttr(req, p)
        m.updatePackRspSeq(mp, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaListXAttr(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.ListXAttrRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ListXAttr(req, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaGetXAttr] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchExtentsAdd(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.AppendExtentKeysRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        err = mp.BatchExtentAppend(req, p)
        m.updatePackRspSeq(mp, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaBatchExtentsAdd] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchObjExtentsAdd(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.AppendObjExtentKeysRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.BatchObjExtentAppend(req, p)
        _ = m.respondToClientWithVer(conn, p)
        log.LogDebugf("%s [opMetaBatchObjExtentsAdd] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opCreateMultipart(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.CreateMultipartRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClientWithVer(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.CreateMultipart(req, p)
        _ = m.respondToClientWithVer(conn, p)
        return
}

func (m *metadataManager) opRemoveMultipart(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.RemoveMultipartRequest{}

        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.RemoveMultipart(req, p)
        _ = m.respondToClient(conn, p)
        return
}

func (m *metadataManager) opGetExpiredMultipart(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.GetExpiredMultipartRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetExpiredMultipart] req: %v, resp: %v", req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.GetExpiredMultipart(req, p)
        _ = m.respondToClient(conn, p)
        return
}

func (m *metadataManager) opGetMultipart(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.GetMultipartRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.GetMultipart(req, p)
        _ = m.respondToClient(conn, p)
        return
}

func (m *metadataManager) opAppendMultipart(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.AddMultipartPartRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClientWithVer(conn, p)
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.AppendMultipart(req, p)
        _ = m.respondToClientWithVer(conn, p)
        return
}

func (m *metadataManager) opListMultipart(conn net.Conn, p *Packet, remoteAddr string) (err error) {
        req := &proto.ListMultipartRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opListMultipart] req: %v, resp: %v", req, err.Error())
                return
        }
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opListMultipart] req: %v, resp: %v", req, err.Error())
                return
        }
        if !mp.IsFollowerRead() && !m.serveProxy(conn, mp, p) {
                return
        }
        err = mp.ListMultipart(req, p)
        _ = m.respondToClient(conn, p)
        return
}

// Handle OpMetaTxCreateInode inode.
func (m *metadataManager) opTxCreateInode(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.TxCreateInodeRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                return
        }
        if err = m.checkMultiVersionStatus(mp, p); err != nil {
                err = errors.NewErrorf("[%v],req[%v],err[%v]", p.GetOpMsgWithReqAndResult(), req, string(p.Data))
                m.respondToClientWithVer(conn, p)
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.TxCreateInode(req, p, remoteAddr)
        m.updatePackRspSeq(mp, p)
        m.respondToClient(conn, p)
        log.LogDebugf("%s [opTxCreateInode] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) opMetaBatchSetInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.BatchSetMetaserverQuotaReuqest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opMetaBatchSetInodeQuota] req: %v, resp: %v", req, err.Error())
                return
        }
        log.LogInfof("[opMetaBatchSetInodeQuota] req [%v] decode req.", req)
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opMetaBatchSetInodeQuota] req: %v, resp: %v", req, err.Error())
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }
        resp := &proto.BatchSetMetaserverQuotaResponse{}
        err = mp.batchSetInodeQuota(req, resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        p.PacketOkWithBody(reply)
        _ = m.respondToClient(conn, p)
        log.LogInfof("[opMetaBatchSetInodeQuota] req [%v] resp [%v] success.", req, resp)
        return
}

func (m *metadataManager) opMetaBatchDeleteInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.BatchDeleteMetaserverQuotaReuqest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opMetaBatchDeleteInodeQuota] req: %v, resp: %v", req, err.Error())
                return
        }
        log.LogInfof("[opMetaBatchDeleteInodeQuota] req [%v] decode req.", req)
        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opMetaBatchDeleteInodeQuota] req: %v, resp: %v", req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }
        resp := &proto.BatchDeleteMetaserverQuotaResponse{}
        err = mp.batchDeleteInodeQuota(req, resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClient(conn, p)
                return
        }
        p.PacketOkWithBody(reply)
        _ = m.respondToClient(conn, p)
        log.LogInfof("[opMetaBatchDeleteInodeQuota] req [%v] resp [%v] success.", req, resp)
        return err
}

func (m *metadataManager) opMetaGetInodeQuota(conn net.Conn, p *Packet, remote string) (err error) {
        req := &proto.GetInodeQuotaRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionId)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[opGetMultipart] req: %v, resp: %v", req, err.Error())
                return
        }
        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.getInodeQuota(req.Inode, p)
        _ = m.respondToClient(conn, p)
        log.LogInfof("[opMetaGetInodeQuota] get inode[%v] quota success.", req.Inode)
        return
}

func (m *metadataManager) opMetaGetUniqID(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        req := &proto.GetUniqIDRequest{}
        if err = json.Unmarshal(p.Data, req); err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        mp, err := m.getPartition(req.PartitionID)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, ([]byte)(err.Error()))
                m.respondToClient(conn, p)
                err = errors.NewErrorf("[%v] req: %v, resp: %v", p.GetOpMsgWithReqAndResult(), req, err.Error())
                return
        }

        if !m.serveProxy(conn, mp, p) {
                return
        }

        err = mp.GetUniqID(p, req.Num)
        m.respondToClient(conn, p)
        if err != nil {
                log.LogErrorf("%s [opMetaGetUniqID] %s, "+
                        "response to client: %s", remoteAddr, err.Error(), p.GetResultMsg())
        }
        log.LogDebugf("%s [opMetaGetUniqID] req: %d - %v, resp: %v, body: %s",
                remoteAddr, p.GetReqID(), req, p.GetResultMsg(), p.Data)
        return
}

func (m *metadataManager) prepareCreateVersion(req *proto.MultiVersionOpRequest) (err error, opAagin bool) {
        var ver2Phase *verOp2Phase
        if value, ok := m.volUpdating.Load(req.VolumeID); ok {
                ver2Phase = value.(*verOp2Phase)
                if req.VerSeq < ver2Phase.verSeq {
                        err = fmt.Errorf("seq [%v] create less than loal %v", req.VerSeq, ver2Phase.verSeq)
                        return
                } else if req.VerSeq == ver2Phase.verPrepare {
                        if ver2Phase.status == proto.VersionWorking {
                                opAagin = true
                                return
                        }
                }
        }
        ver2Phase = &verOp2Phase{}
        ver2Phase.step = uint32(req.Op)
        ver2Phase.status = proto.VersionWorking
        ver2Phase.verPrepare = req.VerSeq

        m.volUpdating.Store(req.VolumeID, ver2Phase)

        log.LogWarnf("action[prepareCreateVersion] volume %v update to ver [%v] step %v",
                req.VolumeID, req.VerSeq, ver2Phase.step)
        return
}

func (m *metadataManager) checkVolVerList() (err error) {
        volumeArr := make(map[string]bool)

        log.LogDebugf("checkVolVerList start")
        m.Range(true, func(id uint64, partition MetaPartition) bool {
                volumeArr[partition.GetVolName()] = true
                return true
        })

        for volName := range volumeArr {
                mpsVerlist := make(map[uint64]*proto.VolVersionInfoList)
                // need get first or else the mp verlist may be change in the follower process
                m.Range(true, func(id uint64, partition MetaPartition) bool {
                        if partition.GetVolName() != volName {
                                return true
                        }
                        log.LogDebugf("action[checkVolVerList] volumeName %v id[%v] dp verlist %v partition.GetBaseConfig().PartitionId %v",
                                volName, id, partition.GetVerList(), partition.GetBaseConfig().PartitionId)
                        mpsVerlist[id] = &proto.VolVersionInfoList{VerList: partition.GetVerList()}
                        return true
                })
                var info *proto.VolVersionInfoList
                if info, err = masterClient.AdminAPI().GetVerList(volName); err != nil {
                        log.LogErrorf("action[checkVolVerList] volumeName %v err %v", volName, err)
                        return
                }

                log.LogDebugf("action[checkVolVerList] volumeName %v info %v", volName, info)
                m.Range(true, func(id uint64, partition MetaPartition) bool {
                        if partition.GetVolName() != volName {
                                return true
                        }
                        log.LogDebugf("action[checkVolVerList] volumeName %v info %v id[%v] ", volName, info, id)
                        if _, exist := mpsVerlist[id]; exist {
                                if err = partition.checkByMasterVerlist(mpsVerlist[id], info); err != nil {
                                        return true
                                }
                        }
                        if _, err = partition.checkVerList(info, false); err != nil {
                                log.LogErrorf("[checkVolVerList] volumeName %v err %v", volName, err)
                        }
                        return true
                })
        }
        return
}

func (m *metadataManager) commitCreateVersion(VolumeID string, VerSeq uint64, Op uint8, synchronize bool) (err error) {
        log.LogWarnf("action[commitCreateVersion] volume %v seq [%v]", VolumeID, VerSeq)
        var wg sync.WaitGroup
        // wg.Add(len(m.partitions))
        resultCh := make(chan error, len(m.partitions))
        m.Range(true, func(id uint64, partition MetaPartition) bool {
                if partition.GetVolName() != VolumeID {
                        return true
                }

                if _, ok := partition.IsLeader(); !ok {
                        return true
                }

                wg.Add(1)
                go func(mpId uint64, mp MetaPartition) {
                        defer wg.Done()
                        log.LogInfof("action[commitCreateVersion] volume %v mp  %v do HandleVersionOp verseq [%v]", VolumeID, mpId, VerSeq)
                        if err := mp.HandleVersionOp(Op, VerSeq, nil, synchronize); err != nil {
                                log.LogErrorf("action[commitCreateVersion] volume %v mp  %v do HandleVersionOp verseq [%v] err %v", VolumeID, mpId, VerSeq, err)
                                resultCh <- err
                                return
                        }
                }(id, partition)
                return true
        })

        wg.Wait()
        select {
        case err = <-resultCh:
                if err != nil {
                        close(resultCh)
                        return
                }
        default:
                log.LogInfof("action[commitCreateVersion] volume %v do HandleVersionOp verseq [%v] finished", VolumeID, VerSeq)
        }
        close(resultCh)

        if Op == proto.DeleteVersion {
                return
        }
        if Op == proto.CreateVersionPrepare {
                return
        }

        if value, ok := m.volUpdating.Load(VolumeID); ok {
                ver2Phase := value.(*verOp2Phase)
                log.LogWarnf("action[commitCreateVersion] try commit volume %v prepare seq [%v] with commit seq [%v]",
                        VolumeID, ver2Phase.verPrepare, VerSeq)
                if VerSeq < ver2Phase.verSeq {
                        err = fmt.Errorf("volname [%v] seq [%v] create less than loal %v", VolumeID, VerSeq, ver2Phase.verSeq)
                        log.LogErrorf("action[commitCreateVersion] err %v", err)
                        return
                }
                if ver2Phase.step != proto.CreateVersionPrepare {
                        err = fmt.Errorf("volname [%v] step not prepare", VolumeID)
                        log.LogErrorf("action[commitCreateVersion] err %v", err)
                        return
                }
                ver2Phase.verSeq = VerSeq
                ver2Phase.step = proto.CreateVersionCommit
                ver2Phase.status = proto.VersionWorkingFinished
                log.LogWarnf("action[commitCreateVersion] commit volume %v prepare seq [%v] with commit seq [%v]",
                        VolumeID, ver2Phase.verPrepare, VerSeq)
                return
        }

        err = fmt.Errorf("volname [%v] not found", VolumeID)
        log.LogErrorf("action[commitCreateVersion] err %v", err)

        return
}

func (m *metadataManager) updatePackRspSeq(mp MetaPartition, p *Packet) {
        if mp.GetVerSeq() > p.VerSeq {
                log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp ver [%v], packet ver [%v]", mp.GetVerSeq(), p.VerSeq)
                p.VerSeq = mp.GetVerSeq() // used to response to client and try update verSeq of client
                p.ExtentType |= proto.VersionListFlag
                p.VerList = make([]*proto.VolVersionInfo, len(mp.GetVerList()))
                copy(p.VerList, mp.GetVerList())
        }
        return
}

func (m *metadataManager) checkMultiVersionStatus(mp MetaPartition, p *Packet) (err error) {
        if (p.ExtentType&proto.MultiVersionFlag == 0) && mp.GetVerSeq() > 0 {
                log.LogWarnf("action[checkmultiSnap.multiVersionstatus] volname [%v] mp ver [%v], client use old ver before snapshot", mp.GetVolName(), mp.GetVerSeq())
                return fmt.Errorf("client use old ver before snapshot")
        }
        // meta node do not need to check verSeq as strictly as datanode,file append or modAppendWrite on meta node is invisible to other files.
        // only need to guarantee the verSeq wrote on meta nodes grow up linearly on client's angle
        log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp[%v] ver [%v], packet ver [%v] reqId %v", mp.GetBaseConfig().PartitionId, mp.GetVerSeq(), p.VerSeq, p.ReqID)
        if mp.GetVerSeq() >= p.VerSeq {
                if mp.GetVerSeq() > p.VerSeq {
                        log.LogDebugf("action[checkmultiSnap.multiVersionstatus] mp ver [%v], packet ver [%v]", mp.GetVerSeq(), p.VerSeq)
                        p.VerSeq = mp.GetVerSeq() // used to response to client and try update verSeq of client
                        p.ExtentType |= proto.VersionListFlag
                        p.VerList = make([]*proto.VolVersionInfo, len(mp.GetVerList()))
                        copy(p.VerList, mp.GetVerList())
                }
                return
        }
        if p.IsVersionList() {
                _, err = mp.checkVerList(&proto.VolVersionInfoList{VerList: p.VerList}, true)
                return
        }
        p.ResultCode = proto.OpAgainVerionList
        // need return and tell client
        err = fmt.Errorf("volname [%v] req seq [%v] but not found commit status", mp.GetVolName(), p.VerSeq)
        if value, ok := m.volUpdating.Load(mp.GetVolName()); ok {
                ver2Phase := value.(*verOp2Phase)
                if ver2Phase.isActiveReqToMaster {
                        return
                }
        }

        select {
        case m.verUpdateChan <- mp.GetVolName():
        default:
                log.LogWarnf("channel is full, volname [%v] not be queued", mp.GetVolName())
        }
        return
}

func (m *metadataManager) checkAndPromoteVersion(volName string) (err error) {
        log.LogInfof("action[checkmultiSnap.multiVersionstatus] volumeName %v", volName)
        var info *proto.VolumeVerInfo
        if value, ok := m.volUpdating.Load(volName); ok {
                ver2Phase := value.(*verOp2Phase)

                if atomic.LoadUint32(&ver2Phase.status) != proto.VersionWorkingAbnormal &&
                        atomic.LoadUint32(&ver2Phase.step) == proto.CreateVersionPrepare {

                        ver2Phase.Lock() // here trylock may be better after go1.18 adapted to compile
                        defer ver2Phase.Unlock()

                        // check again in case of sth already happened by other goroutine during be blocked by lock
                        if atomic.LoadUint32(&ver2Phase.status) == proto.VersionWorkingAbnormal ||
                                atomic.LoadUint32(&ver2Phase.step) != proto.CreateVersionPrepare {

                                log.LogWarnf("action[checkmultiSnap.multiVersionstatus] volumeName %v status [%v] step %v",
                                        volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step))
                                return
                        }

                        if info, err = masterClient.AdminAPI().GetVerInfo(volName); err != nil {
                                log.LogErrorf("action[checkmultiSnap.multiVersionstatus] volumeName %v status [%v] step %v err %v",
                                        volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step), err)
                                return
                        }
                        if info.VerSeqPrepare != ver2Phase.verPrepare {
                                atomic.StoreUint32(&ver2Phase.status, proto.VersionWorkingAbnormal)
                                err = fmt.Errorf("volumeName %v status [%v] step %v",
                                        volName, atomic.LoadUint32(&ver2Phase.status), atomic.LoadUint32(&ver2Phase.step))
                                log.LogErrorf("action[checkmultiSnap.multiVersionstatus] err %v", err)
                                return
                        }
                        if info.VerPrepareStatus == proto.CreateVersionCommit {
                                if err = m.commitCreateVersion(volName, info.VerSeqPrepare, proto.CreateVersionCommit, false); err != nil {
                                        log.LogErrorf("action[checkmultiSnap.multiVersionstatus] err %v", err)
                                        return
                                }
                        }
                }
        } else {
                log.LogErrorf("action[checkmultiSnap.multiVersionstatus] volumeName %v not found", volName)
        }
        return
}

func (m *metadataManager) opMultiVersionOp(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        // For ack to master
        data := p.Data
        m.responseAckOKToMaster(conn, p)

        var (
                req       = &proto.MultiVersionOpRequest{}
                resp      = &proto.MultiVersionOpResponse{}
                adminTask = &proto.AdminTask{
                        Request: req,
                }
                opAgain bool
        )
        log.LogDebugf("action[opMultiVersionOp] volume %v op [%v]", req.VolumeID, req.Op)

        start := time.Now()
        decode := json.NewDecoder(bytes.NewBuffer(data))
        decode.UseNumber()
        if err = decode.Decode(adminTask); err != nil {
                resp.Status = proto.TaskFailed
                resp.Result = err.Error()
                log.LogErrorf("action[opMultiVersionOp] %v mp  err %v do Decoder", req.VolumeID, err.Error())
                goto end
        }

        resp.Status = proto.TaskSucceeds
        resp.VolumeID = req.VolumeID
        resp.Addr = req.Addr
        resp.VerSeq = req.VerSeq
        resp.Op = req.Op

        if req.Op == proto.CreateVersionPrepare {
                if err, opAgain = m.prepareCreateVersion(req); err != nil || opAgain {
                        log.LogErrorf("action[opMultiVersionOp] %v mp  err %v do Decoder", req.VolumeID, err)
                        goto end
                }
                if err = m.commitCreateVersion(req.VolumeID, req.VerSeq, req.Op, true); err != nil {
                        log.LogErrorf("action[opMultiVersionOp] %v mp  err %v do commitCreateVersion", req.VolumeID, err.Error())
                        goto end
                }
        } else if req.Op == proto.CreateVersionCommit || req.Op == proto.DeleteVersion {
                if err = m.commitCreateVersion(req.VolumeID, req.VerSeq, req.Op, false); err != nil {
                        log.LogErrorf("action[opMultiVersionOp] %v mp  err %v do commitCreateVersion", req.VolumeID, err.Error())
                        goto end
                }
        }
end:
        if err != nil {
                resp.Result = err.Error()
        }
        adminTask.Request = nil
        adminTask.Response = resp
        if errRsp := m.respondToMaster(adminTask); errRsp != nil {
                log.LogInfof("action[opMultiVersionOp] %s pkt %s, resp success req:%v; respAdminTask: %v, resp: %v, errRsp %v err %v",
                        remoteAddr, p.String(), req, adminTask, resp, errRsp, err)
        }

        if log.EnableInfo() {
                rspData, _ := json.Marshal(resp)
                log.LogInfof("action[opMultiVersionOp] %s pkt %s, resp success req:%v; respAdminTask: %v, resp: %v, cost %s",
                        remoteAddr, p.String(), req, adminTask, string(rspData), time.Since(start).String())
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "net"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        ForceClosedConnect = true
        NoClosedConnect    = false
)

var ErrForbiddenMetaPartition = errors.New("meta partition is forbidden")

func (m *metadataManager) IsForbiddenOp(mp MetaPartition, reqOp uint8) bool {
        if !mp.IsForbidden() {
                return false
        }
        switch reqOp {
        case
                // dentry
                proto.OpMetaCreateDentry,
                proto.OpMetaTxCreateDentry,
                proto.OpQuotaCreateDentry,
                proto.OpMetaDeleteDentry,
                proto.OpMetaTxDeleteDentry,
                proto.OpMetaBatchDeleteDentry,
                proto.OpMetaUpdateDentry,
                proto.OpMetaTxUpdateDentry,
                // extend
                proto.OpMetaUpdateXAttr,
                proto.OpMetaSetXAttr,
                proto.OpMetaBatchSetXAttr,
                proto.OpMetaRemoveXAttr,
                // extent
                proto.OpMetaTruncate,
                proto.OpMetaExtentsAdd,
                proto.OpMetaExtentAddWithCheck,
                proto.OpMetaObjExtentAdd,
                proto.OpMetaBatchObjExtentsAdd,
                proto.OpMetaBatchExtentsAdd,
                proto.OpMetaExtentsDel,
                // inode
                proto.OpMetaCreateInode,
                proto.OpQuotaCreateInode,
                proto.OpMetaTxUnlinkInode,
                proto.OpMetaUnlinkInode,
                proto.OpMetaBatchUnlinkInode,
                proto.OpMetaTxLinkInode,
                proto.OpMetaLinkInode,
                proto.OpMetaEvictInode,
                proto.OpMetaBatchEvictInode,
                proto.OpMetaSetattr,
                proto.OpMetaBatchDeleteInode,
                proto.OpMetaClearInodeCache,
                proto.OpMetaTxCreateInode,
                // multipart
                proto.OpAddMultipartPart,
                proto.OpRemoveMultipart,
                proto.OpCreateMultipart,
                // quota
                proto.OpMetaBatchSetInodeQuota,
                proto.OpMetaBatchDeleteInodeQuota:

                return true
        default:
                return false
        }
}

// The proxy is used during the leader change. When a leader of a partition changes, the proxy forwards the request to
// the new leader.
func (m *metadataManager) serveProxy(conn net.Conn, mp MetaPartition,
        p *Packet) (ok bool) {
        var (
                mConn      *net.TCPConn
                leaderAddr string
                err        error
                reqID      = p.ReqID
                reqOp      = p.Opcode
        )

        // check forbidden
        if m.IsForbiddenOp(mp, reqOp) {
                err = ErrForbiddenMetaPartition
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.respondToClient(conn, p)
                return false
        }

        if leaderAddr, ok = mp.IsLeader(); ok {
                return
        }
        if leaderAddr == "" {
                err = ErrNoLeader
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                goto end
        }

        mConn, err = m.connPool.GetConnect(leaderAddr)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.connPool.PutConnect(mConn, ForceClosedConnect)
                goto end
        }

        // send to master connection
        if err = p.WriteToConn(mConn); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.connPool.PutConnect(mConn, ForceClosedConnect)
                goto end
        }

        // read connection from the master
        if err = p.ReadFromConnWithVer(mConn, proto.NoReadDeadlineTime); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                m.connPool.PutConnect(mConn, ForceClosedConnect)
                goto end
        }
        if reqID != p.ReqID || reqOp != p.Opcode {
                log.LogErrorf("serveProxy: send and received packet mismatch: req(%v_%v) resp(%v_%v)",
                        reqID, reqOp, p.ReqID, p.Opcode)
        }
        m.connPool.PutConnect(mConn, NoClosedConnect)
end:
        m.respondToClient(conn, p)
        if err != nil {
                log.LogErrorf("[serveProxy]: req: %d - %v, %v, packet(%v)", p.GetReqID(),
                        p.GetOpMsg(), err, p)
        }
        log.LogDebugf("[serveProxy] req: %d - %v, resp: %v, packet(%v)", p.GetReqID(), p.GetOpMsg(),
                p.GetResultMsg(), p)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "net"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// Reply operation results to the master.
func (m *metadataManager) respondToMaster(task *proto.AdminTask) (err error) {
        // handle panic
        defer func() {
                if r := recover(); r != nil {
                        switch data := r.(type) {
                        case error:
                                err = data
                        default:
                                err = errors.New(data.(string))
                        }
                }
        }()
        if err = masterClient.NodeAPI().ResponseMetaNodeTask(task); err != nil {
                err = errors.Trace(err, "try respondToMaster failed")
        }
        return
}

// Reply data through tcp connection to the client.
func (m *metadataManager) respondToClientWithVer(conn net.Conn, p *Packet) (err error) {
        // Handle panic
        defer func() {
                if r := recover(); r != nil {
                        switch data := r.(type) {
                        case error:
                                err = data
                        default:
                                err = errors.New(data.(string))
                        }
                }
        }()

        // process data and send reply though specified tcp connection.
        if p.VerSeq > 0 {
                p.ExtentType |= proto.MultiVersionFlag
        }
        err = p.WriteToConn(conn)
        if err != nil {
                log.LogErrorf("response to client[%s], "+
                        "request[%s], response packet[%s]",
                        err.Error(), p.GetOpMsg(), p.GetResultMsg())
        }
        return
}

// Reply data through tcp connection to the client.
func (m *metadataManager) respondToClient(conn net.Conn, p *Packet) (err error) {
        // Handle panic
        defer func() {
                if r := recover(); r != nil {
                        switch data := r.(type) {
                        case error:
                                err = data
                        default:
                                err = errors.New(data.(string))
                        }
                }
        }()

        // process data and send reply though specified tcp connection.
        err = p.WriteToConn(conn)
        if err != nil {
                log.LogErrorf("response to client[%s], "+
                        "request[%s], response packet[%s]",
                        err.Error(), p.GetOpMsg(), p.GetResultMsg())
        }
        return
}

func (m *metadataManager) responseAckOKToMaster(conn net.Conn, p *Packet) {
        go func() {
                p.PacketOkReply()
                if err := p.WriteToConn(conn); err != nil {
                        log.LogErrorf("ack master response: %s", err.Error())
                }
        }()
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

type MetaQuotaManager struct {
        statisticTemp        *sync.Map // key quotaId, value proto.QuotaUsedInfo
        statisticBase        *sync.Map // key quotaId, value proto.QuotaUsedInfo
        statisticRebuildTemp *sync.Map // key quotaId, value proto.QuotaUsedInfo
        statisticRebuildBase *sync.Map // key quotaId, value proto.QuotaUsedInfo
        limitedMap           *sync.Map
        rbuilding            bool
        volName              string
        rwlock               sync.RWMutex
        mpID                 uint64
        enable               bool
}

type MetaQuotaInode struct {
        inode    *Inode
        quotaIds []uint32
}

type TxMetaQuotaInode struct {
        txinode  *TxInode
        quotaIds []uint32
}

func NewQuotaManager(volName string, mpId uint64) (mqMgr *MetaQuotaManager) {
        mqMgr = &MetaQuotaManager{
                statisticTemp:        new(sync.Map),
                statisticBase:        new(sync.Map),
                statisticRebuildTemp: new(sync.Map),
                statisticRebuildBase: new(sync.Map),
                limitedMap:           new(sync.Map),
                volName:              volName,
                mpID:                 mpId,
        }
        return
}

func (qInode *MetaQuotaInode) Marshal() (result []byte, err error) {
        var inodeBytes []byte
        quotaBytes := bytes.NewBuffer(make([]byte, 0, 128))
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        inodeBytes, err = qInode.inode.Marshal()
        if err != nil {
                return
        }
        inodeLen := uint32(len(inodeBytes))
        if err = binary.Write(buff, binary.BigEndian, inodeLen); err != nil {
                return
        }
        buff.Write(inodeBytes)
        for _, quotaId := range qInode.quotaIds {
                if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
                        return
                }
        }
        buff.Write(quotaBytes.Bytes())
        result = buff.Bytes()
        log.LogDebugf("MetaQuotaInode Marshal inode[%v] inodeLen [%v] size [%v]", qInode.inode.Inode, inodeLen, len(result))
        return
}

func (qInode *MetaQuotaInode) Unmarshal(raw []byte) (err error) {
        var inodeLen uint32
        var quotaId uint32
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &inodeLen); err != nil {
                return
        }
        inodeBytes := make([]byte, inodeLen)
        if _, err = buff.Read(inodeBytes); err != nil {
                return
        }
        log.LogDebugf("MetaQuotaInode Unmarshal inodeLen [%v] size [%v]", inodeBytes, len(raw))
        qInode.inode = NewInode(0, 0)
        if err = qInode.inode.Unmarshal(inodeBytes); err != nil {
                return
        }
        for {
                if buff.Len() == 0 {
                        break
                }
                if err = binary.Read(buff, binary.BigEndian, &quotaId); err != nil {
                        return
                }
                qInode.quotaIds = append(qInode.quotaIds, quotaId)
        }
        return
}

func (qInode *TxMetaQuotaInode) Marshal() (result []byte, err error) {
        var inodeBytes []byte
        quotaBytes := bytes.NewBuffer(make([]byte, 0, 128))
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        inodeBytes, err = qInode.txinode.Marshal()
        if err != nil {
                return
        }
        inodeLen := uint32(len(inodeBytes))
        if err = binary.Write(buff, binary.BigEndian, inodeLen); err != nil {
                return
        }
        buff.Write(inodeBytes)
        for _, quotaId := range qInode.quotaIds {
                if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
                        return
                }
        }
        buff.Write(quotaBytes.Bytes())
        result = buff.Bytes()
        log.LogDebugf("TxMetaQuotaInode Marshal inode[%v] inodeLen [%v] size [%v]", qInode.txinode.Inode.Inode, inodeLen, len(result))
        return
}

func (qInode *TxMetaQuotaInode) Unmarshal(raw []byte) (err error) {
        var inodeLen uint32
        var quotaId uint32
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &inodeLen); err != nil {
                return
        }
        inodeBytes := make([]byte, inodeLen)
        if _, err = buff.Read(inodeBytes); err != nil {
                return
        }
        log.LogDebugf("TxMetaQuotaInode Unmarshal inodeLen [%v] size [%v]", inodeBytes, len(raw))
        qInode.txinode = NewTxInode(0, 0, nil)
        if err = qInode.txinode.Unmarshal(inodeBytes); err != nil {
                return
        }
        for {
                if buff.Len() == 0 {
                        break
                }
                if err = binary.Read(buff, binary.BigEndian, &quotaId); err != nil {
                        return
                }
                qInode.quotaIds = append(qInode.quotaIds, quotaId)
        }
        return
}

func (mqMgr *MetaQuotaManager) setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo) {
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()

        for _, info := range infos {
                if mqMgr.volName != info.VolName {
                        continue
                }
                mqMgr.enable = info.Enable
                mqMgr.limitedMap.Store(info.QuotaId, info.LimitedInfo)
                log.LogDebugf("mp[%v] quotaId [%v] limitedInfo [%v]", mqMgr.mpID, info.QuotaId, info.LimitedInfo)
        }
        mqMgr.limitedMap.Range(func(key, value interface{}) bool {
                quotaId := key.(uint32)
                found := false

                for _, info := range infos {
                        if mqMgr.volName != info.VolName {
                                continue
                        }
                        if info.QuotaId == quotaId {
                                found = true
                                break
                        }
                }

                if !found {
                        mqMgr.limitedMap.Delete(quotaId)
                }
                return true
        })
        return
}

func (mqMgr *MetaQuotaManager) getQuotaReportInfos() (infos []*proto.QuotaReportInfo) {
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()
        var usedInfo proto.QuotaUsedInfo
        mqMgr.statisticTemp.Range(func(key, value interface{}) bool {
                usedInfo = value.(proto.QuotaUsedInfo)
                if value, isFind := mqMgr.statisticBase.Load(key.(uint32)); isFind {
                        baseInfo := value.(proto.QuotaUsedInfo)
                        log.LogDebugf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v] baseInfo [%v]", mqMgr.mpID,
                                key.(uint32), usedInfo, baseInfo)
                        usedInfo.Add(&baseInfo)
                        if usedInfo.UsedFiles < 0 {
                                log.LogWarnf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
                                usedInfo.UsedFiles = 0
                        }
                        if usedInfo.UsedBytes < 0 {
                                log.LogWarnf("[getQuotaReportInfos] statisticTemp mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
                                usedInfo.UsedBytes = 0
                        }
                }
                mqMgr.statisticBase.Store(key.(uint32), usedInfo)
                return true
        })
        mqMgr.statisticTemp = new(sync.Map)
        mqMgr.statisticBase.Range(func(key, value interface{}) bool {
                quotaId := key.(uint32)
                if _, ok := mqMgr.limitedMap.Load(quotaId); !ok {
                        return true
                }
                usedInfo = value.(proto.QuotaUsedInfo)
                reportInfo := &proto.QuotaReportInfo{
                        QuotaId:  quotaId,
                        UsedInfo: usedInfo,
                }
                infos = append(infos, reportInfo)
                log.LogDebugf("[getQuotaReportInfos] statisticBase mp[%v] key [%v] usedInfo [%v]", mqMgr.mpID, key.(uint32), usedInfo)
                return true
        })
        return
}

func (mqMgr *MetaQuotaManager) statisticRebuildStart() bool {
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()
        if !mqMgr.enable {
                return false
        }

        if mqMgr.rbuilding {
                return false
        }
        mqMgr.rbuilding = true
        return true
}

func (mqMgr *MetaQuotaManager) statisticRebuildFin(rebuild bool) {
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()
        mqMgr.rbuilding = false
        if !rebuild {
                mqMgr.statisticRebuildBase = new(sync.Map)
                mqMgr.statisticRebuildTemp = new(sync.Map)
                return
        }
        mqMgr.statisticBase = mqMgr.statisticRebuildBase
        mqMgr.statisticTemp = mqMgr.statisticRebuildTemp
        mqMgr.statisticRebuildBase = new(sync.Map)
        mqMgr.statisticRebuildTemp = new(sync.Map)

        if log.EnableInfo() {
                mqMgr.statisticTemp.Range(func(key, value interface{}) bool {
                        quotaId := key.(uint32)
                        usedInfo := value.(proto.QuotaUsedInfo)
                        log.LogInfof("statisticRebuildFin statisticTemp  mp[%v] quotaId [%v] usedInfo [%v]", mqMgr.mpID, quotaId, usedInfo)
                        return true
                })
                mqMgr.statisticBase.Range(func(key, value interface{}) bool {
                        quotaId := key.(uint32)
                        usedInfo := value.(proto.QuotaUsedInfo)
                        log.LogInfof("statisticRebuildFin statisticBase  mp[%v] quotaId [%v] usedInfo [%v]", mqMgr.mpID, quotaId, usedInfo)
                        return true
                })
        }
}

func (mqMgr *MetaQuotaManager) IsOverQuota(size bool, files bool, quotaId uint32) (status uint8) {
        var limitedInfo proto.QuotaLimitedInfo
        mqMgr.rwlock.RLock()
        defer mqMgr.rwlock.RUnlock()
        if !mqMgr.enable {
                log.LogInfof("IsOverQuota quota [%v] is disable.", quotaId)
                return
        }
        value, isFind := mqMgr.limitedMap.Load(quotaId)
        if isFind {
                limitedInfo = value.(proto.QuotaLimitedInfo)
                if size && limitedInfo.LimitedBytes {
                        status = proto.OpNoSpaceErr
                }

                if files && limitedInfo.LimitedFiles {
                        status = proto.OpNoSpaceErr
                }
        }
        log.LogInfof("IsOverQuota quotaId [%v] limitedInfo[%v] status [%v] isFind [%v]", quotaId, limitedInfo, status, isFind)
        return
}

func (mqMgr *MetaQuotaManager) updateUsedInfo(size int64, files int64, quotaId uint32) {
        var baseInfo proto.QuotaUsedInfo
        var baseTemp proto.QuotaUsedInfo
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()

        value, isFind := mqMgr.statisticTemp.Load(quotaId)
        if isFind {
                baseInfo = value.(proto.QuotaUsedInfo)
        }
        baseInfo.UsedBytes += size
        baseInfo.UsedFiles += files
        mqMgr.statisticTemp.Store(quotaId, baseInfo)
        if mqMgr.rbuilding {
                value, isFind = mqMgr.statisticRebuildTemp.Load(quotaId)
                if isFind {
                        baseTemp = value.(proto.QuotaUsedInfo)
                } else {
                        baseTemp.UsedBytes = 0
                        baseTemp.UsedFiles = 0
                }
                baseTemp.UsedBytes += size
                baseTemp.UsedFiles += files
                mqMgr.statisticRebuildTemp.Store(quotaId, baseTemp)
        }
        log.LogDebugf("updateUsedInfo mpId [%v] quotaId [%v] baseInfo [%v] baseTemp[%v]", mqMgr.mpID, quotaId, baseInfo, baseTemp)
        return
}

func (mqMgr *MetaQuotaManager) EnableQuota() bool {
        return mqMgr.enable
}

func (mqMgr *MetaQuotaManager) getUsedInfoForTest(quotaId uint32) (size int64, files int64) {
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()
        var baseInfo proto.QuotaUsedInfo
        value, isFind := mqMgr.statisticTemp.Load(quotaId)
        if isFind {
                baseInfo = value.(proto.QuotaUsedInfo)
        }
        return baseInfo.UsedBytes, baseInfo.UsedFiles
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        syslog "log"
        "os"
        "strconv"
        "strings"
        "sync/atomic"
        "time"

        "github.com/xtaci/smux"

        "github.com/cubefs/cubefs/cmd/common"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        masterSDK "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

var (
        clusterInfo *proto.ClusterInfo
        // masterClient   *masterSDK.MasterClient
        masterClient   *masterSDK.MasterCLientWithResolver
        configTotalMem uint64
        serverPort     string
        smuxPortShift  int
        smuxPool       *util.SmuxConnectPool
        smuxPoolCfg    = util.DefaultSmuxConnPoolConfig()
)

// The MetaNode manages the dentry and inode information of the meta partitions on a meta node.
// The data consistency is ensured by Raft.
type MetaNode struct {
        nodeId                    uint64
        listen                    string
        bindIp                    bool
        metadataDir               string // root dir of the metaNode
        raftDir                   string // root dir of the raftStore log
        metadataManager           MetadataManager
        localAddr                 string
        clusterId                 string
        raftStore                 raftstore.RaftStore
        raftHeartbeatPort         string
        raftReplicatePort         string
        raftRetainLogs            uint64
        raftSyncSnapFormatVersion uint32 // format version of snapshot that raft leader sent to follower
        zoneName                  string
        httpStopC                 chan uint8
        smuxStopC                 chan uint8
        metrics                   *MetaNodeMetrics
        tickInterval              int
        raftRecvBufSize           int
        connectionCnt             int64
        clusterUuid               string
        clusterUuidEnable         bool
        serviceIDKey              string

        control common.Control
}

// Start starts up the meta node with the specified configuration.
//  1. Start and load each meta partition from the snapshot.
//  2. Restore raftStore fsm of each meta node range.
//  3. Start server and accept connection from the master and clients.
func (m *MetaNode) Start(cfg *config.Config) (err error) {
        return m.control.Start(m, cfg, doStart)
}

// Shutdown stops the meta node.
func (m *MetaNode) Shutdown() {
        m.control.Shutdown(m, doShutdown)
}

func (m *MetaNode) checkLocalPartitionMatchWithMaster() (err error) {
        var metaNodeInfo *proto.MetaNodeInfo
        for i := 0; i < 3; i++ {
                if metaNodeInfo, err = masterClient.NodeAPI().GetMetaNode(fmt.Sprintf("%s:%s", m.localAddr, m.listen)); err != nil {
                        log.LogErrorf("checkLocalPartitionMatchWithMaster: get MetaNode info fail: err(%v)", err)
                        continue
                }
                break
        }

        if err != nil {
                return
        }

        if len(metaNodeInfo.PersistenceMetaPartitions) == 0 {
                return
        }
        lackPartitions := make([]uint64, 0)
        for _, partitionID := range metaNodeInfo.PersistenceMetaPartitions {
                _, err := m.metadataManager.GetPartition(partitionID)
                if err != nil {
                        lackPartitions = append(lackPartitions, partitionID)
                }
        }
        if len(lackPartitions) == 0 {
                return
        }
        m.metrics.MetricMetaFailedPartition.SetWithLabels(float64(1), map[string]string{
                "partids": fmt.Sprintf("%v", lackPartitions),
                "node":    m.localAddr + ":" + m.listen,
                "nodeid":  fmt.Sprintf("%d", m.nodeId),
        })
        log.LogErrorf("LackPartitions %v on metanode %v, please deal quickly", lackPartitions, m.localAddr+":"+m.listen)
        return
}

func doStart(s common.Server, cfg *config.Config) (err error) {
        m, ok := s.(*MetaNode)
        if !ok {
                return errors.New("Invalid node Type!")
        }
        if err = m.parseConfig(cfg); err != nil {
                return
        }
        if err = m.register(); err != nil {
                return
        }

        if err = m.startRaftServer(cfg); err != nil {
                return
        }
        if err = m.newMetaManager(); err != nil {
                return
        }
        if err = m.startServer(); err != nil {
                return
        }
        if err = m.startSmuxServer(); err != nil {
                return
        }
        if err = m.startMetaManager(); err != nil {
                return
        }
        if err = m.registerAPIHandler(); err != nil {
                return
        }

        go m.startUpdateNodeInfo()

        exporter.Init(cfg.GetString("role"), cfg)
        m.startStat()

        // check local partition compare with master ,if lack,then not start
        if err = m.checkLocalPartitionMatchWithMaster(); err != nil {
                syslog.Println(err)
                exporter.Warning(err.Error())
                return
        }
        exporter.RegistConsul(m.clusterId, cfg.GetString("role"), cfg)
        return
}

func doShutdown(s common.Server) {
        m, ok := s.(*MetaNode)
        if !ok {
                return
        }
        m.stopUpdateNodeInfo()
        // shutdown node and release the resource
        m.stopStat()
        m.stopServer()
        m.stopSmuxServer()
        m.stopMetaManager()
        m.stopRaftServer()
        masterClient.Stop()
}

// Sync blocks the invoker's goroutine until the meta node shuts down.
func (m *MetaNode) Sync() {
        m.control.Sync()
}

func (m *MetaNode) parseConfig(cfg *config.Config) (err error) {
        if cfg == nil {
                err = errors.New("invalid configuration")
                return
        }
        m.localAddr = cfg.GetString(cfgLocalIP)
        m.listen = cfg.GetString(proto.ListenPort)
        m.bindIp = cfg.GetBool(proto.BindIpKey)
        serverPort = m.listen
        m.metadataDir = cfg.GetString(cfgMetadataDir)
        m.raftDir = cfg.GetString(cfgRaftDir)
        m.raftHeartbeatPort = cfg.GetString(cfgRaftHeartbeatPort)
        m.raftReplicatePort = cfg.GetString(cfgRaftReplicaPort)
        m.tickInterval = int(cfg.GetFloat(cfgTickInterval))
        m.raftRecvBufSize = int(cfg.GetInt(cfgRaftRecvBufSize))
        m.zoneName = cfg.GetString(cfgZoneName)

        deleteBatchCount := cfg.GetInt64(cfgDeleteBatchCount)
        if deleteBatchCount > 1 {
                updateDeleteBatchCount(uint64(deleteBatchCount))
        }

        m.serviceIDKey = cfg.GetString(cfgServiceIDKey)

        total, _, err := util.GetMemInfo()
        if err != nil {
                log.LogErrorf("get total mem failed, err %s", err.Error())
        }

        ratioStr := cfg.GetString(cfgMemRatio)
        if err == nil && ratioStr != "" {
                ratio, _ := strconv.Atoi(ratioStr)
                if ratio <= 0 || ratio >= 100 {
                        return fmt.Errorf("cfgMemRatio is not legal, shoule beteen 1-100, now %s", ratioStr)
                }

                configTotalMem = total * uint64(ratio) / 100
                log.LogInfof("configTotalMem by ratio is: mem [%d], ratio[%d]", configTotalMem, ratio)
        } else {
                configTotalMem, _ = strconv.ParseUint(cfg.GetString(cfgTotalMem), 10, 64)
                if configTotalMem == 0 {
                        return fmt.Errorf("bad totalMem config,Recommended to be configured as 80 percent of physical machine memory")
                }
        }

        if err == nil && configTotalMem > total-util.GB {
                return fmt.Errorf("bad totalMem config,Recommended to be configured as 80 percent of physical machine memory")
        }

        if m.metadataDir == "" {
                return fmt.Errorf("bad metadataDir config")
        }
        if m.listen == "" {
                return fmt.Errorf("bad listen config")
        }
        if m.raftDir == "" {
                return fmt.Errorf("bad raftDir config")
        }
        if m.raftHeartbeatPort == "" {
                return fmt.Errorf("bad raftHeartbeatPort config")
        }
        if m.raftReplicatePort == "" {
                return fmt.Errorf("bad cfgRaftReplicaPort config")
        }

        raftRetainLogs := cfg.GetString(cfgRetainLogs)
        if raftRetainLogs != "" {
                if m.raftRetainLogs, err = strconv.ParseUint(raftRetainLogs, 10, 64); err != nil {
                        return fmt.Errorf("%v, err:%v", proto.ErrInvalidCfg, err.Error())
                }
        }
        if m.raftRetainLogs <= 0 {
                m.raftRetainLogs = DefaultRaftNumOfLogsToRetain
        }
        syslog.Println("conf raftRetainLogs=", m.raftRetainLogs)
        log.LogInfof("[parseConfig] raftRetainLogs[%v]", m.raftRetainLogs)

        if cfg.HasKey(cfgRaftSyncSnapFormatVersion) {
                raftSyncSnapFormatVersion := uint32(cfg.GetInt64(cfgRaftSyncSnapFormatVersion))
                if raftSyncSnapFormatVersion < 0 || raftSyncSnapFormatVersion > SnapFormatVersion_1 {
                        m.raftSyncSnapFormatVersion = SnapFormatVersion_1
                        log.LogInfof("invalid config raftSyncSnapFormatVersion, using default[%v]", m.raftSyncSnapFormatVersion)
                } else {
                        m.raftSyncSnapFormatVersion = raftSyncSnapFormatVersion
                        log.LogInfof("by config raftSyncSnapFormatVersion:[%v]", m.raftSyncSnapFormatVersion)
                }
        } else {
                m.raftSyncSnapFormatVersion = SnapFormatVersion_1
                log.LogInfof("using default raftSyncSnapFormatVersion[%v]", m.raftSyncSnapFormatVersion)
        }
        syslog.Println("conf raftSyncSnapFormatVersion=", m.raftSyncSnapFormatVersion)
        log.LogInfof("[parseConfig] raftSyncSnapFormatVersion[%v]", m.raftSyncSnapFormatVersion)

        constCfg := config.ConstConfig{
                Listen:           m.listen,
                RaftHeartbetPort: m.raftHeartbeatPort,
                RaftReplicaPort:  m.raftReplicatePort,
        }
        ok := false
        if ok, err = config.CheckOrStoreConstCfg(m.metadataDir, config.DefaultConstConfigFile, &constCfg); !ok {
                log.LogErrorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
                return fmt.Errorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
        }

        log.LogInfof("[parseConfig] load localAddr[%v].", m.localAddr)
        log.LogInfof("[parseConfig] load listen[%v].", m.listen)
        log.LogInfof("[parseConfig] load metadataDir[%v].", m.metadataDir)
        log.LogInfof("[parseConfig] load raftDir[%v].", m.raftDir)
        log.LogInfof("[parseConfig] load raftHeartbeatPort[%v].", m.raftHeartbeatPort)
        log.LogInfof("[parseConfig] load raftReplicatePort[%v].", m.raftReplicatePort)
        log.LogInfof("[parseConfig] load zoneName[%v].", m.zoneName)

        if err = m.parseSmuxConfig(cfg); err != nil {
                return fmt.Errorf("parseSmuxConfig fail err %v", err)
        } else {
                log.LogInfof("Start: init smux conn pool (%v).", smuxPoolCfg)
                smuxPool = util.NewSmuxConnectPool(smuxPoolCfg)
        }

        addrs := cfg.GetSlice(proto.MasterAddr)
        masters := make([]string, 0, len(addrs))
        for _, addr := range addrs {
                masters = append(masters, addr.(string))
        }

        updateInterval := cfg.GetInt(configNameResolveInterval)
        if updateInterval <= 0 || updateInterval > 60 {
                log.LogWarnf("name resolving interval[1-60] is set to default: %v", DefaultNameResolveInterval)
                updateInterval = DefaultNameResolveInterval
        }

        // masterClient = masterSDK.NewMasterClient(masters, false)
        masterClient = masterSDK.NewMasterCLientWithResolver(masters, false, updateInterval)
        if masterClient == nil {
                err = fmt.Errorf("parseConfig: masters addrs format err[%v]", masters)
                log.LogErrorf("parseConfig: masters addrs format err[%v]", masters)
                return err
        }
        if err = masterClient.Start(); err != nil {
                return err
        }
        err = m.validConfig()
        return
}

func (m *MetaNode) parseSmuxConfig(cfg *config.Config) error {
        // SMux port
        smuxPortShift = int(cfg.GetInt64(cfgSmuxPortShift))
        if smuxPortShift == 0 {
                smuxPortShift = util.DefaultSmuxPortShift
        }

        // SMux buffer
        maxBuffer := cfg.GetInt64(cfgSmuxMaxBuffer)
        if maxBuffer > 0 {
                smuxPoolCfg.MaxReceiveBuffer = int(maxBuffer)
                if smuxPoolCfg.MaxStreamBuffer > int(maxBuffer) {
                        smuxPoolCfg.MaxStreamBuffer = int(maxBuffer)
                }

                if err := smux.VerifyConfig(smuxPoolCfg.Config); err != nil {
                        return err
                }
        }

        maxConn := cfg.GetInt64(cfgSmuxMaxConn)
        if maxConn > 0 {
                smuxPoolCfg.ConnsPerAddr = int(maxConn)
        }

        maxStreamPerConn := cfg.GetInt64(cfgSmuxStreamPerConn)
        if maxStreamPerConn > 0 {
                smuxPoolCfg.StreamsPerConn = int(maxStreamPerConn)
        }

        if err := util.VerifySmuxPoolConfig(smuxPoolCfg); err != nil {
                return err
        }

        log.LogDebugf("[parseSmuxConfig] cfg %v.", smuxPoolCfg)
        return nil
}

func (m *MetaNode) validConfig() (err error) {
        if len(strings.TrimSpace(m.listen)) == 0 {
                err = errors.New("illegal listen")
                return
        }
        if m.metadataDir == "" {
                m.metadataDir = defaultMetadataDir
        }
        if m.raftDir == "" {
                m.raftDir = defaultRaftDir
        }
        if len(masterClient.Nodes()) == 0 {
                err = errors.New("master address list is empty")
                return
        }
        return
}

func (m *MetaNode) newMetaManager() (err error) {
        if _, err = os.Stat(m.metadataDir); err != nil {
                if err = os.MkdirAll(m.metadataDir, 0o755); err != nil {
                        return
                }
        }

        if m.clusterUuidEnable {
                if err = config.CheckOrStoreClusterUuid(m.metadataDir, m.clusterUuid, false); err != nil {
                        log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
                        return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
                }
        }

        constCfg := config.ConstConfig{
                Listen:           m.listen,
                RaftHeartbetPort: m.raftHeartbeatPort,
                RaftReplicaPort:  m.raftReplicatePort,
        }
        ok := false
        if ok, err = config.CheckOrStoreConstCfg(m.metadataDir, config.DefaultConstConfigFile, &constCfg); !ok {
                log.LogErrorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
                return fmt.Errorf("constCfg check failed %v %v %v %v", m.metadataDir, config.DefaultConstConfigFile, constCfg, err)
        }

        // load metadataManager
        conf := MetadataManagerConfig{
                NodeID:    m.nodeId,
                RootDir:   m.metadataDir,
                RaftStore: m.raftStore,
                ZoneName:  m.zoneName,
        }
        m.metadataManager = NewMetadataManager(conf, m)
        return
}

func (m *MetaNode) startMetaManager() (err error) {
        if err = m.metadataManager.Start(); err == nil {
                log.LogInfof("[startMetaManager] manager start finish.")
        }
        return
}

func (m *MetaNode) stopMetaManager() {
        if m.metadataManager != nil {
                m.metadataManager.Stop()
        }
}

func (m *MetaNode) register() (err error) {
        step := 0
        var nodeAddress string
        for {
                if step < 1 {
                        clusterInfo, err = getClusterInfo()
                        if err != nil {
                                log.LogErrorf("[register] %s", err.Error())
                                continue
                        }
                        if m.localAddr == "" {
                                m.localAddr = clusterInfo.Ip
                        }
                        m.clusterUuid = clusterInfo.ClusterUuid
                        m.clusterUuidEnable = clusterInfo.ClusterUuidEnable
                        m.clusterId = clusterInfo.Cluster
                        nodeAddress = m.localAddr + ":" + m.listen
                        step++
                }
                var nodeID uint64
                if nodeID, err = masterClient.NodeAPI().AddMetaNodeWithAuthNode(nodeAddress, m.zoneName, m.serviceIDKey); err != nil {
                        log.LogErrorf("register: register to master fail: address(%v) err(%s)", nodeAddress, err)
                        time.Sleep(3 * time.Second)
                        continue
                }
                m.nodeId = nodeID
                return
        }
}

// NewServer creates a new meta node instance.
func NewServer() *MetaNode {
        return &MetaNode{}
}

func getClusterInfo() (ci *proto.ClusterInfo, err error) {
        ci, err = masterClient.AdminAPI().GetClusterInfo()
        return
}

// AddConnection adds a connection.
func (m *MetaNode) AddConnection() {
        atomic.AddInt64(&m.connectionCnt, 1)
}

// RemoveConnection removes a connection.
func (m *MetaNode) RemoveConnection() {
        atomic.AddInt64(&m.connectionCnt, -1)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/util/exporter"
)

// metrics
const (
        StatPeriod = time.Minute * time.Duration(1)

        MetricMetaFailedPartition      = "meta_failed_partition"
        MetricMetaPartitionInodeCount  = "mpInodeCount"
        MetricMetaPartitionDentryCount = "mpDentryCount"
        MetricConnectionCount          = "connectionCnt"
)

type MetaNodeMetrics struct {
        MetricConnectionCount          *exporter.Gauge
        MetricMetaFailedPartition      *exporter.Gauge
        MetricMetaPartitionInodeCount  *exporter.Gauge
        MetricMetaPartitionDentryCount *exporter.Gauge

        metricStopCh chan struct{}
}

func (m *MetaNode) startStat() {
        m.metrics = &MetaNodeMetrics{
                metricStopCh: make(chan struct{}, 0),

                MetricConnectionCount:          exporter.NewGauge(MetricConnectionCount),
                MetricMetaFailedPartition:      exporter.NewGauge(MetricMetaFailedPartition),
                MetricMetaPartitionInodeCount:  exporter.NewGauge(MetricMetaPartitionInodeCount),
                MetricMetaPartitionDentryCount: exporter.NewGauge(MetricMetaPartitionDentryCount),
        }

        go m.collectPartitionMetrics()
}

func (m *MetaNode) upatePartitionMetrics(mp *metaPartition) {
        labels := map[string]string{
                "partid":     fmt.Sprintf("%d", mp.config.PartitionId),
                exporter.Vol: mp.config.VolName,
        }
        m.metrics.MetricMetaPartitionInodeCount.SetWithLabels(float64(mp.GetInodeTreeLen()), labels)
        m.metrics.MetricMetaPartitionDentryCount.SetWithLabels(float64(mp.GetDentryTreeLen()), labels)
}

func (m *MetaNode) collectPartitionMetrics() {
        ticker := time.NewTicker(StatPeriod)
        for {
                select {
                case <-m.metrics.metricStopCh:
                        return
                case <-ticker.C:
                        if manager, ok := m.metadataManager.(*metadataManager); ok {
                                manager.mu.RLock()
                                for _, p := range manager.partitions {
                                        if mp, ok := p.(*metaPartition); ok {
                                                m.upatePartitionMetrics(mp)
                                        }
                                }
                                manager.mu.RUnlock()
                        }
                        m.metrics.MetricConnectionCount.Set(float64(m.connectionCnt))
                }
        }
}

func (m *MetaNode) stopStat() {
        m.metrics.metricStopCh <- struct{}{}
}

// Code generated by MockGen. DO NOT EDIT.
// Source: raftstore/partition.go

// Package raftstoremock is a generated GoMock package.
package raftstoremock

import (
        reflect "reflect"

        proto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        raftstore "github.com/cubefs/cubefs/raftstore"
        gomock "github.com/golang/mock/gomock"
)

// MockPartition is a mock of Partition interface.
type MockPartition struct {
        ctrl     *gomock.Controller
        recorder *MockPartitionMockRecorder
}

// MockPartitionMockRecorder is the mock recorder for MockPartition.
type MockPartitionMockRecorder struct {
        mock *MockPartition
}

// NewMockPartition creates a new mock instance.
func NewMockPartition(ctrl *gomock.Controller) *MockPartition {
        mock := &MockPartition{ctrl: ctrl}
        mock.recorder = &MockPartitionMockRecorder{mock}
        return mock
}

// EXPECT returns an object that allows the caller to indicate expected use.
func (m *MockPartition) EXPECT() *MockPartitionMockRecorder {
        return m.recorder
}

// AppliedIndex mocks base method.
func (m *MockPartition) AppliedIndex() uint64 {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "AppliedIndex")
        ret0, _ := ret[0].(uint64)
        return ret0
}

// AppliedIndex indicates an expected call of AppliedIndex.
func (mr *MockPartitionMockRecorder) AppliedIndex() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AppliedIndex", reflect.TypeOf((*MockPartition)(nil).AppliedIndex))
}

// ChangeMember mocks base method.
func (m *MockPartition) ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (interface{}, error) {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "ChangeMember", changeType, peer, context)
        ret0, _ := ret[0].(interface{})
        ret1, _ := ret[1].(error)
        return ret0, ret1
}

// ChangeMember indicates an expected call of ChangeMember.
func (mr *MockPartitionMockRecorder) ChangeMember(changeType, peer, context interface{}) *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ChangeMember", reflect.TypeOf((*MockPartition)(nil).ChangeMember), changeType, peer, context)
}

// CommittedIndex mocks base method.
func (m *MockPartition) CommittedIndex() uint64 {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "CommittedIndex")
        ret0, _ := ret[0].(uint64)
        return ret0
}

// CommittedIndex indicates an expected call of CommittedIndex.
func (mr *MockPartitionMockRecorder) CommittedIndex() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CommittedIndex", reflect.TypeOf((*MockPartition)(nil).CommittedIndex))
}

// IsRestoring mocks base method.
func (m *MockPartition) IsRestoring() bool {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "IsRestoring")
        ret0, _ := ret[0].(bool)
        return ret0
}

// IsRestoring indicates an expected call of IsRestoring.
func (mr *MockPartitionMockRecorder) IsRestoring() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRestoring", reflect.TypeOf((*MockPartition)(nil).IsRestoring))
}

// Delete mocks base method.
func (m *MockPartition) Delete() error {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "Delete")
        ret0, _ := ret[0].(error)
        return ret0
}

// Delete indicates an expected call of Delete.
func (mr *MockPartitionMockRecorder) Delete() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Delete", reflect.TypeOf((*MockPartition)(nil).Delete))
}

// IsOfflinePeer mocks base method.
func (m *MockPartition) IsOfflinePeer() bool {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "IsOfflinePeer")
        ret0, _ := ret[0].(bool)
        return ret0
}

// IsOfflinePeer indicates an expected call of IsOfflinePeer.
func (mr *MockPartitionMockRecorder) IsOfflinePeer() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsOfflinePeer", reflect.TypeOf((*MockPartition)(nil).IsOfflinePeer))
}

// IsRaftLeader mocks base method.
func (m *MockPartition) IsRaftLeader() bool {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "IsRaftLeader")
        ret0, _ := ret[0].(bool)
        return ret0
}

// IsRaftLeader indicates an expected call of IsRaftLeader.
func (mr *MockPartitionMockRecorder) IsRaftLeader() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsRaftLeader", reflect.TypeOf((*MockPartition)(nil).IsRaftLeader))
}

// LeaderTerm mocks base method.
func (m *MockPartition) LeaderTerm() (uint64, uint64) {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "LeaderTerm")
        ret0, _ := ret[0].(uint64)
        ret1, _ := ret[1].(uint64)
        return ret0, ret1
}

// LeaderTerm indicates an expected call of LeaderTerm.
func (mr *MockPartitionMockRecorder) LeaderTerm() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LeaderTerm", reflect.TypeOf((*MockPartition)(nil).LeaderTerm))
}

// Status mocks base method.
func (m *MockPartition) Status() *raftstore.PartitionStatus {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "Status")
        ret0, _ := ret[0].(*raftstore.PartitionStatus)
        return ret0
}

// Status indicates an expected call of Status.
func (mr *MockPartitionMockRecorder) Status() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Status", reflect.TypeOf((*MockPartition)(nil).Status))
}

// Stop mocks base method.
func (m *MockPartition) Stop() error {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "Stop")
        ret0, _ := ret[0].(error)
        return ret0
}

// Stop indicates an expected call of Stop.
func (mr *MockPartitionMockRecorder) Stop() *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Stop", reflect.TypeOf((*MockPartition)(nil).Stop))
}

// Submit mocks base method.
func (m *MockPartition) Submit(cmd []byte) (interface{}, error) {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "Submit", cmd)
        ret0, _ := ret[0].(interface{})
        ret1, _ := ret[1].(error)
        return ret0, ret1
}

// Submit indicates an expected call of Submit.
func (mr *MockPartitionMockRecorder) Submit(cmd interface{}) *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Submit", reflect.TypeOf((*MockPartition)(nil).Submit), cmd)
}

// Truncate mocks base method.
func (m *MockPartition) Truncate(index uint64) {
        m.ctrl.T.Helper()
        m.ctrl.Call(m, "Truncate", index)
}

// Truncate indicates an expected call of Truncate.
func (mr *MockPartitionMockRecorder) Truncate(index interface{}) *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Truncate", reflect.TypeOf((*MockPartition)(nil).Truncate), index)
}

// TryToLeader mocks base method.
func (m *MockPartition) TryToLeader(nodeID uint64) error {
        m.ctrl.T.Helper()
        ret := m.ctrl.Call(m, "TryToLeader", nodeID)
        ret0, _ := ret[0].(error)
        return ret0
}

// TryToLeader indicates an expected call of TryToLeader.
func (mr *MockPartitionMockRecorder) TryToLeader(nodeID interface{}) *gomock.Call {
        mr.mock.ctrl.T.Helper()
        return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "TryToLeader", reflect.TypeOf((*MockPartition)(nil).TryToLeader), nodeID)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "sort"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

// Part defined necessary fields for multipart part management.
type Part struct {
        ID         uint16
        UploadTime time.Time
        MD5        string
        Size       uint64
        Inode      uint64
}

func (m *Part) Equal(o *Part) bool {
        return m.ID == o.ID &&
                m.Inode == o.Inode &&
                m.Size == o.Size &&
                m.MD5 == o.MD5
}

func (m Part) Bytes() ([]byte, error) {
        var err error
        buffer := bytes.NewBuffer(nil)
        tmp := make([]byte, binary.MaxVarintLen64)
        var n int
        // ID
        n = binary.PutUvarint(tmp, uint64(m.ID))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // upload time
        n = binary.PutVarint(tmp, m.UploadTime.UnixNano())
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // MD5
        n = binary.PutUvarint(tmp, uint64(len(m.MD5)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        if _, err = buffer.WriteString(m.MD5); err != nil {
                return nil, err
        }
        // size
        n = binary.PutUvarint(tmp, m.Size)
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // inode
        n = binary.PutUvarint(tmp, m.Inode)
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        return buffer.Bytes(), nil
}

func PartFromBytes(raw []byte) *Part {
        var offset, n int
        // decode ID
        var u64ID uint64
        u64ID, n = binary.Uvarint(raw)
        offset += n
        // decode upload time
        var uploadTimeI64 int64
        uploadTimeI64, n = binary.Varint(raw[offset:])
        offset += n
        // decode MD5
        var md5Len uint64
        md5Len, n = binary.Uvarint(raw[offset:])
        offset += n
        md5Content := string(raw[offset : offset+int(md5Len)])
        offset += int(md5Len)
        // decode size
        var sizeU64 uint64
        sizeU64, n = binary.Uvarint(raw[offset:])
        offset += n
        // decode inode
        var inode uint64
        inode, n = binary.Uvarint(raw[offset:])

        muPart := &Part{
                ID:         uint16(u64ID),
                UploadTime: time.Unix(0, uploadTimeI64),
                MD5:        md5Content,
                Size:       sizeU64,
                Inode:      inode,
        }
        return muPart
}

type Parts []*Part

func (m Parts) Len() int {
        return len(m)
}

func (m Parts) sort() {
        sort.SliceStable(m, func(i, j int) bool {
                return m[i].ID < m[j].ID
        })
}

func (m *Parts) Hash(part *Part) (has bool) {
        i := sort.Search(len(*m), func(i int) bool {
                return (*m)[i].ID >= part.ID
        })
        has = i < len(*m) && (*m)[i].ID == part.ID
        return
}

func (m *Parts) UpdateOrStore(part *Part) (oldInode uint64, update, conflict bool) {
        i := sort.Search(len(*m), func(i int) bool {
                return (*m)[i].ID >= part.ID
        })
        if i >= 0 && i < len(*m) && (*m)[i].ID == part.ID {
                oldPart := (*m)[i]
                oldInode = oldPart.Inode
                if part.Inode == oldInode {
                        log.LogWarnf("Request already success,the same partinode[%d] must not be overwritten.", oldInode)
                        return
                }
                if part.UploadTime.Before(oldPart.UploadTime) {
                        log.LogWarnf("Request part putTime[%v] is less than old part putTime[%v], partNumber[%v]",
                                part.UploadTime.UnixNano(), oldPart.UploadTime.UnixNano(), part.ID)
                        conflict = true
                        return
                }
                update = true
                (*m)[i] = part
                return
        }
        *m = append(*m, part)
        update = false
        m.sort()
        return
}

// Deprecated
func (m *Parts) Insert(part *Part, replace bool) (success bool) {
        i := sort.Search(len(*m), func(i int) bool {
                return (*m)[i].ID >= part.ID
        })
        if i < len(*m) && (*m)[i].ID == part.ID {
                if replace {
                        (*m)[i] = part
                        return true
                }
                return false
        }
        *m = append(*m, part)
        m.sort()
        return true
}

func (m *Parts) Remove(id uint16) {
        i := sort.Search(len(*m), func(i int) bool {
                return (*m)[i].ID >= id
        })
        if i < len(*m) && (*m)[i].ID == id {
                if len(*m) > i+1 {
                        *m = append((*m)[:i], (*m)[i+1:]...)
                } else {
                        *m = (*m)[:i]
                }
        }
}

func (m Parts) Search(id uint16) (part *Part, found bool) {
        i := sort.Search(len(m), func(i int) bool {
                return m[i].ID >= id
        })
        if i < len(m) && m[i].ID == id {
                return m[i], true
        }
        return nil, false
}

func (m Parts) Bytes() ([]byte, error) {
        var err error
        var n int
        buffer := bytes.NewBuffer(nil)
        tmp := make([]byte, binary.MaxVarintLen64)
        n = binary.PutUvarint(tmp, uint64(len(m)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        var marshaled []byte
        for _, p := range m {
                marshaled, err = p.Bytes()
                if err != nil {
                        return nil, err
                }
                // write part length
                n = binary.PutUvarint(tmp, uint64(len(marshaled)))
                if _, err = buffer.Write(tmp[:n]); err != nil {
                        return nil, err
                }
                // write part bytes
                if _, err = buffer.Write(marshaled); err != nil {
                        return nil, err
                }
        }
        return buffer.Bytes(), nil
}

func PartsFromBytes(raw []byte) Parts {
        var offset, n int
        var numPartsU64 uint64
        numPartsU64, n = binary.Uvarint(raw)
        offset += n
        muParts := make([]*Part, int(numPartsU64))
        for i := 0; i < int(numPartsU64); i++ {
                var partLengthU64 uint64
                partLengthU64, n = binary.Uvarint(raw[offset:])
                offset += n
                part := PartFromBytes(raw[offset : offset+int(partLengthU64)])
                muParts[i] = part
                offset += int(partLengthU64)
        }
        return muParts
}

type MultipartExtend map[string]string

func NewMultipartExtend() MultipartExtend {
        return make(map[string]string)
}

func (me MultipartExtend) Bytes() ([]byte, error) {
        var n int
        var err error
        buffer := bytes.NewBuffer(nil)
        tmp := make([]byte, binary.MaxVarintLen64)
        n = binary.PutUvarint(tmp, uint64(len(me)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        marshalStr := func(src string) error {
                n = binary.PutUvarint(tmp, uint64(len(src)))
                if _, err = buffer.Write(tmp[:n]); err != nil {
                        return err
                }
                if _, err = buffer.WriteString(src); err != nil {
                        return err
                }
                return nil
        }
        for key, val := range me {
                if err = marshalStr(key); err != nil {
                        return nil, err
                }
                if err = marshalStr(val); err != nil {
                        return nil, err
                }
        }
        return buffer.Bytes(), nil
}

func MultipartExtendFromBytes(raw []byte) MultipartExtend {
        var offset, n int
        var el uint64
        me := NewMultipartExtend()
        unmarshalStr := func(data []byte) (string, int) {
                var n int
                var lengthU64 uint64
                lengthU64, n = binary.Uvarint(data)
                return string(data[n : n+int(lengthU64)]), n + int(lengthU64)
        }
        el, n = binary.Uvarint(raw)
        if el <= 0 {
                return nil
        }
        offset += n
        for i := 0; i < int(el); i++ {
                var key, val string
                key, n = unmarshalStr(raw[offset:])
                offset += n
                val, n = unmarshalStr(raw[offset:])
                offset += n
                me[key] = val
        }
        return me
}

// Multipart defined necessary fields for multipart session management.
type Multipart struct {
        // session fields
        id       string
        key      string
        initTime time.Time
        parts    Parts
        extend   MultipartExtend

        mu sync.RWMutex
}

func (m *Multipart) Less(than btree.Item) bool {
        tm, is := than.(*Multipart)
        return is && ((m.key < tm.key) || ((m.key == tm.key) && (m.id < tm.id)))
}

func (m *Multipart) Copy() btree.Item {
        return &Multipart{
                id:       m.id,
                key:      m.key,
                initTime: m.initTime,
                parts:    append(Parts{}, m.parts...),
                extend:   m.extend,
        }
}

func (m *Multipart) ID() string {
        return m.id
}

func (m *Multipart) UpdateOrStorePart(part *Part) (oldInode uint64, updated, conflict bool) {
        m.mu.Lock()
        defer m.mu.Unlock()
        if m.parts == nil {
                m.parts = PartsFromBytes(nil)
        }
        oldInode, updated, conflict = m.parts.UpdateOrStore(part)
        return
}

// Deprecated
func (m *Multipart) InsertPart(part *Part, replace bool) (success bool) {
        m.mu.Lock()
        defer m.mu.Unlock()
        if m.parts == nil {
                m.parts = PartsFromBytes(nil)
        }
        success = m.parts.Insert(part, replace)
        return
}

func (m *Multipart) Parts() []*Part {
        m.mu.RLock()
        defer m.mu.RUnlock()
        return append([]*Part{}, m.parts...)
}

func (m *Multipart) Bytes() ([]byte, error) {
        var n int
        buffer := bytes.NewBuffer(nil)
        var err error
        tmp := make([]byte, binary.MaxVarintLen64)
        // marshal id
        marshalStr := func(src string) error {
                n = binary.PutUvarint(tmp, uint64(len(src)))
                if _, err = buffer.Write(tmp[:n]); err != nil {
                        return err
                }
                if _, err = buffer.WriteString(src); err != nil {
                        return err
                }
                return nil
        }
        // marshal id
        if err = marshalStr(m.id); err != nil {
                return nil, err
        }
        // marshal key
        if err = marshalStr(m.key); err != nil {
                return nil, err
        }
        // marshal init time
        n = binary.PutVarint(tmp, m.initTime.UnixNano())
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        // marshal parts
        var marshaledParts []byte
        if marshaledParts, err = m.parts.Bytes(); err != nil {
                return nil, err
        }
        n = binary.PutUvarint(tmp, uint64(len(marshaledParts)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        if _, err = buffer.Write(marshaledParts); err != nil {
                return nil, err
        }
        // marshall extend
        var extendBytes []byte
        if extendBytes, err = m.extend.Bytes(); err != nil {
                return nil, err
        }
        n = binary.PutUvarint(tmp, uint64(len(extendBytes)))
        if _, err = buffer.Write(tmp[:n]); err != nil {
                return nil, err
        }
        if _, err = buffer.Write(extendBytes); err != nil {
                return nil, err
        }
        return buffer.Bytes(), nil
}

func MultipartFromBytes(raw []byte) *Multipart {
        unmarshalStr := func(data []byte) (string, int) {
                var n int
                var lengthU64 uint64
                lengthU64, n = binary.Uvarint(data)
                return string(data[n : n+int(lengthU64)]), n + int(lengthU64)
        }
        var offset, n int
        // decode id
        var id string
        id, n = unmarshalStr(raw)
        offset += n
        // decode key
        var key string
        key, n = unmarshalStr(raw[offset:])
        offset += n
        // decode init time
        var initTimeI64 int64
        initTimeI64, n = binary.Varint(raw[offset:])
        offset += n
        // decode parts
        var partsLengthU64 uint64
        partsLengthU64, n = binary.Uvarint(raw[offset:])
        offset += n
        parts := PartsFromBytes(raw[offset : offset+int(partsLengthU64)])
        offset += int(partsLengthU64)
        // decode multipart extend
        var extendLengthU64 uint64
        extendLengthU64, n = binary.Uvarint(raw[offset:])
        offset += n
        me := MultipartExtendFromBytes(raw[offset : offset+int(extendLengthU64)])

        muSession := &Multipart{
                id:       id,
                key:      key,
                initTime: time.Unix(0, initTimeI64),
                parts:    parts,
                extend:   me,
        }
        return muSession
}

package metanode

import (
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        UpdateNodeInfoTicket     = 1 * time.Minute
        DefaultDeleteBatchCounts = 128
)

type NodeInfo struct {
        deleteBatchCount uint64
}

var (
        nodeInfo                   = &NodeInfo{}
        nodeInfoStopC              = make(chan struct{}, 0)
        deleteWorkerSleepMs uint64 = 0
        dirChildrenNumLimit uint32 = proto.DefaultDirChildrenNumLimit
)

func DeleteBatchCount() uint64 {
        val := atomic.LoadUint64(&nodeInfo.deleteBatchCount)
        if val == 0 {
                val = DefaultDeleteBatchCounts
        }
        return val
}

func updateDeleteBatchCount(val uint64) {
        atomic.StoreUint64(&nodeInfo.deleteBatchCount, val)
}

func updateDeleteWorkerSleepMs(val uint64) {
        atomic.StoreUint64(&deleteWorkerSleepMs, val)
}

func updateDirChildrenNumLimit(val uint32) {
        atomic.StoreUint32(&dirChildrenNumLimit, val)
}

func DeleteWorkerSleepMs() {
        val := atomic.LoadUint64(&deleteWorkerSleepMs)
        if val > 0 {
                time.Sleep(time.Duration(val) * time.Millisecond)
        }
}

func (m *MetaNode) startUpdateNodeInfo() {
        ticker := time.NewTicker(UpdateNodeInfoTicket)
        defer ticker.Stop()
        for {
                select {
                case <-nodeInfoStopC:
                        log.LogInfo("metanode nodeinfo gorutine stopped")
                        return
                case <-ticker.C:
                        m.updateNodeInfo()
                        m.metadataManager.checkVolVerList()
                }
        }
}

func (m *MetaNode) stopUpdateNodeInfo() {
        nodeInfoStopC <- struct{}{}
}

func (m *MetaNode) updateNodeInfo() {
        // clusterInfo, err := getClusterInfo()
        clusterInfo, err := masterClient.AdminAPI().GetClusterInfo()
        if err != nil {
                log.LogErrorf("[updateNodeInfo] %s", err.Error())
                return
        }
        updateDeleteBatchCount(clusterInfo.MetaNodeDeleteBatchCount)
        updateDeleteWorkerSleepMs(clusterInfo.MetaNodeDeleteWorkerSleepMs)

        if clusterInfo.DirChildrenNumLimit < proto.MinDirChildrenNumLimit {
                log.LogWarnf("updateNodeInfo: DirChildrenNumLimit probably not enabled on master, set to default value(%v)",
                        proto.DefaultDirChildrenNumLimit)
                atomic.StoreUint32(&dirChildrenNumLimit, proto.DefaultDirChildrenNumLimit)
        } else {
                atomic.StoreUint32(&dirChildrenNumLimit, clusterInfo.DirChildrenNumLimit)
                log.LogInfof("updateNodeInfo: DirChildrenNumLimit(%v)", clusterInfo.DirChildrenNumLimit)
        }

        // updateDirChildrenNumLimit(clusterInfo.DirChildrenNumLimit)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

type Packet struct {
        proto.Packet
}

// NewPacketToDeleteExtent returns a new packet to delete the extent.
func NewPacketToDeleteExtent(dp *DataPartition, ext *proto.ExtentKey) (p *Packet, invalid bool) {
        p = new(Packet)
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpMarkDelete
        p.ExtentType = proto.NormalExtentType
        p.PartitionID = dp.PartitionID
        if storage.IsTinyExtent(ext.ExtentId) {
                p.ExtentType = proto.TinyExtentType
        }
        log.LogDebugf("NewPacketToDeleteExtent. ext %v", ext)
        if ext.IsSplit() {
                var (
                        newOff  = ext.ExtentOffset
                        newSize = ext.Size
                )
                if int(ext.ExtentOffset)%util.PageSize != 0 {
                        log.LogDebugf("NewPacketToDeleteExtent. ext %v", ext)
                        newOff = ext.ExtentOffset + util.PageSize - ext.ExtentOffset%util.PageSize
                        if ext.Size <= uint32(newOff-ext.ExtentOffset) {
                                invalid = true
                                log.LogDebugf("NewPacketToDeleteExtent. ext %v invalid to punch hole newOff %v",
                                        ext, newOff)
                                return
                        }
                        newSize = ext.Size - uint32(newOff-ext.ExtentOffset)
                }

                if newSize%util.PageSize != 0 {
                        newSize = newSize - newSize%util.PageSize
                }

                if newSize == 0 {
                        invalid = true
                        log.LogDebugf("NewPacketToDeleteExtent. ext %v invalid to punch hole", ext)
                        return
                }
                ext.Size = newSize
                ext.ExtentOffset = newOff
                log.LogDebugf("ext [%v] delete be set split flag", ext)
                p.Opcode = proto.OpSplitMarkDelete
        } else {
                log.LogDebugf("ext [%v] delete normal ext", ext)
        }
        p.Data, _ = json.Marshal(ext)
        p.Size = uint32(len(p.Data))
        p.ExtentID = ext.ExtentId
        p.ReqID = proto.GenerateRequestID()
        p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
        if len(dp.Hosts) == 1 {
                p.RemainingFollowers = 127
        }

        p.Arg = ([]byte)(dp.GetAllAddrs())
        p.ArgLen = uint32(len(p.Arg))

        return
}

// NewPacketToBatchDeleteExtent returns a new packet to batch delete the extent.
func NewPacketToBatchDeleteExtent(dp *DataPartition, exts []*proto.ExtentKey) *Packet {
        p := new(Packet)
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpBatchDeleteExtent
        p.ExtentType = proto.NormalExtentType
        p.PartitionID = uint64(dp.PartitionID)
        p.Data, _ = json.Marshal(exts)
        p.Size = uint32(len(p.Data))
        p.ReqID = proto.GenerateRequestID()
        p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
        if len(dp.Hosts) == 1 {
                p.RemainingFollowers = 127
        }
        p.Arg = ([]byte)(dp.GetAllAddrs())
        p.ArgLen = uint32(len(p.Arg))

        return p
}

// NewPacketToDeleteExtent returns a new packet to delete the extent.
func NewPacketToFreeInodeOnRaftFollower(partitionID uint64, freeInodes []byte) *Packet {
        p := new(Packet)
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpMetaFreeInodesOnRaftFollower
        p.PartitionID = partitionID
        p.ExtentType = proto.NormalExtentType
        p.ReqID = proto.GenerateRequestID()
        p.Data = make([]byte, len(freeInodes))
        copy(p.Data, freeInodes)
        p.Size = uint32(len(p.Data))

        return p
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/json"
        "fmt"
        "math"
        "math/rand"
        "os"
        "path"
        "reflect"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/blobstore/api/access"
        "github.com/cubefs/cubefs/cmd/common"
        raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/sdk/data/blobstore"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/timeutil"
)

// NOTE: if the operation is invoked by local machine
// the remote addr is "127.0.0.1"
const localAddrForAudit = "127.0.0.1"

var (
        ErrIllegalHeartbeatAddress = errors.New("illegal heartbeat address")
        ErrIllegalReplicateAddress = errors.New("illegal replicate address")
        ErrSnapshotCrcMismatch     = errors.New("snapshot crc not match")
)

// Errors
var (
        ErrInodeIDOutOfRange = errors.New("inode ID out of range")
)

type sortedPeers []proto.Peer

func (sp sortedPeers) Len() int {
        return len(sp)
}

func (sp sortedPeers) Less(i, j int) bool {
        return sp[i].ID < sp[j].ID
}

func (sp sortedPeers) Swap(i, j int) {
        sp[i], sp[j] = sp[j], sp[i]
}

// MetaMultiSnapshotInfo
type MetaMultiSnapshotInfo struct {
        VerSeq uint64
        Status int8
        Ctime  time.Time
}

// MetaPartitionConfig is used to create a meta partition.
type MetaPartitionConfig struct {
        // Identity for raftStore group. RaftStore nodes in the same raftStore group must have the same groupID.
        PartitionId   uint64              `json:"partition_id"`
        VolName       string              `json:"vol_name"`
        Start         uint64              `json:"start"` // Minimal Inode ID of this range. (Required during initialization)
        End           uint64              `json:"end"`   // Maximal Inode ID of this range. (Required during initialization)
        PartitionType int                 `json:"partition_type"`
        Peers         []proto.Peer        `json:"peers"` // Peers information of the raftStore
        Cursor        uint64              `json:"-"`     // Cursor ID of the inode that have been assigned
        UniqId        uint64              `json:"-"`
        NodeId        uint64              `json:"-"`
        RootDir       string              `json:"-"`
        VerSeq        uint64              `json:"ver_seq"`
        BeforeStart   func()              `json:"-"`
        AfterStart    func()              `json:"-"`
        BeforeStop    func()              `json:"-"`
        AfterStop     func()              `json:"-"`
        RaftStore     raftstore.RaftStore `json:"-"`
        ConnPool      *util.ConnectPool   `json:"-"`
        Forbidden     bool                `json:"-"`
}

func (c *MetaPartitionConfig) checkMeta() (err error) {
        if c.PartitionId <= 0 {
                err = errors.NewErrorf("[checkMeta]: partition id at least 1, "+
                        "now partition id is: %d", c.PartitionId)
                return
        }
        if c.Start < 0 {
                err = errors.NewErrorf("[checkMeta]: start at least 0")
                return
        }
        if c.End <= c.Start {
                err = errors.NewErrorf("[checkMeta]: end=%v, "+
                        "start=%v; end <= start", c.End, c.Start)
                return
        }
        if len(c.Peers) <= 0 {
                err = errors.NewErrorf("[checkMeta]: must have peers, now peers is 0")
                return
        }
        return
}

func (c *MetaPartitionConfig) sortPeers() {
        sp := sortedPeers(c.Peers)
        sort.Sort(sp)
}

// OpInode defines the interface for the inode operations.
type OpInode interface {
        CreateInode(req *CreateInoReq, p *Packet, remoteAddr string) (err error)
        UnlinkInode(req *UnlinkInoReq, p *Packet, remoteAddr string) (err error)
        UnlinkInodeBatch(req *BatchUnlinkInoReq, p *Packet, remoteAddr string) (err error)
        InodeGet(req *InodeGetReq, p *Packet) (err error)
        InodeGetSplitEk(req *InodeGetSplitReq, p *Packet) (err error)
        InodeGetBatch(req *InodeGetReqBatch, p *Packet) (err error)
        CreateInodeLink(req *LinkInodeReq, p *Packet, remoteAddr string) (err error)
        EvictInode(req *EvictInodeReq, p *Packet, remoteAddr string) (err error)
        EvictInodeBatch(req *BatchEvictInodeReq, p *Packet, remoteAddr string) (err error)
        SetAttr(req *SetattrRequest, reqData []byte, p *Packet) (err error)
        GetInodeTree() *BTree
        GetInodeTreeLen() int
        DeleteInode(req *proto.DeleteInodeRequest, p *Packet, remoteAddr string) (err error)
        DeleteInodeBatch(req *proto.DeleteInodeBatchRequest, p *Packet, remoteAddr string) (err error)
        ClearInodeCache(req *proto.ClearInodeCacheRequest, p *Packet) (err error)
        TxCreateInode(req *proto.TxCreateInodeRequest, p *Packet, remoteAddr string) (err error)
        TxUnlinkInode(req *proto.TxUnlinkInodeRequest, p *Packet, remoteAddr string) (err error)
        TxCreateInodeLink(req *proto.TxLinkInodeRequest, p *Packet, remoteAddr string) (err error)
        QuotaCreateInode(req *proto.QuotaCreateInodeRequest, p *Packet, remoteAddr string) (err error)
}

type OpExtend interface {
        SetXAttr(req *proto.SetXAttrRequest, p *Packet) (err error)
        BatchSetXAttr(req *proto.BatchSetXAttrRequest, p *Packet) (err error)
        GetXAttr(req *proto.GetXAttrRequest, p *Packet) (err error)
        GetAllXAttr(req *proto.GetAllXAttrRequest, p *Packet) (err error)
        BatchGetXAttr(req *proto.BatchGetXAttrRequest, p *Packet) (err error)
        RemoveXAttr(req *proto.RemoveXAttrRequest, p *Packet) (err error)
        ListXAttr(req *proto.ListXAttrRequest, p *Packet) (err error)
        UpdateXAttr(req *proto.UpdateXAttrRequest, p *Packet) (err error)
}

// OpDentry defines the interface for the dentry operations.
type OpDentry interface {
        CreateDentry(req *CreateDentryReq, p *Packet, remoteAddr string) (err error)
        DeleteDentry(req *DeleteDentryReq, p *Packet, remoteAddr string) (err error)
        DeleteDentryBatch(req *BatchDeleteDentryReq, p *Packet, remoteAddr string) (err error)
        UpdateDentry(req *UpdateDentryReq, p *Packet, remoteAddr string) (err error)
        ReadDir(req *ReadDirReq, p *Packet) (err error)
        ReadDirLimit(req *ReadDirLimitReq, p *Packet) (err error)
        ReadDirOnly(req *ReadDirOnlyReq, p *Packet) (err error)
        Lookup(req *LookupReq, p *Packet) (err error)
        GetDentryTree() *BTree
        GetDentryTreeLen() int
        TxCreateDentry(req *proto.TxCreateDentryRequest, p *Packet, remoteAddr string) (err error)
        TxDeleteDentry(req *proto.TxDeleteDentryRequest, p *Packet, remoteAddr string) (err error)
        TxUpdateDentry(req *proto.TxUpdateDentryRequest, p *Packet, remoteAddr string) (err error)
        QuotaCreateDentry(req *proto.QuotaCreateDentryRequest, p *Packet, remoteAddr string) (err error)
}

type OpTransaction interface {
        TxCreate(req *proto.TxCreateRequest, p *Packet) (err error)
        TxCommitRM(req *proto.TxApplyRMRequest, p *Packet) error
        TxRollbackRM(req *proto.TxApplyRMRequest, p *Packet) error
        TxCommit(req *proto.TxApplyRequest, p *Packet, remoteAddr string) (err error)
        TxRollback(req *proto.TxApplyRequest, p *Packet, remoteAddr string) (err error)
        TxGetInfo(req *proto.TxGetInfoRequest, p *Packet) (err error)
        TxGetCnt() (uint64, uint64, uint64)
        TxGetTree() (*BTree, *BTree, *BTree)
}

// OpExtent defines the interface for the extent operations.
type OpExtent interface {
        ExtentAppend(req *proto.AppendExtentKeyRequest, p *Packet) (err error)
        ExtentAppendWithCheck(req *proto.AppendExtentKeyWithCheckRequest, p *Packet) (err error)
        BatchObjExtentAppend(req *proto.AppendObjExtentKeysRequest, p *Packet) (err error)
        ExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error)
        ObjExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error)
        ExtentsTruncate(req *ExtentsTruncateReq, p *Packet, remoteAddr string) (err error)
        BatchExtentAppend(req *proto.AppendExtentKeysRequest, p *Packet) (err error)
        // ExtentsDelete(req *proto.DelExtentKeyRequest, p *Packet) (err error)
}

type OpMultipart interface {
        GetMultipart(req *proto.GetMultipartRequest, p *Packet) (err error)
        CreateMultipart(req *proto.CreateMultipartRequest, p *Packet) (err error)
        AppendMultipart(req *proto.AddMultipartPartRequest, p *Packet) (err error)
        RemoveMultipart(req *proto.RemoveMultipartRequest, p *Packet) (err error)
        ListMultipart(req *proto.ListMultipartRequest, p *Packet) (err error)
        GetUidInfo() (info []*proto.UidReportSpaceInfo)
        SetUidLimit(info []*proto.UidSpaceInfo)
        SetTxInfo(info []*proto.TxInfo)
        GetExpiredMultipart(req *proto.GetExpiredMultipartRequest, p *Packet) (err error)
}

// MultiVersion operation from master or client
type OpMultiVersion interface {
        GetVerSeq() uint64
        GetVerList() []*proto.VolVersionInfo
        GetAllVerList() []*proto.VolVersionInfo
        HandleVersionOp(op uint8, verSeq uint64, verList []*proto.VolVersionInfo, sync bool) (err error)
        fsmVersionOp(reqData []byte) (err error)
        GetAllVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error)
        GetSpecVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error)
        GetExtentByVer(ino *Inode, req *proto.GetExtentsRequest, rsp *proto.GetExtentsResponse)
        checkVerList(info *proto.VolVersionInfoList, sync bool) (needUpdate bool, err error)
        checkByMasterVerlist(mpVerList *proto.VolVersionInfoList, masterVerList *proto.VolVersionInfoList) (err error)
}

// OpMeta defines the interface for the metadata operations.
type OpMeta interface {
        OpInode
        OpDentry
        OpExtent
        OpPartition
        OpExtend
        OpMultipart
        OpTransaction
        OpQuota
        OpMultiVersion
}

// OpPartition defines the interface for the partition operations.
type OpPartition interface {
        GetVolName() (volName string)
        IsLeader() (leaderAddr string, isLeader bool)
        LeaderTerm() (leaderID, term uint64)
        IsFollowerRead() bool
        SetFollowerRead(bool)
        GetCursor() uint64
        GetUniqId() uint64
        GetBaseConfig() MetaPartitionConfig
        ResponseLoadMetaPartition(p *Packet) (err error)
        PersistMetadata() (err error)
        RenameStaleMetadata() (err error)
        ChangeMember(changeType raftproto.ConfChangeType, peer raftproto.Peer, context []byte) (resp interface{}, err error)
        Reset() (err error)
        UpdatePartition(req *UpdatePartitionReq, resp *UpdatePartitionResp) (err error)
        DeleteRaft() error
        IsExsitPeer(peer proto.Peer) bool
        TryToLeader(groupID uint64) error
        CanRemoveRaftMember(peer proto.Peer) error
        IsEquareCreateMetaPartitionRequst(request *proto.CreateMetaPartitionRequest) (err error)
        GetUniqID(p *Packet, num uint32) (err error)
}

// MetaPartition defines the interface for the meta partition operations.
type MetaPartition interface {
        Start(isCreate bool) error
        Stop()
        DataSize() uint64
        GetFreeListLen() int
        OpMeta
        LoadSnapshot(path string) error
        ForceSetMetaPartitionToLoadding()
        ForceSetMetaPartitionToFininshLoad()
        IsForbidden() bool
        SetForbidden(status bool)
        IsEnableAuditLog() bool
        SetEnableAuditLog(status bool)
}

type UidManager struct {
        accumDelta        *sync.Map
        accumBase         *sync.Map
        accumRebuildDelta *sync.Map // snapshot redoLog
        accumRebuildBase  *sync.Map // snapshot mirror
        uidAcl            *sync.Map
        lastUpdateTime    time.Time
        enable            bool
        rbuilding         bool
        volName           string
        acLock            sync.RWMutex
        mpID              uint64
}

func NewUidMgr(volName string, mpID uint64) (mgr *UidManager) {
        mgr = &UidManager{
                volName:           volName,
                mpID:              mpID,
                accumDelta:        new(sync.Map),
                accumBase:         new(sync.Map),
                accumRebuildDelta: new(sync.Map),
                accumRebuildBase:  new(sync.Map),
                uidAcl:            new(sync.Map),
        }
        var uid uint32
        mgr.uidAcl.Store(uid, false)
        log.LogDebugf("NewUidMgr init")
        return
}

func (uMgr *UidManager) addUidSpace(uid uint32, inode uint64, eks []proto.ExtentKey) (status uint8) {
        uMgr.acLock.Lock()
        defer uMgr.acLock.Unlock()

        status = proto.OpOk
        if uMgr.getUidAcl(uid) {
                log.LogWarnf("addUidSpace.volname [%v] mp[%v] uid %v be set full", uMgr.mpID, uMgr.volName, uid)
                return proto.OpNoSpaceErr
        }
        if eks == nil {
                return
        }
        var size int64
        for _, ek := range eks {
                size += int64(ek.Size)
        }
        if val, ok := uMgr.accumDelta.Load(uid); ok {
                size += val.(int64)
        }
        uMgr.accumDelta.Store(uid, size)

        if uMgr.rbuilding {
                if val, ok := uMgr.accumRebuildDelta.Load(uid); ok {
                        size += val.(int64)
                }
                uMgr.accumRebuildDelta.Store(uid, size)
        }
        return
}

func (uMgr *UidManager) doMinusUidSpace(uid uint32, inode uint64, size uint64) {
        uMgr.acLock.Lock()
        defer uMgr.acLock.Unlock()

        doWork := func(delta *sync.Map) {
                var rsvSize int64
                if val, ok := delta.Load(uid); ok {
                        delta.Store(uid, val.(int64)-int64(size))
                } else {
                        rsvSize -= int64(size)
                        delta.Store(uid, rsvSize)
                }
        }
        doWork(uMgr.accumDelta)
        if uMgr.rbuilding {
                doWork(uMgr.accumRebuildDelta)
        }
}

func (uMgr *UidManager) minusUidSpace(uid uint32, inode uint64, eks []proto.ExtentKey) {
        var size uint64
        for _, ek := range eks {
                size += uint64(ek.Size)
        }
        uMgr.doMinusUidSpace(uid, inode, size)
}

func (uMgr *UidManager) getUidAcl(uid uint32) (enable bool) {
        if val, ok := uMgr.uidAcl.Load(uid); ok {
                enable = val.(bool)
        }
        return
}

func (uMgr *UidManager) setUidAcl(info []*proto.UidSpaceInfo) {
        uMgr.acLock.Lock()
        defer uMgr.acLock.Unlock()

        uMgr.uidAcl = new(sync.Map)
        for _, uidInfo := range info {
                if uidInfo.VolName != uMgr.volName {
                        continue
                }
                // log.LogDebugf("setUidAcl.volname [%v] uid %v be set enable %v", uMgr.volName, uidInfo.Uid, uidInfo.Limited)
                uMgr.uidAcl.Store(uidInfo.Uid, uidInfo.Limited)
        }
}

func (uMgr *UidManager) getAllUidSpace() (rsp []*proto.UidReportSpaceInfo) {
        uMgr.acLock.RLock()
        defer uMgr.acLock.RUnlock()

        var ok bool

        uMgr.accumDelta.Range(func(key, value interface{}) bool {
                var size int64
                size += value.(int64)
                if baseInfo, ok := uMgr.accumBase.Load(key.(uint32)); ok {
                        size += baseInfo.(int64)
                        if size < 0 {
                                log.LogErrorf("getAllUidSpace. mp[%v] uid %v size small than 0 %v, old %v, new %v", uMgr.mpID, key.(uint32), size, value.(int64), baseInfo.(int64))
                                return false
                        }
                }
                uMgr.accumBase.Store(key.(uint32), size)
                return true
        })

        uMgr.accumDelta = new(sync.Map)

        uMgr.accumBase.Range(func(key, value interface{}) bool {
                var size int64
                if size, ok = value.(int64); !ok {
                        log.LogErrorf("getAllUidSpace. mp[%v] accumBase key %v size type %v", uMgr.mpID, reflect.TypeOf(key), reflect.TypeOf(value))
                        return false
                }
                rsp = append(rsp, &proto.UidReportSpaceInfo{
                        Uid:  key.(uint32),
                        Size: uint64(size),
                })
                // log.LogDebugf("getAllUidSpace. mp[%v] accumBase uid %v size %v", uMgr.mpID, key.(uint32), size)
                return true
        })

        return
}

func (uMgr *UidManager) accumRebuildStart() bool {
        uMgr.acLock.Lock()
        defer uMgr.acLock.Unlock()
        log.LogDebugf("accumRebuildStart vol [%v] mp[%v] rbuilding [%v]", uMgr.volName, uMgr.mpID, uMgr.rbuilding)
        if uMgr.rbuilding {
                return false
        }
        uMgr.rbuilding = true
        return true
}

func (uMgr *UidManager) accumRebuildFin(rebuild bool) {
        uMgr.acLock.Lock()
        defer uMgr.acLock.Unlock()
        log.LogDebugf("accumRebuildFin rebuild volname [%v], mp:[%v],%v:%v, rebuild:[%v]", uMgr.volName, uMgr.mpID,
                uMgr.accumRebuildBase, uMgr.accumRebuildDelta, rebuild)
        uMgr.rbuilding = false
        if !rebuild {
                uMgr.accumRebuildBase = new(sync.Map)
                uMgr.accumRebuildDelta = new(sync.Map)
                return
        }
        uMgr.accumBase = uMgr.accumRebuildBase
        uMgr.accumDelta = uMgr.accumRebuildDelta
        uMgr.accumRebuildBase = new(sync.Map)
        uMgr.accumRebuildDelta = new(sync.Map)
}

func (uMgr *UidManager) accumInoUidSize(ino *Inode, accum *sync.Map) {
        size := ino.GetSpaceSize()
        if val, ok := accum.Load(ino.Uid); ok {
                size += uint64(val.(int64))
        }
        accum.Store(ino.Uid, int64(size))
}

type OpQuota interface {
        setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo)
        getQuotaReportInfos() (infos []*proto.QuotaReportInfo)
        batchSetInodeQuota(req *proto.BatchSetMetaserverQuotaReuqest,
                resp *proto.BatchSetMetaserverQuotaResponse) (err error)
        batchDeleteInodeQuota(req *proto.BatchDeleteMetaserverQuotaReuqest,
                resp *proto.BatchDeleteMetaserverQuotaResponse) (err error)
        getInodeQuota(inode uint64, p *Packet) (err error)
}

// metaPartition manages the range of the inode IDs.
// When a new inode is requested, it allocates a new inode id for this inode if possible.
// States:
//
//        +-----+             +-------+
//        | New | → Restore → | Ready |
//        +-----+             +-------+
type metaPartition struct {
        config                 *MetaPartitionConfig
        size                   uint64                // For partition all file size
        applyID                uint64                // Inode/Dentry max applyID, this index will be update after restoring from the dumped data.
        storedApplyId          uint64                // update after store snapshot to disk
        dentryTree             *BTree                // btree for dentries
        inodeTree              *BTree                // btree for inodes
        extendTree             *BTree                // btree for inode extend (XAttr) management
        multipartTree          *BTree                // collection for multipart management
        txProcessor            *TransactionProcessor // transction processor
        raftPartition          raftstore.Partition
        stopC                  chan bool
        storeChan              chan *storeMsg
        state                  uint32
        delInodeFp             *os.File
        freeList               *freeList // free inode list
        extDelCh               chan []proto.ExtentKey
        extReset               chan struct{}
        vol                    *Vol
        manager                *metadataManager
        isLoadingMetaPartition bool
        summaryLock            sync.Mutex
        ebsClient              *blobstore.BlobStoreClient
        volType                int
        isFollowerRead         bool
        uidManager             *UidManager
        xattrLock              sync.Mutex
        fileRange              []int64
        mqMgr                  *MetaQuotaManager
        nonIdempotent          sync.Mutex
        uniqChecker            *uniqChecker
        verSeq                 uint64
        multiVersionList       *proto.VolVersionInfoList
        versionLock            sync.Mutex
        verUpdateChan          chan []byte
        enableAuditLog         bool
}

func (mp *metaPartition) IsForbidden() bool {
        return mp.config.Forbidden
}

func (mp *metaPartition) SetForbidden(status bool) {
        mp.config.Forbidden = status
}

func (mp *metaPartition) IsEnableAuditLog() bool {
        return mp.enableAuditLog
}

func (mp *metaPartition) SetEnableAuditLog(status bool) {
        mp.enableAuditLog = status
}

func (mp *metaPartition) acucumRebuildStart() bool {
        return mp.uidManager.accumRebuildStart()
}

func (mp *metaPartition) acucumRebuildFin(rebuild bool) {
        mp.uidManager.accumRebuildFin(rebuild)
}

func (mp *metaPartition) acucumUidSizeByStore(ino *Inode) {
        mp.uidManager.accumInoUidSize(ino, mp.uidManager.accumRebuildBase)
}

func (mp *metaPartition) acucumUidSizeByLoad(ino *Inode) {
        mp.uidManager.accumInoUidSize(ino, mp.uidManager.accumBase)
}

func (mp *metaPartition) GetVerList() []*proto.VolVersionInfo {
        mp.multiVersionList.RWLock.RLock()
        defer mp.multiVersionList.RWLock.RUnlock()

        verList := make([]*proto.VolVersionInfo, len(mp.multiVersionList.VerList))
        copy(verList, mp.multiVersionList.VerList)

        return verList
}

// include TemporaryVerMap or else cann't recycle temporary version after restart
func (mp *metaPartition) GetAllVerList() (verList []*proto.VolVersionInfo) {
        mp.multiVersionList.RWLock.RLock()
        defer mp.multiVersionList.RWLock.RUnlock()

        verList = make([]*proto.VolVersionInfo, len(mp.multiVersionList.VerList))
        copy(verList, mp.multiVersionList.VerList)

        for _, verInfo := range mp.multiVersionList.TemporaryVerMap {
                verList = append(verList, verInfo)
        }
        sort.SliceStable(verList, func(i, j int) bool {
                if verList[i].Ver < verList[j].Ver {
                        return true
                }
                return false
        })
        return
}

func (mp *metaPartition) updateSize() {
        timer := time.NewTicker(time.Minute * 2)
        go func() {
                for {
                        select {
                        case <-timer.C:
                                size := uint64(0)

                                mp.inodeTree.GetTree().Ascend(func(item BtreeItem) bool {
                                        inode := item.(*Inode)
                                        size += inode.Size
                                        return true
                                })

                                mp.size = size
                                log.LogDebugf("[updateSize] update mp[%v] size(%d) success,inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, size, mp.inodeTree.Len(), mp.dentryTree.Len())
                        case <-mp.stopC:
                                log.LogDebugf("[updateSize] stop update mp[%v] size,inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.inodeTree.Len(), mp.dentryTree.Len())
                                return
                        }
                }
        }()
}

func (mp *metaPartition) ForceSetMetaPartitionToLoadding() {
        mp.isLoadingMetaPartition = true
}

func (mp *metaPartition) ForceSetMetaPartitionToFininshLoad() {
        mp.isLoadingMetaPartition = false
}

func (mp *metaPartition) DataSize() uint64 {
        return mp.size
}

func (mp *metaPartition) GetFreeListLen() int {
        return mp.freeList.Len()
}

// Start starts a meta partition.
func (mp *metaPartition) Start(isCreate bool) (err error) {
        if atomic.CompareAndSwapUint32(&mp.state, common.StateStandby, common.StateStart) {
                defer func() {
                        var newState uint32
                        if err != nil {
                                newState = common.StateStandby
                        } else {
                                newState = common.StateRunning
                        }
                        atomic.StoreUint32(&mp.state, newState)
                }()
                if mp.config.BeforeStart != nil {
                        mp.config.BeforeStart()
                }
                if err = mp.onStart(isCreate); err != nil {
                        err = errors.NewErrorf("[Start]->%s", err.Error())
                        return
                }

                if mp.config.AfterStart != nil {
                        mp.config.AfterStart()
                }
        }
        return
}

// Stop stops a meta partition.
func (mp *metaPartition) Stop() {
        if atomic.CompareAndSwapUint32(&mp.state, common.StateRunning, common.StateShutdown) {
                defer atomic.StoreUint32(&mp.state, common.StateStopped)
                if mp.config.BeforeStop != nil {
                        mp.config.BeforeStop()
                }
                mp.onStop()
                if mp.config.AfterStop != nil {
                        mp.config.AfterStop()
                        log.LogDebugf("[AfterStop]: partition id=%d execute ok.",
                                mp.config.PartitionId)
                }
        }
}

func (mp *metaPartition) versionInit(isCreate bool) (err error) {
        if !isCreate {
                return
        }
        var verList *proto.VolVersionInfoList
        verList, err = masterClient.AdminAPI().GetVerList(mp.config.VolName)

        if err != nil {
                log.LogErrorf("action[onStart] GetVerList err[%v]", err)
                return
        }

        for _, info := range verList.VerList {
                if info.Status != proto.VersionNormal {
                        continue
                }
                mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, info)
        }

        log.LogDebugf("action[onStart] mp[%v] verList %v", mp.config.PartitionId, mp.multiVersionList.VerList)
        vlen := len(mp.multiVersionList.VerList)
        if vlen > 0 {
                mp.verSeq = mp.multiVersionList.VerList[vlen-1].Ver
        }

        return
}

func (mp *metaPartition) onStart(isCreate bool) (err error) {
        defer func() {
                if err == nil {
                        return
                }
                mp.onStop()
        }()
        if err = mp.versionInit(isCreate); err != nil {
                return
        }
        if err = mp.load(isCreate); err != nil {
                err = errors.NewErrorf("[onStart] load partition id=%d: %s",
                        mp.config.PartitionId, err.Error())
                return
        }
        mp.startScheduleTask()
        if err = mp.startFreeList(); err != nil {
                err = errors.NewErrorf("[onStart] start free list id=%d: %s",
                        mp.config.PartitionId, err.Error())
                return
        }

        // set EBS Client
        if clusterInfo, err = masterClient.AdminAPI().GetClusterInfo(); err != nil {
                log.LogErrorf("action[onStart] GetClusterInfo err[%v]", err)
                return
        }

        var volumeInfo *proto.SimpleVolView
        if volumeInfo, err = masterClient.AdminAPI().GetVolumeSimpleInfo(mp.config.VolName); err != nil {
                log.LogErrorf("action[onStart] GetVolumeSimpleInfo err[%v]", err)
                return
        }

        mp.vol.volDeleteLockTime = volumeInfo.DeleteLockTime

        go mp.runVersionOp()

        mp.volType = volumeInfo.VolType
        var ebsClient *blobstore.BlobStoreClient
        if clusterInfo.EbsAddr != "" && proto.IsCold(mp.volType) {
                ebsClient, err = blobstore.NewEbsClient(
                        access.Config{
                                ConnMode: access.NoLimitConnMode,
                                Consul: access.ConsulConfig{
                                        Address: clusterInfo.EbsAddr,
                                },
                                MaxSizePutOnce: int64(volumeInfo.ObjBlockSize),
                                Logger:         &access.Logger{Filename: path.Join(log.LogDir, "ebs.log")},
                        },
                )

                if err != nil {
                        log.LogErrorf("action[onStart] err[%v]", err)
                        return
                }
                if ebsClient == nil {
                        err = errors.NewErrorf("[onStart] ebsClient is nil")
                        return
                }
                mp.ebsClient = ebsClient
        }

        go mp.startCheckerEvict()

        log.LogDebugf("[before raft] get mp[%v] applied(%d),inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.applyID, mp.inodeTree.Len(), mp.dentryTree.Len())

        if err = mp.startRaft(); err != nil {
                err = errors.NewErrorf("[onStart] start raft id=%d: %s",
                        mp.config.PartitionId, err.Error())
                return
        }
        log.LogDebugf("[after raft] get mp[%v] applied(%d),inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.applyID, mp.inodeTree.Len(), mp.dentryTree.Len())

        mp.updateSize()

        if proto.IsHot(mp.volType) {
                log.LogInfof("hot vol not need cacheTTL")
                go mp.multiVersionTTLWork(time.Minute)
                return
        }
        // do cache TTL die out process
        if err = mp.cacheTTLWork(); err != nil {
                err = errors.NewErrorf("[onStart] start CacheTTLWork id=%d: %s",
                        mp.config.PartitionId, err.Error())
                return
        }

        return
}

func (mp *metaPartition) startScheduleTask() {
        mp.startSchedule(mp.applyID)
        mp.startFileStats()
}

func (mp *metaPartition) onStop() {
        mp.stopRaft()
        mp.stop()
        if mp.delInodeFp != nil {
                mp.delInodeFp.Sync()
                mp.delInodeFp.Close()
        }
}

func (mp *metaPartition) startRaft() (err error) {
        var (
                heartbeatPort int
                replicaPort   int
                peers         []raftstore.PeerAddress
        )
        if heartbeatPort, replicaPort, err = mp.getRaftPort(); err != nil {
                return
        }
        for _, peer := range mp.config.Peers {
                addr := strings.Split(peer.Addr, ":")[0]
                rp := raftstore.PeerAddress{
                        Peer: raftproto.Peer{
                                ID: peer.ID,
                        },
                        Address:       addr,
                        HeartbeatPort: heartbeatPort,
                        ReplicaPort:   replicaPort,
                }
                peers = append(peers, rp)
        }
        log.LogInfof("start partition id=%d,applyID:%v raft peers: %s",
                mp.config.PartitionId, mp.applyID, peers)
        pc := &raftstore.PartitionConfig{
                ID:      mp.config.PartitionId,
                Applied: mp.applyID,
                Peers:   peers,
                SM:      mp,
        }
        mp.raftPartition, err = mp.config.RaftStore.CreatePartition(pc)
        if err == nil {
                mp.ForceSetMetaPartitionToFininshLoad()
        }
        return
}

func (mp *metaPartition) stopRaft() {
        if mp.raftPartition != nil {
                // TODO Unhandled errors
                // mp.raftPartition.Stop()
        }
        return
}

func (mp *metaPartition) getRaftPort() (heartbeat, replica int, err error) {
        raftConfig := mp.config.RaftStore.RaftConfig()
        heartbeatAddrSplits := strings.Split(raftConfig.HeartbeatAddr, ":")
        replicaAddrSplits := strings.Split(raftConfig.ReplicateAddr, ":")
        if len(heartbeatAddrSplits) != 2 {
                err = ErrIllegalHeartbeatAddress
                return
        }
        if len(replicaAddrSplits) != 2 {
                err = ErrIllegalReplicateAddress
                return
        }
        heartbeat, err = strconv.Atoi(heartbeatAddrSplits[1])
        if err != nil {
                return
        }
        replica, err = strconv.Atoi(replicaAddrSplits[1])
        if err != nil {
                return
        }
        return
}

// NewMetaPartition creates a new meta partition with the specified configuration.
func NewMetaPartition(conf *MetaPartitionConfig, manager *metadataManager) MetaPartition {
        mp := &metaPartition{
                config:        conf,
                dentryTree:    NewBtree(),
                inodeTree:     NewBtree(),
                extendTree:    NewBtree(),
                multipartTree: NewBtree(),
                stopC:         make(chan bool),
                storeChan:     make(chan *storeMsg, 100),
                freeList:      newFreeList(),
                extDelCh:      make(chan []proto.ExtentKey, defaultDelExtentsCnt),
                extReset:      make(chan struct{}),
                vol:           NewVol(),
                manager:       manager,
                uniqChecker:   newUniqChecker(),
                verSeq:        conf.VerSeq,
                multiVersionList: &proto.VolVersionInfoList{
                        TemporaryVerMap: make(map[uint64]*proto.VolVersionInfo),
                },
                enableAuditLog: true,
        }
        mp.txProcessor = NewTransactionProcessor(mp)
        return mp
}

func (mp *metaPartition) GetVolName() (volName string) {
        return mp.config.VolName
}

func (mp *metaPartition) GetVerSeq() uint64 {
        return atomic.LoadUint64(&mp.verSeq)
}

// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) SetFollowerRead(fRead bool) {
        if mp.raftPartition == nil {
                return
        }
        mp.isFollowerRead = fRead
        return
}

// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) IsFollowerRead() (ok bool) {
        if mp.raftPartition == nil {
                return false
        }

        if !mp.isFollowerRead {
                return false
        }

        if mp.raftPartition.IsRestoring() {
                return false
        }

        return true
}

// IsLeader returns the raft leader address and if the current meta partition is the leader.
func (mp *metaPartition) IsLeader() (leaderAddr string, ok bool) {
        if mp.raftPartition == nil {
                return
        }
        leaderID, _ := mp.raftPartition.LeaderTerm()
        if leaderID == 0 {
                return
        }
        ok = leaderID == mp.config.NodeId
        for _, peer := range mp.config.Peers {
                if leaderID == peer.ID {
                        leaderAddr = peer.Addr
                        return
                }
        }
        return
}

func (mp *metaPartition) LeaderTerm() (leaderID, term uint64) {
        if mp.raftPartition == nil {
                return
        }
        return mp.raftPartition.LeaderTerm()
}

func (mp *metaPartition) GetPeers() (peers []string) {
        peers = make([]string, 0)
        for _, peer := range mp.config.Peers {
                if mp.config.NodeId == peer.ID {
                        continue
                }
                peers = append(peers, peer.Addr)
        }
        return
}

// GetCursor returns the cursor stored in the config.
func (mp *metaPartition) GetCursor() uint64 {
        return atomic.LoadUint64(&mp.config.Cursor)
}

// GetUniqId returns the uniqid stored in the config.
func (mp *metaPartition) GetUniqId() uint64 {
        return atomic.LoadUint64(&mp.config.UniqId)
}

// PersistMetadata is the wrapper of persistMetadata.
func (mp *metaPartition) PersistMetadata() (err error) {
        mp.config.sortPeers()
        err = mp.persistMetadata()
        return
}

// Backup partition to partition.old
func (mp *metaPartition) RenameStaleMetadata() (err error) {
        err = mp.renameStaleMetadata()
        return
}

func (mp *metaPartition) parseCrcFromFile() ([]uint32, error) {
        data, err := os.ReadFile(path.Join(path.Join(mp.config.RootDir, snapshotDir), SnapshotSign))
        if err != nil {
                return nil, err
        }
        raw := string(data)
        crcStrs := strings.Split(raw, " ")

        crcs := make([]uint32, 0, len(crcStrs))
        for _, crcStr := range crcStrs {
                crc, err := strconv.ParseUint(crcStr, 10, 32)
                if err != nil {
                        return nil, err
                }
                crcs = append(crcs, uint32(crc))
        }

        return crcs, nil
}

const (
        CRC_COUNT_BASIC      int = 4
        CRC_COUNT_TX_STUFF   int = 7
        CRC_COUNT_UINQ_STUFF int = 8
        CRC_COUNT_MULTI_VER  int = 9
)

func (mp *metaPartition) LoadSnapshot(snapshotPath string) (err error) {
        crcs, err := mp.parseCrcFromFile()
        if err != nil {
                return err
        }

        loadFuncs := []func(rootDir string, crc uint32) error{
                mp.loadInode,
                mp.loadDentry,
                nil, // loading quota info from extend requires mp.loadInode() has been completed, so skip mp.loadExtend() here
                mp.loadMultipart,
        }

        crc_count := len(crcs)
        if crc_count != CRC_COUNT_BASIC && crc_count != CRC_COUNT_TX_STUFF && crc_count != CRC_COUNT_UINQ_STUFF && crc_count != CRC_COUNT_MULTI_VER {
                log.LogErrorf("action[LoadSnapshot] crc array length %d not match", len(crcs))
                return ErrSnapshotCrcMismatch
        }

        // handle compatibility in upgrade scenarios
        needLoadTxStuff := false
        needLoadUniqStuff := false
        if crc_count >= CRC_COUNT_TX_STUFF {
                needLoadTxStuff = true
                loadFuncs = append(loadFuncs, mp.loadTxInfo)
                loadFuncs = append(loadFuncs, mp.loadTxRbInode)
                loadFuncs = append(loadFuncs, mp.loadTxRbDentry)
        }
        if crc_count >= CRC_COUNT_UINQ_STUFF {
                needLoadUniqStuff = true
                loadFuncs = append(loadFuncs, mp.loadUniqChecker)
        }

        if crc_count == CRC_COUNT_MULTI_VER {
                if err = mp.loadMultiVer(snapshotPath, crcs[CRC_COUNT_MULTI_VER-1]); err != nil {
                        return
                }
        } else {
                mp.storeMultiVersion(snapshotPath, &storeMsg{multiVerList: mp.multiVersionList.VerList})
        }

        errs := make([]error, len(loadFuncs))
        var wg sync.WaitGroup
        wg.Add(len(loadFuncs))
        for idx, f := range loadFuncs {
                loadFunc := f
                if f == nil {
                        wg.Done()
                        continue
                }

                i := idx
                go func() {
                        defer func() {
                                if r := recover(); r != nil {
                                        log.LogWarnf("action[LoadSnapshot] recovered when load partition partition: %v, failed: %v",
                                                mp.config.PartitionId, r)

                                        errs[i] = errors.NewErrorf("%v", r)
                                }

                                wg.Done()
                        }()

                        errs[i] = loadFunc(snapshotPath, crcs[i])
                }()
        }

        wg.Wait()
        log.LogDebugf("[load meta finish] get mp[%v] inodeCount(%d),dentryCount(%d)", mp.config.PartitionId, mp.inodeTree.Len(), mp.dentryTree.Len())
        for _, err = range errs {
                if err != nil {
                        return
                }
        }

        if err = mp.loadExtend(snapshotPath, crcs[2]); err != nil {
                return
        }

        if needLoadTxStuff {
                if err = mp.loadTxID(snapshotPath); err != nil {
                        return
                }
        }

        if needLoadUniqStuff {
                if err = mp.loadUniqID(snapshotPath); err != nil {
                        return
                }
        }

        if err = mp.loadApplyID(snapshotPath); err != nil {
                return
        }
        return
}

func (mp *metaPartition) load(isCreate bool) (err error) {
        if err = mp.loadMetadata(); err != nil {
                return
        }
        // 1. create new metaPartition, no need to load snapshot
        // 2. store the snapshot files for new mp, because
        // mp.load() will check all the snapshot files when mn startup
        if isCreate {
                if err = mp.storeSnapshotFiles(); err != nil {
                        err = errors.NewErrorf("[onStart] storeSnapshotFiles for partition id=%d: %s",
                                mp.config.PartitionId, err.Error())
                }
                return
        }

        snapshotPath := path.Join(mp.config.RootDir, snapshotDir)
        if _, err = os.Stat(snapshotPath); err != nil {
                log.LogErrorf("load snapshot failed, err: %s", err.Error())
                return nil

        }

        return mp.LoadSnapshot(snapshotPath)
}

func (mp *metaPartition) store(sm *storeMsg) (err error) {
        log.LogWarnf("metaPartition %d store apply %v", mp.config.PartitionId, sm.applyIndex)
        tmpDir := path.Join(mp.config.RootDir, snapshotDirTmp)
        if _, err = os.Stat(tmpDir); err == nil {
                // TODO Unhandled errors
                os.RemoveAll(tmpDir)
        }
        err = nil
        if err = os.MkdirAll(tmpDir, 0o775); err != nil {
                return
        }

        defer func() {
                if err != nil {
                        // TODO Unhandled errors
                        os.RemoveAll(tmpDir)
                }
        }()
        crcBuffer := bytes.NewBuffer(make([]byte, 0, 16))
        storeFuncs := []func(dir string, sm *storeMsg) (uint32, error){
                mp.storeInode,
                mp.storeDentry,
                mp.storeExtend,
                mp.storeMultipart,
                mp.storeTxInfo,
                mp.storeTxRbInode,
                mp.storeTxRbDentry,
                mp.storeUniqChecker,
                mp.storeMultiVersion,
        }
        for _, storeFunc := range storeFuncs {
                var crc uint32
                if crc, err = storeFunc(tmpDir, sm); err != nil {
                        return
                }
                if crcBuffer.Len() != 0 {
                        crcBuffer.WriteString(" ")
                }
                crcBuffer.WriteString(fmt.Sprintf("%d", crc))
        }
        log.LogWarnf("metaPartition %d store apply %v", mp.config.PartitionId, sm.applyIndex)
        if err = mp.storeApplyID(tmpDir, sm); err != nil {
                return
        }
        if err = mp.storeTxID(tmpDir, sm); err != nil {
                return
        }
        if err = mp.storeUniqID(tmpDir, sm); err != nil {
                return
        }

        // write crc to file
        if err = os.WriteFile(path.Join(tmpDir, SnapshotSign), crcBuffer.Bytes(), 0o775); err != nil {
                return
        }
        snapshotDir := path.Join(mp.config.RootDir, snapshotDir)
        // check snapshot backup
        backupDir := path.Join(mp.config.RootDir, snapshotBackup)
        if _, err = os.Stat(backupDir); err == nil {
                if err = os.RemoveAll(backupDir); err != nil {
                        return
                }
        }
        err = nil

        // rename snapshot
        if _, err = os.Stat(snapshotDir); err == nil {
                if err = os.Rename(snapshotDir, backupDir); err != nil {
                        return
                }
        }
        err = nil

        if err = os.Rename(tmpDir, snapshotDir); err != nil {
                _ = os.Rename(backupDir, snapshotDir)
                return
        }
        err = os.RemoveAll(backupDir)
        if err != nil {
                return
        }

        mp.storedApplyId = sm.applyIndex
        return
}

// UpdatePeers updates the peers.
func (mp *metaPartition) UpdatePeers(peers []proto.Peer) {
        mp.config.Peers = peers
}

// DeleteRaft deletes the raft partition.
func (mp *metaPartition) DeleteRaft() (err error) {
        err = mp.raftPartition.Delete()
        return
}

// Return a new inode ID and update the offset.
func (mp *metaPartition) nextInodeID() (inodeId uint64, err error) {
        for {
                cur := atomic.LoadUint64(&mp.config.Cursor)
                end := mp.config.End
                if cur >= end {
                        log.LogWarnf("nextInodeID: can't create inode again, cur %d, end %d", cur, end)
                        return 0, ErrInodeIDOutOfRange
                }
                newId := cur + 1
                if atomic.CompareAndSwapUint64(&mp.config.Cursor, cur, newId) {
                        return newId, nil
                }
        }
}

// ChangeMember changes the raft member with the specified one.
func (mp *metaPartition) ChangeMember(changeType raftproto.ConfChangeType, peer raftproto.Peer, context []byte) (resp interface{}, err error) {
        resp, err = mp.raftPartition.ChangeMember(changeType, peer, context)
        return
}

// GetBaseConfig returns the configuration stored in the meta partition. TODO remove? no usage?
func (mp *metaPartition) GetBaseConfig() MetaPartitionConfig {
        return *mp.config
}

// UpdatePartition updates the meta partition. TODO remove? no usage?
func (mp *metaPartition) UpdatePartition(req *UpdatePartitionReq,
        resp *UpdatePartitionResp) (err error) {
        reqData, err := json.Marshal(req)
        if err != nil {
                resp.Status = proto.TaskFailed
                resp.Result = err.Error()
                return
        }
        r, err := mp.submit(opFSMUpdatePartition, reqData)
        if err != nil {
                resp.Status = proto.TaskFailed
                resp.Result = err.Error()
                return
        }
        if status := r.(uint8); status != proto.OpOk {
                resp.Status = proto.TaskFailed
                p := &Packet{}
                p.ResultCode = status
                err = errors.NewErrorf("[UpdatePartition]: %s", p.GetResultMsg())
                resp.Result = p.GetResultMsg()
        }
        resp.Status = proto.TaskSucceeds
        return
}

func (mp *metaPartition) DecommissionPartition(req []byte) (err error) {
        _, err = mp.submit(opFSMDecommissionPartition, req)
        return
}

func (mp *metaPartition) IsExsitPeer(peer proto.Peer) bool {
        for _, hasExsitPeer := range mp.config.Peers {
                if hasExsitPeer.Addr == peer.Addr && hasExsitPeer.ID == peer.ID {
                        return true
                }
        }
        return false
}

func (mp *metaPartition) TryToLeader(groupID uint64) error {
        return mp.raftPartition.TryToLeader(groupID)
}

// ResponseLoadMetaPartition loads the snapshot signature. TODO remove? no usage?
func (mp *metaPartition) ResponseLoadMetaPartition(p *Packet) (err error) {
        resp := &proto.MetaPartitionLoadResponse{
                PartitionID: mp.config.PartitionId,
                DoCompare:   true,
        }
        resp.MaxInode = mp.GetCursor()
        resp.InodeCount = uint64(mp.GetInodeTreeLen())
        resp.DentryCount = uint64(mp.GetDentryTreeLen())
        resp.ApplyID = mp.getApplyID()
        resp.CommittedID = mp.getCommittedID()
        if err != nil {
                err = errors.Trace(err,
                        "[ResponseLoadMetaPartition] check snapshot")
                return
        }

        data, err := json.Marshal(resp)
        if err != nil {
                err = errors.Trace(err, "[ResponseLoadMetaPartition] marshal")
                return
        }
        p.PacketOkWithBody(data)
        return
}

// MarshalJSON is the wrapper of json.Marshal.
func (mp *metaPartition) MarshalJSON() ([]byte, error) {
        return json.Marshal(mp.config)
}

// TODO remove? no usage?
// Reset resets the meta partition.
func (mp *metaPartition) Reset() (err error) {
        mp.inodeTree.Reset()
        mp.dentryTree.Reset()
        mp.config.Cursor = 0
        mp.config.UniqId = 0
        mp.applyID = 0
        mp.txProcessor.Reset()

        // remove files
        filenames := []string{applyIDFile, dentryFile, inodeFile, extendFile, multipartFile, verdataFile, txInfoFile, txRbInodeFile, txRbDentryFile, TxIDFile}
        for _, filename := range filenames {
                filepath := path.Join(mp.config.RootDir, filename)
                if err = os.Remove(filepath); err != nil {
                        return
                }
        }

        return
}

func (mp *metaPartition) canRemoveSelf() (canRemove bool, err error) {
        var partition *proto.MetaPartitionInfo
        if partition, err = masterClient.ClientAPI().GetMetaPartition(mp.config.PartitionId); err != nil {
                log.LogErrorf("action[canRemoveSelf] err[%v]", err)
                return
        }
        canRemove = false
        var existInPeers bool
        for _, peer := range partition.Peers {
                if mp.config.NodeId == peer.ID {
                        existInPeers = true
                }
        }
        if !existInPeers {
                canRemove = true
                return
        }
        if mp.config.NodeId == partition.OfflinePeerID {
                canRemove = true
                return
        }
        return
}

// cacheTTLWork only happen in datalake situation
func (mp *metaPartition) multiVersionTTLWork(dur time.Duration) {
        // do cache ttl work
        // first sleep a rand time, range [0, 1200s(20m)],
        // make sure all mps is not doing scan work at the same time.
        rand.Seed(time.Now().Unix())
        time.Sleep(time.Duration(rand.Intn(60)))
        log.LogDebugf("[multiVersionTTLWork] start, mp[%v]", mp.config.PartitionId)
        ttl := time.NewTicker(dur)
        snapQueue := make(chan interface{}, 5)
        for {
                select {
                case <-ttl.C:
                        log.LogDebugf("[multiVersionTTLWork] begin cache ttl, mp[%v]", mp.config.PartitionId)
                        mp.multiVersionList.RWLock.RLock()
                        volVersionInfoList := &proto.VolVersionInfoList{
                                TemporaryVerMap: make(map[uint64]*proto.VolVersionInfo),
                        }
                        copy(volVersionInfoList.VerList, mp.multiVersionList.VerList)
                        for key, value := range mp.multiVersionList.TemporaryVerMap {
                                copiedValue := *value
                                volVersionInfoList.TemporaryVerMap[key] = &copiedValue
                        }

                        mp.multiVersionList.RWLock.RUnlock()
                        for _, version := range volVersionInfoList.TemporaryVerMap {
                                if version.Status == proto.VersionDeleting {
                                        continue
                                }
                                snapQueue <- nil
                                version.Status = proto.VersionDeleting
                                go func(verSeq uint64) {
                                        mp.delPartitionVersion(verSeq)
                                        mp.multiVersionList.RWLock.Lock()
                                        delete(mp.multiVersionList.TemporaryVerMap, verSeq)
                                        mp.multiVersionList.RWLock.Unlock()
                                        <-snapQueue
                                }(version.Ver)
                        }

                case <-mp.stopC:
                        log.LogWarnf("[multiVersionTTLWork] stoped, mp[%v]", mp.config.PartitionId)
                        return
                }
        }

        return
}

func (mp *metaPartition) delPartitionVersion(verSeq uint64) {
        var wg sync.WaitGroup
        wg.Add(3)
        reqVerSeq := verSeq
        if reqVerSeq == 0 {
                reqVerSeq = math.MaxUint64
        }

        log.LogInfof("action[delPartitionVersion] mp[%v] verseq [%v]:%v", mp.config.PartitionId, verSeq, reqVerSeq)
        go mp.delPartitionInodesVersion(reqVerSeq, &wg)
        go mp.delPartitionExtendsVersion(reqVerSeq, &wg)
        go mp.delPartitionDentriesVersion(reqVerSeq, &wg)
        wg.Wait()
}

func (mp *metaPartition) delPartitionDentriesVersion(verSeq uint64, wg *sync.WaitGroup) {
        defer wg.Done()
        // begin
        count := 0
        needSleep := false

        mp.dentryTree.GetTree().Ascend(func(i BtreeItem) bool {
                if _, ok := mp.IsLeader(); !ok {
                        return false
                }
                den := i.(*Dentry)
                // dir type just skip

                p := &Packet{}
                req := &proto.DeleteDentryRequest{
                        VolName:     mp.config.VolName,
                        ParentID:    mp.config.PartitionId,
                        PartitionID: den.ParentId,
                        Name:        den.Name,
                        Verseq:      verSeq,
                }
                mp.DeleteDentry(req, p, localAddrForAudit)
                // check empty result.
                // if result is OpAgain, means the extDelCh maybe full,
                // so let it sleep 1s.
                if p.ResultCode == proto.OpAgain {
                        needSleep = true
                }

                // every 1000 inode sleep 1s
                if count > 1000 || needSleep {
                        count %= 1000
                        needSleep = false
                        time.Sleep(time.Second)
                }
                return true
        })
}

func (mp *metaPartition) delPartitionExtendsVersion(verSeq uint64, wg *sync.WaitGroup) {
        defer wg.Done()
        // begin
        count := 0
        needSleep := false

        mp.extendTree.GetTree().Ascend(func(treeItem BtreeItem) bool {
                if _, ok := mp.IsLeader(); !ok {
                        return false
                }
                e := treeItem.(*Extend)

                p := &Packet{}
                req := &proto.RemoveXAttrRequest{
                        VolName:     mp.config.VolName,
                        PartitionId: mp.config.PartitionId,
                        Inode:       e.inode,
                        VerSeq:      verSeq,
                }
                mp.RemoveXAttr(req, p)
                // check empty result.
                // if result is OpAgain, means the extDelCh maybe full,
                // so let it sleep 1s.
                if p.ResultCode == proto.OpAgain {
                        needSleep = true
                }

                // every 1000 inode sleep 1s
                if count > 1000 || needSleep {
                        count %= 1000
                        needSleep = false
                        time.Sleep(time.Second)
                }
                return true
        })
}

func (mp *metaPartition) delPartitionInodesVersion(verSeq uint64, wg *sync.WaitGroup) {
        defer wg.Done()
        // begin
        count := 0
        needSleep := false

        mp.inodeTree.GetTree().Ascend(func(i BtreeItem) bool {
                if _, ok := mp.IsLeader(); !ok {
                        return false
                }
                inode := i.(*Inode)
                // dir type just skip
                if proto.IsDir(inode.Type) {
                        return true
                }

                inode.RLock()
                // eks is empty just skip
                if ok, _ := inode.ShouldDelVer(verSeq, mp.verSeq); !ok {
                        inode.RUnlock()
                        return true
                }

                p := &Packet{}
                req := &proto.UnlinkInodeRequest{
                        Inode:  inode.Inode,
                        VerSeq: verSeq,
                }
                inode.RUnlock()

                mp.UnlinkInode(req, p, localAddrForAudit)
                // check empty result.
                // if result is OpAgain, means the extDelCh maybe full,
                // so let it sleep 1s.
                if p.ResultCode == proto.OpAgain {
                        needSleep = true
                }

                // every 1000 inode sleep 1s
                if count > 1000 || needSleep {
                        count %= 1000
                        needSleep = false
                        time.Sleep(time.Second)
                }
                return true
        })

        return
}

// cacheTTLWork only happen in datalake situation
func (mp *metaPartition) cacheTTLWork() (err error) {
        // check volume type, only Cold volume will do the cache ttl.
        volView, mcErr := masterClient.ClientAPI().GetVolumeWithoutAuthKey(mp.config.VolName)
        if mcErr != nil {
                err = fmt.Errorf("cacheTTLWork: can't get volume info: partitoinID(%v) volume(%v)",
                        mp.config.PartitionId, mp.config.VolName)
                return
        }
        if volView.VolType != proto.VolumeTypeCold {
                return
        }

        if mp.verSeq > 0 {
                log.LogWarnf("[doCacheTTL] volume [%v] enable snapshot.exit cache ttl, mp[%v]", mp.GetVolName(), mp.config.PartitionId)
                return
        }

        // do cache ttl work
        go mp.doCacheTTL(volView.CacheTTL)
        return
}

func (mp *metaPartition) doCacheTTL(cacheTTL int) (err error) {
        // first sleep a rand time, range [0, 1200s(20m)],
        // make sure all mps is not doing scan work at the same time.
        rand.Seed(time.Now().Unix())
        time.Sleep(time.Duration(rand.Intn(1200)))

        ttl := time.NewTicker(time.Duration(util.OneDaySec()) * time.Second)
        for {
                select {
                case <-ttl.C:
                        if mp.verSeq > 0 {
                                log.LogWarnf("[doCacheTTL] volume [%v] enable snapshot.exit cache ttl, mp[%v] cacheTTL[%v]",
                                        mp.GetVolName(), mp.config.PartitionId, cacheTTL)
                                return
                        }
                        log.LogDebugf("[doCacheTTL] begin cache ttl, mp[%v] cacheTTL[%v]", mp.config.PartitionId, cacheTTL)
                        // only leader can do TTL work
                        if _, ok := mp.IsLeader(); !ok {
                                log.LogDebugf("[doCacheTTL] partitionId=%d is not leader, skip", mp.config.PartitionId)
                                continue
                        }

                        // get the last cacheTTL
                        volView, mcErr := masterClient.ClientAPI().GetVolumeWithoutAuthKey(mp.config.VolName)
                        if mcErr != nil {
                                err = fmt.Errorf("[doCacheTTL]: can't get volume info: partitoinID(%v) volume(%v)",
                                        mp.config.PartitionId, mp.config.VolName)
                                return
                        }
                        cacheTTL = volView.CacheTTL

                        mp.InodeTTLScan(cacheTTL)

                case <-mp.stopC:
                        log.LogWarnf("[doCacheTTL] stoped, mp[%v]", mp.config.PartitionId)
                        return
                }
        }
}

func (mp *metaPartition) InodeTTLScan(cacheTTL int) {
        curTime := timeutil.GetCurrentTimeUnix()
        // begin
        count := 0
        needSleep := false
        mp.inodeTree.GetTree().Ascend(func(i BtreeItem) bool {
                inode := i.(*Inode)
                // dir type just skip
                if proto.IsDir(inode.Type) {
                        return true
                }
                inode.RLock()
                // eks is empty just skip
                if len(inode.Extents.eks) == 0 || inode.ShouldDelete() {
                        inode.RUnlock()
                        return true
                }

                if (curTime - inode.AccessTime) > int64(cacheTTL)*util.OneDaySec() {
                        log.LogDebugf("[InodeTTLScan] mp[%v] do inode ttl delete[%v]", mp.config.PartitionId, inode.Inode)
                        count++
                        // make request
                        p := &Packet{}
                        req := &proto.EmptyExtentKeyRequest{
                                Inode: inode.Inode,
                        }
                        ino := NewInode(req.Inode, 0)
                        curTime = timeutil.GetCurrentTimeUnix()
                        if inode.ModifyTime < curTime {
                                ino.ModifyTime = curTime
                        }

                        mp.ExtentsOp(p, ino, opFSMExtentsEmpty)
                        // check empty result.
                        // if result is OpAgain, means the extDelCh maybe full,
                        // so let it sleep 1s.
                        if p.ResultCode == proto.OpAgain {
                                needSleep = true
                        }
                }
                inode.RUnlock()
                // every 1000 inode sleep 1s
                if count > 1000 || needSleep {
                        count %= 1000
                        needSleep = false
                        time.Sleep(time.Second)
                }
                return true
        })
}

func (mp *metaPartition) initTxInfo(txInfo *proto.TransactionInfo) error {
        txInfo.TxID = mp.txProcessor.txManager.nextTxID()

        txInfo.CreateTime = time.Now().Unix()
        txInfo.State = proto.TxStatePreCommit

        if mp.txProcessor.txManager.opLimiter.Allow() {
                return nil
        }

        return fmt.Errorf("tx create is limited")
}

func (mp *metaPartition) storeSnapshotFiles() (err error) {
        msg := &storeMsg{
                applyIndex:     mp.applyID,
                txId:           mp.txProcessor.txManager.txIdAlloc.getTransactionID(),
                inodeTree:      NewBtree(),
                dentryTree:     NewBtree(),
                extendTree:     NewBtree(),
                multipartTree:  NewBtree(),
                txTree:         NewBtree(),
                txRbInodeTree:  NewBtree(),
                txRbDentryTree: NewBtree(),
                uniqId:         mp.GetUniqId(),
                uniqChecker:    newUniqChecker(),
                multiVerList:   mp.multiVersionList.VerList,
        }

        return mp.store(msg)
}

func (mp *metaPartition) startCheckerEvict() {
        timer := time.NewTimer(opCheckerInterval)
        for {
                select {
                case <-timer.C:
                        if _, ok := mp.IsLeader(); ok {
                                left, evict, err := mp.uniqCheckerEvict()
                                if evict != 0 {
                                        log.LogInfof("[uniqChecker] after doEvict partition-%d, left:%d, evict:%d, err:%v", mp.config.PartitionId, left, evict, err)
                                } else {
                                        log.LogDebugf("[uniqChecker] after doEvict partition-%d, left:%d, evict:%d, err:%v", mp.config.PartitionId, left, evict, err)
                                }
                        }
                        timer.Reset(opCheckerInterval)
                case <-mp.stopC:
                        return
                }
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "container/list"
        "encoding/binary"
        "fmt"
        "io"
        "io/ioutil"
        "os"
        "path"
        "strings"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/synclist"
)

const (
        prefixDelExtent     = "EXTENT_DEL"
        prefixDelExtentV2   = "EXTENT_DEL_V2"
        prefixMultiVer      = verdataFile
        maxDeleteExtentSize = 10 * MB
)

var extentsFileHeader = []byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08}

// start metapartition delete extents work
func (mp *metaPartition) startToDeleteExtents() {
        fileList := synclist.New()
        go mp.appendDelExtentsToFile(fileList)
        go mp.deleteExtentsFromList(fileList)
}

// create extent delete file
func (mp *metaPartition) createExtentDeleteFile(prefix string, idx int64, fileList *synclist.SyncList) (fp *os.File, fileName string, fileSize int64, err error) {
        fileName = fmt.Sprintf("%s_%d", prefix, idx)
        fp, err = os.OpenFile(path.Join(mp.config.RootDir, fileName),
                os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
        if err != nil {
                log.LogErrorf("[metaPartition] createExtentDeletFile openFile %v %v error %v", mp.config.RootDir, fileName, err)
                return
        }
        if _, err = fp.Write(extentsFileHeader); err != nil {
                log.LogErrorf("[metaPartition] createExtentDeletFile Write %v %v error %v", mp.config.RootDir, fileName, err)
        }
        fileSize = int64(len(extentsFileHeader))
        fileList.PushBack(fileName)
        return
}

// append delete extents from extDelCh to EXTENT_DEL_N files
func (mp *metaPartition) appendDelExtentsToFile(fileList *synclist.SyncList) {
        defer func() {
                if r := recover(); r != nil {
                        log.LogErrorf(fmt.Sprintf("[metaPartition] appendDelExtentsToFile pid(%v) panic (%v)", mp.config.PartitionId, r))
                }
        }()
        var (
                fileName string
                fileSize int64
                idx      int64
                fp       *os.File
                err      error
        )
LOOP:
        // scan existed EXTENT_DEL_* files to fill fileList
        finfos, err := ioutil.ReadDir(mp.config.RootDir)
        if err != nil {
                panic(err)
        }

        finfos = sortDelExtFileInfo(finfos)
        for _, info := range finfos {
                fileList.PushBack(info.Name())
                fileSize = info.Size()
        }

        // check
        lastItem := fileList.Back()
        if lastItem != nil {
                fileName = lastItem.Value.(string)
        }
        if lastItem == nil || !strings.HasPrefix(fileName, prefixDelExtentV2) {
                // if no exist EXTENT_DEL_*, create one
                log.LogDebugf("action[appendDelExtentsToFile] verseq [%v]", mp.verSeq)
                fp, fileName, fileSize, err = mp.createExtentDeleteFile(prefixDelExtentV2, idx, fileList)
                log.LogDebugf("action[appendDelExtentsToFile] verseq [%v] fileName %v", mp.verSeq, fileName)
                if err != nil {
                        panic(err)
                }
        } else {
                // exist, open last file
                fp, err = os.OpenFile(path.Join(mp.config.RootDir, fileName),
                        os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
                if err != nil {
                        panic(err)
                }
                // continue from last item
                idx = getDelExtFileIdx(fileName)
        }

        log.LogDebugf("action[appendDelExtentsToFile] verseq [%v] fileName %v", mp.verSeq, fileName)
        // TODO Unhandled errors
        defer fp.Close()
        buf := make([]byte, 0)
        for {
                select {
                case <-mp.stopC:
                        return
                case <-mp.extReset:
                        // TODO Unhandled errors
                        fp.Close()
                        // reset fileList
                        fileList.Init()
                        goto LOOP
                case eks := <-mp.extDelCh:
                        var data []byte
                        buf = buf[:0]
                        if len(eks) == 0 {
                                goto LOOP
                        }
                        log.LogDebugf("[appendDelExtentsToFile] mp(%v) del eks [%v]", mp.config.PartitionId, eks)
                        for _, ek := range eks {
                                data, err = ek.MarshalBinaryWithCheckSum(true)
                                if err != nil {
                                        log.LogWarnf("[appendDelExtentsToFile] partitionId=%d,"+
                                                " extentKey marshal: %s", mp.config.PartitionId, err.Error())
                                        break
                                }
                                buf = append(buf, data...)
                        }

                        if err != nil {
                                err = mp.sendExtentsToChan(eks)
                                if err != nil {
                                        log.LogErrorf("[appendDelExtentsToFile] mp[%v] sendExtentsToChan fail, err(%s)", mp.config.PartitionId, err.Error())
                                }
                                continue
                        }
                        if fileSize >= maxDeleteExtentSize {
                                // TODO Unhandled errors
                                // close old File
                                fp.Close()
                                idx += 1
                                fp, fileName, fileSize, err = mp.createExtentDeleteFile(prefixDelExtentV2, idx, fileList)
                                if err != nil {
                                        panic(err)
                                }
                                log.LogDebugf("appendDelExtentsToFile. volname [%v] mp[%v] createExtentDeleteFile %v",
                                        mp.GetVolName(), mp.config.PartitionId, fileName)
                        }
                        // write delete extents into file
                        if _, err = fp.Write(buf); err != nil {
                                panic(err)
                        }
                        fileSize += int64(len(buf))
                        log.LogDebugf("action[appendDelExtentsToFile] filesize now %v", fileSize)
                }
        }
}

func (mp *metaPartition) batchDeleteExtentsByDp(dpId uint64, extents []*proto.ExtentKey) (err error) {
        dp := mp.vol.GetPartition(dpId)
        if dp == nil {
                log.LogErrorf("[batchDeleteExtentsByDp] mp(%v) dp(%v) not found", mp.config.PartitionId, dpId)
                err = fmt.Errorf("dp %v is not found", dpId)
                return
        }
        if dp.IsDiscard {
                log.LogDebugf("[batchDeleteExtentsByDp] mp(%v) dp(%v) is discard", mp.config.PartitionId, dpId)
                return
        }
        log.LogDebugf("[batchDeleteExtentsByDp] mp(%v) delete eks from dp(%v)", mp.config.PartitionId, dpId)
        err = mp.doBatchDeleteExtentsByPartition(dpId, extents)
        return
}

// Delete all the extents of a file.
func (mp *metaPartition) deleteExtentsFromList(fileList *synclist.SyncList) {
        defer func() {
                if r := recover(); r != nil {
                        log.LogErrorf(fmt.Sprintf("deleteExtentsFromList(%v) deleteExtentsFromList panic (%v)", mp.config.PartitionId, r))
                }
        }()

        var (
                element  *list.Element
                fileName string
                file     string
                fileInfo os.FileInfo
                err      error
        )
        for {
                // DeleteWorkerSleepMs()
                time.Sleep(1 * time.Minute)
                select {
                case <-mp.stopC:
                        return
                default:
                }
                element = fileList.Front()
                if element == nil {
                        continue
                }
                fileName = element.Value.(string)
                file = path.Join(mp.config.RootDir, fileName)
                if fileInfo, err = os.Stat(file); err != nil {
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) skip file(%v)", mp.config.PartitionId, fileName)
                        fileList.Remove(element)
                        continue
                }
                log.LogDebugf("[deleteExtentsFromList] mp(%v) reading file(%v)", mp.config.PartitionId, fileName)
                // if not leader, ignore delete
                if _, ok := mp.IsLeader(); !ok {
                        log.LogDebugf("[deleteExtentsFromList] partitionId=%d, "+
                                "not raft leader,please ignore", mp.config.PartitionId)
                        continue
                }
                // leader do delete extent for EXTENT_DEL_* file

                // read delete extents from file
                buf := make([]byte, 8)
                fp, err := os.OpenFile(file, os.O_RDWR, 0o644)
                if err != nil {
                        if !os.IsNotExist(err) {
                                log.LogErrorf("[deleteExtentsFromList] volname [%v] mp[%v] openFile %v error: %v", mp.GetVolName(), mp.config.PartitionId, file, err)
                        } else {
                                log.LogDebugf("[deleteExtentsFromList] mp(%v) delete extents file(%v) deleted", mp.config.PartitionId, fileName)
                        }
                        fileList.Remove(element)
                        continue
                }

                // get delete extents cursor at file header 8 bytes
                if _, err = fp.ReadAt(buf, 0); err != nil {
                        log.LogWarnf("[deleteExtentsFromList] partitionId=%d, "+
                                "read cursor least 8bytes, retry later", mp.config.PartitionId)
                        // TODO Unhandled errors
                        fp.Close()
                        continue
                }
                extentV2 := false
                extentKeyLen := uint64(proto.ExtentLength)
                if strings.HasPrefix(fileName, prefixDelExtentV2) {
                        extentV2 = true
                        extentKeyLen = uint64(proto.ExtentV2Length)
                }
                cursor := binary.BigEndian.Uint64(buf)
                stat, err := fp.Stat()
                if err != nil {
                        log.LogErrorf("[deleteExtentsFromList] mp(%v) stat file(%v) err(%v)", mp.config.PartitionId, fileName, err)
                        continue
                }
                log.LogDebugf("[deleteExtentsFromList] volname [%v] mp[%v] o openFile %v file len %v cursor %v", mp.GetVolName(), mp.config.PartitionId, file,
                        stat.Size(), cursor)

                log.LogDebugf("action[deleteExtentsFromList] get cursor %v", cursor)
                if fileInfo.Size() == int64(cursor) {
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) reach the end of file(%v), sleep", mp.config.PartitionId, fileName)
                        fp.Close()
                        continue
                } else if fileInfo.Size() > int64(cursor) && fileInfo.Size() < int64(cursor)+int64(extentKeyLen) {
                        log.LogErrorf("[deleteExtentsFromList] mp(%d), file(%v) corrupted!", mp.config.PartitionId, fileName)
                        fileList.Remove(element)
                        fp.Close()
                        continue
                }

                var deleteCnt uint64
                errExts := make([]proto.ExtentKey, 0)
                needDeleteExtents := make(map[uint64][]*proto.ExtentKey)
                buf = make([]byte, util.MB)
                err = func() (err error) {
                        // read extents from cursor
                        defer fp.Close()
                        // NOTE: read 1 MB at once
                        rLen, err := fp.ReadAt(buf, int64(cursor))
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) read len(%v) cursor(%v), err(%v)", mp.config.PartitionId, rLen, cursor, err)
                        if err != nil {
                                if err == io.EOF {
                                        err = nil
                                        if rLen == 0 {
                                                log.LogDebugf("[deleteExtentsFromList] mp(%v) file list cnt(%v)", mp.config.PartitionId, fileList.Len())
                                                if fileList.Len() <= 1 {
                                                        log.LogDebugf("[deleteExtentsFromList] mp(%v) skip delete file(%v), free list count(%v)", mp.config.PartitionId, fileName, fileList.Len())
                                                        return
                                                }
                                                status := mp.raftPartition.Status()
                                                _, isLeader := mp.IsLeader()
                                                if isLeader && !status.RestoringSnapshot {
                                                        // delete old delete extents file for metapartition
                                                        if _, err = mp.submit(opFSMInternalDelExtentFile, []byte(fileName)); err != nil {
                                                                log.LogErrorf("[deleteExtentsFromList] mp(%v), delete old file(%v), err(%v)", mp.config.PartitionId, fileName, err)
                                                                return
                                                        }
                                                        log.LogDebugf("[deleteExtentsFromList] mp(%v), delete old file(%v)", mp.config.PartitionId, fileName)
                                                        return
                                                }
                                                log.LogDebugf("[deleteExtentsFromList] partitionId=%d,delete"+
                                                        " old file status: %s", mp.config.PartitionId, status.State)
                                        }
                                } else {
                                        log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to read file(%v), err(%v)", mp.config.PartitionId, fileName, err)
                                        return
                                }
                        }
                        cursor += uint64(rLen)
                        buff := bytes.NewBuffer(buf[:rLen])
                        batchCount := DeleteBatchCount() * 5
                        for buff.Len() != 0 && deleteCnt < batchCount {
                                lastUnread := buff.Len()
                                // NOTE: audjust cursor
                                if uint64(buff.Len()) < extentKeyLen {
                                        cursor -= uint64(lastUnread)
                                        break
                                }
                                if extentV2 && uint64(buff.Len()) < uint64(proto.ExtentV3Length) {
                                        if r := bytes.Compare(buff.Bytes()[:4], proto.ExtentKeyHeaderV3); r == 0 {
                                                cursor -= uint64(lastUnread)
                                                break
                                        }
                                }
                                // NOTE: read ek
                                ek := proto.ExtentKey{}
                                if extentV2 {
                                        if err = ek.UnmarshalBinaryWithCheckSum(buff); err != nil {
                                                if err == proto.InvalidKeyHeader || err == proto.InvalidKeyCheckSum {
                                                        log.LogErrorf("[deleteExtentsFromList] invalid extent key header %v, %v, %v", fileName, mp.config.PartitionId, err)
                                                        return
                                                }
                                                log.LogErrorf("[deleteExtentsFromList] mp: %v Unmarshal extentkey from %v unresolved error: %v", mp.config.PartitionId, fileName, err)
                                                return
                                        }
                                } else {
                                        // ek for del no need to get version
                                        if err = ek.UnmarshalBinary(buff, false); err != nil {
                                                log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to unmarshal extent", mp.config.PartitionId)
                                                return
                                        }
                                }

                                // NOTE: add to current batch
                                dpId := ek.PartitionId
                                eks := needDeleteExtents[dpId]
                                if eks == nil {
                                        eks = make([]*proto.ExtentKey, 0)
                                }
                                eks = append(eks, &ek)
                                needDeleteExtents[dpId] = eks

                                // NOTE: limit batch count
                                deleteCnt++
                                log.LogDebugf("[deleteExtentsFromList] mp(%v) append extent(%v) to batch, count limit(%v), cnt(%v)", mp.config.PartitionId, ek, batchCount, deleteCnt)
                        }
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) reach the end of buffer", mp.config.PartitionId)
                        return
                }()

                if err != nil {
                        log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to read delete file(%v), err(%v)", mp.config.PartitionId, fileName, err)
                        continue
                }

                if deleteCnt == 0 {
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) delete cnt is 0, sleep", mp.config.PartitionId)
                        continue
                }

                successCnt := 0

                for dpId, eks := range needDeleteExtents {
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) delete dp(%v) eks count(%v)", mp.config.PartitionId, dpId, len(eks))
                        err = mp.batchDeleteExtentsByDp(dpId, eks)
                        if err != nil {
                                log.LogErrorf("[deleteExtentsFromList] mp(%v) failed to delete dp(%v) extents", mp.config.PartitionId, dpId)
                                err = nil
                                for _, ek := range eks {
                                        errExts = append(errExts, *ek)
                                }
                        } else {
                                successCnt += len(eks)
                        }
                }

                log.LogDebugf("[deleteExtentsFromList] mp(%v) delete success cnt(%v), err cnt(%v)", mp.config.PartitionId, successCnt, len(errExts))

                if successCnt == 0 {
                        log.LogErrorf("[deleteExtentsFromList] no extents delete successfully, sleep")
                        continue
                }

                if len(errExts) != 0 {
                        log.LogDebugf("[deleteExtentsFromList] mp(%v) sync errExts(%v)", mp.config.PartitionId, errExts)
                        err = mp.sendExtentsToChan(errExts)
                        if err != nil {
                                log.LogErrorf("[deleteExtentsFromList] sendExtentsToChan by raft error, mp[%v], err(%v), ek(%v)", mp.config.PartitionId, err, len(errExts))
                        }
                }

                buff := bytes.NewBuffer([]byte{})
                buff.WriteString(fmt.Sprintf("%s %d", fileName, cursor))
                log.LogDebugf("[deleteExtentsFromList] mp(%v) delete eks(%v) from file(%v)", mp.config.PartitionId, deleteCnt, fileName)
                if _, err = mp.submit(opFSMInternalDelExtentCursor, buff.Bytes()); err != nil {
                        log.LogWarnf("[deleteExtentsFromList] partitionId=%d, %s",
                                mp.config.PartitionId, err.Error())
                }

                log.LogDebugf("[deleteExtentsFromList] mp(%v) file(%v), cursor(%v), size(%v)", mp.config.PartitionId, fileName, cursor, len(buf))
        }
}

// func (mp *metaPartition) checkBatchDeleteExtents(allExtents map[uint64][]*proto.ExtentKey) {
//         for partitionID, deleteExtents := range allExtents {
//                 needDeleteExtents := make([]proto.ExtentKey, len(deleteExtents))
//                 for index, ek := range deleteExtents {
//                         newEx := proto.ExtentKey{
//                                 FileOffset:   ek.FileOffset,
//                                 PartitionId:  ek.PartitionId,
//                                 ExtentId:     ek.ExtentId,
//                                 ExtentOffset: ek.ExtentOffset,
//                                 Size:         ek.Size,
//                                 CRC:          ek.CRC,
//                         }
//                         needDeleteExtents[index] = newEx
//                         log.LogWritef("mp[%v] deleteExtents(%v)", mp.config.PartitionId, newEx.String())
//                 }
//                 err := mp.doBatchDeleteExtentsByPartition(partitionID, deleteExtents)
//                 if err != nil {
//                         log.LogWarnf(fmt.Sprintf("metaPartition(%v) dataPartitionID(%v)"+
//                                 " batchDeleteExtentsByPartition failed(%v)", mp.config.PartitionId, partitionID, err))
//                         mp.extDelCh <- needDeleteExtents
//                 }
//                 DeleteWorkerSleepMs()
//         }
//         return
// }

package metanode

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/exporter"
)

type FileSizeRange uint32

const (
        Size1K   uint64 = 2 << 10
        Size1M   uint64 = 2 << 20
        Size16M         = 16 * Size1M
        Size32M         = 32 * Size1M
        Size64M         = 64 * Size1M
        Size128M        = 128 * Size1M
        Size256M        = 256 * Size1M
)

const (
        LessThan1K FileSizeRange = iota
        LessThan1M
        LessThan16M
        LessThan32M
        LessThan64M
        LessThan128M
        LessThan256M
        BiggerThan256M
        MaxRangeType
)

const (
        fileStatsCheckPeriod = time.Second * 30
)

func toString(fileSize FileSizeRange) string {
        switch fileSize {
        case LessThan1K:
                return "<1K"
        case LessThan1M:
                return "<1M"
        case LessThan16M:
                return "<16M"
        case LessThan32M:
                return "<32M"
        case LessThan64M:
                return "<64M"
        case LessThan128M:
                return "<128M"
        case LessThan256M:
                return "<256M"
        case BiggerThan256M:
                return ">256M"
        default:
                return "unknown"
        }
}

func (mp *metaPartition) setMetrics(fileRange []int64) {
        for i, val := range fileRange {
                labels := map[string]string{
                        "partid":    fmt.Sprintf("%d", mp.config.PartitionId),
                        "volName":   mp.config.VolName,
                        "sizeRange": toString(FileSizeRange(i)),
                }
                exporter.NewGauge("fileStats").SetWithLabels(float64(val), labels)
        }
}

func (mp *metaPartition) fileStats(ino *Inode) {
        if !mp.manager.fileStatsEnable {
                return
        }
        fileRange := mp.fileRange
        if ino.NLink > 0 && proto.IsRegular(ino.Type) {
                if 0 <= ino.Size && ino.Size < Size1K {
                        fileRange[LessThan1K] += 1
                } else if Size1K <= ino.Size && ino.Size < Size1M {
                        fileRange[LessThan1M] += 1
                } else if Size1M <= ino.Size && ino.Size < Size16M {
                        fileRange[LessThan16M] += 1
                } else if Size16M <= ino.Size && ino.Size < Size32M {
                        fileRange[LessThan32M] += 1
                } else if Size32M <= ino.Size && ino.Size < Size64M {
                        fileRange[LessThan64M] += 1
                } else if Size64M <= ino.Size && ino.Size < Size128M {
                        fileRange[LessThan128M] += 1
                } else if Size128M <= ino.Size && ino.Size < Size256M {
                        fileRange[LessThan256M] += 1
                } else {
                        fileRange[BiggerThan256M] += 1
                }
        }
}

func (mp *metaPartition) startFileStats() {
        checkTicker := time.NewTicker(fileStatsCheckPeriod)
        go func(stopC chan bool) {
                lastEnable := false
                isLeader := false
                for {
                        select {
                        case <-stopC:
                                // if this mp is closed, clear the metric
                                if lastEnable {
                                        fileRange := make([]int64, MaxRangeType)
                                        mp.setMetrics(fileRange)
                                }
                                checkTicker.Stop()
                                return
                        case <-checkTicker.C:
                                if !mp.manager.fileStatsEnable {
                                        // if fileStatsEnable change from true to false, clear the metric
                                        if lastEnable {
                                                fileRange := make([]int64, MaxRangeType)
                                                mp.setMetrics(fileRange)
                                        }
                                        lastEnable = false
                                        continue
                                }

                                lastEnable = true

                                // Clear the metric if status change from leader to follower
                                if _, isLeader = mp.IsLeader(); isLeader {
                                        mp.setMetrics(mp.fileRange)
                                } else {
                                        fileRange := make([]int64, MaxRangeType)
                                        mp.setMetrics(fileRange)
                                }
                        }
                }
        }(mp.stopC)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        "net"
        "os"
        "path"
        "runtime/debug"
        "sort"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/fileutil"
        "github.com/cubefs/cubefs/util/log"
)

const (
        AsyncDeleteInterval           = 10 * time.Second
        UpdateVolTicket               = 2 * time.Minute
        BatchCounts                   = 128
        OpenRWAppendOpt               = os.O_CREATE | os.O_RDWR | os.O_APPEND
        TempFileValidTime             = 86400 // units: sec
        DeleteInodeFileExtension      = "INODE_DEL"
        DeleteWorkerCnt               = 10
        InodeNLink0DelayDeleteSeconds = 24 * 3600
        DeleteInodeFileRollingSize    = 500 * util.MB
)

func (mp *metaPartition) openDeleteInodeFile() (err error) {
        if mp.delInodeFp, err = os.OpenFile(path.Join(mp.config.RootDir,
                DeleteInodeFileExtension), OpenRWAppendOpt, 0o644); err != nil {
                log.LogErrorf("[openDeleteInodeFile] failed to open delete inode file, err(%v)", err)
                return
        }
        return
}

func (mp *metaPartition) startFreeList() (err error) {
        if err = mp.openDeleteInodeFile(); err != nil {
                return
        }

        // start vol update ticket
        go mp.updateVolWorker()
        go mp.deleteWorker()
        mp.startToDeleteExtents()
        return
}

func (mp *metaPartition) updateVolView(convert func(view *proto.DataPartitionsView) *DataPartitionsView) (err error) {
        volName := mp.config.VolName
        dataView, err := masterClient.ClientAPI().EncodingGzip().GetDataPartitions(volName)
        if err != nil {
                err = fmt.Errorf("updateVolWorker: get data partitions view fail: volume(%v) err(%v)",
                        volName, err)
                log.LogErrorf(err.Error())
                return
        }
        mp.vol.UpdatePartitions(convert(dataView))

        volView, err := masterClient.AdminAPI().GetVolumeSimpleInfo(volName)
        if err != nil {
                err = fmt.Errorf("updateVolWorker: get volumeinfo fail: volume(%v)  err(%v)", volName, err)
                log.LogErrorf(err.Error())
                return
        }
        mp.vol.volDeleteLockTime = volView.DeleteLockTime
        return nil
}

func (mp *metaPartition) updateVolWorker() {
        t := time.NewTicker(UpdateVolTicket)
        convert := func(view *proto.DataPartitionsView) *DataPartitionsView {
                newView := &DataPartitionsView{
                        DataPartitions: make([]*DataPartition, len(view.DataPartitions)),
                }
                for i := 0; i < len(view.DataPartitions); i++ {
                        if len(view.DataPartitions[i].Hosts) < 1 {
                                log.LogErrorf("updateVolWorker dp id(%v) is invalid, DataPartitionResponse detail[%v]",
                                        view.DataPartitions[i].PartitionID, view.DataPartitions[i])
                                continue
                        }
                        newView.DataPartitions[i] = &DataPartition{
                                PartitionID: view.DataPartitions[i].PartitionID,
                                Status:      view.DataPartitions[i].Status,
                                Hosts:       view.DataPartitions[i].Hosts,
                                ReplicaNum:  view.DataPartitions[i].ReplicaNum,
                                IsDiscard:   view.DataPartitions[i].IsDiscard,
                        }
                }
                return newView
        }
        mp.updateVolView(convert)
        for {
                select {
                case <-mp.stopC:
                        t.Stop()
                        return
                case <-t.C:
                        mp.updateVolView(convert)
                }
        }
}

const (
        MinDeleteBatchCounts = 100
        MaxSleepCnt          = 10
)

func (mp *metaPartition) deleteWorker() {
        var (
                idx      int
                isLeader bool
        )
        buffSlice := make([]uint64, 0, DeleteBatchCount())
        var sleepCnt uint64
        for {
                buffSlice = buffSlice[:0]
                select {
                case <-mp.stopC:
                        log.LogDebugf("[metaPartition] deleteWorker stop partition: %v", mp.config)
                        return
                default:
                }

                if _, isLeader = mp.IsLeader(); !isLeader {
                        time.Sleep(AsyncDeleteInterval)
                        continue
                }

                // add sleep time value
                DeleteWorkerSleepMs()

                isForceDeleted := sleepCnt%MaxSleepCnt == 0
                if !isForceDeleted && mp.freeList.Len() < MinDeleteBatchCounts {
                        time.Sleep(AsyncDeleteInterval)
                        sleepCnt++
                        continue
                }

                // do nothing.
                if mp.freeList.Len() == 0 {
                        time.Sleep(time.Minute)
                        continue
                }

                batchCount := DeleteBatchCount()
                delayDeleteInos := make([]uint64, 0)
                for idx = 0; idx < int(batchCount); idx++ {
                        // batch get free inode from the freeList
                        ino := mp.freeList.Pop()
                        if ino == 0 {
                                break
                        }
                        log.LogDebugf("action[deleteWorker]: remove inode(%v)", ino)

                        // check inode nlink == 0 and deleteMarkFlag unset
                        if inode, ok := mp.inodeTree.Get(&Inode{Inode: ino}).(*Inode); ok {
                                inTx, _ := mp.txProcessor.txResource.isInodeInTransction(inode)
                                if inode.ShouldDelayDelete() || inTx {
                                        log.LogDebugf("[metaPartition] deleteWorker delay to remove inode: %v as NLink is 0, inTx %v", inode, inTx)
                                        delayDeleteInos = append(delayDeleteInos, ino)
                                        continue
                                }
                        }

                        buffSlice = append(buffSlice, ino)
                }

                // delay
                for _, delayDeleteIno := range delayDeleteInos {
                        mp.freeList.Push(delayDeleteIno)
                }
                log.LogDebugf("metaPartition. buff slice [%v]", buffSlice)

                mp.persistDeletedInodes(buffSlice)
                mp.deleteMarkedInodes(buffSlice)
                sleepCnt++
        }
}

// delete Extents by Partition,and find all successDelete inode
func (mp *metaPartition) batchDeleteExtentsByPartition(partitionDeleteExtents map[uint64][]*proto.ExtentKey,
        allInodes []*Inode) (shouldCommit []*Inode, shouldPushToFreeList []*Inode) {
        occurErrors := make(map[uint64]error)
        shouldCommit = make([]*Inode, 0, len(allInodes))
        shouldPushToFreeList = make([]*Inode, 0)
        var (
                wg   sync.WaitGroup
                lock sync.Mutex
        )

        // wait all Partition do BatchDeleteExtents finish
        for partitionID, extents := range partitionDeleteExtents {
                dp := mp.vol.GetPartition(partitionID)
                // NOTE: if dp is discard, skip it
                if dp.IsDiscard {
                        log.LogWarnf("action[batchDeleteExtentsByPartition] dp(%v) is discard, skip extents count(%v)", partitionID, len(extents))
                        continue
                }
                log.LogDebugf("batchDeleteExtentsByPartition partitionID %v extents %v", partitionID, extents)
                wg.Add(1)
                go func(partitionID uint64, extents []*proto.ExtentKey) {
                        defer wg.Done()
                        perr := mp.doBatchDeleteExtentsByPartition(partitionID, extents)
                        lock.Lock()
                        occurErrors[partitionID] = perr
                        lock.Unlock()
                }(partitionID, extents)
        }
        wg.Wait()

        // range AllNode,find all Extents delete success on inode,it must to be append shouldCommit
        for i := 0; i < len(allInodes); i++ {
                successDeleteExtentCnt := 0
                inode := allInodes[i]
                inode.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
                        if occurErrors[ek.PartitionId] != nil {
                                log.LogWarnf("deleteInode inode[%v] error(%v)", inode.Inode, occurErrors[ek.PartitionId])
                                return false
                        }
                        successDeleteExtentCnt++
                        return true
                })
                if successDeleteExtentCnt == inode.Extents.Len() {
                        shouldCommit = append(shouldCommit, inode)
                        log.LogDebugf("action[batchDeleteExtentsByPartition]: delete inode(%v) success", inode)
                } else {
                        shouldPushToFreeList = append(shouldPushToFreeList, inode)
                        log.LogDebugf("action[batchDeleteExtentsByPartition]: delete inode(%v) fail", inode)
                }
        }

        return
}

// Delete the marked inodes.
func (mp *metaPartition) deleteMarkedInodes(inoSlice []uint64) {
        defer func() {
                if r := recover(); r != nil {
                        stack := string(debug.Stack())
                        log.LogErrorf(fmt.Sprintf("metaPartition(%v) deleteMarkedInodes panic (%v)\nstack:%v",
                                mp.config.PartitionId, r, stack))
                }
        }()

        if len(inoSlice) == 0 {
                return
        }
        log.LogDebugf("[deleteMarkedInodes] . mp[%v] inoSlice [%v]", mp.config.PartitionId, inoSlice)
        shouldCommit := make([]*Inode, 0, DeleteBatchCount())
        shouldRePushToFreeList := make([]*Inode, 0)
        deleteExtentsByPartition := make(map[uint64][]*proto.ExtentKey)
        allInodes := make([]*Inode, 0)
        for _, ino := range inoSlice {
                ref := &Inode{Inode: ino}
                inode, ok := mp.inodeTree.Get(ref).(*Inode)
                if !ok {
                        log.LogDebugf("[deleteMarkedInodes] . mp[%v] inode[%v] not found", mp.config.PartitionId, ino)
                        continue
                }

                if !inode.ShouldDelete() {
                        log.LogWarnf("[deleteMarkedInodes] : inode should not be deleted, ino %s", inode.String())
                        continue
                }

                log.LogDebugf("[deleteMarkedInodes] . mp[%v] inode[%v] inode.Extents: %v, ino verList: %v",
                        mp.config.PartitionId, ino, inode.Extents, inode.GetMultiVerString())

                if inode.getLayerLen() > 0 {
                        log.LogErrorf("[deleteMarkedInodes] deleteMarkedInodes. mp[%v] inode[%v] verlist len %v should not drop",
                                mp.config.PartitionId, ino, inode.getLayerLen())
                        return
                }

                extInfo := inode.GetAllExtsOfflineInode(mp.config.PartitionId)
                for dpID, inodeExts := range extInfo {
                        exts, ok := deleteExtentsByPartition[dpID]
                        if !ok {
                                exts = make([]*proto.ExtentKey, 0)
                        }
                        exts = append(exts, inodeExts...)
                        log.LogWritef("[deleteMarkedInodes] mp[%v] ino(%v) deleteExtent(%v)", mp.config.PartitionId, inode.Inode, len(inodeExts))
                        deleteExtentsByPartition[dpID] = exts
                }

                allInodes = append(allInodes, inode)
        }

        if proto.IsCold(mp.volType) {
                // delete ebs obj extents
                shouldCommit, shouldRePushToFreeList = mp.doBatchDeleteObjExtentsInEBS(allInodes)
                log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteInodeCnt(%d) shouldRePush(%d)",
                        mp.config.PartitionId, len(shouldCommit), len(shouldRePushToFreeList))
                for _, inode := range shouldRePushToFreeList {
                        mp.freeList.Push(inode.Inode)
                }
                allInodes = shouldCommit
        }
        log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteExtentsByPartition(%v) allInodes(%v)",
                mp.config.PartitionId, deleteExtentsByPartition, allInodes)
        shouldCommit, shouldRePushToFreeList = mp.batchDeleteExtentsByPartition(deleteExtentsByPartition, allInodes)
        bufSlice := make([]byte, 0, 8*len(shouldCommit))
        for _, inode := range shouldCommit {
                bufSlice = append(bufSlice, inode.MarshalKey()...)
        }

        err := mp.syncToRaftFollowersFreeInode(bufSlice)
        if err != nil {
                log.LogWarnf("[deleteMarkedInodes] raft commit inode list: %v, "+
                        "response %s", shouldCommit, err.Error())
        }

        for _, inode := range shouldCommit {
                if err == nil {
                        mp.internalDeleteInode(inode)
                } else {
                        mp.freeList.Push(inode.Inode)
                }
        }

        log.LogInfof("[deleteMarkedInodes] metaPartition(%v) deleteInodeCnt(%v) inodeCnt(%v)", mp.config.PartitionId, len(shouldCommit), mp.inodeTree.Len())
        for _, inode := range shouldRePushToFreeList {
                mp.freeList.Push(inode.Inode)
        }

        // try again.
        if len(shouldRePushToFreeList) > 0 && deleteWorkerSleepMs == 0 {
                time.Sleep(time.Duration(1000) * time.Millisecond)
        }
}

func (mp *metaPartition) syncToRaftFollowersFreeInode(hasDeleteInodes []byte) (err error) {
        if len(hasDeleteInodes) == 0 {
                return
        }
        _, err = mp.submit(opFSMInternalDeleteInode, hasDeleteInodes)

        return
}

func (mp *metaPartition) notifyRaftFollowerToFreeInodes(wg *sync.WaitGroup, target string, hasDeleteInodes []byte) (err error) {
        var conn *net.TCPConn
        conn, err = mp.config.ConnPool.GetConnect(target)
        defer func() {
                wg.Done()
                if err != nil {
                        log.LogWarnf(err.Error())
                        mp.config.ConnPool.PutConnect(conn, ForceClosedConnect)
                } else {
                        mp.config.ConnPool.PutConnect(conn, NoClosedConnect)
                }
        }()
        if err != nil {
                return
        }
        request := NewPacketToFreeInodeOnRaftFollower(mp.config.PartitionId, hasDeleteInodes)
        if err = request.WriteToConn(conn); err != nil {
                return
        }

        if err = request.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
                return
        }

        if request.ResultCode != proto.OpOk {
                err = fmt.Errorf("request(%v) error(%v)", request.GetUniqueLogId(), string(request.Data[:request.Size]))
        }

        return
}

func (mp *metaPartition) doDeleteMarkedInodes(ext *proto.ExtentKey) (err error) {
        // get the data node view
        dp := mp.vol.GetPartition(ext.PartitionId)
        log.LogDebugf("action[doDeleteMarkedInodes] dp(%v) status (%v)", dp.PartitionID, dp.Status)
        if dp == nil {
                if proto.IsCold(mp.volType) {
                        log.LogInfof("[doDeleteMarkedInodes] ext(%s) is already been deleted, not delete any more", ext.String())
                        return
                }

                err = errors.NewErrorf("unknown dataPartitionID=%d in vol",
                        ext.PartitionId)
                return
        }

        // delete the data node
        if len(dp.Hosts) < 1 {
                log.LogErrorf("doBatchDeleteExtentsByPartition dp id(%v) is invalid, detail[%v]", ext.PartitionId, dp)
                err = errors.NewErrorf("dp id(%v) is invalid", ext.PartitionId)
                return
        }
        // NOTE: if all replicas in dp is dead
        // skip send request to dp leader
        if dp.Status == proto.Unavailable {
                return
        }
        addr := util.ShiftAddrPort(dp.Hosts[0], smuxPortShift)
        conn, err := smuxPool.GetConnect(addr)
        log.LogInfof("doDeleteMarkedInodes mp (%v) GetConnect (%v), ext(%s)", mp.config.PartitionId, addr, ext.String())

        defer func() {
                smuxPool.PutConnect(conn, ForceClosedConnect)
                log.LogInfof("doDeleteMarkedInodes mp (%v) PutConnect (%v), ext(%s)", mp.config.PartitionId, addr, ext.String())
        }()

        if err != nil {
                err = errors.NewErrorf("get conn from pool %s, "+
                        "extent(%s))",
                        err.Error(), ext.String())
                return
        }
        var (
                p       *Packet
                invalid bool
        )
        if p, invalid = NewPacketToDeleteExtent(dp, ext); invalid {
                p.ResultCode = proto.OpOk
                return
        }
        if err = p.WriteToConn(conn); err != nil {
                err = errors.NewErrorf("write to dataNode %s, %s", p.GetUniqueLogId(),
                        err.Error())
                return
        }

        if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime); err != nil {
                err = errors.NewErrorf("read response from dataNode %s, %s",
                        p.GetUniqueLogId(), err.Error())
                return
        }

        if p.ResultCode == proto.OpTryOtherAddr && proto.IsCold(mp.volType) {
                log.LogInfof("[doBatchDeleteExtentsByPartition] deleteOp retrun tryOtherAddr code means dp is deleted for LF vol, ext(%s)", ext.String())
                return
        }

        if p.ResultCode != proto.OpOk {
                err = errors.NewErrorf("[deleteMarkedInodes] %s response: %s", p.GetUniqueLogId(),
                        p.GetResultMsg())
        }
        return
}

func (mp *metaPartition) doBatchDeleteExtentsByPartition(partitionID uint64, exts []*proto.ExtentKey) (err error) {
        // get the data node view
        dp := mp.vol.GetPartition(partitionID)
        if dp == nil {
                if proto.IsCold(mp.volType) {
                        log.LogInfof("[doBatchDeleteExtentsByPartition] dp(%d) is already been deleted, not delete any more", partitionID)
                        return
                }

                err = errors.NewErrorf("unknown dataPartitionID=%d in vol",
                        partitionID)
                return
        }

        for _, ext := range exts {
                if ext.PartitionId != partitionID {
                        err = errors.NewErrorf("BatchDeleteExtent do batchDelete on PartitionID(%v) but unexpect Extent(%v)", partitionID, ext)
                        return
                }
        }

        // delete the data node
        if len(dp.Hosts) < 1 {
                log.LogErrorf("doBatchDeleteExtentsByPartition dp id(%v) is invalid, detail[%v]", partitionID, dp)
                err = errors.NewErrorf("dp id(%v) is invalid", partitionID)
                return
        }
        addr := util.ShiftAddrPort(dp.Hosts[0], smuxPortShift)
        conn, err := smuxPool.GetConnect(addr)
        log.LogInfof("doBatchDeleteExtentsByPartition mp (%v) GetConnect (%v)", mp.config.PartitionId, addr)

        ResultCode := proto.OpOk

        defer func() {
                smuxPool.PutConnect(conn, ForceClosedConnect)
                log.LogInfof("doBatchDeleteExtentsByPartition mp (%v) PutConnect (%v)", mp.config.PartitionId, addr)
        }()

        if err != nil {
                err = errors.NewErrorf("get conn from pool %s, "+
                        "extents partitionId=%d",
                        err.Error(), partitionID)
                return
        }
        p := NewPacketToBatchDeleteExtent(dp, exts)
        if err = p.WriteToConn(conn); err != nil {
                err = errors.NewErrorf("write to dataNode %s, %s", p.GetUniqueLogId(),
                        err.Error())
                return
        }
        if err = p.ReadFromConnWithVer(conn, proto.BatchDeleteExtentReadDeadLineTime); err != nil {
                err = errors.NewErrorf("read response from dataNode %s, %s",
                        p.GetUniqueLogId(), err.Error())
                return
        }

        ResultCode = p.ResultCode

        if ResultCode == proto.OpTryOtherAddr && proto.IsCold(mp.volType) {
                log.LogInfof("[doBatchDeleteExtentsByPartition] deleteOp retrun tryOtherAddr code means dp is deleted for LF vol, dp(%d)", partitionID)
                return
        }

        if p.ResultCode != proto.OpOk {
                err = errors.NewErrorf("[deleteMarkedInodes] %s response: %s", p.GetUniqueLogId(),
                        p.GetResultMsg())
        }

        return
}

const maxDelCntOnce = 512

func (mp *metaPartition) doBatchDeleteObjExtentsInEBS(allInodes []*Inode) (shouldCommit []*Inode, shouldPushToFreeList []*Inode) {
        shouldCommit = make([]*Inode, 0, len(allInodes))
        shouldPushToFreeList = make([]*Inode, 0)
        var (
                wg   sync.WaitGroup
                lock sync.Mutex
        )

        for _, inode := range allInodes {
                wg.Add(1)

                inode.RLock()
                inode.ObjExtents.RLock()
                go func(ino *Inode, oeks []proto.ObjExtentKey) {
                        defer wg.Done()
                        log.LogDebugf("[doBatchDeleteObjExtentsInEBS] ino(%d) delObjEks[%d]", ino.Inode, len(oeks))
                        err := mp.deleteObjExtents(oeks)

                        lock.Lock()
                        if err != nil {
                                shouldPushToFreeList = append(shouldPushToFreeList, ino)
                                log.LogErrorf("[doBatchDeleteObjExtentsInEBS] delete ebs eks fail, ino(%d), cnt(%d), err(%s)", ino.Inode, len(oeks), err.Error())
                        } else {
                                shouldCommit = append(shouldCommit, ino)
                        }
                        lock.Unlock()

                        ino.ObjExtents.RUnlock()
                        ino.RUnlock()
                }(inode, inode.ObjExtents.eks)
        }

        wg.Wait()

        return
}

func (mp *metaPartition) deleteObjExtents(oeks []proto.ObjExtentKey) (err error) {
        total := len(oeks)

        for i := 0; i < total; i += maxDelCntOnce {
                max := util.Min(i+maxDelCntOnce, total)
                err = mp.ebsClient.Delete(oeks[i:max])
                if err != nil {
                        log.LogErrorf("[deleteObjExtents] delete ebs eks fail, cnt(%d), err(%s)", max-i, err.Error())
                        return err
                }
        }

        return err
}

func (mp *metaPartition) recycleInodeDelFile() {
        // NOTE: get all files
        dentries, err := os.ReadDir(mp.config.RootDir)
        if err != nil {
                log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to read dir(%v)", mp.config.PartitionId, mp.config.RootDir)
                return
        }
        inodeDelFiles := make([]string, 0)
        for _, dentry := range dentries {
                if strings.HasPrefix(dentry.Name(), DeleteInodeFileExtension) && strings.HasSuffix(dentry.Name(), ".old") {
                        inodeDelFiles = append(inodeDelFiles, dentry.Name())
                }
        }
        // NOTE: sort files
        sort.Slice(inodeDelFiles, func(i, j int) bool {
                // NOTE: date format satisfies dictionary order
                return inodeDelFiles[i] < inodeDelFiles[j]
        })

        // NOTE: check disk space and recycle files
        for len(inodeDelFiles) > 0 {
                diskSpaceLeft := int64(0)
                stat, err := fileutil.Statfs(mp.config.RootDir)
                if err != nil {
                        log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to get fs info", mp.config.PartitionId)
                        return
                }
                diskSpaceLeft = int64(stat.Bavail * uint64(stat.Bsize))
                if diskSpaceLeft >= 50*util.GB && len(inodeDelFiles) < 5 {
                        log.LogDebugf("[recycleInodeDelFile] mp(%v) not need to recycle, return", mp.config.PartitionId)
                        return
                }
                // NOTE: delete a file and pop an item
                oldestFile := inodeDelFiles[len(inodeDelFiles)-1]
                inodeDelFiles = inodeDelFiles[:len(inodeDelFiles)-1]
                err = os.Remove(oldestFile)
                if err != nil {
                        log.LogErrorf("[recycleInodeDelFile] mp(%v) failed to remove file(%v)", mp.config.PartitionId, oldestFile)
                        return
                }
        }
}

func (mp *metaPartition) persistDeletedInode(ino uint64, currentSize *uint64) {
        if *currentSize >= DeleteInodeFileRollingSize {
                fileName := fmt.Sprintf("%v.%v.%v", DeleteInodeFileExtension, time.Now().Format(log.FileNameDateFormat), "old")
                if err := mp.delInodeFp.Sync(); err != nil {
                        log.LogErrorf("[persistDeletedInode] failed to sync delete inode file, err(%v), inode(%v)", err, ino)
                        return
                }
                mp.delInodeFp.Close()
                mp.delInodeFp = nil
                // NOTE: that is ok, if rename fails
                // we will re-open it in next line
                fileName = path.Join(mp.config.RootDir, fileName)
                err := os.Rename(path.Join(mp.config.RootDir, DeleteInodeFileExtension), fileName)
                if err != nil {
                        log.LogErrorf("[persistDeletedInode] failed to rename delete inode file, err(%v)", err)
                } else {
                        *currentSize = 0
                        mp.recycleInodeDelFile()
                }
                if err = mp.openDeleteInodeFile(); err != nil {
                        log.LogErrorf("[persistDeletedInode] failed to open delete inode file, err(%v), inode(%v)", err, ino)
                        return
                }
        }
        // NOTE: += sizeof(uint64)
        *currentSize += 8
        if _, err := mp.delInodeFp.WriteString(fmt.Sprintf("%v\n", ino)); err != nil {
                log.LogErrorf("[persistDeletedInode] failed to persist ino(%v), err(%v)", ino, err)
                return
        }
}

func (mp *metaPartition) persistDeletedInodes(inos []uint64) {
        log.LogDebugf("persistDeletedInodes. inos [%v]", inos)
        if mp.delInodeFp == nil {
                // NOTE: hope it can re-open file
                if err := mp.openDeleteInodeFile(); err != nil {
                        log.LogErrorf("[persistDeletedInodes] delete inode file is not open, err(%v), inodes(%v)", err, inos)
                        return
                }
                log.LogWarnf("[persistDeletedInodes] re-open file success")
        }
        info, err := mp.delInodeFp.Stat()
        if err != nil {
                log.LogErrorf("[persistDeletedInodes] failed to get size of delete inode file, err(%v), inodes(%v)", err, inos)
                return
        }
        currSize := uint64(info.Size())
        for _, ino := range inos {
                mp.persistDeletedInode(ino, &currSize)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "math"
        "net"
        "os"
        "path"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        raftproto "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

// Apply applies the given operational commands.
func (mp *metaPartition) Apply(command []byte, index uint64) (resp interface{}, err error) {
        msg := &MetaItem{}
        defer func() {
                if err == nil {
                        mp.uploadApplyID(index)
                }
        }()
        if err = msg.UnmarshalJson(command); err != nil {
                return
        }

        mp.nonIdempotent.Lock()
        defer mp.nonIdempotent.Unlock()

        switch msg.Op {
        case opFSMCreateInode:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                if mp.config.Cursor < ino.Inode {
                        mp.config.Cursor = ino.Inode
                }
                resp = mp.fsmCreateInode(ino)
        case opFSMCreateInodeQuota:
                qinode := &MetaQuotaInode{}
                if err = qinode.Unmarshal(msg.V); err != nil {
                        return
                }
                ino := qinode.inode
                if mp.config.Cursor < ino.Inode {
                        mp.config.Cursor = ino.Inode
                }
                if len(qinode.quotaIds) > 0 {
                        mp.setInodeQuota(qinode.quotaIds, ino.Inode)
                }
                resp = mp.fsmCreateInode(ino)
                if resp == proto.OpOk {
                        for _, quotaId := range qinode.quotaIds {
                                mp.mqMgr.updateUsedInfo(0, 1, quotaId)
                        }
                }
        case opFSMUnlinkInode:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }

                status := mp.inodeInTx(ino.Inode)
                if status != proto.OpOk {
                        resp = &InodeResponse{Status: status}
                        return
                }
                resp = mp.fsmUnlinkInode(ino, 0)
        case opFSMUnlinkInodeOnce:
                var inoOnce *InodeOnce
                if inoOnce, err = InodeOnceUnmarshal(msg.V); err != nil {
                        return
                }
                ino := NewInode(inoOnce.Inode, 0)
                ino.setVer(inoOnce.VerSeq)
                resp = mp.fsmUnlinkInode(ino, inoOnce.UniqID)
        case opFSMUnlinkInodeBatch:
                inodes, err := InodeBatchUnmarshal(msg.V)
                if err != nil {
                        return nil, err
                }
                resp = mp.fsmUnlinkInodeBatch(inodes)
        case opFSMExtentTruncate:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmExtentsTruncate(ino)
        case opFSMCreateLinkInode:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                status := mp.inodeInTx(ino.Inode)
                if status != proto.OpOk {
                        resp = &InodeResponse{Status: status}
                        return
                }
                resp = mp.fsmCreateLinkInode(ino, 0)
        case opFSMCreateLinkInodeOnce:
                var inoOnce *InodeOnce
                if inoOnce, err = InodeOnceUnmarshal(msg.V); err != nil {
                        return
                }
                ino := NewInode(inoOnce.Inode, 0)
                resp = mp.fsmCreateLinkInode(ino, inoOnce.UniqID)
        case opFSMEvictInode:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                status := mp.inodeInTx(ino.Inode)
                if status != proto.OpOk {
                        resp = &InodeResponse{Status: status}
                        return
                }
                resp = mp.fsmEvictInode(ino)
        case opFSMEvictInodeBatch:
                inodes, err := InodeBatchUnmarshal(msg.V)
                if err != nil {
                        return nil, err
                }
                resp = mp.fsmBatchEvictInode(inodes)
        case opFSMSetAttr:
                req := &SetattrRequest{}
                err = json.Unmarshal(msg.V, req)
                if err != nil {
                        return
                }
                err = mp.fsmSetAttr(req)
        case opFSMCreateDentry:
                den := &Dentry{}
                if err = den.Unmarshal(msg.V); err != nil {
                        return
                }

                status := mp.dentryInTx(den.ParentId, den.Name)
                if status != proto.OpOk {
                        resp = status
                        return
                }

                resp = mp.fsmCreateDentry(den, false)
        case opFSMDeleteDentry:
                den := &Dentry{}
                if err = den.Unmarshal(msg.V); err != nil {
                        return
                }

                status := mp.dentryInTx(den.ParentId, den.Name)
                if status != proto.OpOk {
                        resp = status
                        return
                }

                resp = mp.fsmDeleteDentry(den, false)
        case opFSMDeleteDentryBatch:
                db, err := DentryBatchUnmarshal(msg.V)
                if err != nil {
                        return nil, err
                }
                resp = mp.fsmBatchDeleteDentry(db)
        case opFSMUpdateDentry:
                den := &Dentry{}
                if err = den.Unmarshal(msg.V); err != nil {
                        return
                }

                status := mp.dentryInTx(den.ParentId, den.Name)
                if status != proto.OpOk {
                        resp = &DentryResponse{Status: status}
                        return
                }

                resp = mp.fsmUpdateDentry(den)
        case opFSMUpdatePartition:
                req := &UpdatePartitionReq{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp, err = mp.fsmUpdatePartition(req.End)
        case opFSMExtentsAdd:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmAppendExtents(ino)
        case opFSMExtentsAddWithCheck:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmAppendExtentsWithCheck(ino, false)
        case opFSMExtentSplit:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmAppendExtentsWithCheck(ino, true)
        case opFSMObjExtentsAdd:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmAppendObjExtents(ino)
        case opFSMExtentsEmpty:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmExtentsEmpty(ino)
        case opFSMClearInodeCache:
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmClearInodeCache(ino)
        case opFSMSentToChan:
                resp = mp.fsmSendToChan(msg.V, true)
        case opFSMStoreTick:
                inodeTree := mp.inodeTree.GetTree()
                dentryTree := mp.dentryTree.GetTree()
                extendTree := mp.extendTree.GetTree()
                multipartTree := mp.multipartTree.GetTree()
                txTree := mp.txProcessor.txManager.txTree.GetTree()
                txRbInodeTree := mp.txProcessor.txResource.txRbInodeTree.GetTree()
                txRbDentryTree := mp.txProcessor.txResource.txRbDentryTree.GetTree()
                txId := mp.txProcessor.txManager.txIdAlloc.getTransactionID()
                quotaRebuild := mp.mqMgr.statisticRebuildStart()
                uidRebuild := mp.acucumRebuildStart()
                uniqChecker := mp.uniqChecker.clone()
                msg := &storeMsg{
                        command:        opFSMStoreTick,
                        applyIndex:     index,
                        txId:           txId,
                        inodeTree:      inodeTree,
                        dentryTree:     dentryTree,
                        extendTree:     extendTree,
                        multipartTree:  multipartTree,
                        txTree:         txTree,
                        txRbInodeTree:  txRbInodeTree,
                        txRbDentryTree: txRbDentryTree,
                        quotaRebuild:   quotaRebuild,
                        uidRebuild:     uidRebuild,
                        uniqChecker:    uniqChecker,
                        multiVerList:   mp.GetAllVerList(),
                }
                log.LogDebugf("opFSMStoreTick: quotaRebuild [%v] uidRebuild [%v]", quotaRebuild, uidRebuild)
                mp.storeChan <- msg
        case opFSMInternalDeleteInode:
                err = mp.internalDelete(msg.V)
        case opFSMInternalDeleteInodeBatch:
                err = mp.internalDeleteBatch(msg.V)
        case opFSMInternalDelExtentFile:
                err = mp.delOldExtentFile(msg.V)
        case opFSMInternalDelExtentCursor:
                err = mp.setExtentDeleteFileCursor(msg.V)
        case opFSMSetXAttr:
                var extend *Extend
                if extend, err = NewExtendFromBytes(msg.V); err != nil {
                        return
                }
                err = mp.fsmSetXAttr(extend)
        case opFSMRemoveXAttr:
                var extend *Extend
                if extend, err = NewExtendFromBytes(msg.V); err != nil {
                        return
                }
                err = mp.fsmRemoveXAttr(extend)
        case opFSMUpdateXAttr:
                var extend *Extend
                if extend, err = NewExtendFromBytes(msg.V); err != nil {
                        return
                }
                err = mp.fsmSetXAttr(extend)
        case opFSMCreateMultipart:
                var multipart *Multipart
                multipart = MultipartFromBytes(msg.V)
                resp = mp.fsmCreateMultipart(multipart)
        case opFSMRemoveMultipart:
                var multipart *Multipart
                multipart = MultipartFromBytes(msg.V)
                resp = mp.fsmRemoveMultipart(multipart)
        case opFSMAppendMultipart:
                var multipart *Multipart
                multipart = MultipartFromBytes(msg.V)
                resp = mp.fsmAppendMultipart(multipart)
        case opFSMSyncCursor:
                var cursor uint64
                cursor = binary.BigEndian.Uint64(msg.V)
                if cursor > mp.config.Cursor {
                        mp.config.Cursor = cursor
                }
        case opFSMSyncTxID:
                var txID uint64
                txID = binary.BigEndian.Uint64(msg.V)
                if txID > mp.txProcessor.txManager.txIdAlloc.getTransactionID() {
                        mp.txProcessor.txManager.txIdAlloc.setTransactionID(txID)
                }
        case opFSMTxInit:
                txInfo := proto.NewTransactionInfo(0, 0)
                if err = txInfo.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxInit(txInfo)
        case opFSMTxCreateInode:
                txIno := NewTxInode(0, 0, nil)
                if err = txIno.Unmarshal(msg.V); err != nil {
                        return
                }
                if mp.config.Cursor < txIno.Inode.Inode {
                        mp.config.Cursor = txIno.Inode.Inode
                }
                resp = mp.fsmTxCreateInode(txIno, []uint32{})
        case opFSMTxCreateInodeQuota:
                qinode := &TxMetaQuotaInode{}
                if err = qinode.Unmarshal(msg.V); err != nil {
                        return
                }
                txIno := qinode.txinode
                if mp.config.Cursor < txIno.Inode.Inode {
                        mp.config.Cursor = txIno.Inode.Inode
                }
                if len(qinode.quotaIds) > 0 {
                        mp.setInodeQuota(qinode.quotaIds, txIno.Inode.Inode)
                }
                resp = mp.fsmTxCreateInode(txIno, qinode.quotaIds)
                if resp == proto.OpOk {
                        for _, quotaId := range qinode.quotaIds {
                                mp.mqMgr.updateUsedInfo(0, 1, quotaId)
                        }
                }
        case opFSMTxCreateDentry:
                txDen := NewTxDentry(0, "", 0, 0, nil, nil)
                if err = txDen.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxCreateDentry(txDen)
        case opFSMTxSetState:
                req := &proto.TxSetStateRequest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmTxSetState(req)
        case opFSMTxCommitRM:
                req := &proto.TransactionInfo{}
                if err = req.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxCommitRM(req)
        case opFSMTxRollbackRM:
                req := &proto.TransactionInfo{}
                if err = req.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxRollbackRM(req)
        case opFSMTxCommit:
                req := &proto.TxApplyRequest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmTxCommit(req.TxID)
        case opFSMTxRollback:
                req := &proto.TxApplyRequest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmTxRollback(req.TxID)
        case opFSMTxDelete:
                req := &proto.TxApplyRequest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmTxDelete(req.TxID)
        case opFSMTxDeleteDentry:
                txDen := NewTxDentry(0, "", 0, 0, nil, nil)
                if err = txDen.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxDeleteDentry(txDen)
        case opFSMTxUnlinkInode:
                txIno := NewTxInode(0, 0, nil)
                if err = txIno.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxUnlinkInode(txIno)
        case opFSMTxUpdateDentry:
                // txDen := NewTxDentry(0, "", 0, 0, nil)
                txUpdateDen := NewTxUpdateDentry(nil, nil, nil)
                if err = txUpdateDen.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxUpdateDentry(txUpdateDen)
        case opFSMTxCreateLinkInode:
                txIno := NewTxInode(0, 0, nil)
                if err = txIno.Unmarshal(msg.V); err != nil {
                        return
                }
                resp = mp.fsmTxCreateLinkInode(txIno)
        case opFSMSetInodeQuotaBatch:
                req := &proto.BatchSetMetaserverQuotaReuqest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmSetInodeQuotaBatch(req)
        case opFSMDeleteInodeQuotaBatch:
                req := &proto.BatchDeleteMetaserverQuotaReuqest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                resp = mp.fsmDeleteInodeQuotaBatch(req)
        case opFSMUniqID:
                resp = mp.fsmUniqID(msg.V)
        case opFSMUniqCheckerEvict:
                req := &fsmEvictUniqCheckerRequest{}
                if err = json.Unmarshal(msg.V, req); err != nil {
                        return
                }
                err = mp.fsmUniqCheckerEvict(req)
        case opFSMVersionOp:
                err = mp.fsmVersionOp(msg.V)
        default:
                // do nothing
        }

        return
}

func (mp *metaPartition) runVersionOp() {
        mp.verUpdateChan = make(chan []byte, 100)
        for {
                select {
                case verData := <-mp.verUpdateChan:
                        mp.submit(opFSMVersionOp, verData)
                case <-mp.stopC:
                        log.LogWarnf("runVersionOp exit!")
                        return
                }
        }
}

func (mp *metaPartition) fsmVersionOp(reqData []byte) (err error) {
        mp.multiVersionList.RWLock.Lock()
        defer mp.multiVersionList.RWLock.Unlock()

        var opData VerOpData
        if err = json.Unmarshal(reqData, &opData); err != nil {
                log.LogErrorf("action[fsmVersionOp] mp[%v] unmarshal error %v", mp.config.PartitionId, err)
                return
        }

        log.LogInfof("action[fsmVersionOp] volname [%v] mp[%v] seq [%v], op [%v]", mp.config.VolName, mp.config.PartitionId, opData.VerSeq, opData.Op)
        if opData.Op == proto.CreateVersionPrepare {
                cnt := len(mp.multiVersionList.VerList)
                if cnt > 0 {
                        lastVersion := mp.multiVersionList.VerList[cnt-1]
                        if lastVersion.Ver > opData.VerSeq {
                                log.LogWarnf("action[HandleVersionOp] createVersionPrepare reqeust seq [%v] less than last exist snapshot seq [%v]", opData.VerSeq, lastVersion.Ver)
                                return
                        } else if lastVersion.Ver == opData.VerSeq {
                                log.LogWarnf("action[HandleVersionOp] CreateVersionPrepare request seq [%v] already exist status [%v]", opData.VerSeq, lastVersion.Status)
                                return
                        }
                }
                newVer := &proto.VolVersionInfo{
                        Status: proto.VersionPrepare,
                        Ver:    opData.VerSeq,
                }
                mp.verSeq = opData.VerSeq
                mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, newVer)

                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], seqArray size %v", mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList))
        } else if opData.Op == proto.CreateVersionCommit {
                cnt := len(mp.multiVersionList.VerList)
                if cnt > 0 {
                        if mp.multiVersionList.VerList[cnt-1].Ver > opData.VerSeq {
                                log.LogWarnf("action[fsmVersionOp] mp[%v] reqeust seq [%v] less than last exist snapshot seq [%v]", mp.config.PartitionId,
                                        opData.VerSeq, mp.multiVersionList.VerList[cnt-1].Ver)
                                return
                        }
                        if mp.multiVersionList.VerList[cnt-1].Ver == opData.VerSeq {
                                if mp.multiVersionList.VerList[cnt-1].Status != proto.VersionPrepare {
                                        log.LogWarnf("action[fsmVersionOp] mp[%v] reqeust seq [%v] Equal last exist snapshot seq [%v] but with status [%v]", mp.config.PartitionId,
                                                mp.multiVersionList.VerList[cnt-1].Ver, opData.VerSeq, mp.multiVersionList.VerList[cnt-1].Status)
                                }
                                mp.multiVersionList.VerList[cnt-1].Status = proto.VersionNormal
                                return
                        }
                }
                newVer := &proto.VolVersionInfo{
                        Status: proto.VersionNormal,
                        Ver:    opData.VerSeq,
                }
                mp.verSeq = opData.VerSeq
                mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, newVer)

                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], seqArray size %v", mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList))
        } else if opData.Op == proto.DeleteVersion {
                for i, ver := range mp.multiVersionList.VerList {
                        if i == len(mp.multiVersionList.VerList)-1 {
                                log.LogWarnf("action[fsmVersionOp] mp[%v] seq [%v], op [%v], seqArray size %v newest ver [%v] reque ver [%v]",
                                        mp.config.PartitionId, opData.VerSeq, opData.Op, len(mp.multiVersionList.VerList), ver.Ver, opData.VerSeq)
                                break
                        }
                        if ver.Ver == opData.VerSeq {
                                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], VerList %v",
                                        mp.config.PartitionId, opData.VerSeq, opData.Op, mp.multiVersionList.VerList)
                                // mp.multiVersionList = append(mp.multiVersionList[:i], mp.multiVersionList[i+1:]...)
                                mp.multiVersionList.VerList = append(mp.multiVersionList.VerList[:i], mp.multiVersionList.VerList[i+1:]...)
                                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] seq [%v], op [%v], VerList %v",
                                        mp.config.PartitionId, opData.VerSeq, opData.Op, mp.multiVersionList.VerList)
                                break
                        }
                }
        } else if opData.Op == proto.SyncBatchVersionList {
                log.LogInfof("action[fsmVersionOp] mp[%v] before update:with seq [%v] verlist %v opData.VerList %v",
                        mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList, opData.VerList)

                lastVer := mp.multiVersionList.GetLastVer()
                for _, info := range opData.VerList {
                        if info.Ver > lastVer {
                                mp.multiVersionList.VerList = append(mp.multiVersionList.VerList, info)
                                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] after update:with seq [%v] verlist %v",
                                        mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList)
                        }
                }
                mp.verSeq = mp.multiVersionList.GetLastVer()
                log.LogInfof("action[fsmVersionOp] updateVerList mp[%v] after update:with seq [%v] verlist %v",
                        mp.config.PartitionId, mp.verSeq, mp.multiVersionList.VerList)
        } else {
                log.LogErrorf("action[fsmVersionOp] mp[%v] with seq [%v] process op type %v seq [%v] not found",
                        mp.config.PartitionId, mp.verSeq, opData.Op, opData.VerSeq)
        }
        return
}

// ApplyMemberChange  apply changes to the raft member.
func (mp *metaPartition) ApplyMemberChange(confChange *raftproto.ConfChange, index uint64) (resp interface{}, err error) {
        defer func() {
                if err == nil {
                        mp.uploadApplyID(index)
                }
        }()
        // change memory status
        var (
                updated bool
        )
        switch confChange.Type {
        case raftproto.ConfAddNode:
                req := &proto.AddMetaPartitionRaftMemberRequest{}
                if err = json.Unmarshal(confChange.Context, req); err != nil {
                        return
                }
                updated, err = mp.confAddNode(req, index)
        case raftproto.ConfRemoveNode:
                req := &proto.RemoveMetaPartitionRaftMemberRequest{}
                if err = json.Unmarshal(confChange.Context, req); err != nil {
                        return
                }
                updated, err = mp.confRemoveNode(req, index)
        case raftproto.ConfUpdateNode:
                // updated, err = mp.confUpdateNode(req, index)
        default:
                // do nothing
        }
        if err != nil {
                return
        }
        if updated {
                mp.config.sortPeers()
                if err = mp.persistMetadata(); err != nil {
                        log.LogErrorf("action[ApplyMemberChange] err[%v].", err)
                        return
                }
        }
        return
}

// Snapshot returns the snapshot of the current meta partition.
func (mp *metaPartition) Snapshot() (snap raftproto.Snapshot, err error) {
        snap, err = newMetaItemIterator(mp)
        return
}

func (mp *metaPartition) ApplySnapshot(peers []raftproto.Peer, iter raftproto.SnapIterator) (err error) {
        var (
                data           []byte
                index          int
                appIndexID     uint64
                txID           uint64
                uniqID         uint64
                cursor         uint64
                inodeTree      = NewBtree()
                dentryTree     = NewBtree()
                extendTree     = NewBtree()
                multipartTree  = NewBtree()
                txTree         = NewBtree()
                txRbInodeTree  = NewBtree()
                txRbDentryTree = NewBtree()
                uniqChecker    = newUniqChecker()
                verList        []*proto.VolVersionInfo
        )

        blockUntilStoreSnapshot := func() {
                ticker := time.NewTicker(5 * time.Second)
                defer ticker.Stop()

                log.LogWarnf("ApplySnapshot: start to block until store snapshot to disk, mp[%v], appid %d", mp.config.PartitionId, appIndexID)
                start := time.Now()

                for {
                        select {
                        case <-ticker.C:
                                if time.Since(start) > time.Minute*20 {
                                        msg := fmt.Sprintf("ApplySnapshot: wait store snapshot timeout after 20 minutes, mp %d, appId %d, storeId %d",
                                                mp.config.PartitionId, appIndexID, mp.storedApplyId)
                                        log.LogErrorf(msg)
                                        err = fmt.Errorf(msg)
                                        return
                                }

                                msg := fmt.Sprintf("ApplySnapshot: start check storedApplyId, mp %d appId %d, storeAppId %d, cost %s",
                                        mp.config.PartitionId, appIndexID, mp.storedApplyId, time.Since(start).String())
                                if time.Since(start) > time.Minute {
                                        log.LogWarnf("still block after one minute, msg %s", msg)
                                } else {
                                        log.LogInfo(msg)
                                }

                                if mp.storedApplyId >= appIndexID {
                                        log.LogWarnf("ApplySnapshot: store snapshot success, msg %s", msg)
                                        return
                                }
                        case <-mp.stopC:
                                log.LogWarnf("ApplySnapshot: revice stop signal, exit now, partition(%d), applyId(%d)", mp.config.PartitionId, mp.applyID)
                                err = errors.New("server has been shutdown when block")
                                return
                        }
                }
        }

        defer func() {
                if err == io.EOF {
                        mp.applyID = appIndexID
                        mp.config.UniqId = uniqID
                        mp.txProcessor.txManager.txIdAlloc.setTransactionID(txID)
                        mp.inodeTree = inodeTree
                        mp.dentryTree = dentryTree
                        mp.extendTree = extendTree
                        mp.multipartTree = multipartTree
                        mp.config.Cursor = cursor
                        mp.txProcessor.txManager.txTree = txTree
                        mp.txProcessor.txResource.txRbInodeTree = txRbInodeTree
                        mp.txProcessor.txResource.txRbDentryTree = txRbDentryTree
                        mp.uniqChecker = uniqChecker
                        mp.multiVersionList.VerList = make([]*proto.VolVersionInfo, len(verList))
                        copy(mp.multiVersionList.VerList, verList)
                        mp.verSeq = mp.multiVersionList.GetLastVer()
                        log.LogInfof("mp[%v] updateVerList (%v) seq [%v]", mp.config.PartitionId, mp.multiVersionList.VerList, mp.verSeq)
                        err = nil
                        // store message
                        mp.storeChan <- &storeMsg{
                                command:        opFSMStoreTick,
                                applyIndex:     mp.applyID,
                                txId:           mp.txProcessor.txManager.txIdAlloc.getTransactionID(),
                                inodeTree:      mp.inodeTree.GetTree(),
                                dentryTree:     mp.dentryTree.GetTree(),
                                extendTree:     mp.extendTree.GetTree(),
                                multipartTree:  mp.multipartTree.GetTree(),
                                txTree:         mp.txProcessor.txManager.txTree.GetTree(),
                                txRbInodeTree:  mp.txProcessor.txResource.txRbInodeTree.GetTree(),
                                txRbDentryTree: mp.txProcessor.txResource.txRbDentryTree.GetTree(),
                                uniqChecker:    uniqChecker.clone(),
                                multiVerList:   mp.GetVerList(),
                        }
                        select {
                        case mp.extReset <- struct{}{}:
                                log.LogDebugf("ApplySnapshot: finish with EOF: partitionID(%v) applyID(%v), txID(%v), uniqID(%v), cursor(%v)",
                                        mp.config.PartitionId, mp.applyID, mp.txProcessor.txManager.txIdAlloc.getTransactionID(), mp.config.UniqId, mp.config.Cursor)
                                blockUntilStoreSnapshot()
                                return
                        case <-mp.stopC:
                                log.LogWarnf("ApplySnapshot: revice stop signal, exit now, partition(%d), applyId(%d)", mp.config.PartitionId, mp.applyID)
                                err = errors.New("server has been shutdown")
                                return
                        }
                }
                log.LogErrorf("ApplySnapshot: stop with error: partitionID(%v) err(%v)", mp.config.PartitionId, err)
        }()

        var leaderSnapFormatVer uint32
        leaderSnapFormatVer = math.MaxUint32

        for {
                data, err = iter.Next()
                if err != nil {
                        return
                }

                if index == 0 {
                        appIndexID = binary.BigEndian.Uint64(data)
                        log.LogDebugf("ApplySnapshot: partitionID(%v), temporary uint64 appIndexID:%v", mp.config.PartitionId, appIndexID)
                }

                snap := NewMetaItem(0, nil, nil)
                if err = snap.UnmarshalBinary(data); err != nil {
                        if index == 0 {
                                // for compatibility, if leader send snapshot format int version_0, index=0 is applyId in uint64 and
                                // will cause snap.UnmarshalBinary err, then just skip index=0 and continue with the other fields
                                log.LogInfof("ApplySnapshot: snap.UnmarshalBinary failed in index=0, partitionID(%v), assuming snapshot format version_0",
                                        mp.config.PartitionId)
                                index++
                                leaderSnapFormatVer = SnapFormatVersion_0
                                continue
                        }

                        log.LogInfof("ApplySnapshot: snap.UnmarshalBinary failed, partitionID(%v) index(%v)", mp.config.PartitionId, index)
                        err = errors.New("unmarshal snap data failed")
                        return
                }

                if index == 0 {
                        if snap.Op != opFSMSnapFormatVersion {
                                // check whether the snapshot format matches, if snap.UnmarshalBinary has no err for index 0, it should be opFSMSnapFormatVersion
                                err = fmt.Errorf("ApplySnapshot: snapshot format not match, partitionID(%v), index:%v, expect snap.Op:%v, actual snap.Op:%v",
                                        mp.config.PartitionId, index, opFSMSnapFormatVersion, snap.Op)
                                log.LogWarn(err.Error())
                                return
                        }

                        // check whether the snapshot format version number matches
                        leaderSnapFormatVer = binary.BigEndian.Uint32(snap.V)
                        if leaderSnapFormatVer != mp.manager.metaNode.raftSyncSnapFormatVersion {
                                log.LogWarnf("ApplySnapshot: snapshot format not match, partitionID(%v), index:%v, expect ver:%v, actual ver:%v",
                                        mp.config.PartitionId, index, mp.manager.metaNode.raftSyncSnapFormatVersion, leaderSnapFormatVer)
                        }

                        index++
                        continue
                }

                index++
                switch snap.Op {
                case opFSMApplyId:
                        appIndexID = binary.BigEndian.Uint64(snap.V)
                        log.LogDebugf("ApplySnapshot: partitionID(%v) appIndexID:%v", mp.config.PartitionId, appIndexID)
                case opFSMTxId:
                        txID = binary.BigEndian.Uint64(snap.V)
                        log.LogDebugf("ApplySnapshot: partitionID(%v) txID:%v", mp.config.PartitionId, txID)
                case opFSMCursor:
                        cursor = binary.BigEndian.Uint64(snap.V)
                        log.LogDebugf("ApplySnapshot: partitionID(%v) cursor:%v", mp.config.PartitionId, cursor)
                case opFSMUniqIDSnap:
                        uniqID = binary.BigEndian.Uint64(snap.V)
                        log.LogDebugf("ApplySnapshot: partitionID(%v) uniqId:%v", mp.config.PartitionId, uniqID)
                case opFSMCreateInode:
                        ino := NewInode(0, 0)

                        // TODO Unhandled errors
                        ino.UnmarshalKey(snap.K)
                        ino.UnmarshalValue(snap.V)
                        if cursor < ino.Inode {
                                cursor = ino.Inode
                        }
                        inodeTree.ReplaceOrInsert(ino, true)
                        log.LogDebugf("ApplySnapshot: create inode: partitonID(%v) inode[%v].", mp.config.PartitionId, ino)
                case opFSMCreateDentry:
                        dentry := &Dentry{}
                        if err = dentry.UnmarshalKey(snap.K); err != nil {
                                return
                        }
                        if err = dentry.UnmarshalValue(snap.V); err != nil {
                                return
                        }
                        dentryTree.ReplaceOrInsert(dentry, true)
                        log.LogDebugf("ApplySnapshot: create dentry: partitionID(%v) dentry(%v)", mp.config.PartitionId, dentry)
                case opFSMSetXAttr:
                        var extend *Extend
                        if extend, err = NewExtendFromBytes(snap.V); err != nil {
                                return
                        }
                        extendTree.ReplaceOrInsert(extend, true)
                        log.LogDebugf("ApplySnapshot: set extend attributes: partitionID(%v) extend(%v)",
                                mp.config.PartitionId, extend)
                case opFSMCreateMultipart:
                        multipart := MultipartFromBytes(snap.V)
                        multipartTree.ReplaceOrInsert(multipart, true)
                        log.LogDebugf("ApplySnapshot: create multipart: partitionID(%v) multipart(%v)", mp.config.PartitionId, multipart)
                case opFSMTxSnapshot:
                        txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
                        txInfo.Unmarshal(snap.V)
                        txTree.ReplaceOrInsert(txInfo, true)
                        log.LogDebugf("ApplySnapshot: create transaction: partitionID(%v) txInfo(%v)", mp.config.PartitionId, txInfo)
                case opFSMTxRbInodeSnapshot:
                        txRbInode := NewTxRollbackInode(nil, []uint32{}, nil, 0)
                        txRbInode.Unmarshal(snap.V)
                        txRbInodeTree.ReplaceOrInsert(txRbInode, true)
                        log.LogDebugf("ApplySnapshot: create txRbInode: partitionID(%v) txRbinode[%v]", mp.config.PartitionId, txRbInode)
                case opFSMTxRbDentrySnapshot:
                        txRbDentry := NewTxRollbackDentry(nil, nil, 0)
                        txRbDentry.Unmarshal(snap.V)
                        txRbDentryTree.ReplaceOrInsert(txRbDentry, true)
                        log.LogDebugf("ApplySnapshot: create txRbDentry: partitionID(%v) txRbDentry(%v)", mp.config.PartitionId, txRbDentry)
                case opFSMVerListSnapShot:
                        json.Unmarshal(snap.V, &verList)
                        log.LogDebugf("ApplySnapshot: create verList: partitionID(%v) snap.V(%v) verList(%v)", mp.config.PartitionId, snap.V, verList)
                case opExtentFileSnapshot:
                        fileName := string(snap.K)
                        fileName = path.Join(mp.config.RootDir, fileName)
                        if err = os.WriteFile(fileName, snap.V, 0o644); err != nil {
                                log.LogErrorf("ApplySnapshot: write snap extent delete file fail: partitionID(%v) err(%v)",
                                        mp.config.PartitionId, err)
                        }
                        log.LogDebugf("ApplySnapshot: write snap extent delete file: partitonID(%v) filename(%v).",
                                mp.config.PartitionId, fileName)
                case opFSMUniqCheckerSnap:
                        if err = uniqChecker.UnMarshal(snap.V); err != nil {
                                log.LogErrorf("ApplyUniqChecker: write snap uniqChecker fail")
                                return
                        }
                        log.LogDebugf("ApplySnapshot: write snap uniqChecker")

                default:
                        if leaderSnapFormatVer != math.MaxUint32 && leaderSnapFormatVer > mp.manager.metaNode.raftSyncSnapFormatVersion {
                                log.LogWarnf("ApplySnapshot: unknown op=%d, leaderSnapFormatVer:%v, mySnapFormatVer:%v, skip it",
                                        snap.Op, leaderSnapFormatVer, mp.manager.metaNode.raftSyncSnapFormatVersion)
                        } else {
                                err = fmt.Errorf("unknown Op=%d", snap.Op)
                                return
                        }
                }
        }
}

// HandleFatalEvent handles the fatal errors.
func (mp *metaPartition) HandleFatalEvent(err *raft.FatalError) {
        // Panic while fatal event happen.
        exporter.Warning(fmt.Sprintf("action[HandleFatalEvent] err[%v].", err))
        log.LogFatalf("action[HandleFatalEvent] err[%v].", err)
        panic(err.Err)
}

// HandleLeaderChange handles the leader changes.
func (mp *metaPartition) HandleLeaderChange(leader uint64) {
        exporter.Warning(fmt.Sprintf("metaPartition(%v) changeLeader to (%v)", mp.config.PartitionId, leader))
        if mp.config.NodeId == leader {
                localIp := mp.manager.metaNode.localAddr
                if localIp == "" {
                        localIp = "127.0.0.1"
                }

                conn, err := net.DialTimeout("tcp", net.JoinHostPort(localIp, serverPort), time.Second)
                if err != nil {
                        log.LogErrorf(fmt.Sprintf("HandleLeaderChange serverPort not exsit ,error %v", err))
                        exporter.Warning(fmt.Sprintf("mp[%v] HandleLeaderChange serverPort not exsit ,error %v", mp.config.PartitionId, err))
                        go mp.raftPartition.TryToLeader(mp.config.PartitionId)
                        return
                }
                log.LogDebugf("[metaPartition] HandleLeaderChange close conn %v, nodeId: %v, leader: %v", serverPort, mp.config.NodeId, leader)
                exporter.Warning(fmt.Sprintf("[metaPartition]mp[%v] HandleLeaderChange close conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader))
                conn.(*net.TCPConn).SetLinger(0)
                conn.Close()
        }
        if mp.config.NodeId != leader {
                log.LogDebugf("[metaPartition] pid: %v HandleLeaderChange become unleader nodeId: %v, leader: %v", mp.config.PartitionId, mp.config.NodeId, leader)
                exporter.Warning(fmt.Sprintf("[metaPartition] pid: %v HandleLeaderChange become unleader nodeId: %v, leader: %v", mp.config.PartitionId, mp.config.NodeId, leader))
                mp.storeChan <- &storeMsg{
                        command: stopStoreTick,
                }
                return
        }
        mp.storeChan <- &storeMsg{
                command: startStoreTick,
        }

        log.LogDebugf("[metaPartition] pid: %v HandleLeaderChange become leader conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader)
        exporter.Warning(fmt.Sprintf("[metaPartition] pid: %v HandleLeaderChange become leader conn %v, nodeId: %v, leader: %v", mp.config.PartitionId, serverPort, mp.config.NodeId, leader))
        if mp.config.Start == 0 && mp.config.Cursor == 0 {
                id, err := mp.nextInodeID()
                if err != nil {
                        log.LogFatalf("[HandleLeaderChange] init root inode id: %s.", err.Error())
                        exporter.Warning(fmt.Sprintf("[HandleLeaderChange] pid %v init root inode id: %s.", mp.config.PartitionId, err.Error()))
                }
                ino := NewInode(id, proto.Mode(os.ModePerm|os.ModeDir))
                go mp.initInode(ino)
        }
}

// Put puts the given key-value pair (operation key and operation request) into the raft store.
func (mp *metaPartition) submit(op uint32, data []byte) (resp interface{}, err error) {
        log.LogDebugf("submit. op [%v]", op)
        snap := NewMetaItem(0, nil, nil)
        snap.Op = op
        if data != nil {
                snap.V = data
        }
        cmd, err := snap.MarshalJson()
        if err != nil {
                return
        }

        // submit to the raft store
        resp, err = mp.raftPartition.Submit(cmd)
        log.LogDebugf("submit. op [%v] done", op)
        return
}

func (mp *metaPartition) uploadApplyID(applyId uint64) {
        atomic.StoreUint64(&mp.applyID, applyId)
}

func (mp *metaPartition) getApplyID() (applyId uint64) {
        return atomic.LoadUint64(&mp.applyID)
}

func (mp *metaPartition) getCommittedID() (committedId uint64) {
        status := mp.raftPartition.Status()
        return status.Commit
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io/ioutil"
        "os"
        "path"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) initInode(ino *Inode) {
        for {
                time.Sleep(10 * time.Nanosecond)
                select {
                case <-mp.stopC:
                        return
                default:
                        // check first root inode
                        if mp.hasInode(ino) {
                                return
                        }
                        if !mp.raftPartition.IsRaftLeader() {
                                continue
                        }
                        // qinode := &MetaQuotaInode{
                        //         inode:    ino,
                        //         quotaIds: make([]uint32, 0, 0),
                        // }
                        // data, err := qinode.Marshal()
                        // if err != nil {
                        //         log.LogFatalf("[initInode] marshal: %s", err.Error())
                        // }

                        data, err := ino.Marshal()
                        if err != nil {
                                log.LogFatalf("[initInode] marshal: %s", err.Error())
                        }
                        // put first root inode
                        resp, err := mp.submit(opFSMCreateInode, data)
                        if err != nil {
                                log.LogFatalf("[initInode] raft sync: %s", err.Error())
                        }
                        p := &Packet{}
                        p.ResultCode = resp.(uint8)
                        log.LogDebugf("[initInode] raft sync: response status = %v.",
                                p.GetResultMsg())
                        return
                }
        }
}

// Not implemented.
func (mp *metaPartition) decommissionPartition() (err error) {
        return
}

func (mp *metaPartition) fsmUpdatePartition(end uint64) (status uint8,
        err error) {
        status = proto.OpOk
        oldEnd := mp.config.End
        mp.config.End = end

        if end < mp.config.Cursor {
                status = proto.OpAgain
                mp.config.End = oldEnd
                return
        }
        if err = mp.PersistMetadata(); err != nil {
                status = proto.OpDiskErr
                mp.config.End = oldEnd
        }
        return
}

func (mp *metaPartition) confAddNode(req *proto.AddMetaPartitionRaftMemberRequest, index uint64) (updated bool, err error) {
        var (
                heartbeatPort int
                replicaPort   int
        )
        if heartbeatPort, replicaPort, err = mp.getRaftPort(); err != nil {
                return
        }

        addPeer := false
        for _, peer := range mp.config.Peers {
                if peer.ID == req.AddPeer.ID {
                        addPeer = true
                        break
                }
        }
        updated = !addPeer
        if !updated {
                return
        }
        mp.config.Peers = append(mp.config.Peers, req.AddPeer)
        addr := strings.Split(req.AddPeer.Addr, ":")[0]
        mp.config.RaftStore.AddNodeWithPort(req.AddPeer.ID, addr, heartbeatPort, replicaPort)
        return
}

func (mp *metaPartition) confRemoveNode(req *proto.RemoveMetaPartitionRaftMemberRequest, index uint64) (updated bool, err error) {
        var canRemoveSelf bool
        if canRemoveSelf, err = mp.canRemoveSelf(); err != nil {
                return
        }
        peerIndex := -1
        data, _ := json.Marshal(req)
        log.LogInfof("Start RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                req.PartitionId, mp.config.NodeId, string(data))
        for i, peer := range mp.config.Peers {
                if peer.ID == req.RemovePeer.ID {
                        updated = true
                        peerIndex = i
                        break
                }
        }
        if !updated {
                log.LogInfof("NoUpdate RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                        req.PartitionId, mp.config.NodeId, string(data))
                return
        }
        mp.config.Peers = append(mp.config.Peers[:peerIndex], mp.config.Peers[peerIndex+1:]...)
        if mp.config.NodeId == req.RemovePeer.ID && !mp.isLoadingMetaPartition && canRemoveSelf {
                mp.Stop()
                mp.DeleteRaft()
                mp.manager.deletePartition(mp.GetBaseConfig().PartitionId)
                os.RemoveAll(mp.config.RootDir)
                updated = false
        }
        log.LogInfof("Fininsh RemoveRaftNode  PartitionID(%v) nodeID(%v)  do RaftLog (%v) ",
                req.PartitionId, mp.config.NodeId, string(data))
        return
}

func (mp *metaPartition) delOldExtentFile(buf []byte) (err error) {
        fileName := string(buf)
        log.LogWarnf("[delOldExtentFile] del extent file(%s), mp[%v]", fileName, mp.config.PartitionId)

        infos, err := ioutil.ReadDir(mp.config.RootDir)
        if err != nil {
                return
        }

        infos = sortDelExtFileInfo(infos)
        tgtIdx := getDelExtFileIdx(fileName)

        for _, f := range infos {
                idx := getDelExtFileIdx(f.Name())
                if idx > tgtIdx {
                        break
                }

                log.LogWarnf("[delOldExtentFile] del extent file(%s), mp[%v]", f.Name(), mp.config.PartitionId)
                os.Remove(path.Join(mp.config.RootDir, f.Name()))
        }

        return
}

//
func (mp *metaPartition) setExtentDeleteFileCursor(buf []byte) (err error) {
        str := string(buf)
        var (
                fileName string
                cursor   int64
        )
        _, err = fmt.Sscanf(str, "%s %d", &fileName, &cursor)
        log.LogInfof("[setExtentDeleteFileCursor] &fileName_&cursor(%s), mp[%v]", str, mp.config.PartitionId)
        if err != nil {
                return
        }
        fp, err := os.OpenFile(path.Join(mp.config.RootDir, fileName), os.O_CREATE|os.O_RDWR,
                0o644)
        if err != nil {
                log.LogErrorf("[setExtentDeleteFileCursor] openFile %s failed: %s",
                        fileName, err.Error())
                return
        }
        if err = binary.Write(fp, binary.BigEndian, cursor); err != nil {
                log.LogErrorf("[setExtentDeleteFileCursor] write file %s cursor"+
                        " failed: %s", fileName, err.Error())
        }
        // TODO Unhandled errors
        fp.Close()
        return
}

func (mp *metaPartition) CanRemoveRaftMember(peer proto.Peer) error {
        downReplicas := mp.config.RaftStore.RaftServer().GetDownReplicas(mp.config.PartitionId)
        hasExsit := false
        for _, p := range mp.config.Peers {
                if p.ID == peer.ID {
                        hasExsit = true
                        break
                }
        }
        if !hasExsit {
                return nil
        }

        hasDownReplicasExcludePeer := make([]uint64, 0)
        for _, nodeID := range downReplicas {
                if nodeID.NodeID == peer.ID {
                        continue
                }
                hasDownReplicasExcludePeer = append(hasDownReplicasExcludePeer, nodeID.NodeID)
        }

        sumReplicas := len(mp.config.Peers)
        if sumReplicas%2 == 1 {
                if sumReplicas-len(hasDownReplicasExcludePeer) > (sumReplicas/2 + 1) {
                        return nil
                }
        } else {
                if sumReplicas-len(hasDownReplicasExcludePeer) >= (sumReplicas/2 + 1) {
                        return nil
                }
        }

        return fmt.Errorf("downReplicas(%v) too much,so donnot offline (%v)", downReplicas, peer)
}

func (mp *metaPartition) IsEquareCreateMetaPartitionRequst(request *proto.CreateMetaPartitionRequest) (err error) {
        if len(mp.config.Peers) != len(request.Members) {
                return fmt.Errorf("Exsit unavali Partition(%v) partitionHosts(%v) requestHosts(%v)", mp.config.PartitionId, mp.config.Peers, request.Members)
        }
        if mp.config.Start != request.Start || mp.config.End != request.End {
                return fmt.Errorf("Exsit unavali Partition(%v) range(%v-%v) requestRange(%v-%v)", mp.config.PartitionId, mp.config.Start, mp.config.End, request.Start, request.End)
        }
        for index, peer := range mp.config.Peers {
                requestPeer := request.Members[index]
                if requestPeer.ID != peer.ID || requestPeer.Addr != peer.Addr {
                        return fmt.Errorf("Exsit unavali Partition(%v) partitionHosts(%v) requestHosts(%v)", mp.config.PartitionId, mp.config.Peers, request.Members)
                }
        }
        if mp.config.VolName != request.VolName {
                return fmt.Errorf("Exsit unavali Partition(%v) VolName(%v) requestVolName(%v)", mp.config.PartitionId, mp.config.VolName, request.VolName)
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "strings"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

type DentryResponse struct {
        Status uint8
        Msg    *Dentry
}

func NewDentryResponse() *DentryResponse {
        return &DentryResponse{
                Msg: &Dentry{},
        }
}

func (mp *metaPartition) fsmTxCreateDentry(txDentry *TxDentry) (status uint8) {
        done := mp.txProcessor.txManager.txInRMDone(txDentry.TxInfo.TxID)
        if done {
                log.LogWarnf("fsmTxCreateDentry: tx is already finish. txId %s", txDentry.TxInfo.TxID)
                status = proto.OpTxInfoNotExistErr
                return
        }

        txDI := proto.NewTxDentryInfo("", txDentry.Dentry.ParentId, txDentry.Dentry.Name, 0)
        txDenInfo, ok := txDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
        if !ok {
                status = proto.OpTxDentryInfoNotExistErr
                return
        }

        rbDentry := NewTxRollbackDentry(txDentry.Dentry, txDenInfo, TxDelete)
        status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
        if status == proto.OpExistErr {
                return proto.OpOk
        }

        if status != proto.OpOk {
                return
        }

        defer func() {
                if status != proto.OpOk {
                        mp.txProcessor.txResource.deleteTxRollbackDentry(txDenInfo.ParentId, txDenInfo.Name, txDenInfo.TxID)
                }
        }()

        return mp.fsmCreateDentry(txDentry.Dentry, false)
}

// Insert a dentry into the dentry tree.
func (mp *metaPartition) fsmCreateDentry(dentry *Dentry,
        forceUpdate bool) (status uint8) {
        status = proto.OpOk
        var parIno *Inode
        if !forceUpdate {
                item := mp.inodeTree.CopyGet(NewInode(dentry.ParentId, 0))
                if item == nil {
                        log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get nil, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, dentry.Name, dentry.Inode)
                        status = proto.OpNotExistErr
                        return
                }
                parIno = item.(*Inode)
                if parIno.ShouldDelete() {
                        log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, parIno, dentry.Name, dentry.Inode)
                        status = proto.OpNotExistErr
                        return
                }
                if !proto.IsDir(parIno.Type) {
                        log.LogErrorf("action[fsmCreateDentry] mp[%v] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v]", mp.config.PartitionId, dentry.ParentId, parIno, dentry.Name, dentry.Inode)
                        status = proto.OpArgMismatchErr
                        return
                }
        }

        if item, ok := mp.dentryTree.ReplaceOrInsert(dentry, false); !ok {
                // do not allow directories and files to overwrite each
                // other when renaming
                d := item.(*Dentry)
                if d.isDeleted() {
                        log.LogDebugf("action[fsmCreateDentry] mp[%v] newest dentry %v be set deleted flag", mp.config.PartitionId, d)
                        d.Inode = dentry.Inode
                        if d.getVerSeq() == dentry.getVerSeq() {
                                d.setVerSeq(dentry.getSeqFiled())
                        } else {
                                if d.getSnapListLen() > 0 && d.multiSnap.dentryList[0].isDeleted() {
                                        d.setVerSeq(dentry.getSeqFiled())
                                } else {
                                        d.addVersion(dentry.getSeqFiled())
                                }
                        }
                        d.Type = dentry.Type
                        d.ParentId = dentry.ParentId
                        log.LogDebugf("action[fsmCreateDentry.ver] mp[%v] latest dentry already deleted.Now create new one [%v]", mp.config.PartitionId, dentry)

                        if !forceUpdate {
                                parIno.IncNLink(mp.verSeq)
                                parIno.SetMtime()
                        }
                        return
                } else if proto.OsModeType(dentry.Type) != proto.OsModeType(d.Type) && !proto.IsSymlink(dentry.Type) && !proto.IsSymlink(d.Type) {
                        log.LogErrorf("action[fsmCreateDentry] ParentId [%v] get [%v] but should del, dentry name [%v], inode[%v], type[%v,%v],dir[%v,%v]",
                                dentry.ParentId, parIno, dentry.Name, dentry.Inode, dentry.Type, d.Type, proto.IsSymlink(dentry.Type), proto.IsSymlink(d.Type))
                        status = proto.OpArgMismatchErr
                        return
                } else if dentry.ParentId == d.ParentId && strings.Compare(dentry.Name, d.Name) == 0 && dentry.Inode == d.Inode {
                        log.LogDebugf("action[fsmCreateDentry.ver] mp[%v] no need repeat create new one [%v]", mp.config.PartitionId, dentry)
                        return
                }
                log.LogErrorf("action[fsmCreateDentry.ver] mp[%v] dentry already exist [%v] and diff with the request [%v]", mp.config.PartitionId, d, dentry)
                status = proto.OpExistErr
                return
        }

        if !forceUpdate {
                parIno.IncNLink(mp.verSeq)
                parIno.SetMtime()
        }
        return
}

func (mp *metaPartition) getDentryList(dentry *Dentry) (denList []proto.DetryInfo) {
        item := mp.dentryTree.Get(dentry)
        if item != nil {
                if item.(*Dentry).getSnapListLen() == 0 {
                        return
                }
                for _, den := range item.(*Dentry).multiSnap.dentryList {
                        denList = append(denList, proto.DetryInfo{
                                Inode:  den.Inode,
                                Mode:   den.Type,
                                IsDel:  den.isDeleted(),
                                VerSeq: den.getVerSeq(),
                        })
                }
        }
        return
}

// Query a dentry from the dentry tree with specified dentry info.
func (mp *metaPartition) getDentry(dentry *Dentry) (*Dentry, uint8) {
        status := proto.OpOk
        item := mp.dentryTree.Get(dentry)
        if item == nil {
                status = proto.OpNotExistErr
                return nil, status
        }
        log.LogDebugf("action[getDentry] get dentry[%v] by req dentry %v", item.(*Dentry), dentry)

        den := mp.getDentryByVerSeq(item.(*Dentry), dentry.getSeqFiled())
        if den != nil {
                return den, proto.OpOk
        }
        return den, proto.OpNotExistErr
}

func (mp *metaPartition) fsmTxDeleteDentry(txDentry *TxDentry) (resp *DentryResponse) {
        resp = NewDentryResponse()
        resp.Status = proto.OpOk
        if mp.txProcessor.txManager.txInRMDone(txDentry.TxInfo.TxID) {
                log.LogWarnf("fsmTxDeleteDentry: tx is already finish. txId %s", txDentry.TxInfo.TxID)
                resp.Status = proto.OpTxInfoNotExistErr
                return
        }

        tmpDen := txDentry.Dentry
        txDI := proto.NewTxDentryInfo("", tmpDen.ParentId, tmpDen.Name, 0)
        txDenInfo, ok := txDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
        if !ok {
                resp.Status = proto.OpTxDentryInfoNotExistErr
                return
        }

        rbDentry := NewTxRollbackDentry(tmpDen, txDenInfo, TxAdd)
        resp.Status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
        if resp.Status == proto.OpExistErr {
                resp.Status = proto.OpOk
                return
        }

        if resp.Status != proto.OpOk {
                return
        }

        defer func() {
                if resp.Status != proto.OpOk {
                        mp.txProcessor.txResource.deleteTxRollbackDentry(txDenInfo.ParentId, txDenInfo.Name, txDenInfo.TxID)
                }
        }()

        item := mp.dentryTree.Get(tmpDen)
        if item == nil || item.(*Dentry).Inode != tmpDen.Inode {
                log.LogWarnf("fsmTxDeleteDentry: got wrong dentry, want %v, got %v", tmpDen, item)
                resp.Status = proto.OpNotExistErr
                return
        }

        mp.dentryTree.Delete(tmpDen)
        // parent link count not change
        resp.Msg = item.(*Dentry)
        return
}

// Delete dentry from the dentry tree.
func (mp *metaPartition) fsmDeleteDentry(denParm *Dentry, checkInode bool) (resp *DentryResponse) {
        log.LogDebugf("action[fsmDeleteDentry] mp[%v] delete param (%v) seq [%v]", mp.config.PartitionId, denParm, denParm.getSeqFiled())
        resp = NewDentryResponse()
        resp.Status = proto.OpOk
        var (
                denFound *Dentry
                item     interface{}
                doMore   = true
                clean    bool
        )
        if checkInode {
                log.LogDebugf("action[fsmDeleteDentry] mp[%v] delete param %v", mp.config.PartitionId, denParm)
                item = mp.dentryTree.Execute(func(tree *btree.BTree) interface{} {
                        d := tree.CopyGet(denParm)
                        if d == nil {
                                return nil
                        }
                        den := d.(*Dentry)
                        if den.Inode != denParm.Inode {
                                return nil
                        }
                        if mp.verSeq == 0 {
                                log.LogDebugf("action[fsmDeleteDentry] mp[%v] volume snapshot not enabled,delete directly", mp.config.PartitionId)
                                denFound = den
                                return mp.dentryTree.tree.Delete(den)
                        }
                        denFound, doMore, clean = den.deleteVerSnapshot(denParm.getSeqFiled(), mp.verSeq, mp.GetVerList())
                        return den
                })
        } else {
                log.LogDebugf("action[fsmDeleteDentry] mp[%v] denParm dentry %v", mp.config.PartitionId, denParm)
                if mp.verSeq == 0 {
                        item = mp.dentryTree.Delete(denParm)
                        if item != nil {
                                denFound = item.(*Dentry)
                        }
                } else {
                        item = mp.dentryTree.Get(denParm)
                        if item != nil {
                                denFound, doMore, clean = item.(*Dentry).deleteVerSnapshot(denParm.getSeqFiled(), mp.verSeq, mp.GetVerList())
                        }
                }
        }

        if item != nil && (clean == true || (item.(*Dentry).getSnapListLen() == 0 && item.(*Dentry).isDeleted())) {
                log.LogDebugf("action[fsmDeleteDentry] mp[%v] dnetry %v really be deleted", mp.config.PartitionId, item.(*Dentry))
                item = mp.dentryTree.Delete(item.(*Dentry))
        }

        if !doMore { // not the top layer,do nothing to parent inode
                if denFound != nil {
                        resp.Msg = denFound
                }
                log.LogDebugf("action[fsmDeleteDentry] mp[%v] there's nothing to do more denParm %v", mp.config.PartitionId, denParm)
                return
        }
        if denFound == nil {
                resp.Status = proto.OpNotExistErr
                log.LogErrorf("action[fsmDeleteDentry] mp[%v] not found dentry %v", mp.config.PartitionId, denParm)
                return
        } else {
                mp.inodeTree.CopyFind(NewInode(denParm.ParentId, 0),
                        func(item BtreeItem) {
                                if item != nil { // no matter
                                        ino := item.(*Inode)
                                        if !ino.ShouldDelete() {
                                                log.LogDebugf("action[fsmDeleteDentry] mp[%v] den  %v delete parent's link", mp.config.PartitionId, denParm)
                                                if denParm.getSeqFiled() == 0 {
                                                        item.(*Inode).DecNLink()
                                                }
                                                log.LogDebugf("action[fsmDeleteDentry] mp[%v] inode[%v] be unlinked by child name %v", mp.config.PartitionId, item.(*Inode).Inode, denParm.Name)
                                                item.(*Inode).SetMtime()
                                        }
                                }
                        })
        }
        resp.Msg = denFound
        return
}

// batch Delete dentry from the dentry tree.
func (mp *metaPartition) fsmBatchDeleteDentry(db DentryBatch) []*DentryResponse {
        result := make([]*DentryResponse, 0, len(db))
        for _, dentry := range db {
                status := mp.dentryInTx(dentry.ParentId, dentry.Name)
                if status != proto.OpOk {
                        result = append(result, &DentryResponse{Status: status})
                        continue
                }
                result = append(result, mp.fsmDeleteDentry(dentry, true))
        }
        return result
}

func (mp *metaPartition) fsmTxUpdateDentry(txUpDateDentry *TxUpdateDentry) (resp *DentryResponse) {
        resp = NewDentryResponse()
        resp.Status = proto.OpOk

        if mp.txProcessor.txManager.txInRMDone(txUpDateDentry.TxInfo.TxID) {
                log.LogWarnf("fsmTxUpdateDentry: tx is already finish. txId %s", txUpDateDentry.TxInfo.TxID)
                resp.Status = proto.OpTxInfoNotExistErr
                return
        }

        newDen := txUpDateDentry.NewDentry
        oldDen := txUpDateDentry.OldDentry

        txDI := proto.NewTxDentryInfo("", oldDen.ParentId, oldDen.Name, 0)
        txDenInfo, ok := txUpDateDentry.TxInfo.TxDentryInfos[txDI.GetKey()]
        if !ok {
                resp.Status = proto.OpTxDentryInfoNotExistErr
                return
        }

        item := mp.dentryTree.CopyGet(oldDen)
        if item == nil || item.(*Dentry).Inode != oldDen.Inode {
                resp.Status = proto.OpNotExistErr
                log.LogWarnf("fsmTxUpdateDentry: find dentry is not right, want %v, got %v", oldDen, item)
                return
        }

        rbDentry := NewTxRollbackDentry(txUpDateDentry.OldDentry, txDenInfo, TxUpdate)
        resp.Status = mp.txProcessor.txResource.addTxRollbackDentry(rbDentry)
        if resp.Status == proto.OpExistErr {
                resp.Status = proto.OpOk
                return
        }

        if resp.Status != proto.OpOk {
                return
        }

        d := item.(*Dentry)
        d.Inode, newDen.Inode = newDen.Inode, d.Inode
        resp.Msg = newDen
        return
}

func (mp *metaPartition) fsmUpdateDentry(dentry *Dentry) (
        resp *DentryResponse) {
        resp = NewDentryResponse()
        resp.Status = proto.OpOk
        mp.dentryTree.CopyFind(dentry, func(item BtreeItem) {
                if item == nil {
                        resp.Status = proto.OpNotExistErr
                        return
                }
                d := item.(*Dentry)
                if dentry.Inode == d.Inode {
                        return
                }
                if d.getVerSeq() < mp.GetVerSeq() {
                        dn := d.CopyDirectly()
                        dn.(*Dentry).setVerSeq(d.getVerSeq())
                        d.setVerSeq(mp.GetVerSeq())
                        d.multiSnap.dentryList = append([]*Dentry{dn.(*Dentry)}, d.multiSnap.dentryList...)
                }
                d.Inode, dentry.Inode = dentry.Inode, d.Inode
                resp.Msg = dentry
        })
        return
}

func (mp *metaPartition) getDentryTree() *BTree {
        return mp.dentryTree.GetTree()
}

func (mp *metaPartition) getDentryByVerSeq(dy *Dentry, verSeq uint64) (d *Dentry) {
        d, _ = dy.getDentryFromVerList(verSeq, false)
        return
}

func (mp *metaPartition) readDirOnly(req *ReadDirOnlyReq) (resp *ReadDirOnlyResp) {
        resp = &ReadDirOnlyResp{}
        begDentry := &Dentry{
                ParentId: req.ParentID,
        }
        endDentry := &Dentry{
                ParentId: req.ParentID + 1,
        }
        mp.dentryTree.AscendRange(begDentry, endDentry, func(i BtreeItem) bool {
                if proto.IsDir(i.(*Dentry).Type) {
                        d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
                        if d == nil {
                                return true
                        }
                        resp.Children = append(resp.Children, proto.Dentry{
                                Inode: d.Inode,
                                Type:  d.Type,
                                Name:  d.Name,
                        })
                }
                return true
        })
        return
}

func (mp *metaPartition) readDir(req *ReadDirReq) (resp *ReadDirResp) {
        resp = &ReadDirResp{}
        begDentry := &Dentry{
                ParentId: req.ParentID,
        }
        endDentry := &Dentry{
                ParentId: req.ParentID + 1,
        }
        mp.dentryTree.AscendRange(begDentry, endDentry, func(i BtreeItem) bool {
                d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
                if d == nil {
                        return true
                }
                resp.Children = append(resp.Children, proto.Dentry{
                        Inode: d.Inode,
                        Type:  d.Type,
                        Name:  d.Name,
                })
                return true
        })
        return
}

// Read dentry from btree by limit count
// if req.Marker == "" and req.Limit == 0, it becomes readDir
// else if req.Marker != "" and req.Limit == 0, return dentries from pid:name to pid+1
// else if req.Marker == "" and req.Limit != 0, return dentries from pid with limit count
// else if req.Marker != "" and req.Limit != 0, return dentries from pid:marker to pid:xxxx with limit count
//
func (mp *metaPartition) readDirLimit(req *ReadDirLimitReq) (resp *ReadDirLimitResp) {
        log.LogDebugf("action[readDirLimit] mp[%v] req %v", mp.config.PartitionId, req)
        resp = &ReadDirLimitResp{}
        startDentry := &Dentry{
                ParentId: req.ParentID,
        }
        if len(req.Marker) > 0 {
                startDentry.Name = req.Marker
        }
        endDentry := &Dentry{
                ParentId: req.ParentID + 1,
        }
        mp.dentryTree.AscendRange(startDentry, endDentry, func(i BtreeItem) bool {
                if !proto.IsDir(i.(*Dentry).Type) && (req.VerOpt&uint8(proto.FlagsSnapshotDel) > 0) {
                        if req.VerOpt&uint8(proto.FlagsSnapshotDelDir) > 0 {
                                return true
                        }
                        if !i.(*Dentry).isEffective(req.VerSeq) {
                                return true
                        }
                }
                d := mp.getDentryByVerSeq(i.(*Dentry), req.VerSeq)
                if d == nil {
                        return true
                }
                resp.Children = append(resp.Children, proto.Dentry{
                        Inode: d.Inode,
                        Type:  d.Type,
                        Name:  d.Name,
                })
                // Limit == 0 means no limit.
                if req.Limit > 0 && uint64(len(resp.Children)) >= req.Limit {
                        return false
                }
                return true
        })
        log.LogDebugf("action[readDirLimit] mp[%v] resp %v", mp.config.PartitionId, resp)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        "math"

        "github.com/cubefs/cubefs/util/log"
)

type ExtendOpResult struct {
        Status uint8
        Extend *Extend
}

func (mp *metaPartition) fsmSetXAttr(extend *Extend) (err error) {
        extend.verSeq = mp.GetVerSeq()
        treeItem := mp.extendTree.CopyGet(extend)
        var e *Extend
        if treeItem == nil {
                mp.extendTree.ReplaceOrInsert(extend, true)
        } else {
                // attr multi-ver copy all attr for simplify management
                e = treeItem.(*Extend)
                if e.verSeq != extend.verSeq {
                        if extend.verSeq < e.verSeq {
                                return fmt.Errorf("seq error assign %v but less than %v", extend.verSeq, e.verSeq)
                        }
                        e.multiVers = append([]*Extend{e.Copy().(*Extend)}, e.multiVers...)
                        e.verSeq = extend.verSeq
                }
                e.Merge(extend, true)
        }

        return
}

// todo(leon chang):check snapshot delete relation with attr
func (mp *metaPartition) fsmRemoveXAttr(reqExtend *Extend) (err error) {
        treeItem := mp.extendTree.CopyGet(reqExtend)
        if treeItem == nil {
                return
        }

        e := treeItem.(*Extend)
        if mp.GetVerSeq() == 0 || (e.verSeq == mp.GetVerSeq() && reqExtend.verSeq == 0) {
                reqExtend.Range(func(key, value []byte) bool {
                        e.Remove(key)
                        return true
                })
                return
        }

        if reqExtend.verSeq == 0 {
                reqExtend.verSeq = mp.GetVerSeq()
        }
        if reqExtend.verSeq == math.MaxUint64 {
                reqExtend.verSeq = 0
        }

        e.versionMu.Lock()
        defer e.versionMu.Unlock()
        if reqExtend.verSeq < e.GetMinVer() {
                return
        }

        mp.multiVersionList.RWLock.RLock()
        defer mp.multiVersionList.RWLock.RUnlock()

        if reqExtend.verSeq > e.verSeq {
                e.multiVers = append([]*Extend{e.Copy().(*Extend)}, e.multiVers...)
                e.verSeq = reqExtend.verSeq
                reqExtend.Range(func(key, value []byte) bool {
                        e.Remove(key)
                        return true
                })
        } else if reqExtend.verSeq == e.verSeq {
                var globalNewVer uint64
                if globalNewVer, err = mp.multiVersionList.GetNextNewerVer(reqExtend.verSeq); err != nil {
                        log.LogErrorf("fsmRemoveXAttr. mp[%v] seq [%v] req ver [%v] not found newer seq", mp.config.PartitionId, mp.verSeq, reqExtend.verSeq)
                        return err
                }
                e.verSeq = globalNewVer
        } else {
                innerLastVer := e.verSeq
                for id, ele := range e.multiVers {
                        if ele.verSeq > reqExtend.verSeq {
                                innerLastVer = ele.verSeq
                                continue
                        } else if ele.verSeq < reqExtend.verSeq {
                                return
                        } else {
                                var globalNewVer uint64
                                if globalNewVer, err = mp.multiVersionList.GetNextNewerVer(ele.verSeq); err != nil {
                                        return err
                                }
                                if globalNewVer < innerLastVer {
                                        log.LogDebugf("mp[%v] inode[%v] extent layer %v update seq [%v] to %v",
                                                mp.config.PartitionId, ele.inode, id, ele.verSeq, globalNewVer)
                                        ele.verSeq = globalNewVer
                                        return
                                }
                                e.multiVers = append(e.multiVers[:id], e.multiVers[id+1:]...)
                                return
                        }
                }
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/timeutil"
)

type InodeResponse struct {
        Status uint8
        Msg    *Inode
}

func NewInodeResponse() *InodeResponse {
        return &InodeResponse{}
}

// Create and inode and attach it to the inode tree.
func (mp *metaPartition) fsmTxCreateInode(txIno *TxInode, quotaIds []uint32) (status uint8) {
        status = proto.OpOk
        if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
                log.LogWarnf("fsmTxCreateInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
                return proto.OpTxInfoNotExistErr
        }

        // inodeInfo := mp.txProcessor.txManager.getTxInodeInfo(txIno.TxInfo.TxID, txIno.Inode.Inode)
        inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
        if !ok {
                status = proto.OpTxInodeInfoNotExistErr
                return
        }

        rbInode := NewTxRollbackInode(txIno.Inode, quotaIds, inodeInfo, TxDelete)
        status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
        if status != proto.OpOk {
                return
        }

        defer func() {
                if status != proto.OpOk {
                        mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
                }
        }()
        // 3.insert inode in inode tree
        return mp.fsmCreateInode(txIno.Inode)
}

// Create and inode and attach it to the inode tree.
func (mp *metaPartition) fsmCreateInode(ino *Inode) (status uint8) {
        if status = mp.uidManager.addUidSpace(ino.Uid, ino.Inode, nil); status != proto.OpOk {
                return
        }

        status = proto.OpOk
        if _, ok := mp.inodeTree.ReplaceOrInsert(ino, false); !ok {
                status = proto.OpExistErr
        }

        return
}

func (mp *metaPartition) fsmTxCreateLinkInode(txIno *TxInode) (resp *InodeResponse) {
        resp = NewInodeResponse()
        resp.Status = proto.OpOk
        if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
                log.LogWarnf("fsmTxCreateLinkInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
                resp.Status = proto.OpTxInfoNotExistErr
                return
        }

        // 2.register rollback item
        inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
        if !ok {
                resp.Status = proto.OpTxInodeInfoNotExistErr
                return
        }

        rbInode := NewTxRollbackInode(txIno.Inode, []uint32{}, inodeInfo, TxDelete)
        resp.Status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
        if resp.Status == proto.OpExistErr {
                resp.Status = proto.OpOk
                resp.Msg = txIno.Inode
                return
        }

        if resp.Status != proto.OpOk {
                return
        }

        defer func() {
                if resp.Status != proto.OpOk {
                        mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
                }
        }()

        return mp.fsmCreateLinkInode(txIno.Inode, 0)
}

func (mp *metaPartition) fsmCreateLinkInode(ino *Inode, uniqID uint64) (resp *InodeResponse) {
        resp = NewInodeResponse()
        resp.Status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                resp.Status = proto.OpNotExistErr
                return
        }
        i := item.(*Inode)
        if i.ShouldDelete() {
                resp.Status = proto.OpNotExistErr
                return
        }

        resp.Msg = i
        if !mp.uniqChecker.legalIn(uniqID) {
                log.LogWarnf("fsmCreateLinkInode repeated, ino[%v] uniqID %v nlink %v", ino.Inode, uniqID, ino.GetNLink())
                return
        }
        i.IncNLink(ino.getVer())
        return
}

func (mp *metaPartition) getInodeByVer(ino *Inode) (i *Inode) {
        item := mp.inodeTree.Get(ino)
        if item == nil {
                log.LogDebugf("action[getInodeByVer] not found ino[%v] verseq [%v]", ino.Inode, ino.getVer())
                return
        }
        i, _ = item.(*Inode).getInoByVer(ino.getVer(), false)
        return
}

func (mp *metaPartition) getInodeTopLayer(ino *Inode) (resp *InodeResponse) {
        resp = NewInodeResponse()
        resp.Status = proto.OpOk

        item := mp.inodeTree.Get(ino)
        if item == nil {
                resp.Status = proto.OpNotExistErr
                log.LogDebugf("action[getInodeTopLayer] not found ino[%v] verseq [%v]", ino.Inode, ino.getVer())
                return
        }
        i := item.(*Inode)
        ctime := timeutil.GetCurrentTimeUnix()
        /*
         * FIXME: not protected by lock yet, since nothing is depending on atime.
         * Shall add inode lock in the future.
         */
        if ctime > i.AccessTime {
                i.AccessTime = ctime
        }

        resp.Msg = i
        return
}

func (mp *metaPartition) getInode(ino *Inode, listAll bool) (resp *InodeResponse) {
        resp = NewInodeResponse()
        resp.Status = proto.OpOk

        i := mp.getInodeByVer(ino)
        if i == nil || (listAll == false && i.ShouldDelete()) {
                log.LogDebugf("action[getInode] ino  %v not found", ino)
                resp.Status = proto.OpNotExistErr
                return
        }

        ctime := timeutil.GetCurrentTimeUnix()

        /*
         * FIXME: not protected by lock yet, since nothing is depending on atime.
         * Shall add inode lock in the future.
         */
        if ctime > i.AccessTime {
                i.AccessTime = ctime
        }

        resp.Msg = i
        return
}

func (mp *metaPartition) hasInode(ino *Inode) (ok bool) {
        item := mp.inodeTree.Get(ino)
        if item == nil {
                return
        }
        i := mp.getInodeByVer(ino)
        if i == nil || i.ShouldDelete() {
                return
        }
        ok = true
        return
}

// Ascend is the wrapper of inodeTree.Ascend
func (mp *metaPartition) Ascend(f func(i BtreeItem) bool) {
        mp.inodeTree.Ascend(f)
}

func (mp *metaPartition) fsmTxUnlinkInode(txIno *TxInode) (resp *InodeResponse) {
        resp = NewInodeResponse()
        resp.Status = proto.OpOk

        if proto.IsDir(txIno.Inode.Type) && txIno.TxInfo.TxType == proto.TxTypeRemove && txIno.Inode.NLink > 2 {
                resp.Status = proto.OpNotEmpty
                log.LogWarnf("fsmTxUnlinkInode: dir is not empty, can't remove it, txinode[%v]", txIno)
                return
        }

        if mp.txProcessor.txManager.txInRMDone(txIno.TxInfo.TxID) {
                log.LogWarnf("fsmTxUnlinkInode: tx is already finish. txId %s", txIno.TxInfo.TxID)
                resp.Status = proto.OpTxInfoNotExistErr
                return
        }

        inodeInfo, ok := txIno.TxInfo.TxInodeInfos[txIno.Inode.Inode]
        if !ok {
                resp.Status = proto.OpTxInodeInfoNotExistErr
                return
        }
        var quotaIds []uint32
        quotaIds, _ = mp.isExistQuota(txIno.Inode.Inode)

        rbInode := NewTxRollbackInode(txIno.Inode, quotaIds, inodeInfo, TxAdd)
        resp.Status = mp.txProcessor.txResource.addTxRollbackInode(rbInode)
        if resp.Status == proto.OpExistErr {
                resp.Status = proto.OpOk
                item := mp.inodeTree.Get(txIno.Inode)
                if item != nil {
                        resp.Msg = item.(*Inode)
                }
                return
        }
        if resp.Status != proto.OpOk {
                return
        }

        defer func() {
                if resp.Status != proto.OpOk {
                        mp.txProcessor.txResource.deleteTxRollbackInode(txIno.Inode.Inode, txIno.TxInfo.TxID)
                }
        }()

        resp = mp.fsmUnlinkInode(txIno.Inode, 0)
        if resp.Status != proto.OpOk {
                return
        }

        if txIno.TxInfo.TxType == proto.TxTypeRename {
                mp.fsmEvictInode(txIno.Inode)
        }

        return
}

// normal unlink seq is 0
// snapshot unlink seq is snapshotVersion
// fsmUnlinkInode delete the specified inode from inode tree.

func (mp *metaPartition) fsmUnlinkInode(ino *Inode, uniqID uint64) (resp *InodeResponse) {
        log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
        var ext2Del []proto.ExtentKey

        resp = NewInodeResponse()
        resp.Status = proto.OpOk

        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
                resp.Status = proto.OpNotExistErr
                return
        }
        inode := item.(*Inode)
        if ino.getVer() == 0 && inode.ShouldDelete() {
                log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, ino)
                resp.Status = proto.OpNotExistErr
                return
        }

        resp.Msg = inode
        if !mp.uniqChecker.legalIn(uniqID) {
                log.LogWarnf("fsmUnlinkInode repeat, mp[%v] ino[%v] uniqID %v nlink %v", mp.config.PartitionId, ino.Inode, uniqID, ino.GetNLink())
                return
        }

        log.LogDebugf("action[fsmUnlinkInode] mp[%v] get inode[%v]", mp.config.PartitionId, inode)
        var (
                doMore bool
                status = proto.OpOk
        )

        if ino.getVer() == 0 {
                ext2Del, doMore, status = inode.unlinkTopLayer(mp.config.PartitionId, ino, mp.verSeq, mp.multiVersionList)
        } else { // means drop snapshot
                log.LogDebugf("action[fsmUnlinkInode] mp[%v] req drop assigned snapshot reqseq [%v] inode seq [%v]", mp.config.PartitionId, ino.getVer(), inode.getVer())
                if ino.getVer() > inode.getVer() && !isInitSnapVer(ino.getVer()) {
                        log.LogDebugf("action[fsmUnlinkInode] mp[%v] inode[%v] unlink not exist snapshot and return do nothing.reqseq [%v] larger than inode seq [%v]",
                                mp.config.PartitionId, ino.Inode, ino.getVer(), inode.getVer())
                        return
                } else {
                        ext2Del, doMore, status = inode.unlinkVerInList(mp.config.PartitionId, ino, mp.verSeq, mp.multiVersionList)
                }
        }
        if !doMore {
                resp.Status = status
                return
        }

        if inode.IsEmptyDirAndNoSnapshot() {
                if ino.NLink < 2 { // snapshot deletion
                        log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] really be deleted, empty dir", mp.config.PartitionId, inode)
                        mp.inodeTree.Delete(inode)
                        mp.updateUsedInfo(0, -1, inode.Inode)
                }
        } else if inode.IsTempFile() {
                // all snapshot between create to last deletion cleaned
                if inode.NLink == 0 && inode.getLayerLen() == 0 {
                        mp.updateUsedInfo(-1*int64(inode.Size), -1, inode.Inode)
                        log.LogDebugf("action[fsmUnlinkInode] mp[%v] unlink inode[%v] and push to freeList", mp.config.PartitionId, inode)
                        inode.AccessTime = time.Now().Unix()
                        mp.freeList.Push(inode.Inode)
                        mp.uidManager.doMinusUidSpace(inode.Uid, inode.Inode, inode.Size)
                        log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v]", mp.config.PartitionId, inode)
                }
        }

        if len(ext2Del) > 0 {
                log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] DecSplitExts ext2Del %v", mp.config.PartitionId, ino, ext2Del)
                inode.DecSplitExts(mp.config.PartitionId, ext2Del)
                mp.extDelCh <- ext2Del
        }
        log.LogDebugf("action[fsmUnlinkInode] mp[%v] ino[%v] left", mp.config.PartitionId, inode)
        return
}

// fsmUnlinkInode delete the specified inode from inode tree.
func (mp *metaPartition) fsmUnlinkInodeBatch(ib InodeBatch) (resp []*InodeResponse) {
        for _, ino := range ib {
                status := mp.inodeInTx(ino.Inode)
                if status != proto.OpOk {
                        resp = append(resp, &InodeResponse{Status: status})
                        continue
                }
                resp = append(resp, mp.fsmUnlinkInode(ino, 0))
        }
        return
}

func (mp *metaPartition) internalHasInode(ino *Inode) bool {
        return mp.inodeTree.Has(ino)
}

func (mp *metaPartition) internalDelete(val []byte) (err error) {
        if len(val) == 0 {
                return
        }
        buf := bytes.NewBuffer(val)
        ino := NewInode(0, 0)
        for {
                err = binary.Read(buf, binary.BigEndian, &ino.Inode)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                return
                        }
                        return
                }
                log.LogDebugf("internalDelete: received internal delete: partitionID(%v) inode[%v]",
                        mp.config.PartitionId, ino.Inode)
                mp.internalDeleteInode(ino)
        }
}

func (mp *metaPartition) internalDeleteBatch(val []byte) error {
        if len(val) == 0 {
                return nil
        }
        inodes, err := InodeBatchUnmarshal(val)
        if err != nil {
                return nil
        }

        for _, ino := range inodes {
                log.LogDebugf("internalDelete: received internal delete: partitionID(%v) inode[%v]",
                        mp.config.PartitionId, ino.Inode)
                mp.internalDeleteInode(ino)
        }

        return nil
}

func (mp *metaPartition) internalDeleteInode(ino *Inode) {
        log.LogDebugf("action[internalDeleteInode] ino[%v] really be deleted", ino)
        mp.inodeTree.Delete(ino)
        mp.freeList.Remove(ino.Inode)
        mp.extendTree.Delete(&Extend{inode: ino.Inode}) // Also delete extend attribute.
        return
}

func (mp *metaPartition) fsmAppendExtents(ino *Inode) (status uint8) {
        status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                status = proto.OpNotExistErr
                return
        }
        ino2 := item.(*Inode)
        if ino2.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }
        oldSize := int64(ino2.Size)
        eks := ino.Extents.CopyExtents()
        if status = mp.uidManager.addUidSpace(ino2.Uid, ino2.Inode, eks); status != proto.OpOk {
                return
        }
        delExtents := ino2.AppendExtents(eks, ino.ModifyTime, mp.volType)
        mp.updateUsedInfo(int64(ino2.Size)-oldSize, 0, ino2.Inode)
        log.LogInfof("fsmAppendExtents mpId[%v].inode[%v] deleteExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
        mp.uidManager.minusUidSpace(ino2.Uid, ino2.Inode, delExtents)

        log.LogInfof("fsmAppendExtents mpId[%v].inode[%v] DecSplitExts deleteExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
        ino2.DecSplitExts(mp.config.PartitionId, delExtents)
        mp.extDelCh <- delExtents
        return
}

func (mp *metaPartition) fsmAppendExtentsWithCheck(ino *Inode, isSplit bool) (status uint8) {
        var (
                delExtents       []proto.ExtentKey
                discardExtentKey []proto.ExtentKey
        )

        if mp.verSeq < ino.getVer() {
                status = proto.OpArgMismatchErr
                log.LogErrorf("fsmAppendExtentsWithCheck.mp[%v] param ino[%v] mp seq [%v]", mp.config.PartitionId, ino, mp.verSeq)
                return
        }
        status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)

        if item == nil {
                status = proto.OpNotExistErr
                return
        }

        fsmIno := item.(*Inode)
        if fsmIno.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }

        oldSize := int64(fsmIno.Size)
        eks := ino.Extents.CopyExtents()

        if len(eks) < 1 {
                return
        }
        if len(eks) > 1 {
                discardExtentKey = eks[1:]
        }

        if status = mp.uidManager.addUidSpace(fsmIno.Uid, fsmIno.Inode, eks[:1]); status != proto.OpOk {
                log.LogErrorf("fsmAppendExtentsWithCheck.mp[%v] addUidSpace status [%v]", mp.config.PartitionId, status)
                return
        }

        log.LogDebugf("action[fsmAppendExtentsWithCheck] mp[%v] ver [%v] ino[%v] isSplit %v ek [%v] hist len %v discardExtentKey %v",
                mp.config.PartitionId, mp.verSeq, fsmIno.Inode, isSplit, eks[0], fsmIno.getLayerLen(), discardExtentKey)

        appendExtParam := &AppendExtParam{
                mpId:             mp.config.PartitionId,
                mpVer:            mp.verSeq,
                ek:               eks[0],
                ct:               ino.ModifyTime,
                discardExtents:   discardExtentKey,
                volType:          mp.volType,
                multiVersionList: mp.multiVersionList,
        }

        if !isSplit {
                delExtents, status = fsmIno.AppendExtentWithCheck(appendExtParam)
                if status == proto.OpOk {
                        log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] DecSplitExts delExtents [%v]", mp.config.PartitionId, delExtents)
                        fsmIno.DecSplitExts(appendExtParam.mpId, delExtents)
                        mp.extDelCh <- delExtents
                }
                // conflict need delete eks[0], to clear garbage data
                if status == proto.OpConflictExtentsErr {
                        log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] OpConflictExtentsErr [%v]", mp.config.PartitionId, eks[:1])
                        if !storage.IsTinyExtent(eks[0].ExtentId) && eks[0].ExtentOffset >= util.ExtentSize {
                                eks[0].SetSplit(true)
                        }
                        mp.extDelCh <- eks[:1]
                }
        } else {
                // only the ek itself will be moved to level before
                // ino verseq be set with mp ver before submit in case other mp be updated while on flight, which will lead to
                // inconsistent between raft pairs
                delExtents, status = fsmIno.SplitExtentWithCheck(appendExtParam)
                log.LogInfof("action[fsmAppendExtentsWithCheck] mp[%v] DecSplitExts delExtents [%v]", mp.config.PartitionId, delExtents)
                fsmIno.DecSplitExts(mp.config.PartitionId, delExtents)
                mp.extDelCh <- delExtents
                mp.uidManager.minusUidSpace(fsmIno.Uid, fsmIno.Inode, delExtents)
        }

        // conflict need delete eks[0], to clear garbage data
        if status == proto.OpConflictExtentsErr {
                mp.extDelCh <- eks[:1]
                mp.uidManager.minusUidSpace(fsmIno.Uid, fsmIno.Inode, eks[:1])
                log.LogDebugf("fsmAppendExtentsWithCheck mp[%v] delExtents inode[%v] ek(%v)", mp.config.PartitionId, fsmIno.Inode, delExtents)
        }

        mp.updateUsedInfo(int64(fsmIno.Size)-oldSize, 0, fsmIno.Inode)
        log.LogInfof("fsmAppendExtentWithCheck mp[%v] inode[%v] ek(%v) deleteExtents(%v) discardExtents(%v) status(%v)",
                mp.config.PartitionId, fsmIno.Inode, eks[0], delExtents, discardExtentKey, status)

        return
}

func (mp *metaPartition) fsmAppendObjExtents(ino *Inode) (status uint8) {
        status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                status = proto.OpNotExistErr
                return
        }

        inode := item.(*Inode)
        if inode.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }

        eks := ino.ObjExtents.CopyExtents()
        err := inode.AppendObjExtents(eks, ino.ModifyTime)
        // if err is not nil, means obj eks exist overlap.
        if err != nil {
                log.LogErrorf("fsmAppendExtents inode[%v] err(%v)", inode.Inode, err)
                status = proto.OpConflictExtentsErr
        }
        return
}

func (mp *metaPartition) fsmExtentsTruncate(ino *Inode) (resp *InodeResponse) {
        var err error
        resp = NewInodeResponse()
        log.LogDebugf("fsmExtentsTruncate. req ino[%v]", ino)
        resp.Status = proto.OpOk
        item := mp.inodeTree.Get(ino)
        if item == nil {
                resp.Status = proto.OpNotExistErr
                return
        }
        i := item.(*Inode)
        if i.ShouldDelete() {
                resp.Status = proto.OpNotExistErr
                return
        }
        if proto.IsDir(i.Type) {
                resp.Status = proto.OpArgMismatchErr
                return
        }

        doOnLastKey := func(lastKey *proto.ExtentKey) {
                var eks []proto.ExtentKey
                eks = append(eks, *lastKey)
                mp.uidManager.minusUidSpace(i.Uid, i.Inode, eks)
        }

        insertSplitKey := func(ek *proto.ExtentKey) {
                i.insertEkRefMap(mp.config.PartitionId, ek)
        }

        if i.getVer() != mp.verSeq {
                i.CreateVer(mp.verSeq)
        }
        i.Lock()
        defer i.Unlock()

        if err = i.CreateLowerVersion(i.getVer(), mp.multiVersionList); err != nil {
                return
        }
        oldSize := int64(i.Size)
        delExtents := i.ExtentsTruncate(ino.Size, ino.ModifyTime, doOnLastKey, insertSplitKey)

        if len(delExtents) == 0 {
                return
        }

        if delExtents, err = i.RestoreExts2NextLayer(mp.config.PartitionId, delExtents, mp.verSeq, 0); err != nil {
                panic("RestoreExts2NextLayer should not be error")
        }
        mp.updateUsedInfo(int64(i.Size)-oldSize, 0, i.Inode)

        // now we should delete the extent
        log.LogInfof("fsmExtentsTruncate.mp (%v) inode[%v] DecSplitExts exts(%v)", mp.config.PartitionId, i.Inode, delExtents)
        i.DecSplitExts(mp.config.PartitionId, delExtents)
        mp.extDelCh <- delExtents
        mp.uidManager.minusUidSpace(i.Uid, i.Inode, delExtents)
        return
}

func (mp *metaPartition) fsmEvictInode(ino *Inode) (resp *InodeResponse) {
        resp = NewInodeResponse()
        log.LogDebugf("action[fsmEvictInode] inode[%v]", ino)
        resp.Status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                resp.Status = proto.OpNotExistErr
                return
        }
        i := item.(*Inode)
        if i.ShouldDelete() {
                log.LogDebugf("action[fsmEvictInode] inode[%v] already be mark delete", ino)
                return
        }
        if proto.IsDir(i.Type) {
                if i.IsEmptyDirAndNoSnapshot() {
                        i.SetDeleteMark()
                }
                return
        }

        if i.IsTempFile() {
                log.LogDebugf("action[fsmEvictInode] inode[%v] already linke zero and be set mark delete and be put to freelist", ino)
                if i.isEmptyVerList() {
                        i.SetDeleteMark()
                        mp.freeList.Push(i.Inode)
                }
        }
        return
}

func (mp *metaPartition) fsmBatchEvictInode(ib InodeBatch) (resp []*InodeResponse) {
        for _, ino := range ib {
                status := mp.inodeInTx(ino.Inode)
                if status != proto.OpOk {
                        resp = append(resp, &InodeResponse{Status: status})
                        return
                }
                resp = append(resp, mp.fsmEvictInode(ino))
        }
        return
}

func (mp *metaPartition) checkAndInsertFreeList(ino *Inode) {
        if proto.IsDir(ino.Type) {
                return
        }
        if ino.ShouldDelete() {
                mp.freeList.Push(ino.Inode)
        } else if ino.IsTempFile() {
                ino.AccessTime = time.Now().Unix()
                mp.freeList.Push(ino.Inode)
        }
}

func (mp *metaPartition) fsmSetAttr(req *SetattrRequest) (err error) {
        log.LogDebugf("action[fsmSetAttr] req %v", req)
        ino := NewInode(req.Inode, req.Mode)
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                return
        }
        ino = item.(*Inode)
        if ino.ShouldDelete() {
                return
        }
        ino.SetAttr(req)
        return
}

// fsmExtentsEmpty only use in datalake situation
func (mp *metaPartition) fsmExtentsEmpty(ino *Inode) (status uint8) {
        status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                status = proto.OpNotExistErr
                return
        }
        i := item.(*Inode)
        if i.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }
        if proto.IsDir(i.Type) {
                status = proto.OpArgMismatchErr
                return
        }
        log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks len [%v]", mp.config.PartitionId, ino.Inode, len(i.Extents.eks))
        tinyEks := i.CopyTinyExtents()
        log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks tiny len [%v]", mp.config.PartitionId, ino.Inode, len(tinyEks))

        if len(tinyEks) > 0 {
                mp.extDelCh <- tinyEks
                mp.uidManager.minusUidSpace(i.Uid, i.Inode, tinyEks)
                log.LogDebugf("fsmExtentsEmpty mp[%v] inode[%d] tinyEks(%v)", mp.config.PartitionId, ino.Inode, tinyEks)
        }

        i.EmptyExtents(ino.ModifyTime)

        return
}

// fsmExtentsEmpty only use in datalake situation
func (mp *metaPartition) fsmDelVerExtents(ino *Inode) (status uint8) {
        status = proto.OpOk
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                status = proto.OpNotExistErr
                return
        }
        i := item.(*Inode)
        if i.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }
        if proto.IsDir(i.Type) {
                status = proto.OpArgMismatchErr
                return
        }
        log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks len [%v]", mp.config.PartitionId, ino.Inode, len(i.Extents.eks))
        tinyEks := i.CopyTinyExtents()
        log.LogDebugf("action[fsmExtentsEmpty] mp[%v] ino[%v],eks tiny len [%v]", mp.config.PartitionId, ino.Inode, len(tinyEks))

        if len(tinyEks) > 0 {
                mp.extDelCh <- tinyEks
                log.LogDebugf("fsmExtentsEmpty mp[%v] inode[%d] tinyEks(%v)", mp.config.PartitionId, ino.Inode, tinyEks)
        }

        i.EmptyExtents(ino.ModifyTime)

        return
}

func (mp *metaPartition) fsmClearInodeCache(ino *Inode) (status uint8) {
        status = proto.OpOk
        item := mp.inodeTree.Get(ino)
        if item == nil {
                status = proto.OpNotExistErr
                return
        }
        ino2 := item.(*Inode)
        if ino2.ShouldDelete() {
                status = proto.OpNotExistErr
                return
        }
        delExtents := ino2.EmptyExtents(ino.ModifyTime)
        log.LogInfof("fsmClearInodeCache.mp[%v] inode[%v] DecSplitExts delExtents(%v)", mp.config.PartitionId, ino2.Inode, delExtents)
        if len(delExtents) > 0 {
                ino2.DecSplitExts(mp.config.PartitionId, delExtents)
                mp.extDelCh <- delExtents
        }
        return
}

// attion: unmarshal error will disard extent
func (mp *metaPartition) fsmSendToChan(val []byte, v3 bool) (status uint8) {
        sortExtents := NewSortedExtents()
        // ek for del don't need version info
        err, _ := sortExtents.UnmarshalBinary(val, v3)
        if err != nil {
                panic(fmt.Errorf("[fsmDelExtents] unmarshal sortExtents error, mp[%v], err(%s)", mp.config.PartitionId, err.Error()))
        }

        log.LogInfof("fsmDelExtents mp[%v] delExtents(%v)", mp.config.PartitionId, len(sortExtents.eks))
        mp.extDelCh <- sortExtents.eks
        return
}

func (mp *metaPartition) fsmSetInodeQuotaBatch(req *proto.BatchSetMetaserverQuotaReuqest) (resp *proto.BatchSetMetaserverQuotaResponse) {
        var files int64
        var bytes int64
        resp = &proto.BatchSetMetaserverQuotaResponse{}
        resp.InodeRes = make(map[uint64]uint8, 0)
        for _, ino := range req.Inodes {
                var isExist bool
                var err error

                extend := NewExtend(ino)
                treeItem := mp.extendTree.Get(extend)
                inode := NewInode(ino, 0)
                retMsg := mp.getInode(inode, false)

                if retMsg.Status != proto.OpOk {
                        log.LogErrorf("fsmSetInodeQuotaBatch get inode[%v] fail.", ino)
                        resp.InodeRes[ino] = retMsg.Status
                        continue
                }
                inode = retMsg.Msg
                log.LogDebugf("fsmSetInodeQuotaBatch msg [%v] inode[%v]", retMsg, inode)
                quotaInfos := &proto.MetaQuotaInfos{
                        QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
                }
                quotaInfo := &proto.MetaQuotaInfo{
                        RootInode: req.IsRoot,
                }

                if treeItem == nil {
                        quotaInfos.QuotaInfoMap[req.QuotaId] = quotaInfo
                        mp.extendTree.ReplaceOrInsert(extend, true)
                } else {
                        extend = treeItem.(*Extend)
                        value, exist := extend.Get([]byte(proto.QuotaKey))
                        if exist {
                                if err = json.Unmarshal(value, &quotaInfos.QuotaInfoMap); err != nil {
                                        log.LogErrorf("set quota Unmarshal quotaInfos fail [%v]", err)
                                        resp.InodeRes[ino] = proto.OpErr
                                        continue
                                }
                                oldQuotaInfo, ok := quotaInfos.QuotaInfoMap[req.QuotaId]
                                if ok {
                                        isExist = true
                                        quotaInfo = oldQuotaInfo
                                }
                        }
                        quotaInfos.QuotaInfoMap[req.QuotaId] = quotaInfo
                }
                value, err := json.Marshal(quotaInfos.QuotaInfoMap)
                if err != nil {
                        log.LogErrorf("set quota marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
                        resp.InodeRes[ino] = proto.OpErr
                        continue
                }

                extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
                resp.InodeRes[ino] = proto.OpOk
                if !isExist {
                        files += 1
                        bytes += int64(inode.Size)
                }
        }
        mp.mqMgr.updateUsedInfo(bytes, files, req.QuotaId)
        log.LogInfof("fsmSetInodeQuotaBatch quotaId [%v] resp [%v] success.", req.QuotaId, resp)
        return
}

func (mp *metaPartition) fsmDeleteInodeQuotaBatch(req *proto.BatchDeleteMetaserverQuotaReuqest) (resp *proto.BatchDeleteMetaserverQuotaResponse) {
        var files int64
        var bytes int64
        resp = &proto.BatchDeleteMetaserverQuotaResponse{}
        resp.InodeRes = make(map[uint64]uint8, 0)

        for _, ino := range req.Inodes {
                var err error
                extend := NewExtend(ino)
                treeItem := mp.extendTree.Get(extend)
                inode := NewInode(ino, 0)
                retMsg := mp.getInode(inode, false)
                if retMsg.Status != proto.OpOk {
                        log.LogErrorf("fsmDeleteInodeQuotaBatch get inode[%v] fail.", ino)
                        resp.InodeRes[ino] = retMsg.Status
                        continue
                }
                inode = retMsg.Msg
                log.LogDebugf("fsmDeleteInodeQuotaBatch msg [%v] inode[%v]", retMsg, inode)
                quotaInfos := &proto.MetaQuotaInfos{
                        QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
                }

                if treeItem == nil {
                        log.LogDebugf("fsmDeleteInodeQuotaBatch inode[%v] not has extend ", ino)
                        resp.InodeRes[ino] = proto.OpOk
                        continue
                } else {
                        extend = treeItem.(*Extend)
                        value, exist := extend.Get([]byte(proto.QuotaKey))
                        if exist {
                                if err = json.Unmarshal(value, &quotaInfos.QuotaInfoMap); err != nil {
                                        log.LogErrorf("fsmDeleteInodeQuotaBatch ino[%v] Unmarshal quotaInfos fail [%v]", ino, err)
                                        resp.InodeRes[ino] = proto.OpErr
                                        continue
                                }

                                _, ok := quotaInfos.QuotaInfoMap[req.QuotaId]
                                if ok {
                                        delete(quotaInfos.QuotaInfoMap, req.QuotaId)
                                        if len(quotaInfos.QuotaInfoMap) == 0 {
                                                extend.Remove([]byte(proto.QuotaKey))
                                        } else {
                                                value, err = json.Marshal(quotaInfos.QuotaInfoMap)
                                                if err != nil {
                                                        log.LogErrorf("fsmDeleteInodeQuotaBatch marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
                                                        resp.InodeRes[ino] = proto.OpErr
                                                        continue
                                                }
                                                extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
                                        }
                                } else {
                                        log.LogDebugf("fsmDeleteInodeQuotaBatch QuotaInfoMap can not find inode[%v] quota [%v]", ino, req.QuotaId)
                                        resp.InodeRes[ino] = proto.OpOk
                                        continue
                                }
                        } else {
                                resp.InodeRes[ino] = proto.OpOk
                                continue
                        }
                }
                files -= 1
                bytes -= int64(inode.Size)
        }
        mp.mqMgr.updateUsedInfo(bytes, files, req.QuotaId)
        log.LogInfof("fsmDeleteInodeQuotaBatch quotaId [%v] resp [%v] success.", req.QuotaId, resp)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import "github.com/cubefs/cubefs/proto"

func (mp *metaPartition) fsmCreateMultipart(multipart *Multipart) (status uint8) {
        _, ok := mp.multipartTree.ReplaceOrInsert(multipart, false)
        if !ok {
                return proto.OpExistErr
        }
        return proto.OpOk
}

func (mp *metaPartition) fsmRemoveMultipart(multipart *Multipart) (status uint8) {
        deletedItem := mp.multipartTree.Delete(multipart)
        if deletedItem == nil {
                return proto.OpNotExistErr
        }
        return proto.OpOk
}

func (mp *metaPartition) fsmAppendMultipart(multipart *Multipart) (resp proto.AppendMultipartResponse) {
        storedItem := mp.multipartTree.CopyGet(multipart)
        if storedItem == nil {
                resp.Status = proto.OpNotExistErr
                return
        }
        storedMultipart, is := storedItem.(*Multipart)
        if !is {
                resp.Status = proto.OpNotExistErr
                return
        }
        for _, part := range multipart.Parts() {
                oldInode, updated, conflict := storedMultipart.UpdateOrStorePart(part)
                if conflict {
                        resp.Status = proto.OpUploadPartConflictErr
                        return
                }
                if updated {
                        resp.OldInode = oldInode
                        resp.Update = true
                }
        }
        resp.Status = proto.OpOk
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package metanode

import (
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) fsmTxRollback(txID string) (status uint8) {
        status = mp.txProcessor.txManager.rollbackTxInfo(txID)
        return
}

func (mp *metaPartition) fsmTxDelete(txID string) (status uint8) {
        status = mp.txProcessor.txManager.deleteTxInfo(txID)
        return
}

func (mp *metaPartition) fsmTxInodeRollback(req *proto.TxInodeApplyRequest) (status uint8) {
        status, _ = mp.txProcessor.txResource.rollbackInode(req)
        return
}

func (mp *metaPartition) fsmTxDentryRollback(req *proto.TxDentryApplyRequest) (status uint8) {
        status, _ = mp.txProcessor.txResource.rollbackDentry(req)
        return
}

func (mp *metaPartition) fsmTxSetState(req *proto.TxSetStateRequest) (status uint8) {
        status, _ = mp.txProcessor.txManager.txSetState(req)
        return
}

func (mp *metaPartition) fsmTxInit(txInfo *proto.TransactionInfo) (status uint8) {
        status = proto.OpOk
        err := mp.txProcessor.txManager.registerTransaction(txInfo)
        if err != nil {
                log.LogErrorf("fsmTxInit: register transaction failed, txInfo %s, err %s", txInfo.String(), err.Error())
                return proto.OpTxInternalErr
        }
        return
}

func (mp *metaPartition) fsmTxCommit(txID string) (status uint8) {
        status, _ = mp.txProcessor.txManager.commitTxInfo(txID)
        return
}

func (mp *metaPartition) fsmTxInodeCommit(txID string, inode uint64) (status uint8) {
        // var err error
        status, _ = mp.txProcessor.txResource.commitInode(txID, inode)
        return
}

func (mp *metaPartition) fsmTxDentryCommit(txID string, pId uint64, name string) (status uint8) {
        // var err error
        status, _ = mp.txProcessor.txResource.commitDentry(txID, pId, name)
        return
}

func (mp *metaPartition) fsmTxCommitRM(txInfo *proto.TransactionInfo) (status uint8) {
        status = proto.OpOk
        ifo := mp.txProcessor.txManager.copyGetTx(txInfo.TxID)
        if ifo == nil || ifo.Finish() {
                log.LogWarnf("fsmTxCommitRM: tx already commit or rollback before, tx %v, ifo %v", txInfo, ifo)
                return
        }

        mpId := mp.config.PartitionId
        for _, ifo := range txInfo.TxInodeInfos {
                if ifo.MpID != mpId {
                        continue
                }

                mp.fsmTxInodeCommit(ifo.TxID, ifo.Ino)
        }

        for _, ifo := range txInfo.TxDentryInfos {
                if ifo.MpID != mpId {
                        continue
                }

                mp.fsmTxDentryCommit(ifo.TxID, ifo.ParentId, ifo.Name)
        }

        ifo.SetFinish()
        return proto.OpOk
}

func (mp *metaPartition) fsmTxRollbackRM(txInfo *proto.TransactionInfo) (status uint8) {
        status = proto.OpOk
        ifo := mp.txProcessor.txManager.copyGetTx(txInfo.TxID)
        if ifo == nil || ifo.Finish() {
                log.LogWarnf("fsmTxRollbackRM: tx already commit or rollback before, tx %v, ifo %v", txInfo, ifo)
                return
        }

        mpId := mp.config.PartitionId
        for _, ifo := range txInfo.TxInodeInfos {
                if ifo.MpID != mpId {
                        continue
                }

                req := &proto.TxInodeApplyRequest{
                        TxID:  ifo.TxID,
                        Inode: ifo.Ino,
                }
                mp.fsmTxInodeRollback(req)
        }

        // delete from rb tree
        for _, ifo := range txInfo.TxDentryInfos {
                if ifo.MpID != mpId {
                        continue
                }

                req := &proto.TxDentryApplyRequest{
                        TxID: ifo.TxID,
                        Pid:  ifo.ParentId,
                        Name: ifo.Name,
                }
                mp.fsmTxDentryRollback(req)
        }

        ifo.SetFinish()
        return proto.OpOk
}

func (mp *metaPartition) inodeInTx(inode uint64) uint8 {
        inTx, txId := mp.txProcessor.txResource.isInodeInTransction(NewInode(inode, 0))
        if inTx {
                log.LogWarnf("inodeInTx: inode is in transaction, inode %d, txId %s", inode, txId)
                return proto.OpTxConflictErr
        }
        return proto.OpOk
}

func (mp *metaPartition) dentryInTx(parIno uint64, name string) uint8 {
        inTx, txId := mp.txProcessor.txResource.isDentryInTransction(&Dentry{
                ParentId: parIno,
                Name:     name,
        })

        if inTx {
                log.LogWarnf("inodeInTx: inode is in transaction, parent inode %d, name %s, txId %s", parIno, name, txId)
                return proto.OpTxConflictErr
        }
        return proto.OpOk
}

func (mp *metaPartition) txInodeInRb(inode uint64, newTxId string) (rbInode *TxRollbackInode) {
        rbIno := mp.txProcessor.txResource.getTxRbInode(inode)
        if rbIno != nil && rbIno.txInodeInfo.TxID == newTxId {
                return rbIno
        }

        return nil
}

func (mp *metaPartition) txDentryInRb(parIno uint64, name, newTxId string) bool {
        inTx, txId := mp.txProcessor.txResource.isDentryInTransction(&Dentry{
                ParentId: parIno,
                Name:     name,
        })
        return inTx && txId == newTxId
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"

        "github.com/cubefs/cubefs/proto"
)

type fsmEvictUniqCheckerRequest struct {
        Idx    int
        UniqID uint64
}

type UniqIdResp struct {
        Start  uint64
        End    uint64
        Status uint8
}

func (mp *metaPartition) fsmUniqID(val []byte) (resp *UniqIdResp) {
        resp = &UniqIdResp{
                Status: proto.OpOk,
        }

        num := binary.BigEndian.Uint32(val)
        resp.Start, resp.End = mp.allocateUniqID(num)
        return resp
}

func (mp *metaPartition) fsmUniqCheckerEvict(req *fsmEvictUniqCheckerRequest) error {
        mp.uniqChecker.doEvict(req.UniqID)
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io"
        "os"
        "path"
        "reflect"
        "strings"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

// MetaItem defines the structure of the metadata operations.
type MetaItem struct {
        Op uint32 `json:"Op"`
        K  []byte `json:"k"`
        V  []byte `json:"v"`
}

// MarshalJson
func (s *MetaItem) MarshalJson() ([]byte, error) {
        return json.Marshal(s)
}

// MarshalBinary marshals MetaItem to binary data.
// Binary frame structure:
//  +------+----+------+------+------+------+
//  | Item | Op | LenK |   K  | LenV |   V  |
//  +------+----+------+------+------+------+
//  | byte | 4  |  4   | LenK |  4   | LenV |
//  +------+----+------+------+------+------+
func (s *MetaItem) MarshalBinary() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0))
        buff.Grow(4 + len(s.K) + len(s.V))
        if err = binary.Write(buff, binary.BigEndian, s.Op); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(s.K))); err != nil {
                return
        }
        if _, err = buff.Write(s.K); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(s.V))); err != nil {
                return
        }
        if _, err = buff.Write(s.V); err != nil {
                return
        }
        result = buff.Bytes()
        return
}

// UnmarshalJson unmarshals binary data to MetaItem.
func (s *MetaItem) UnmarshalJson(data []byte) error {
        return json.Unmarshal(data, s)
}

// MarshalBinary unmarshal this MetaItem entity from binary data.
// Binary frame structure:
//  +------+----+------+------+------+------+
//  | Item | Op | LenK |   K  | LenV |   V  |
//  +------+----+------+------+------+------+
//  | byte | 4  |  4   | LenK |  4   | LenV |
//  +------+----+------+------+------+------+
func (s *MetaItem) UnmarshalBinary(raw []byte) (err error) {
        var (
                lenK uint32
                lenV uint32
        )
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &s.Op); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &lenK); err != nil {
                return
        }
        s.K = make([]byte, lenK)
        if _, err = buff.Read(s.K); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &lenV); err != nil {
                return
        }
        s.V = make([]byte, lenV)
        if _, err = buff.Read(s.V); err != nil {
                return
        }
        return
}

// NewMetaItem returns a new MetaItem.
func NewMetaItem(op uint32, key, value []byte) *MetaItem {
        return &MetaItem{
                Op: op,
                K:  key,
                V:  value,
        }
}

type fileData struct {
        filename string
        data     []byte
}

const (
        // initial version
        SnapFormatVersion_0 uint32 = iota

        // version since transaction feature, added formatVersion, txId and cursor in MetaItemIterator struct
        SnapFormatVersion_1
)

// MetaItemIterator defines the iterator of the MetaItem.
type MetaItemIterator struct {
        fileRootDir       string
        SnapFormatVersion uint32
        applyID           uint64
        uniqID            uint64
        txId              uint64
        cursor            uint64
        inodeTree         *BTree
        dentryTree        *BTree
        extendTree        *BTree
        multipartTree     *BTree
        txTree            *BTree
        txRbInodeTree     *BTree
        txRbDentryTree    *BTree
        uniqChecker       *uniqChecker
        verList           []*proto.VolVersionInfo

        filenames []string

        dataCh    chan interface{}
        errorCh   chan error
        err       error
        closeCh   chan struct{}
        closeOnce sync.Once
}

// SnapItemWrapper key definition
const (
        SiwKeySnapFormatVer uint32 = iota
        SiwKeyApplyId
        SiwKeyTxId
        SiwKeyCursor
        SiwKeyUniqId
        SiwKeyVerList
)

type SnapItemWrapper struct {
        key   uint32
        value interface{}
}

func (siw *SnapItemWrapper) MarshalKey() (k []byte) {
        k = make([]byte, 8)
        binary.BigEndian.PutUint32(k, siw.key)
        return
}

func (siw *SnapItemWrapper) UnmarshalKey(k []byte) (err error) {
        siw.key = binary.BigEndian.Uint32(k)
        return
}

// newMetaItemIterator returns a new MetaItemIterator.
func newMetaItemIterator(mp *metaPartition) (si *MetaItemIterator, err error) {
        si = new(MetaItemIterator)
        si.fileRootDir = mp.config.RootDir
        si.SnapFormatVersion = mp.manager.metaNode.raftSyncSnapFormatVersion
        mp.nonIdempotent.Lock()
        si.applyID = mp.getApplyID()
        si.txId = mp.txProcessor.txManager.txIdAlloc.getTransactionID()
        si.cursor = mp.GetCursor()
        si.uniqID = mp.GetUniqId()
        si.inodeTree = mp.inodeTree.GetTree()
        si.dentryTree = mp.dentryTree.GetTree()
        si.extendTree = mp.extendTree.GetTree()
        si.multipartTree = mp.multipartTree.GetTree()
        si.txTree = mp.txProcessor.txManager.txTree.GetTree()
        si.txRbInodeTree = mp.txProcessor.txResource.txRbInodeTree.GetTree()
        si.txRbDentryTree = mp.txProcessor.txResource.txRbDentryTree.GetTree()
        si.uniqChecker = mp.uniqChecker.clone()
        si.verList = mp.GetAllVerList()
        mp.nonIdempotent.Unlock()

        si.dataCh = make(chan interface{})
        si.errorCh = make(chan error, 1)
        si.closeCh = make(chan struct{})

        // collect extend del files
        filenames := make([]string, 0)
        var fileInfos []os.DirEntry
        if fileInfos, err = os.ReadDir(mp.config.RootDir); err != nil {
                return
        }

        for _, fileInfo := range fileInfos {
                if !fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), prefixDelExtent) {
                        filenames = append(filenames, fileInfo.Name())
                }
                if !fileInfo.IsDir() && strings.HasPrefix(fileInfo.Name(), prefixDelExtentV2) {
                        filenames = append(filenames, fileInfo.Name())
                }
        }
        si.filenames = filenames

        // start data producer
        go func(iter *MetaItemIterator) {
                defer func() {
                        close(iter.dataCh)
                        close(iter.errorCh)
                }()
                produceItem := func(item interface{}) (success bool) {
                        select {
                        case iter.dataCh <- item:
                                return true
                        case <-iter.closeCh:
                                return false
                        }
                }
                produceError := func(err error) {
                        select {
                        case iter.errorCh <- err:
                        default:
                        }
                }
                checkClose := func() (closed bool) {
                        select {
                        case <-iter.closeCh:
                                return true
                        default:
                                return false
                        }
                }

                if si.SnapFormatVersion == SnapFormatVersion_0 {
                        // process index ID
                        produceItem(si.applyID)
                        log.LogDebugf("newMetaItemIterator: SnapFormatVersion_0, partitionId(%v), applyID(%v)",
                                mp.config.PartitionId, si.applyID)
                } else if si.SnapFormatVersion == SnapFormatVersion_1 {
                        // process snapshot format version
                        snapFormatVerWrapper := SnapItemWrapper{SiwKeySnapFormatVer, si.SnapFormatVersion}
                        produceItem(snapFormatVerWrapper)

                        // process apply index ID
                        applyIdWrapper := SnapItemWrapper{SiwKeyApplyId, si.applyID}
                        produceItem(applyIdWrapper)

                        // process txId
                        txIdWrapper := SnapItemWrapper{SiwKeyTxId, si.txId}
                        produceItem(txIdWrapper)

                        // process cursor
                        cursorWrapper := SnapItemWrapper{SiwKeyCursor, si.cursor}
                        produceItem(cursorWrapper)

                        verListWrapper := SnapItemWrapper{SiwKeyVerList, si.verList}
                        produceItem(verListWrapper)

                        log.LogDebugf("newMetaItemIterator: SnapFormatVersion_1, partitionId(%v) applyID(%v) txId(%v) cursor(%v) uniqID(%v) verList(%v)",
                                mp.config.PartitionId, si.applyID, si.txId, si.cursor, si.uniqID, si.verList)

                        if si.uniqID != 0 {
                                // process uniqId
                                uniqIdWrapper := SnapItemWrapper{SiwKeyUniqId, si.uniqID}
                                produceItem(uniqIdWrapper)
                        }
                } else {
                        panic(fmt.Sprintf("invalid raftSyncSnapFormatVersione: %v", si.SnapFormatVersion))
                }

                // process inodes
                iter.inodeTree.Ascend(func(i BtreeItem) bool {
                        return produceItem(i)
                })
                if checkClose() {
                        return
                }
                // process dentries
                iter.dentryTree.Ascend(func(i BtreeItem) bool {
                        return produceItem(i)
                })
                if checkClose() {
                        return
                }
                // process extends
                iter.extendTree.Ascend(func(i BtreeItem) bool {
                        return produceItem(i)
                })
                if checkClose() {
                        return
                }
                // process multiparts
                iter.multipartTree.Ascend(func(i BtreeItem) bool {
                        return produceItem(i)
                })
                if checkClose() {
                        return
                }

                if si.SnapFormatVersion == SnapFormatVersion_1 {
                        iter.txTree.Ascend(func(i BtreeItem) bool {
                                return produceItem(i)
                        })
                        if checkClose() {
                                return
                        }

                        iter.txRbInodeTree.Ascend(func(i BtreeItem) bool {
                                return produceItem(i)
                        })
                        if checkClose() {
                                return
                        }

                        iter.txRbDentryTree.Ascend(func(i BtreeItem) bool {
                                return produceItem(i)
                        })
                        if checkClose() {
                                return
                        }

                        if si.uniqID != 0 {
                                produceItem(si.uniqChecker)
                                if checkClose() {
                                        return
                                }
                        }
                }

                // process extent del files
                var err error
                var raw []byte
                for _, filename := range iter.filenames {
                        if raw, err = os.ReadFile(path.Join(iter.fileRootDir, filename)); err != nil {
                                produceError(err)
                                return
                        }
                        if !produceItem(&fileData{filename: filename, data: raw}) {
                                return
                        }
                }
        }(si)

        return
}

// ApplyIndex returns the applyID of the iterator.
func (si *MetaItemIterator) ApplyIndex() uint64 {
        return si.applyID
}

// Close closes the iterator.
func (si *MetaItemIterator) Close() {
        si.closeOnce.Do(func() {
                close(si.closeCh)
        })
        return
}

// Next returns the next item.
func (si *MetaItemIterator) Next() (data []byte, err error) {
        if si.err != nil {
                err = si.err
                return
        }
        var item interface{}
        var open bool
        select {
        case item, open = <-si.dataCh:
        case err, open = <-si.errorCh:
        }
        if item == nil || !open {
                err, si.err = io.EOF, io.EOF
                si.Close()
                return
        }
        if err != nil {
                si.err = err
                si.Close()
                return
        }

        var snap *MetaItem
        switch typedItem := item.(type) {
        case uint64:
                applyIDBuf := make([]byte, 8)
                binary.BigEndian.PutUint64(applyIDBuf, si.applyID)
                data = applyIDBuf
                return
        case SnapItemWrapper:
                if typedItem.key == SiwKeySnapFormatVer {
                        snapFormatVerBuf := make([]byte, 8)
                        binary.BigEndian.PutUint32(snapFormatVerBuf, si.SnapFormatVersion)
                        snap = NewMetaItem(opFSMSnapFormatVersion, typedItem.MarshalKey(), snapFormatVerBuf)
                } else if typedItem.key == SiwKeyApplyId {
                        applyIDBuf := make([]byte, 8)
                        binary.BigEndian.PutUint64(applyIDBuf, si.applyID)
                        snap = NewMetaItem(opFSMApplyId, typedItem.MarshalKey(), applyIDBuf)
                } else if typedItem.key == SiwKeyTxId {
                        txIDBuf := make([]byte, 8)
                        binary.BigEndian.PutUint64(txIDBuf, si.txId)
                        snap = NewMetaItem(opFSMTxId, typedItem.MarshalKey(), txIDBuf)
                } else if typedItem.key == SiwKeyCursor {
                        cursor := typedItem.value.(uint64)
                        cursorBuf := make([]byte, 8)
                        binary.BigEndian.PutUint64(cursorBuf, cursor)
                        snap = NewMetaItem(opFSMCursor, typedItem.MarshalKey(), cursorBuf)
                } else if typedItem.key == SiwKeyUniqId {
                        uniqId := typedItem.value.(uint64)
                        uniqIdBuf := make([]byte, 8)
                        binary.BigEndian.PutUint64(uniqIdBuf, uniqId)
                        snap = NewMetaItem(opFSMUniqIDSnap, typedItem.MarshalKey(), uniqIdBuf)
                } else if typedItem.key == SiwKeyVerList {
                        var verListBuf []byte
                        if verListBuf, err = json.Marshal(typedItem.value.([]*proto.VolVersionInfo)); err != nil {
                                return
                        }
                        snap = NewMetaItem(opFSMVerListSnapShot, typedItem.MarshalKey(), verListBuf)
                        log.LogInfof("snapshot.fileRootDir %v verList %v", si.fileRootDir, verListBuf)
                } else {
                        panic(fmt.Sprintf("MetaItemIterator.Next: unknown SnapItemWrapper key: %v", typedItem.key))
                }
        case *Inode:
                snap = NewMetaItem(opFSMCreateInode, typedItem.MarshalKey(), typedItem.MarshalValue())
        case *Dentry:
                snap = NewMetaItem(opFSMCreateDentry, typedItem.MarshalKey(), typedItem.MarshalValue())
        case *Extend:
                var raw []byte
                if raw, err = typedItem.Bytes(); err != nil {
                        si.err = err
                        si.Close()
                        return
                }
                snap = NewMetaItem(opFSMSetXAttr, nil, raw)
        case *Multipart:
                var raw []byte
                if raw, err = typedItem.Bytes(); err != nil {
                        si.err = err
                        si.Close()
                        return
                }
                snap = NewMetaItem(opFSMCreateMultipart, nil, raw)
        case *proto.TransactionInfo:
                val, _ := typedItem.Marshal()
                snap = NewMetaItem(opFSMTxSnapshot, []byte(typedItem.TxID), val)
        case *TxRollbackInode:
                val, _ := typedItem.Marshal()
                snap = NewMetaItem(opFSMTxRbInodeSnapshot, typedItem.inode.MarshalKey(), val)
        case *TxRollbackDentry:
                val, _ := typedItem.Marshal()
                snap = NewMetaItem(opFSMTxRbDentrySnapshot, []byte(typedItem.txDentryInfo.GetKey()), val)
        case *fileData:
                snap = NewMetaItem(opExtentFileSnapshot, []byte(typedItem.filename), typedItem.data)
        case *uniqChecker:
                var raw []byte
                if raw, _, err = typedItem.Marshal(); err != nil {
                        si.err = err
                        si.Close()
                        return
                }
                snap = NewMetaItem(opFSMUniqCheckerSnap, nil, raw)
        default:
                panic(fmt.Sprintf("unknown item type: %v", reflect.TypeOf(item).Name()))
        }

        if data, err = snap.MarshalBinary(); err != nil {
                si.err = err
                si.Close()
                return
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"
        "fmt"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) TxCreateDentry(req *proto.TxCreateDentryRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        if req.ParentID == req.Inode {
                err = fmt.Errorf("parentId is equal inodeId")
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }

        for _, quotaId := range req.QuotaIds {
                status := mp.mqMgr.IsOverQuota(false, true, quotaId)
                if status != 0 {
                        err = errors.New("create dentry is over quota")
                        reply := []byte(err.Error())
                        p.PacketErrorWithBody(status, reply)
                        return
                }
        }

        var parIno *Inode
        item := mp.inodeTree.Get(NewInode(req.ParentID, 0))
        if item == nil {
                err = fmt.Errorf("parent inode not exists")
                p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
                return
        }

        parIno = item.(*Inode)
        quota := atomic.LoadUint32(&dirChildrenNumLimit)
        if parIno.NLink >= quota {
                err = fmt.Errorf("parent dir quota limitation reached")
                p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
                return
        }

        txInfo := req.TxInfo.GetCopy()
        txDentry := NewTxDentry(req.ParentID, req.Name, req.Inode, req.Mode, parIno, txInfo)
        val, err := txDentry.Marshal()
        if err != nil {
                return
        }

        status, err := mp.submit(opFSMTxCreateDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        p.ResultCode = status.(uint8)
        return
}

// CreateDentry returns a new dentry.
func (mp *metaPartition) CreateDentry(req *CreateDentryReq, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
                }()
        }
        if req.ParentID == req.Inode {
                err = fmt.Errorf("parentId is equal inodeId")
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }

        item := mp.inodeTree.CopyGet(NewInode(req.ParentID, 0))
        if item == nil {
                err = fmt.Errorf("parent inode not exists")
                p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
                return
        } else {
                parIno := item.(*Inode)
                quota := atomic.LoadUint32(&dirChildrenNumLimit)
                if parIno.NLink >= quota {
                        err = fmt.Errorf("parent dir quota limitation reached")
                        p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
                        return
                }
        }

        dentry := &Dentry{
                ParentId:  req.ParentID,
                Name:      req.Name,
                Inode:     req.Inode,
                Type:      req.Mode,
                multiSnap: NewDentrySnap(mp.GetVerSeq()),
        }
        val, err := dentry.Marshal()
        if err != nil {
                return
        }
        resp, err := mp.submit(opFSMCreateDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.ResultCode = resp.(uint8)
        return
}

func (mp *metaPartition) QuotaCreateDentry(req *proto.QuotaCreateDentryRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
                }()
        }
        if req.ParentID == req.Inode {
                err = fmt.Errorf("parentId is equal inodeId")
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }
        for _, quotaId := range req.QuotaIds {
                status := mp.mqMgr.IsOverQuota(false, true, quotaId)
                if status != 0 {
                        err = errors.New("create dentry is over quota")
                        reply := []byte(err.Error())
                        p.PacketErrorWithBody(status, reply)
                        return
                }
        }
        item := mp.inodeTree.CopyGet(NewInode(req.ParentID, 0))
        if item == nil {
                err = fmt.Errorf("parent inode not exists")
                p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
                return
        } else {
                parIno := item.(*Inode)
                quota := atomic.LoadUint32(&dirChildrenNumLimit)
                if parIno.NLink >= quota {
                        err = fmt.Errorf("parent dir quota limitation reached")
                        p.PacketErrorWithBody(proto.OpDirQuota, []byte(err.Error()))
                        return
                }
        }

        dentry := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
                Inode:    req.Inode,
                Type:     req.Mode,
        }
        dentry.setVerSeq(mp.verSeq)
        log.LogDebugf("action[CreateDentry] mp[%v] with seq [%v],dentry [%v]", mp.config.PartitionId, mp.verSeq, dentry)
        val, err := dentry.Marshal()
        if err != nil {
                return
        }
        resp, err := mp.submit(opFSMCreateDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.ResultCode = resp.(uint8)
        return
}

func (mp *metaPartition) TxDeleteDentry(req *proto.TxDeleteDentryRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Ino, req.ParentID)
                }()
        }
        txInfo := req.TxInfo.GetCopy()
        den := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
        }

        defer func() {
                if p.ResultCode == proto.OpOk {
                        var reply []byte
                        resp := &proto.TxDeleteDentryResponse{
                                Inode: req.Ino,
                        }
                        reply, err = json.Marshal(resp)
                        p.PacketOkWithBody(reply)
                }
        }()

        dentry, status := mp.getDentry(den)
        if status != proto.OpOk {
                if mp.txDentryInRb(req.ParentID, req.Name, req.TxInfo.TxID) {
                        p.ResultCode = proto.OpOk
                        log.LogWarnf("TxDeleteDentry: dentry is already been deleted before, req %v", req)
                        return
                }

                err = fmt.Errorf("dentry[%v] not exists", den)
                log.LogWarn(err)
                p.PacketErrorWithBody(status, []byte(err.Error()))
                return
        }

        if dentry.Inode != req.Ino {
                err = fmt.Errorf("target name ino is not right, par %d, name %s, want %d, got %d",
                        req.PartitionID, req.Name, req.Ino, dentry.Inode)
                log.LogWarn(err)
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }
        parIno := NewInode(req.ParentID, 0)
        inoResp := mp.getInode(parIno, false)
        if inoResp.Status != proto.OpOk {
                err = fmt.Errorf("parIno[%v] not exists", parIno.Inode)
                p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
                return
        }

        txDentry := &TxDentry{
                // ParInode: inoResp.Msg,
                Dentry: dentry,
                TxInfo: txInfo,
        }

        val, err := txDentry.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        r, err := mp.submit(opFSMTxDeleteDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        retMsg := r.(*DentryResponse)
        p.ResultCode = retMsg.Status
        return
}

// DeleteDentry deletes a dentry.
func (mp *metaPartition) DeleteDentry(req *DeleteDentryReq, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), 0, req.ParentID)
                }()
        }
        if req.InodeCreateTime > 0 {
                if mp.vol.volDeleteLockTime > 0 && req.InodeCreateTime+mp.vol.volDeleteLockTime*60*60 > time.Now().Unix() {
                        err = errors.NewErrorf("the current Inode[%v] is still locked for deletion", req.Name)
                        log.LogDebugf("DeleteDentry: the current Inode is still locked for deletion, inode[%v] createTime(%v) mw.volDeleteLockTime(%v) now(%v)", req.Name, req.InodeCreateTime, mp.vol.volDeleteLockTime, time.Now().Unix())
                        p.PacketErrorWithBody(proto.OpNotPerm, []byte(err.Error()))
                        return
                }
        }
        dentry := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
        }
        dentry.setVerSeq(req.Verseq)
        log.LogDebugf("action[DeleteDentry] den param(%v)", dentry)

        val, err := dentry.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        if mp.verSeq == 0 && dentry.getSeqFiled() > 0 {
                err = fmt.Errorf("snapshot not enabled")
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        log.LogDebugf("action[DeleteDentry] submit!")
        r, err := mp.submit(opFSMDeleteDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        retMsg := r.(*DentryResponse)
        p.ResultCode = retMsg.Status
        dentry = retMsg.Msg
        if p.ResultCode == proto.OpOk {
                var reply []byte
                resp := &DeleteDentryResp{
                        Inode: dentry.Inode,
                }
                reply, err = json.Marshal(resp)
                p.PacketOkWithBody(reply)
        }
        return
}

// DeleteDentry deletes a dentry.
func (mp *metaPartition) DeleteDentryBatch(req *BatchDeleteDentryReq, p *Packet, remoteAddr string) (err error) {
        db := make(DentryBatch, 0, len(req.Dens))
        start := time.Now()
        for i, d := range req.Dens {
                db = append(db, &Dentry{
                        ParentId: req.ParentID,
                        Name:     d.Name,
                        Inode:    d.Inode,
                        Type:     d.Type,
                })
                den := &d
                fullPath := ""
                if len(req.FullPaths) > i {
                        fullPath = req.FullPaths[i]
                }
                if mp.IsEnableAuditLog() {
                        defer func() {
                                auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), den.Name, fullPath, err, time.Since(start).Milliseconds(), den.Inode, req.ParentID)
                        }()
                }
        }

        val, err := db.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        r, err := mp.submit(opFSMDeleteDentryBatch, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        retMsg := r.([]*DentryResponse)
        p.ResultCode = proto.OpOk

        bddr := &BatchDeleteDentryResp{}

        for _, m := range retMsg {
                if m.Status != proto.OpOk {
                        p.ResultCode = proto.OpErr
                }

                if dentry := m.Msg; dentry != nil {
                        bddr.Items = append(bddr.Items, &struct {
                                Inode  uint64 `json:"ino"`
                                Status uint8  `json:"status"`
                        }{
                                Inode:  dentry.Inode,
                                Status: m.Status,
                        })
                } else {
                        bddr.Items = append(bddr.Items, &struct {
                                Inode  uint64 `json:"ino"`
                                Status uint8  `json:"status"`
                        }{
                                Status: m.Status,
                        })
                }

        }

        reply, err := json.Marshal(bddr)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }
        p.PacketOkWithBody(reply)

        return
}

func (mp *metaPartition) TxUpdateDentry(req *proto.TxUpdateDentryRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
                }()
        }
        if req.ParentID == req.Inode {
                err = fmt.Errorf("parentId is equal inodeId")
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }

        txInfo := req.TxInfo.GetCopy()

        defer func() {
                if p.ResultCode == proto.OpOk {
                        var reply []byte
                        m := &proto.TxUpdateDentryResponse{
                                Inode: req.OldIno,
                        }
                        reply, _ = json.Marshal(m)
                        p.PacketOkWithBody(reply)
                }
        }()

        newDentry := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
                Inode:    req.Inode,
        }
        oldDentry, status := mp.getDentry(newDentry)
        if status != proto.OpOk {
                if mp.txDentryInRb(req.ParentID, req.Name, req.TxInfo.TxID) {
                        p.ResultCode = proto.OpOk
                        log.LogWarnf("TxDeleteDentry: dentry is already been deleted before, req %v", req)
                        return
                }
                err = fmt.Errorf("oldDentry[%v] not exists", oldDentry)
                p.PacketErrorWithBody(status, []byte(err.Error()))
                return
        }

        if oldDentry.Inode != req.OldIno {
                err = fmt.Errorf("oldDentry is alredy updated, req %v, old [%v]", req, oldDentry)
                p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
                return
        }

        txDentry := &TxUpdateDentry{
                OldDentry: oldDentry,
                NewDentry: newDentry,
                TxInfo:    txInfo,
        }
        val, err := txDentry.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMTxUpdateDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        msg := resp.(*DentryResponse)
        p.ResultCode = msg.Status
        return
}

// UpdateDentry updates a dentry.
func (mp *metaPartition) UpdateDentry(req *UpdateDentryReq, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogDentryOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.Name, req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, req.ParentID)
                }()
        }
        if req.ParentID == req.Inode {
                err = fmt.Errorf("parentId is equal inodeId")
                p.PacketErrorWithBody(proto.OpExistErr, []byte(err.Error()))
                return
        }

        dentry := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
                Inode:    req.Inode,
        }
        dentry.setVerSeq(mp.verSeq)
        val, err := dentry.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMUpdateDentry, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        msg := resp.(*DentryResponse)
        p.ResultCode = msg.Status
        if msg.Status == proto.OpOk {
                var reply []byte
                m := &UpdateDentryResp{
                        Inode: msg.Msg.Inode,
                }
                reply, err = json.Marshal(m)
                p.PacketOkWithBody(reply)
        }
        return
}

func (mp *metaPartition) ReadDirOnly(req *ReadDirOnlyReq, p *Packet) (err error) {
        resp := mp.readDirOnly(req)
        reply, err := json.Marshal(resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

// ReadDir reads the directory based on the given request.
func (mp *metaPartition) ReadDir(req *ReadDirReq, p *Packet) (err error) {
        resp := mp.readDir(req)
        reply, err := json.Marshal(resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

func (mp *metaPartition) ReadDirLimit(req *ReadDirLimitReq, p *Packet) (err error) {
        log.LogInfof("action[ReadDirLimit] read seq [%v], request[%v]", req.VerSeq, req)
        resp := mp.readDirLimit(req)
        reply, err := json.Marshal(resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

// Lookup looks up the given dentry from the request.
func (mp *metaPartition) Lookup(req *LookupReq, p *Packet) (err error) {
        dentry := &Dentry{
                ParentId: req.ParentID,
                Name:     req.Name,
        }
        dentry.setVerSeq(req.VerSeq)
        var denList []proto.DetryInfo
        if req.VerAll {
                denList = mp.getDentryList(dentry)
        }
        dentry, status := mp.getDentry(dentry)

        var reply []byte
        if status == proto.OpOk || req.VerAll {
                var resp *LookupResp
                if status == proto.OpOk {
                        resp = &LookupResp{
                                Inode:  dentry.Inode,
                                Mode:   dentry.Type,
                                VerSeq: dentry.getSeqFiled(),
                                LayAll: denList,
                        }
                } else {
                        resp = &LookupResp{
                                Inode:  0,
                                Mode:   0,
                                VerSeq: 0,
                                LayAll: denList,
                        }
                }
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }

        p.PacketErrorWithBody(status, reply)
        return
}

// GetDentryTree returns the dentry tree stored in the meta partition.
func (mp *metaPartition) GetDentryTree() *BTree {
        return mp.dentryTree.GetTree()
}

// GetDentryTreeLen returns the dentry tree length.
func (mp *metaPartition) GetDentryTreeLen() int {
        if mp.dentryTree == nil {
                return 0
        }
        return mp.dentryTree.Len()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"
        "strconv"
        "strings"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) UpdateXAttr(req *proto.UpdateXAttrRequest, p *Packet) (err error) {
        newValueList := strings.Split(req.Value, ",")
        if len(newValueList) < 3 {
                err = errors.New("Wrong number of parameters")
                log.LogErrorf("action[UpdateXAttr],Wrong number of parameters")
                p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
                return
        }
        filesInc, err := strconv.ParseInt(newValueList[0], 10, 64)
        if err != nil {
                log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
                p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
                return
        }
        dirsInc, err := strconv.ParseInt(newValueList[1], 10, 64)
        if err != nil {
                log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
                p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
                return
        }
        bytesInc, err := strconv.ParseInt(newValueList[2], 10, 64)
        if err != nil {
                log.LogErrorf("action[UpdateXAttr],The parameter must be an integer: err(%v)", err)
                p.PacketErrorWithBody(proto.OpArgMismatchErr, []byte(err.Error()))
                return
        }

        mp.xattrLock.Lock()
        defer mp.xattrLock.Unlock()
        treeItem := mp.extendTree.Get(NewExtend(req.Inode))
        if treeItem != nil {
                extend := treeItem.(*Extend)
                if value, exist := extend.Get([]byte(req.Key)); exist {
                        oldValueList := strings.Split(string(value), ",")
                        oldFiles, _ := strconv.ParseInt(oldValueList[0], 10, 64)
                        oldDirs, _ := strconv.ParseInt(oldValueList[1], 10, 64)
                        oldBytes, _ := strconv.ParseInt(oldValueList[2], 10, 64)
                        newFiles := oldFiles + filesInc
                        newDirs := oldDirs + dirsInc
                        newBytes := oldBytes + bytesInc
                        newValue := strconv.FormatInt(int64(newFiles), 10) + "," +
                                strconv.FormatInt(int64(newDirs), 10) + "," +
                                strconv.FormatInt(int64(newBytes), 10)
                        extend := NewExtend(req.Inode)
                        extend.Put([]byte(req.Key), []byte(newValue), mp.verSeq)
                        if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
                                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                                return
                        }
                        p.PacketOkReply()
                        return
                } else {
                        extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
                        if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
                                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                                return
                        }
                        p.PacketOkReply()
                        return
                }
        } else {
                extend := NewExtend(req.Inode)
                extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
                if _, err = mp.putExtend(opFSMUpdateXAttr, extend); err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
                p.PacketOkReply()
                return
        }
}

func (mp *metaPartition) SetXAttr(req *proto.SetXAttrRequest, p *Packet) (err error) {
        extend := NewExtend(req.Inode)
        extend.Put([]byte(req.Key), []byte(req.Value), mp.verSeq)
        if _, err = mp.putExtend(opFSMSetXAttr, extend); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkReply()
        return
}

func (mp *metaPartition) BatchSetXAttr(req *proto.BatchSetXAttrRequest, p *Packet) (err error) {
        extend := NewExtend(req.Inode)
        for key, val := range req.Attrs {
                extend.Put([]byte(key), []byte(val), mp.verSeq)
        }

        if _, err = mp.putExtend(opFSMSetXAttr, extend); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkReply()
        return
}

func (mp *metaPartition) GetXAttr(req *proto.GetXAttrRequest, p *Packet) (err error) {
        response := &proto.GetXAttrResponse{
                VolName:     req.VolName,
                PartitionId: req.PartitionId,
                Inode:       req.Inode,
                Key:         req.Key,
        }
        treeItem := mp.extendTree.Get(NewExtend(req.Inode))
        if treeItem != nil {
                if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
                        if value, exist := extend.Get([]byte(req.Key)); exist {
                                response.Value = string(value)
                        }
                }
        }
        var encoded []byte
        encoded, err = json.Marshal(response)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(encoded)
        return
}

func (mp *metaPartition) GetAllXAttr(req *proto.GetAllXAttrRequest, p *Packet) (err error) {
        response := &proto.GetAllXAttrResponse{
                VolName:     req.VolName,
                PartitionId: req.PartitionId,
                Inode:       req.Inode,
                Attrs:       make(map[string]string),
        }
        treeItem := mp.extendTree.Get(NewExtend(req.Inode))
        if treeItem != nil {
                if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
                        for key, val := range extend.dataMap {
                                response.Attrs[key] = string(val)
                        }
                }
        }
        var encoded []byte
        encoded, err = json.Marshal(response)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(encoded)
        return
}

func (mp *metaPartition) BatchGetXAttr(req *proto.BatchGetXAttrRequest, p *Packet) (err error) {
        response := &proto.BatchGetXAttrResponse{
                VolName:     req.VolName,
                PartitionId: req.PartitionId,
                XAttrs:      make([]*proto.XAttrInfo, 0, len(req.Inodes)),
        }
        for _, inode := range req.Inodes {
                treeItem := mp.extendTree.Get(NewExtend(inode))
                if treeItem != nil {
                        info := &proto.XAttrInfo{
                                Inode:  inode,
                                XAttrs: make(map[string]string),
                        }

                        var extend *Extend
                        if extend = treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
                                for _, key := range req.Keys {
                                        if val, exist := extend.Get([]byte(key)); exist {
                                                info.XAttrs[key] = string(val)
                                        }
                                }
                        }
                        response.XAttrs = append(response.XAttrs, info)
                }
        }
        var encoded []byte
        if encoded, err = json.Marshal(response); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(encoded)
        return
}

func (mp *metaPartition) RemoveXAttr(req *proto.RemoveXAttrRequest, p *Packet) (err error) {
        extend := NewExtend(req.Inode)
        extend.Put([]byte(req.Key), nil, req.VerSeq)
        if _, err = mp.putExtend(opFSMRemoveXAttr, extend); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkReply()
        return
}

func (mp *metaPartition) ListXAttr(req *proto.ListXAttrRequest, p *Packet) (err error) {
        response := &proto.ListXAttrResponse{
                VolName:     req.VolName,
                PartitionId: req.PartitionId,
                Inode:       req.Inode,
                XAttrs:      make([]string, 0),
        }
        treeItem := mp.extendTree.Get(NewExtend(req.Inode))
        if treeItem != nil {
                if extend := treeItem.(*Extend).GetExtentByVersion(req.VerSeq); extend != nil {
                        extend.Range(func(key, value []byte) bool {
                                response.XAttrs = append(response.XAttrs, string(key))
                                return true
                        })
                }
        }
        var encoded []byte
        encoded, err = json.Marshal(response)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(encoded)
        return
}

func (mp *metaPartition) putExtend(op uint32, extend *Extend) (resp interface{}, err error) {
        var marshaled []byte
        if marshaled, err = extend.Bytes(); err != nil {
                return
        }
        resp, err = mp.submit(op, marshaled)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"
        "fmt"
        "os"
        "sort"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) CheckQuota(inodeId uint64, p *Packet) (iParm *Inode, inode *Inode, err error) {
        iParm = NewInode(inodeId, 0)
        status := mp.isOverQuota(inodeId, true, false)
        if status != 0 {
                log.LogErrorf("CheckQuota dir quota fail inode[%v] status [%v]", inodeId, status)
                err = errors.New("CheckQuota dir quota is over quota")
                reply := []byte(err.Error())
                p.PacketErrorWithBody(status, reply)
                return
        }

        item := mp.inodeTree.Get(iParm)
        if item == nil {
                err = fmt.Errorf("inode[%v] not exist", iParm)
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        inode = item.(*Inode)
        mp.uidManager.acLock.Lock()
        if mp.uidManager.getUidAcl(inode.Uid) {
                log.LogWarnf("CheckQuota UidSpace.volname [%v] mp[%v] uid %v be set full", mp.uidManager.mpID, mp.uidManager.volName, inode.Uid)
                mp.uidManager.acLock.Unlock()
                status = proto.OpNoSpaceErr
                err = errors.New("CheckQuota UidSpace is over quota")
                reply := []byte(err.Error())
                p.PacketErrorWithBody(status, reply)
                return
        }
        mp.uidManager.acLock.Unlock()
        return
}

// ExtentAppend appends an extent.
func (mp *metaPartition) ExtentAppend(req *proto.AppendExtentKeyRequest, p *Packet) (err error) {
        if !proto.IsHot(mp.volType) {
                err = fmt.Errorf("only support hot vol")
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        ino := NewInode(req.Inode, 0)
        if _, _, err = mp.CheckQuota(req.Inode, p); err != nil {
                log.LogErrorf("ExtentAppend fail status [%v]", err)
                return
        }
        ext := req.Extent
        ino.Extents.Append(ext)
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMExtentsAdd, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

// ExtentAppendWithCheck appends an extent with discard extents check.
// Format: one valid extent key followed by non or several discard keys.
func (mp *metaPartition) ExtentAppendWithCheck(req *proto.AppendExtentKeyWithCheckRequest, p *Packet) (err error) {
        status := mp.isOverQuota(req.Inode, true, false)
        if status != 0 {
                log.LogErrorf("ExtentAppendWithCheck fail status [%v]", status)
                err = errors.New("ExtentAppendWithCheck is over quota")
                reply := []byte(err.Error())
                p.PacketErrorWithBody(status, reply)
                return
        }
        var (
                inoParm *Inode
                i       *Inode
        )
        if inoParm, i, err = mp.CheckQuota(req.Inode, p); err != nil {
                log.LogErrorf("ExtentAppendWithCheck CheckQuota fail err [%v]", err)
                return
        }

        // check volume's Type: if volume's type is cold, cbfs' extent can be modify/add only when objextent exist
        if proto.IsCold(mp.volType) {
                i.RLock()
                exist, idx := i.ObjExtents.FindOffsetExist(req.Extent.FileOffset)
                if !exist {
                        i.RUnlock()
                        err = fmt.Errorf("ebs's objextent not exist with offset[%v]", req.Extent.FileOffset)
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
                if i.ObjExtents.eks[idx].Size != uint64(req.Extent.Size) {
                        err = fmt.Errorf("ebs's objextent size[%v] isn't equal to the append size[%v]", i.ObjExtents.eks[idx].Size, req.Extent.Size)
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        i.RUnlock()
                        return
                }
                i.RUnlock()
        }

        ext := req.Extent

        // extent key verSeq not set value since marshal will not include verseq
        // use inode verSeq instead
        inoParm.setVer(mp.verSeq)
        inoParm.Extents.Append(ext)
        log.LogDebugf("ExtentAppendWithCheck: ino(%v) mp[%v] verSeq (%v)", req.Inode, req.PartitionID, mp.verSeq)

        // Store discard extents right after the append extent key.
        if len(req.DiscardExtents) != 0 {
                inoParm.Extents.eks = append(inoParm.Extents.eks, req.DiscardExtents...)
        }
        val, err := inoParm.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        var opFlag uint32 = opFSMExtentsAddWithCheck
        if req.IsSplit {
                opFlag = opFSMExtentSplit
        }
        resp, err := mp.submit(opFlag, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        log.LogDebugf("ExtentAppendWithCheck: ino(%v) mp[%v] verSeq (%v) req.VerSeq(%v) rspcode(%v)", req.Inode, req.PartitionID, mp.verSeq, req.VerSeq, resp.(uint8))

        if mp.verSeq > req.VerSeq {
                // reuse ExtentType to identify flag of version inconsistent between metanode and client
                // will resp to client and make client update all streamer's extent and it's verSeq
                p.ExtentType |= proto.MultiVersionFlag
                p.VerSeq = mp.verSeq
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

func (mp *metaPartition) SetTxInfo(info []*proto.TxInfo) {
        for _, txInfo := range info {
                if txInfo.Volume != mp.config.VolName {
                        continue
                }
                mp.txProcessor.mask = txInfo.Mask
                mp.txProcessor.txManager.setLimit(txInfo.OpLimitVal)
                log.LogInfof("SetTxInfo mp[%v] mask %v limit %v", mp.config.PartitionId, proto.GetMaskString(txInfo.Mask), txInfo.OpLimitVal)
        }
}

type VerOpData struct {
        Op      uint8
        VerSeq  uint64
        VerList []*proto.VolVersionInfo
}

func (mp *metaPartition) checkByMasterVerlist(mpVerList *proto.VolVersionInfoList, masterVerList *proto.VolVersionInfoList) (err error) {
        currMasterSeq := masterVerList.GetLastVer()
        verMapMaster := make(map[uint64]*proto.VolVersionInfo)
        for _, ver := range masterVerList.VerList {
                verMapMaster[ver.Ver] = ver
        }
        log.LogDebugf("checkVerList. volname [%v] mp[%v] masterVerList %v mpVerList.VerList %v", mp.config.VolName, mp.config.PartitionId, masterVerList, mpVerList.VerList)
        mp.multiVersionList.RWLock.Lock()
        defer mp.multiVersionList.RWLock.Unlock()
        vlen := len(mpVerList.VerList)
        for id, info2 := range mpVerList.VerList {
                if id == vlen-1 {
                        break
                }
                log.LogDebugf("checkVerList. volname [%v] mp[%v] ver info %v currMasterseq [%v]", mp.config.VolName, mp.config.PartitionId, info2, currMasterSeq)
                _, exist := verMapMaster[info2.Ver]
                if !exist {
                        if _, ok := mp.multiVersionList.TemporaryVerMap[info2.Ver]; !ok {
                                log.LogInfof("checkVerList. volname [%v] mp[%v] ver info %v be consider as TemporaryVer", mp.config.VolName, mp.config.PartitionId, info2)
                                mp.multiVersionList.TemporaryVerMap[info2.Ver] = info2
                        }
                }
        }

        for verSeq := range mp.multiVersionList.TemporaryVerMap {
                for index, verInfo := range mp.multiVersionList.VerList {
                        if verInfo.Ver == verSeq {
                                log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] ver info %v be consider as TemporaryVer and do deletion verlist %v",
                                        mp.config.VolName, mp.config.PartitionId, verInfo, mp.multiVersionList.VerList)
                                if index == len(mp.multiVersionList.VerList)-1 {
                                        log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] last ver info %v should not be consider as TemporaryVer and do deletion verlist %v",
                                                mp.config.VolName, mp.config.PartitionId, verInfo, mp.multiVersionList.VerList)
                                        return
                                } else {
                                        mp.multiVersionList.VerList = append(mp.multiVersionList.VerList[:index], mp.multiVersionList.VerList[index+1:]...)
                                }

                                log.LogInfof("checkVerList.updateVerList volname [%v] mp[%v] verlist %v", mp.config.VolName, mp.config.PartitionId, mp.multiVersionList.VerList)
                                break
                        }
                }
        }
        return
}

func (mp *metaPartition) checkVerList(reqVerListInfo *proto.VolVersionInfoList, sync bool) (needUpdate bool, err error) {
        mp.multiVersionList.RWLock.RLock()
        verMapLocal := make(map[uint64]*proto.VolVersionInfo)
        verMapReq := make(map[uint64]*proto.VolVersionInfo)
        for _, ver := range reqVerListInfo.VerList {
                verMapReq[ver.Ver] = ver
        }

        var VerList []*proto.VolVersionInfo

        for _, info2 := range mp.multiVersionList.VerList {
                log.LogDebugf("checkVerList. volname [%v] mp[%v] ver info %v", mp.config.VolName, mp.config.PartitionId, info2)
                vms, exist := verMapReq[info2.Ver]
                if !exist {
                        log.LogWarnf("checkVerList. volname [%v] mp[%v] version info(%v) not exist in master (%v)",
                                mp.config.VolName, mp.config.PartitionId, info2, reqVerListInfo.VerList)
                } else if info2.Status != proto.VersionNormal && info2.Status != vms.Status {
                        log.LogWarnf("checkVerList. volname [%v] mp[%v] ver [%v] status abnormal %v", mp.config.VolName, mp.config.PartitionId, info2.Ver, info2.Status)
                        info2.Status = vms.Status
                        needUpdate = true
                }

                if _, ok := verMapLocal[info2.Ver]; !ok {
                        verMapLocal[info2.Ver] = info2
                        VerList = append(VerList, info2)
                }
        }
        mp.multiVersionList.RWLock.RUnlock()

        for _, vInfo := range reqVerListInfo.VerList {
                if vInfo.Status != proto.VersionNormal && vInfo.Status != proto.VersionPrepare {
                        log.LogDebugf("checkVerList. volname [%v] mp[%v] master info %v", mp.config.VolName, mp.config.PartitionId, vInfo)
                        continue
                }
                ver, exist := verMapLocal[vInfo.Ver]
                if !exist {
                        expStr := fmt.Sprintf("checkVerList.volname [%v] mp[%v] not found %v in mp list and append version %v",
                                mp.config.VolName, mp.config.PartitionId, vInfo.Ver, vInfo)
                        log.LogWarnf("[checkVerList] volname [%v]", expStr)
                        if vInfo.Ver < mp.multiVersionList.GetLastVer() {
                                continue
                        }
                        exporter.Warning(expStr)
                        VerList = append(VerList, vInfo)
                        needUpdate = true
                        verMapLocal[vInfo.Ver] = vInfo
                        continue
                }
                if ver.Status != vInfo.Status {
                        warn := fmt.Sprintf("checkVerList.volname [%v] mp[%v] ver [%v] inoraml.local status [%v] update to %v",
                                mp.config.VolName, mp.config.PartitionId, vInfo.Status, vInfo.Ver, vInfo.Status)
                        log.LogWarn(warn)
                        ver.Status = vInfo.Status
                }
        }
        if needUpdate {
                var lastSeq uint64
                sort.SliceStable(VerList, func(i, j int) bool {
                        if VerList[i].Ver < VerList[j].Ver {
                                lastSeq = VerList[j].Ver
                                return true
                        }
                        lastSeq = VerList[i].Ver
                        return false
                })
                if err = mp.HandleVersionOp(proto.SyncBatchVersionList, lastSeq, VerList, sync); err != nil {
                        return
                }
        }
        return
}

func (mp *metaPartition) HandleVersionOp(op uint8, verSeq uint64, verList []*proto.VolVersionInfo, sync bool) (err error) {
        verData := &VerOpData{
                Op:      op,
                VerSeq:  verSeq,
                VerList: verList,
        }
        data, _ := json.Marshal(verData)
        if sync {
                _, err = mp.submit(opFSMVersionOp, data)
                return
        }
        select {
        case mp.verUpdateChan <- data:
                log.LogDebugf("mp[%v] verseq [%v] op [%v] be pushed to queue", mp.config.PartitionId, verSeq, op)
        default:
                err = fmt.Errorf("mp[%v] version update channel full, verdata %v not be executed", mp.config.PartitionId, string(data))
        }
        return
}

func (mp *metaPartition) GetAllVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error) {
        return
}

func (mp *metaPartition) GetSpecVersionInfo(req *proto.MultiVersionOpRequest, p *Packet) (err error) {
        return
}

func (mp *metaPartition) GetExtentByVer(ino *Inode, req *proto.GetExtentsRequest, rsp *proto.GetExtentsResponse) {
        log.LogInfof("action[GetExtentByVer] read ino[%v] readseq [%v] ino seq [%v] hist len %v", ino.Inode, req.VerSeq, ino.getVer(), ino.getLayerLen())
        reqVer := req.VerSeq
        if isInitSnapVer(req.VerSeq) {
                reqVer = 0
        }
        ino.DoReadFunc(func() {
                ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
                        if ek.GetSeq() <= reqVer {
                                rsp.Extents = append(rsp.Extents, ek)
                                log.LogInfof("action[GetExtentByVer] fresh layer.read ino[%v] readseq [%v] ino seq [%v] include ek [%v]", ino.Inode, reqVer, ino.getVer(), ek)
                        } else {
                                log.LogInfof("action[GetExtentByVer] fresh layer.read ino[%v] readseq [%v] ino seq [%v] exclude ek [%v]", ino.Inode, reqVer, ino.getVer(), ek)
                        }
                        return true
                })
                ino.RangeMultiVer(func(idx int, snapIno *Inode) bool {
                        log.LogInfof("action[GetExtentByVer] read ino[%v] readseq [%v] snapIno ino seq [%v]", ino.Inode, reqVer, snapIno.getVer())
                        for _, ek := range snapIno.Extents.eks {
                                if reqVer >= ek.GetSeq() {
                                        log.LogInfof("action[GetExtentByVer] get extent ino[%v] readseq [%v] snapIno ino seq [%v], include ek (%v)", ino.Inode, reqVer, snapIno.getVer(), ek.String())
                                        rsp.Extents = append(rsp.Extents, ek)
                                } else {
                                        log.LogInfof("action[GetExtentByVer] not get extent ino[%v] readseq [%v] snapIno ino seq [%v], exclude ek (%v)", ino.Inode, reqVer, snapIno.getVer(), ek.String())
                                }
                        }
                        if reqVer >= snapIno.getVer() {
                                log.LogInfof("action[GetExtentByVer] finish read ino[%v] readseq [%v] snapIno ino seq [%v]", ino.Inode, reqVer, snapIno.getVer())
                                return false
                        }
                        return true
                })
                sort.SliceStable(rsp.Extents, func(i, j int) bool {
                        return rsp.Extents[i].FileOffset < rsp.Extents[j].FileOffset
                })
        })

        return
}

func (mp *metaPartition) SetUidLimit(info []*proto.UidSpaceInfo) {
        mp.uidManager.volName = mp.config.VolName
        mp.uidManager.setUidAcl(info)
}

func (mp *metaPartition) GetUidInfo() (info []*proto.UidReportSpaceInfo) {
        return mp.uidManager.getAllUidSpace()
}

// ExtentsList returns the list of extents.
func (mp *metaPartition) ExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error) {
        log.LogDebugf("action[ExtentsList] inode[%v] verseq [%v]", req.Inode, req.VerSeq)

        // note:don't need set reqSeq, extents get be done in next step
        ino := NewInode(req.Inode, 0)
        retMsg := mp.getInodeTopLayer(ino)

        // notice.getInode should not set verSeq due to extent need filter from the newest layer to req.VerSeq
        ino = retMsg.Msg
        var (
                reply  []byte
                status = retMsg.Status
        )

        if status == proto.OpOk {
                resp := &proto.GetExtentsResponse{}
                log.LogInfof("action[ExtentsList] inode[%v] request verseq [%v] ino ver [%v] extent size %v ino.Size %v ino[%v] hist len %v",
                        req.Inode, req.VerSeq, ino.getVer(), len(ino.Extents.eks), ino.Size, ino, ino.getLayerLen())

                if req.VerSeq > 0 && ino.getVer() > 0 && (req.VerSeq < ino.getVer() || isInitSnapVer(req.VerSeq)) {
                        mp.GetExtentByVer(ino, req, resp)
                        vIno := ino.Copy().(*Inode)
                        vIno.setVerNoCheck(req.VerSeq)
                        if vIno = mp.getInodeByVer(vIno); vIno != nil {
                                resp.Generation = vIno.Generation
                                resp.Size = vIno.Size
                        }
                } else {
                        ino.DoReadFunc(func() {
                                resp.Generation = ino.Generation
                                resp.Size = ino.Size
                                ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
                                        resp.Extents = append(resp.Extents, ek)
                                        log.LogInfof("action[ExtentsList] append ek [%v]", ek)
                                        return true
                                })
                        })
                }
                if req.VerAll {
                        resp.LayerInfo = retMsg.Msg.getAllLayerEks()
                }
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }
        p.PacketErrorWithBody(status, reply)
        return
}

// ObjExtentsList returns the list of obj extents and extents.
func (mp *metaPartition) ObjExtentsList(req *proto.GetExtentsRequest, p *Packet) (err error) {
        ino := NewInode(req.Inode, 0)
        ino.setVer(req.VerSeq)
        retMsg := mp.getInode(ino, false)
        ino = retMsg.Msg
        var (
                reply  []byte
                status = retMsg.Status
        )
        if status == proto.OpOk {
                resp := &proto.GetObjExtentsResponse{}
                ino.DoReadFunc(func() {
                        resp.Generation = ino.Generation
                        resp.Size = ino.Size
                        ino.Extents.Range(func(_ int, ek proto.ExtentKey) bool {
                                resp.Extents = append(resp.Extents, ek)
                                return true
                        })
                        ino.ObjExtents.Range(func(ek proto.ObjExtentKey) bool {
                                resp.ObjExtents = append(resp.ObjExtents, ek)
                                return true
                        })
                })

                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }
        p.PacketErrorWithBody(status, reply)
        return
}

// ExtentsTruncate truncates an extent.
func (mp *metaPartition) ExtentsTruncate(req *ExtentsTruncateReq, p *Packet, remoteAddr string) (err error) {
        if !proto.IsHot(mp.volType) {
                err = fmt.Errorf("only support hot vol")
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        fileSize := uint64(0)
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, fileSize)
                }()
        }
        ino := NewInode(req.Inode, proto.Mode(os.ModePerm))
        item := mp.inodeTree.CopyGet(ino)
        if item == nil {
                err = fmt.Errorf("inode[%v] is not exist", req.Inode)
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        i := item.(*Inode)
        status := mp.isOverQuota(req.Inode, req.Size > i.Size, false)
        if status != 0 {
                log.LogErrorf("ExtentsTruncate fail status [%v]", status)
                err = errors.New("ExtentsTruncate is over quota")
                reply := []byte(err.Error())
                p.PacketErrorWithBody(status, reply)
                return
        }

        ino.Size = req.Size
        fileSize = ino.Size
        ino.setVer(mp.verSeq)
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMExtentTruncate, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        msg := resp.(*InodeResponse)
        p.PacketErrorWithBody(msg.Status, nil)
        return
}

func (mp *metaPartition) BatchExtentAppend(req *proto.AppendExtentKeysRequest, p *Packet) (err error) {
        if !proto.IsHot(mp.volType) {
                err = fmt.Errorf("only support hot vol")
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }

        var ino *Inode
        if ino, _, err = mp.CheckQuota(req.Inode, p); err != nil {
                log.LogErrorf("BatchExtentAppend fail err [%v]", err)
                return
        }

        extents := req.Extents
        for _, extent := range extents {
                ino.Extents.Append(extent)
        }
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMExtentsAdd, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

func (mp *metaPartition) BatchObjExtentAppend(req *proto.AppendObjExtentKeysRequest, p *Packet) (err error) {
        var ino *Inode
        if ino, _, err = mp.CheckQuota(req.Inode, p); err != nil {
                log.LogErrorf("BatchObjExtentAppend fail status [%v]", err)
                return
        }

        objExtents := req.Extents
        for _, objExtent := range objExtents {
                err = ino.ObjExtents.Append(objExtent)
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
        }
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMObjExtentsAdd, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

// func (mp *metaPartition) ExtentsDelete(req *proto.DelExtentKeyRequest, p *Packet) (err error) {
//         ino := NewInode(req.Inode, 0)
//         inode := mp.inodeTree.Get(ino).(*Inode)
//         inode.Extents.Delete(req.Extents)
//         curTime := timeutil.GetCurrentTimeUnix()
//         if inode.ModifyTime < curTime {
//                 inode.ModifyTime = curTime
//         }
//         val, err := inode.Marshal()
//         if err != nil {
//                 p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
//                 return
//         }
//         resp, err := mp.submit(opFSMExtentsDel, val)
//         if err != nil {
//                 p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
//                 return
//         }
//         p.PacketErrorWithBody(resp.(uint8), nil)
//         return
// }

// ExtentsEmpty only use in datalake situation
func (mp *metaPartition) ExtentsOp(p *Packet, ino *Inode, op uint32) (err error) {
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(op, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

func (mp *metaPartition) sendExtentsToChan(eks []proto.ExtentKey) (err error) {
        if len(eks) == 0 {
                return
        }

        sortExts := NewSortedExtentsFromEks(eks)
        val, err := sortExts.MarshalBinary(true)
        if err != nil {
                return fmt.Errorf("[delExtents] marshal binary fail, %s", err.Error())
        }

        _, err = mp.submit(opFSMSentToChan, val)

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"
        "encoding/json"
        "fmt"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

func replyInfoNoCheck(info *proto.InodeInfo, ino *Inode) bool {
        ino.RLock()
        defer ino.RUnlock()

        info.Inode = ino.Inode
        info.Mode = ino.Type
        info.Size = ino.Size
        info.Nlink = ino.NLink
        info.Uid = ino.Uid
        info.Gid = ino.Gid
        info.Generation = ino.Generation
        info.VerSeq = ino.getVer()
        if length := len(ino.LinkTarget); length > 0 {
                info.Target = make([]byte, length)
                copy(info.Target, ino.LinkTarget)
        }
        info.CreateTime = time.Unix(ino.CreateTime, 0)
        info.AccessTime = time.Unix(ino.AccessTime, 0)
        info.ModifyTime = time.Unix(ino.ModifyTime, 0)
        return true
}

func replyInfo(info *proto.InodeInfo, ino *Inode, quotaInfos map[uint32]*proto.MetaQuotaInfo) bool {
        ino.RLock()
        defer ino.RUnlock()
        if ino.Flag&DeleteMarkFlag > 0 {
                return false
        }
        info.Inode = ino.Inode
        info.Mode = ino.Type
        info.Size = ino.Size
        info.Nlink = ino.NLink
        info.Uid = ino.Uid
        info.Gid = ino.Gid
        info.Generation = ino.Generation
        info.VerSeq = ino.getVer()
        if length := len(ino.LinkTarget); length > 0 {
                info.Target = make([]byte, length)
                copy(info.Target, ino.LinkTarget)
        }
        info.CreateTime = time.Unix(ino.CreateTime, 0)
        info.AccessTime = time.Unix(ino.AccessTime, 0)
        info.ModifyTime = time.Unix(ino.ModifyTime, 0)
        info.QuotaInfos = quotaInfos
        return true
}

func txReplyInfo(inode *Inode, txInfo *proto.TransactionInfo, quotaInfos map[uint32]*proto.MetaQuotaInfo) (resp *proto.TxCreateInodeResponse) {
        inoInfo := &proto.InodeInfo{
                Inode:      inode.Inode,
                Mode:       inode.Type,
                Nlink:      inode.NLink,
                Size:       inode.Size,
                Uid:        inode.Uid,
                Gid:        inode.Gid,
                Generation: inode.Generation,
                ModifyTime: time.Unix(inode.ModifyTime, 0),
                CreateTime: time.Unix(inode.CreateTime, 0),
                AccessTime: time.Unix(inode.AccessTime, 0),
                QuotaInfos: quotaInfos,
                Target:     nil,
        }
        if length := len(inode.LinkTarget); length > 0 {
                inoInfo.Target = make([]byte, length)
                copy(inoInfo.Target, inode.LinkTarget)
        }

        resp = &proto.TxCreateInodeResponse{
                Info:   inoInfo,
                TxInfo: txInfo,
        }
        return
}

// CreateInode returns a new inode.
func (mp *metaPartition) CreateInode(req *CreateInoReq, p *Packet, remoteAddr string) (err error) {
        var (
                status = proto.OpNotExistErr
                reply  []byte
                resp   interface{}
                qinode *MetaQuotaInode
                inoID  uint64
        )
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
                }()
        }
        inoID, err = mp.nextInodeID()
        if err != nil {
                p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
                return
        }
        ino := NewInode(inoID, req.Mode)
        ino.Uid = req.Uid
        ino.Gid = req.Gid
        ino.setVer(mp.verSeq)
        ino.LinkTarget = req.Target

        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return err
        }
        resp, err = mp.submit(opFSMCreateInode, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        if resp.(uint8) == proto.OpOk {
                resp := &CreateInoResp{
                        Info: &proto.InodeInfo{},
                }
                if replyInfo(resp.Info, ino, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
                        status = proto.OpOk
                        reply, err = json.Marshal(resp)
                        if err != nil {
                                status = proto.OpErr
                                reply = []byte(err.Error())
                        }
                }
        }
        p.PacketErrorWithBody(status, reply)
        log.LogInfof("CreateInode req [%v] qinode[%v] success.", req, qinode)
        return
}

func (mp *metaPartition) QuotaCreateInode(req *proto.QuotaCreateInodeRequest, p *Packet, remoteAddr string) (err error) {
        var (
                status = proto.OpNotExistErr
                reply  []byte
                resp   interface{}
                qinode *MetaQuotaInode
                inoID  uint64
        )
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
                }()
        }
        inoID, err = mp.nextInodeID()
        if err != nil {
                p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
                return
        }
        ino := NewInode(inoID, req.Mode)
        ino.Uid = req.Uid
        ino.Gid = req.Gid
        ino.LinkTarget = req.Target

        for _, quotaId := range req.QuotaIds {
                status = mp.mqMgr.IsOverQuota(false, true, quotaId)
                if status != 0 {
                        err = errors.New("create inode is over quota")
                        reply = []byte(err.Error())
                        p.PacketErrorWithBody(status, reply)
                        return
                }
        }
        qinode = &MetaQuotaInode{
                inode:    ino,
                quotaIds: req.QuotaIds,
        }
        val, err := qinode.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return err
        }
        resp, err = mp.submit(opFSMCreateInodeQuota, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        if resp.(uint8) == proto.OpOk {
                resp := &CreateInoResp{
                        Info: &proto.InodeInfo{},
                }
                quotaInfos := make(map[uint32]*proto.MetaQuotaInfo)
                for _, quotaId := range req.QuotaIds {
                        quotaInfos[quotaId] = &proto.MetaQuotaInfo{
                                RootInode: false,
                        }
                }
                if replyInfo(resp.Info, ino, quotaInfos) {
                        status = proto.OpOk
                        reply, err = json.Marshal(resp)
                        if err != nil {
                                status = proto.OpErr
                                reply = []byte(err.Error())
                        }
                }
        }
        p.PacketErrorWithBody(status, reply)
        log.LogInfof("QuotaCreateInode req [%v] qinode[%v] success.", req, qinode)
        return
}

func (mp *metaPartition) TxUnlinkInode(req *proto.TxUnlinkInodeRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        txInfo := req.TxInfo.GetCopy()
        var status uint8
        var respIno *Inode
        defer func() {
                var reply []byte
                if status == proto.OpOk {
                        resp := &proto.TxUnlinkInodeResponse{
                                Info: &proto.InodeInfo{},
                        }
                        if respIno != nil {
                                replyInfo(resp.Info, respIno, make(map[uint32]*proto.MetaQuotaInfo, 0))
                                if reply, err = json.Marshal(resp); err != nil {
                                        status = proto.OpErr
                                        reply = []byte(err.Error())
                                }
                        }
                        p.PacketErrorWithBody(status, reply)
                }
        }()

        ino := NewInode(req.Inode, 0)
        inoResp := mp.getInode(ino, true)
        if inoResp.Status != proto.OpOk {
                if rbIno := mp.txInodeInRb(req.Inode, req.TxInfo.TxID); rbIno != nil {
                        respIno = rbIno.inode
                        status = proto.OpOk

                        item := mp.inodeTree.Get(NewInode(req.Inode, 0))
                        if item != nil {
                                respIno = item.(*Inode)
                        }

                        p.ResultCode = status
                        log.LogWarnf("TxUnlinkInode: inode is already unlink before, req %v, rbino[%v], item %v", req, respIno, item)
                        return nil
                }

                err = fmt.Errorf("ino[%v] not exists", ino.Inode)
                p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
                return
        }

        respIno = inoResp.Msg
        createTime := respIno.CreateTime
        deleteLockTime := mp.vol.volDeleteLockTime * 60 * 60
        if deleteLockTime > 0 && createTime+deleteLockTime > time.Now().Unix() {
                err = fmt.Errorf("the current Inode[%v] is still locked for deletion", req.Inode)
                log.LogDebugf("TxUnlinkInode: the current Inode is still locked for deletion, inode[%v] createTime(%v) mw.volDeleteLockTime(%v) now(%v)", respIno.Inode, createTime, deleteLockTime, time.Now())
                p.PacketErrorWithBody(proto.OpNotPerm, []byte(err.Error()))
                return
        }

        ti := &TxInode{
                Inode:  inoResp.Msg,
                TxInfo: txInfo,
        }

        val, err := ti.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }

        r, err := mp.submit(opFSMTxUnlinkInode, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        msg := r.(*InodeResponse)
        status = msg.Status
        if msg.Msg != nil {
                respIno = msg.Msg
        }
        p.ResultCode = status
        return
}

// DeleteInode deletes an inode.
func (mp *metaPartition) UnlinkInode(req *UnlinkInoReq, p *Packet, remoteAddr string) (err error) {
        var (
                msg   *InodeResponse
                reply []byte
                r     interface{}
                val   []byte
        )
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        makeRspFunc := func() {
                status := msg.Status
                if status == proto.OpOk {
                        resp := &UnlinkInoResp{
                                Info: &proto.InodeInfo{},
                        }
                        replyInfo(resp.Info, msg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0))
                        if reply, err = json.Marshal(resp); err != nil {
                                status = proto.OpErr
                                reply = []byte(err.Error())
                        }
                }
                p.PacketErrorWithBody(status, reply)
        }
        ino := NewInode(req.Inode, 0)
        if item := mp.inodeTree.Get(ino); item == nil {
                err = fmt.Errorf("mp[%v] inode[%v] reqeust cann't found", mp.config.PartitionId, ino)
                log.LogErrorf("action[UnlinkInode] %v", err)
                p.PacketErrorWithBody(proto.OpNotExistErr, []byte(err.Error()))
                return
        }

        if req.UniqID > 0 {
                val = InodeOnceUnlinkMarshal(req)
                r, err = mp.submit(opFSMUnlinkInodeOnce, val)
        } else {
                ino.setVer(req.VerSeq)
                log.LogDebugf("action[UnlinkInode] mp[%v] verseq [%v] ino[%v]", mp.config.PartitionId, req.VerSeq, ino)
                val, err = ino.Marshal()
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
                log.LogDebugf("action[UnlinkInode] mp[%v] ino[%v] submit", mp.config.PartitionId, ino)
                r, err = mp.submit(opFSMUnlinkInode, val)
        }

        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        msg = r.(*InodeResponse)
        makeRspFunc()

        return
}

// DeleteInode deletes an inode.
func (mp *metaPartition) UnlinkInodeBatch(req *BatchUnlinkInoReq, p *Packet, remoteAddr string) (err error) {
        if len(req.Inodes) == 0 {
                return nil
        }

        var inodes InodeBatch
        start := time.Now()
        for i, id := range req.Inodes {
                inodes = append(inodes, NewInode(id, 0))
                ino := id
                fullPath := ""
                if len(req.FullPaths) > i {
                        fullPath = req.FullPaths[i]
                }
                if mp.IsEnableAuditLog() {
                        defer func() {
                                auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
                        }()
                }
        }

        val, err := inodes.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        r, err := mp.submit(opFSMUnlinkInodeBatch, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        result := &BatchUnlinkInoResp{}
        status := proto.OpOk
        for _, ir := range r.([]*InodeResponse) {
                if ir.Status != proto.OpOk {
                        status = ir.Status
                }

                info := &proto.InodeInfo{}
                replyInfo(info, ir.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0))
                result.Items = append(result.Items, &struct {
                        Info   *proto.InodeInfo `json:"info"`
                        Status uint8            `json:"status"`
                }{
                        Info:   info,
                        Status: ir.Status,
                })
        }

        reply, err := json.Marshal(result)
        if err != nil {
                status = proto.OpErr
                reply = []byte(err.Error())
        }
        p.PacketErrorWithBody(status, reply)
        return
}

// InodeGet executes the inodeGet command from the client.
func (mp *metaPartition) InodeGetSplitEk(req *InodeGetSplitReq, p *Packet) (err error) {
        ino := NewInode(req.Inode, 0)
        ino.setVer(req.VerSeq)

        getAllVerInfo := req.VerAll
        retMsg := mp.getInode(ino, getAllVerInfo)

        log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)

        ino = retMsg.Msg
        var (
                reply  []byte
                status = proto.OpNotExistErr
        )
        if retMsg.Status == proto.OpOk {
                resp := &proto.InodeGetSplitResponse{
                        Info: &proto.InodeSplitInfo{
                                Inode:  ino.Inode,
                                VerSeq: ino.getVer(),
                        },
                }
                multiSnap := retMsg.Msg.multiSnap
                if multiSnap != nil && multiSnap.ekRefMap != nil {
                        multiSnap.ekRefMap.Range(func(key, value interface{}) bool {
                                dpID, extID := proto.ParseFromId(key.(uint64))
                                resp.Info.SplitArr = append(resp.Info.SplitArr, proto.SimpleExtInfo{
                                        ID:          key.(uint64),
                                        PartitionID: uint32(dpID),
                                        ExtentID:    uint32(extID),
                                })
                                return true
                        })
                }
                log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                        ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
                status = proto.OpOk
                reply, err = json.Marshal(resp)
                if err != nil {
                        log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                                ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
                log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                        ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
        }
        log.LogDebugf("action[InodeGetSplitEk] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)
        p.PacketErrorWithBody(status, reply)
        return
}

// InodeGet executes the inodeGet command from the client.
func (mp *metaPartition) InodeGet(req *InodeGetReq, p *Packet) (err error) {
        ino := NewInode(req.Inode, 0)
        ino.setVer(req.VerSeq)
        getAllVerInfo := req.VerAll
        retMsg := mp.getInode(ino, getAllVerInfo)

        log.LogDebugf("action[Inode] %v seq [%v] retMsg.status [%v], getAllVerInfo %v",
                ino.Inode, req.VerSeq, retMsg.Status, getAllVerInfo)

        ino = retMsg.Msg

        var (
                reply      []byte
                status     = proto.OpNotExistErr
                quotaInfos map[uint32]*proto.MetaQuotaInfo
        )
        if mp.mqMgr.EnableQuota() {
                quotaInfos, err = mp.getInodeQuotaInfos(req.Inode)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                        p.PacketErrorWithBody(status, reply)
                        return
                }
        }

        ino = retMsg.Msg
        if retMsg.Status == proto.OpOk {
                resp := &proto.InodeGetResponse{
                        Info: &proto.InodeInfo{},
                }
                if getAllVerInfo {
                        replyInfoNoCheck(resp.Info, retMsg.Msg)
                } else {
                        if !replyInfo(resp.Info, retMsg.Msg, quotaInfos) {
                                p.PacketErrorWithBody(status, reply)
                                return

                        }
                }

                status = proto.OpOk
                if getAllVerInfo {
                        inode := mp.getInodeTopLayer(ino)
                        log.LogDebugf("req ino[%v], toplayer ino[%v]", retMsg.Msg, inode)
                        resp.LayAll = inode.Msg.getAllInodesInfo()
                }
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }

        }
        p.PacketErrorWithBody(status, reply)
        return
}

// InodeGetBatch executes the inodeBatchGet command from the client.
func (mp *metaPartition) InodeGetBatch(req *InodeGetReqBatch, p *Packet) (err error) {
        resp := &proto.BatchInodeGetResponse{}
        ino := NewInode(0, 0)
        for _, inoId := range req.Inodes {
                var quotaInfos map[uint32]*proto.MetaQuotaInfo
                ino.Inode = inoId
                ino.setVer(req.VerSeq)
                retMsg := mp.getInode(ino, false)
                if mp.mqMgr.EnableQuota() {
                        quotaInfos, err = mp.getInodeQuotaInfos(inoId)
                        if err != nil {
                                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                                return
                        }
                }
                if retMsg.Status == proto.OpOk {
                        inoInfo := &proto.InodeInfo{}
                        if replyInfo(inoInfo, retMsg.Msg, quotaInfos) {
                                resp.Infos = append(resp.Infos, inoInfo)
                        }
                }
        }
        data, err := json.Marshal(resp)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(data)
        return
}

func (mp *metaPartition) TxCreateInodeLink(req *proto.TxLinkInodeRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        txInfo := req.TxInfo.GetCopy()
        ino := NewInode(req.Inode, 0)
        inoResp := mp.getInode(ino, true)
        if inoResp.Status != proto.OpOk {
                err = fmt.Errorf("ino[%v] not exists", ino.Inode)
                p.PacketErrorWithBody(inoResp.Status, []byte(err.Error()))
                return
        }

        ti := &TxInode{
                Inode:  inoResp.Msg,
                TxInfo: txInfo,
        }

        val, err := ti.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }

        resp, err := mp.submit(opFSMTxCreateLinkInode, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        retMsg := resp.(*InodeResponse)
        status := retMsg.Status
        var reply []byte
        if retMsg.Status == proto.OpOk {
                resp := &proto.TxLinkInodeResponse{
                        Info: &proto.InodeInfo{},
                }
                if replyInfo(resp.Info, retMsg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
                        status = proto.OpOk
                        reply, err = json.Marshal(resp)
                        if err != nil {
                                status = proto.OpErr
                                reply = []byte(err.Error())
                        }
                }
        }
        p.PacketErrorWithBody(status, reply)
        return
}

// CreateInodeLink creates an inode link (e.g., soft link).
func (mp *metaPartition) CreateInodeLink(req *LinkInodeReq, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        var r interface{}
        var val []byte
        if req.UniqID > 0 {
                val = InodeOnceLinkMarshal(req)
                r, err = mp.submit(opFSMCreateLinkInodeOnce, val)
        } else {
                ino := NewInode(req.Inode, 0)
                ino.setVer(mp.verSeq)
                val, err = ino.Marshal()
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
                r, err = mp.submit(opFSMCreateLinkInode, val)

        }

        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        retMsg := r.(*InodeResponse)
        status := proto.OpNotExistErr
        var reply []byte
        if retMsg.Status == proto.OpOk {
                resp := &LinkInodeResp{
                        Info: &proto.InodeInfo{},
                }
                if replyInfo(resp.Info, retMsg.Msg, make(map[uint32]*proto.MetaQuotaInfo, 0)) {
                        status = proto.OpOk
                        reply, err = json.Marshal(resp)
                        if err != nil {
                                status = proto.OpErr
                                reply = []byte(err.Error())
                        }
                }

        }
        p.PacketErrorWithBody(status, reply)
        return
}

// EvictInode evicts an inode.
func (mp *metaPartition) EvictInode(req *EvictInodeReq, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        ino := NewInode(req.Inode, 0)
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMEvictInode, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        msg := resp.(*InodeResponse)
        p.PacketErrorWithBody(msg.Status, nil)
        return
}

// EvictInode evicts an inode.
func (mp *metaPartition) EvictInodeBatch(req *BatchEvictInodeReq, p *Packet, remoteAddr string) (err error) {
        if len(req.Inodes) == 0 {
                return nil
        }

        start := time.Now()
        var inodes InodeBatch

        for i, id := range req.Inodes {
                inodes = append(inodes, NewInode(id, 0))
                ino := id
                fullPath := ""
                if len(req.FullPaths) > i {
                        fullPath = req.FullPaths[i]
                }
                if mp.IsEnableAuditLog() {
                        defer func() {
                                auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
                        }()
                }
        }

        val, err := inodes.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMEvictInodeBatch, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        status := proto.OpOk
        for _, m := range resp.([]*InodeResponse) {
                if m.Status != proto.OpOk {
                        status = m.Status
                }
        }

        p.PacketErrorWithBody(status, nil)
        return
}

// SetAttr set the inode attributes.
func (mp *metaPartition) SetAttr(req *SetattrRequest, reqData []byte, p *Packet) (err error) {
        if mp.verSeq != 0 {
                req.VerSeq = mp.GetVerSeq()
                reqData, err = json.Marshal(req)
                if err != nil {
                        log.LogErrorf("setattr: marshal err(%v)", err)
                        return
                }
        }
        _, err = mp.submit(opFSMSetAttr, reqData)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        log.LogDebugf("action[SetAttr] inode[%v] ver [%v] exit", req.Inode, req.VerSeq)
        p.PacketOkReply()
        return
}

// GetInodeTree returns the inode tree.
func (mp *metaPartition) GetInodeTree() *BTree {
        return mp.inodeTree.GetTree()
}

// GetInodeTreeLen returns the inode tree length.
func (mp *metaPartition) GetInodeTreeLen() int {
        if mp.inodeTree == nil {
                return 0
        }
        return mp.inodeTree.Len()
}

func (mp *metaPartition) DeleteInode(req *proto.DeleteInodeRequest, p *Packet, remoteAddr string) (err error) {
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), req.Inode, 0)
                }()
        }
        bytes := make([]byte, 8)
        binary.BigEndian.PutUint64(bytes, req.Inode)
        _, err = mp.submit(opFSMInternalDeleteInode, bytes)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketOkReply()
        return
}

func (mp *metaPartition) DeleteInodeBatch(req *proto.DeleteInodeBatchRequest, p *Packet, remoteAddr string) (err error) {
        if len(req.Inodes) == 0 {
                return nil
        }
        start := time.Now()
        var inodes InodeBatch

        for i, id := range req.Inodes {
                inodes = append(inodes, NewInode(id, 0))
                ino := id
                fullPath := ""
                if len(req.FullPaths) > i {
                        fullPath = req.FullPaths[i]
                }
                if mp.IsEnableAuditLog() {
                        defer func() {
                                auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), fullPath, err, time.Since(start).Milliseconds(), ino, 0)
                        }()
                }
        }

        encoded, err := inodes.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        _, err = mp.submit(opFSMInternalDeleteInodeBatch, encoded)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketOkReply()
        return
}

// ClearInodeCache clear a inode's cbfs extent but keep ebs extent.
func (mp *metaPartition) ClearInodeCache(req *proto.ClearInodeCacheRequest, p *Packet) (err error) {
        if len(mp.extDelCh) > defaultDelExtentsCnt-100 {
                err = fmt.Errorf("extent del chan full")
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        ino := NewInode(req.Inode, 0)
        val, err := ino.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        resp, err := mp.submit(opFSMClearInodeCache, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }
        p.PacketErrorWithBody(resp.(uint8), nil)
        return
}

// TxCreateInode returns a new inode.
func (mp *metaPartition) TxCreateInode(req *proto.TxCreateInodeRequest, p *Packet, remoteAddr string) (err error) {
        var (
                status = proto.OpNotExistErr
                reply  []byte
                resp   interface{}
                inoID  uint64
        )
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogInodeOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.GetFullPath(), err, time.Since(start).Milliseconds(), inoID, 0)
                }()
        }
        inoID, err = mp.nextInodeID()
        if err != nil {
                p.PacketErrorWithBody(proto.OpInodeFullErr, []byte(err.Error()))
                return
        }

        req.TxInfo.SetCreateInodeId(inoID)
        createTxReq := &proto.TxCreateRequest{
                VolName:         req.VolName,
                PartitionID:     req.PartitionID,
                TransactionInfo: req.TxInfo,
        }
        err = mp.TxCreate(createTxReq, p)
        if err != nil || p.ResultCode != proto.OpOk {
                return
        }

        createResp := &proto.TxCreateResponse{}
        err = json.Unmarshal(p.Data, createResp)
        if err != nil || createResp.TxInfo == nil {
                err = fmt.Errorf("TxCreateInode: unmarshal txInfo failed, data %s, err %v", string(p.Data), err)
                log.LogWarn(err)
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        txIno := NewTxInode(inoID, req.Mode, createResp.TxInfo)
        txIno.Inode.Uid = req.Uid
        txIno.Inode.Gid = req.Gid
        txIno.Inode.LinkTarget = req.Target

        if log.EnableDebug() {
                log.LogDebugf("NewTxInode: TxInode: %v", txIno)
        }

        if defaultQuotaSwitch {
                for _, quotaId := range req.QuotaIds {
                        status = mp.mqMgr.IsOverQuota(false, true, quotaId)
                        if status != 0 {
                                err = errors.New("tx create inode is over quota")
                                reply = []byte(err.Error())
                                p.PacketErrorWithBody(status, reply)
                                return
                        }
                }

                qinode := &TxMetaQuotaInode{
                        txinode:  txIno,
                        quotaIds: req.QuotaIds,
                }
                val, err := qinode.Marshal()
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return err
                }
                resp, err = mp.submit(opFSMTxCreateInodeQuota, val)
                if err != nil {
                        p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                        return err
                }
        } else {
                val, err := txIno.Marshal()
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return err
                }
                resp, err = mp.submit(opFSMTxCreateInode, val)
                if err != nil {
                        p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                        return err
                }
        }

        if resp == proto.OpOk {
                quotaInfos := make(map[uint32]*proto.MetaQuotaInfo)
                for _, quotaId := range req.QuotaIds {
                        quotaInfos[quotaId] = &proto.MetaQuotaInfo{
                                RootInode: false,
                        }
                }
                resp := txReplyInfo(txIno.Inode, createResp.TxInfo, quotaInfos)
                status = proto.OpOk
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }
        p.PacketErrorWithBody(status, reply)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
)

func (mp *metaPartition) GetExpiredMultipart(req *proto.GetExpiredMultipartRequest, p *Packet) (err error) {
        expiredMultiPartInfos := make([]*proto.ExpiredMultipartInfo, 0)
        walkTreeFunc := func(i BtreeItem) bool {
                multipart := i.(*Multipart)
                if len(req.Prefix) > 0 && !strings.HasPrefix(multipart.key, req.Prefix) {
                        // skip and continue
                        return true
                }

                if multipart.initTime.Unix()+int64(req.Days*24*60*60) <= time.Now().Local().Unix() {
                        info := &proto.ExpiredMultipartInfo{
                                Path:        multipart.key,
                                MultipartId: multipart.id,
                                Inodes:      make([]uint64, 0),
                        }
                        for _, part := range multipart.Parts() {
                                info.Inodes = append(info.Inodes, part.Inode)
                        }
                        expiredMultiPartInfos = append(expiredMultiPartInfos, info)
                }

                return true
        }

        mp.multipartTree.Ascend(walkTreeFunc)

        resp := &proto.GetExpiredMultipartResponse{
                Infos: expiredMultiPartInfos,
        }

        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

func (mp *metaPartition) GetMultipart(req *proto.GetMultipartRequest, p *Packet) (err error) {
        item := mp.multipartTree.Get(&Multipart{key: req.Path, id: req.MultipartId})
        if item == nil {
                p.PacketErrorWithBody(proto.OpNotExistErr, nil)
                return
        }
        multipart := item.(*Multipart)
        resp := &proto.GetMultipartResponse{
                Info: &proto.MultipartInfo{
                        ID:       multipart.id,
                        Path:     multipart.key,
                        InitTime: multipart.initTime,
                        Parts:    make([]*proto.MultipartPartInfo, 0, len(multipart.parts)),
                        Extend:   multipart.extend,
                },
        }
        for _, part := range multipart.Parts() {
                resp.Info.Parts = append(resp.Info.Parts, &proto.MultipartPartInfo{
                        ID:         part.ID,
                        Inode:      part.Inode,
                        MD5:        part.MD5,
                        Size:       part.Size,
                        UploadTime: part.UploadTime,
                })
        }
        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

func (mp *metaPartition) AppendMultipart(req *proto.AddMultipartPartRequest, p *Packet) (err error) {
        if req.Part == nil {
                p.PacketOkReply()
                return
        }
        item := mp.multipartTree.Get(&Multipart{key: req.Path, id: req.MultipartId})
        if item == nil {
                p.PacketErrorWithBody(proto.OpNotExistErr, nil)
                return
        }
        multipart := &Multipart{
                id:  req.MultipartId,
                key: req.Path,
                parts: Parts{
                        &Part{
                                ID:         req.Part.ID,
                                UploadTime: req.Part.UploadTime,
                                MD5:        req.Part.MD5,
                                Size:       req.Part.Size,
                                Inode:      req.Part.Inode,
                        },
                },
        }
        var resp interface{}
        if resp, err = mp.putMultipart(opFSMAppendMultipart, multipart); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        appendMultipartResp := resp.(proto.AppendMultipartResponse)
        if appendMultipartResp.Status != proto.OpOk {
                p.PacketErrorWithBody(appendMultipartResp.Status, nil)
                return
        }
        var reply []byte
        if reply, err = json.Marshal(appendMultipartResp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

func (mp *metaPartition) RemoveMultipart(req *proto.RemoveMultipartRequest, p *Packet) (err error) {
        multipart := &Multipart{
                id:  req.MultipartId,
                key: req.Path,
        }
        var resp interface{}
        if resp, err = mp.putMultipart(opFSMRemoveMultipart, multipart); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        status := resp.(uint8)
        if status != proto.OpOk {
                p.PacketErrorWithBody(status, nil)
                return
        }
        p.PacketOkReply()
        return
}

func (mp *metaPartition) CreateMultipart(req *proto.CreateMultipartRequest, p *Packet) (err error) {
        var multipartId string
        for {
                multipartId = util.CreateMultipartID(mp.config.PartitionId).String()
                storedItem := mp.multipartTree.Get(&Multipart{key: req.Path, id: multipartId})
                if storedItem == nil {
                        break
                }
        }

        multipart := &Multipart{
                id:       multipartId,
                key:      req.Path,
                initTime: time.Now().Local(),
                extend:   req.Extend,
        }
        if _, err = mp.putMultipart(opFSMCreateMultipart, multipart); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }

        resp := &proto.CreateMultipartResponse{
                Info: &proto.MultipartInfo{
                        ID:   multipartId,
                        Path: req.Path,
                },
        }
        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

func (mp *metaPartition) ListMultipart(req *proto.ListMultipartRequest, p *Packet) (err error) {
        max := int(req.Max)
        keyMarker := req.Marker
        multipartIdMarker := req.MultipartIdMarker
        prefix := req.Prefix
        matches := make([]*Multipart, 0, max)
        walkTreeFunc := func(i BtreeItem) bool {
                multipart := i.(*Multipart)
                // prefix is enabled
                if len(prefix) > 0 && !strings.HasPrefix(multipart.key, prefix) {
                        // skip and continue
                        return true
                }
                matches = append(matches, multipart)
                return !(len(matches) >= max)
        }
        if len(keyMarker) > 0 {
                mp.multipartTree.AscendGreaterOrEqual(&Multipart{key: keyMarker, id: multipartIdMarker}, walkTreeFunc)
        } else {
                mp.multipartTree.Ascend(walkTreeFunc)
        }
        multipartInfos := make([]*proto.MultipartInfo, len(matches))

        convertPartFunc := func(part *Part) *proto.MultipartPartInfo {
                return &proto.MultipartPartInfo{
                        ID:         part.ID,
                        Inode:      part.Inode,
                        MD5:        part.MD5,
                        Size:       part.Size,
                        UploadTime: part.UploadTime,
                }
        }

        convertMultipartFunc := func(multipart *Multipart) *proto.MultipartInfo {
                partInfos := make([]*proto.MultipartPartInfo, len(multipart.parts))
                for i := 0; i < len(multipart.parts); i++ {
                        partInfos[i] = convertPartFunc(multipart.parts[i])
                }
                return &proto.MultipartInfo{
                        ID:       multipart.id,
                        Path:     multipart.key,
                        InitTime: multipart.initTime,
                        Parts:    partInfos,
                }
        }

        for i := 0; i < len(matches); i++ {
                multipartInfos[i] = convertMultipartFunc(matches[i])
        }

        resp := &proto.ListMultipartResponse{
                Multiparts: multipartInfos,
        }

        var reply []byte
        if reply, err = json.Marshal(resp); err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(reply)
        return
}

// SendMultipart replicate specified multipart operation to raft.
func (mp *metaPartition) putMultipart(op uint32, multipart *Multipart) (resp interface{}, err error) {
        var encoded []byte
        if encoded, err = multipart.Bytes(); err != nil {
                return
        }
        resp, err = mp.submit(op, encoded)
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/json"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) batchSetInodeQuota(req *proto.BatchSetMetaserverQuotaReuqest,
        resp *proto.BatchSetMetaserverQuotaResponse) (err error) {
        if len(req.Inodes) == 0 {
                return nil
        }

        val, err := json.Marshal(req)
        if err != nil {
                log.LogErrorf("batchSetInodeQuota marshal req [%v] failed [%v]", req, err)
                return
        }

        r, err := mp.submit(opFSMSetInodeQuotaBatch, val)
        if err != nil {
                log.LogErrorf("batchSetInodeQuota submit req [%v] failed [%v]", req, err)
                return
        }
        resp.InodeRes = r.(*proto.BatchSetMetaserverQuotaResponse).InodeRes
        log.LogInfof("batchSetInodeQuota quotaId [%v] mp[%v] btreeLen [%v] resp [%v] success", req.QuotaId, mp.config.PartitionId,
                mp.extendTree.Len(), resp)
        return
}

func (mp *metaPartition) batchDeleteInodeQuota(req *proto.BatchDeleteMetaserverQuotaReuqest,
        resp *proto.BatchDeleteMetaserverQuotaResponse) (err error) {
        if len(req.Inodes) == 0 {
                return nil
        }

        val, err := json.Marshal(req)
        if err != nil {
                log.LogErrorf("batchDeleteInodeQuota marshal req [%v] failed [%v]", req, err)
                return
        }

        r, err := mp.submit(opFSMDeleteInodeQuotaBatch, val)
        if err != nil {
                log.LogErrorf("batchDeleteInodeQuota submit req [%v] failed [%v]", req, err)
                return
        }
        resp.InodeRes = r.(*proto.BatchDeleteMetaserverQuotaResponse).InodeRes
        log.LogInfof("batchSetInodeQuota quotaId [%v] mp[%v] btreeLen [%v] resp [%v] success", req.QuotaId, mp.config.PartitionId,
                mp.extendTree.Len(), resp)
        return
}

func (mp *metaPartition) setQuotaHbInfo(infos []*proto.QuotaHeartBeatInfo) {
        mp.mqMgr.setQuotaHbInfo(infos)
        return
}

func (mp *metaPartition) getQuotaReportInfos() (infos []*proto.QuotaReportInfo) {
        return mp.mqMgr.getQuotaReportInfos()
}

func (mp *metaPartition) statisticExtendByLoad(extend *Extend) {
        mqMgr := mp.mqMgr
        ino := NewInode(extend.GetInode(), 0)
        retMsg := mp.getInode(ino, true)
        if retMsg.Status != proto.OpOk {
                log.LogErrorf("statisticExtendByLoad get inode[%v] fail [%v].", extend.GetInode(), retMsg.Status)
                return
        }
        ino = retMsg.Msg
        if ino.NLink == 0 {
                return
        }
        quotaIds, isFind := mp.isExistQuota(extend.GetInode())
        if isFind {
                mqMgr.rwlock.Lock()
                defer mqMgr.rwlock.Unlock()
                for _, quotaId := range quotaIds {
                        var baseInfo proto.QuotaUsedInfo
                        value, isFind := mqMgr.statisticBase.Load(quotaId)
                        if isFind {
                                baseInfo = value.(proto.QuotaUsedInfo)
                        }
                        baseInfo.UsedBytes += int64(ino.Size)
                        baseInfo.UsedFiles += 1
                        mqMgr.statisticBase.Store(quotaId, baseInfo)
                        log.LogDebugf("[statisticExtendByLoad] quotaId [%v] baseInfo [%v]", quotaId, baseInfo)

                }
        }
        log.LogInfof("statisticExtendByLoad ino[%v] isFind [%v].", ino.Inode, isFind)
        return
}

func (mp *metaPartition) statisticExtendByStore(extend *Extend, inodeTree *BTree) {
        mqMgr := mp.mqMgr
        ino := NewInode(extend.GetInode(), 0)

        retMsg := mp.getInode(ino, true)
        if retMsg.Status != proto.OpOk {
                log.LogErrorf("statisticExtendByStore get inode[%v] fail [%v].", extend.GetInode(), retMsg.Status)
                return
        }
        ino = retMsg.Msg
        if ino.NLink == 0 {
                return
        }
        value, exist := extend.Get([]byte(proto.QuotaKey))
        if !exist {
                log.LogDebugf("statisticExtendByStore get quota key failed, mp[%v] inode[%v]", mp.config.PartitionId, extend.GetInode())
                return
        }
        quotaInfos := &proto.MetaQuotaInfos{
                QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
        }
        if err := json.Unmarshal(value, &quotaInfos.QuotaInfoMap); err != nil {
                log.LogErrorf("statisticExtendByStore inode[%v] Unmarshal quotaInfos fail [%v]", extend.GetInode(), err)
                return
        }
        mqMgr.rwlock.Lock()
        defer mqMgr.rwlock.Unlock()
        for quotaId := range quotaInfos.QuotaInfoMap {
                var baseInfo proto.QuotaUsedInfo
                value, isFind := mqMgr.statisticRebuildBase.Load(quotaId)
                if isFind {
                        baseInfo = value.(proto.QuotaUsedInfo)
                }
                baseInfo.UsedBytes += int64(ino.Size)
                baseInfo.UsedFiles += 1
                mqMgr.statisticRebuildBase.Store(quotaId, baseInfo)
                log.LogDebugf("[statisticExtendByStore] mp[%v] quotaId [%v] inode[%v] baseInfo [%v]",
                        mp.config.PartitionId, quotaId, extend.GetInode(), baseInfo)
        }
        log.LogDebugf("statisticExtendByStore mp[%v] inode[%v] success.", mp.config.PartitionId, extend.GetInode())
        return
}

func (mp *metaPartition) updateUsedInfo(size int64, files int64, ino uint64) {
        quotaIds, isFind := mp.isExistQuota(ino)
        if isFind {
                log.LogInfof("updateUsedInfo ino[%v] quotaIds [%v] size [%v] files [%v]", ino, quotaIds, size, files)
                for _, quotaId := range quotaIds {
                        mp.mqMgr.updateUsedInfo(size, files, quotaId)
                }
        }
        return
}

func (mp *metaPartition) isExistQuota(ino uint64) (quotaIds []uint32, isFind bool) {
        extend := NewExtend(ino)
        treeItem := mp.extendTree.Get(extend)
        if treeItem == nil {
                isFind = false
                return
        }
        extend = treeItem.(*Extend)
        value, exist := extend.Get([]byte(proto.QuotaKey))
        if !exist {
                isFind = false
                return
        }
        quotaInfos := &proto.MetaQuotaInfos{
                QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
        }
        if err := json.Unmarshal(value, &quotaInfos.QuotaInfoMap); err != nil {
                log.LogErrorf("set quota inode[%v] Unmarshal quotaInfos fail [%v]", ino, err)
                isFind = false
                return
        }
        isFind = true
        quotaInfos.RLock()
        for quotaId := range quotaInfos.QuotaInfoMap {
                quotaIds = append(quotaIds, quotaId)
        }
        quotaInfos.RUnlock()
        log.LogInfof("isExistQuota inode:[%v] quotaIds [%v] isFind[%v]", ino, quotaIds, isFind)
        return
}

func (mp *metaPartition) isOverQuota(ino uint64, size bool, files bool) (status uint8) {
        quotaIds, isFind := mp.isExistQuota(ino)
        if isFind {
                for _, quotaId := range quotaIds {
                        status = mp.mqMgr.IsOverQuota(size, files, quotaId)
                        if status != 0 {
                                log.LogWarnf("isOverQuota ino[%v] quotaId [%v] size [%v] files[%v] status[%v]", ino, quotaId, size, files, status)
                                return
                        }
                }
        }
        return
}

func (mp *metaPartition) getInodeQuota(inode uint64, p *Packet) (err error) {
        extend := NewExtend(inode)
        quotaInfos := &proto.MetaQuotaInfos{
                QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
        }
        var (
                value []byte
                exist bool
        )
        treeItem := mp.extendTree.CopyGet(extend)
        if treeItem == nil {
                goto handleRsp
        }
        extend = treeItem.(*Extend)

        value, exist = extend.Get([]byte(proto.QuotaKey))
        if exist {
                if err = json.Unmarshal(value, &quotaInfos.QuotaInfoMap); err != nil {
                        log.LogErrorf("getInodeQuota inode[%v] Unmarshal quotaInfos fail [%v]", inode, err)
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        return
                }
        }
handleRsp:
        response := &proto.GetInodeQuotaResponse{}
        log.LogInfof("getInodeQuota indoe %v ,map %v", inode, quotaInfos.QuotaInfoMap)
        response.MetaQuotaInfoMap = quotaInfos.QuotaInfoMap

        encoded, err := json.Marshal(response)
        if err != nil {
                p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                return
        }
        p.PacketOkWithBody(encoded)
        return
}

func (mp *metaPartition) getInodeQuotaInfos(inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
        log.LogInfof("getInodeQuotaInfos mp[%v] treeLen[%v]", mp.config.PartitionId, mp.extendTree.Len())
        treeItem := mp.extendTree.Get(NewExtend(inode))
        if treeItem == nil {
                return
        }
        extend := treeItem.(*Extend)
        info := &proto.MetaQuotaInfos{
                QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
        }
        value, exist := extend.Get([]byte(proto.QuotaKey))
        if exist {
                if err = json.Unmarshal(value, &info.QuotaInfoMap); err != nil {
                        log.LogErrorf("getInodeQuota inode[%v] Unmarshal quotaInfos fail [%v]", inode, err)
                        return
                }
                quotaInfos = info.QuotaInfoMap
        }
        log.LogInfof("getInodeQuotaInfos inode[%v] quotaInfos [%v] exist [%v]", inode, quotaInfos, exist)
        return
}

func (mp *metaPartition) setInodeQuota(quotaIds []uint32, inode uint64) {
        extend := NewExtend(inode)
        quotaInfos := &proto.MetaQuotaInfos{
                QuotaInfoMap: make(map[uint32]*proto.MetaQuotaInfo),
        }
        for _, quotaId := range quotaIds {
                quotaInfo := &proto.MetaQuotaInfo{
                        RootInode: false,
                }
                quotaInfos.QuotaInfoMap[quotaId] = quotaInfo
        }
        value, err := json.Marshal(quotaInfos.QuotaInfoMap)
        if err != nil {
                log.LogErrorf("setInodeQuota marsha1 quotaInfos [%v] fail [%v]", quotaInfos, err)
                return
        }
        extend.Put([]byte(proto.QuotaKey), value, mp.verSeq)
        treeItem := mp.extendTree.CopyGet(extend)
        var e *Extend
        if treeItem == nil {
                mp.extendTree.ReplaceOrInsert(extend, true)
        } else {
                e = treeItem.(*Extend)
                e.Merge(extend, true)
        }

        log.LogInfof("setInodeQuota inode[%v] quota [%v] success.", inode, quotaIds)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package metanode

import (
        "encoding/json"
        "fmt"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auditlog"
        "github.com/cubefs/cubefs/util/log"
)

func (mp *metaPartition) TxCreate(req *proto.TxCreateRequest, p *Packet) error {
        var err error
        txInfo := req.TransactionInfo.GetCopy()

        // 1. init tx in tm
        ifo, err := mp.txInit(txInfo, p)
        if err != nil || ifo == nil {
                return err
        }

        if ifo.TmID != int64(mp.config.PartitionId) {
                p.PacketOkReply()
                return nil
        }

        if ifo.State != proto.TxStatePreCommit {
                log.LogWarnf("TxCreate: tx is already init, txInfo %s", ifo.String())
                p.PacketOkReply()
                return nil
        }

        // 2. add tx to other rm
        mp.txInitToRm(ifo, p)
        if p.ResultCode != proto.OpOk {
                return nil
        }

        resp := &proto.TxCreateResponse{
                TxInfo: ifo,
        }

        status := proto.OpOk
        reply, err := json.Marshal(resp)
        if err != nil {
                status = proto.OpErr
                reply = []byte(err.Error())
        }
        p.PacketErrorWithBody(status, reply)
        return nil
}

func (mp *metaPartition) txInitToRm(txInfo *proto.TransactionInfo, p *Packet) {
        mpIfos := txInfo.GroupByMp()
        statusCh := make(chan uint8, len(mpIfos))
        wg := sync.WaitGroup{}

        for mpId, ifo := range mpIfos {
                if mp.config.PartitionId == mpId {
                        continue
                }

                req := &proto.TxCreateRequest{
                        VolName:         mp.config.VolName,
                        PartitionID:     mpId,
                        TransactionInfo: txInfo,
                }

                pkt, _ := buildTxPacket(req, mpId, proto.OpMetaTxCreate)
                members := ifo.Members
                wg.Add(1)
                go func() {
                        defer wg.Done()
                        status := mp.txProcessor.txManager.txSendToMpWithAddrs(members, pkt)
                        if status != proto.OpOk {
                                log.LogWarnf("txInitRm: send to rm failed, addr %s, pkt %s, status %s",
                                        members, string(pkt.Data), proto.GetStatusStr(status))
                        }
                        statusCh <- status
                }()
        }

        wg.Wait()
        close(statusCh)

        for status := range statusCh {
                if !canRetry(status) {
                        p.ResultCode = status
                        return
                }

                if status != proto.OpOk {
                        p.ResultCode = status
                        return
                }
        }

        p.ResultCode = proto.OpOk
        return
}

func canRetry(status uint8) bool {
        if status == proto.OpOk || status == proto.OpAgain || status == proto.OpErr {
                return true
        }
        return false
}

func (mp *metaPartition) txInit(txInfo *proto.TransactionInfo, p *Packet) (ifo *proto.TransactionInfo, err error) {
        if uint64(txInfo.TmID) == mp.config.PartitionId {
                err = mp.initTxInfo(txInfo)
                if err != nil {
                        log.LogWarnf("init tx limited, ifo %v", txInfo)
                        p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                        return
                }
        }

        val, err := txInfo.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return nil, err
        }

        status, err := mp.submit(opFSMTxInit, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return nil, err
        }

        if status.(uint8) != proto.OpOk {
                p.ResultCode = status.(uint8)
                return nil, fmt.Errorf("init tx by raft failed, %v", proto.GetStatusStr(p.ResultCode))
        }

        ifo = mp.txProcessor.txManager.getTransaction(txInfo.TxID)
        if ifo == nil {
                log.LogWarnf("TxCreate: tx is still not exist, info %s", txInfo.String())
                p.ResultCode = proto.OpTxInfoNotExistErr
                return nil, nil
        }

        return ifo, nil
}

// TxCommitRM used to commit tx for single TM or RM
func (mp *metaPartition) TxCommitRM(req *proto.TxApplyRMRequest, p *Packet) error {
        txInfo := req.TransactionInfo.GetCopy()

        ifo := mp.txProcessor.txManager.getTransaction(txInfo.TxID)
        if ifo == nil {
                log.LogWarnf("TxCommitRM: can't find tx, already rollback or commit, ifo %v", req.TransactionInfo)
                p.PacketErrorWithBody(proto.OpTxInfoNotExistErr, []byte(fmt.Sprintf("tx %s is not exist", txInfo.TxID)))
                return nil
        }

        if ifo.Finish() {
                log.LogWarnf("TxCommitRM: tx already commit before in rm, tx %v", ifo)
                p.ResultCode = proto.OpOk
                return nil
        }

        val, err := ifo.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        status, err := mp.submit(opFSMTxCommitRM, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        p.ResultCode = status.(uint8)
        return nil
}

// TxRollbackRM used to rollback tx for single TM or RM
func (mp *metaPartition) TxRollbackRM(req *proto.TxApplyRMRequest, p *Packet) error {
        txInfo := req.TransactionInfo.GetCopy()

        ifo := mp.txProcessor.txManager.getTransaction(txInfo.TxID)
        if ifo == nil {
                log.LogWarnf("TxRollbackRM: can't find tx, already rollback or commit, ifo %v", req.TransactionInfo)
                p.PacketErrorWithBody(proto.OpTxInfoNotExistErr, []byte(fmt.Sprintf("tx %s is not exist", txInfo.TxID)))
                return nil
        }

        if ifo.Finish() {
                log.LogWarnf("TxRollbackRM: tx already commit before in rm, tx %v", ifo)
                p.ResultCode = proto.OpOk
                return nil
        }

        val, err := txInfo.Marshal()
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        status, err := mp.submit(opFSMTxRollbackRM, val)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return err
        }

        p.ResultCode = status.(uint8)
        return nil
}

func (mp *metaPartition) TxCommit(req *proto.TxApplyRequest, p *Packet, remoteAddr string) error {
        var err error
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogTxOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.TxID, err, time.Since(start).Milliseconds())
                }()
        }
        status, err := mp.txProcessor.txManager.commitTx(req.TxID, false)
        if err != nil {
                p.PacketErrorWithBody(status, []byte(err.Error()))
                return err
        }
        p.ResultCode = status
        return err
}

func (mp *metaPartition) TxRollback(req *proto.TxApplyRequest, p *Packet, remoteAddr string) error {
        var err error
        start := time.Now()
        if mp.IsEnableAuditLog() {
                defer func() {
                        auditlog.LogTxOp(remoteAddr, mp.GetVolName(), p.GetOpMsg(), req.TxID, err, time.Since(start).Milliseconds())
                }()
        }
        status, err := mp.txProcessor.txManager.rollbackTx(req.TxID, false)
        if err != nil {
                p.PacketErrorWithBody(status, []byte(err.Error()))
                return err
        }
        p.ResultCode = status
        return err
}

func (mp *metaPartition) TxGetCnt() (uint64, uint64, uint64) {
        txCnt := mp.txProcessor.txManager.txTree.Len()
        rbInoCnt := mp.txProcessor.txResource.txRbInodeTree.Len()
        rbDenCnt := mp.txProcessor.txResource.txRbDentryTree.Len()
        return uint64(txCnt), uint64(rbInoCnt), uint64(rbDenCnt)
}

func (mp *metaPartition) TxGetTree() (*BTree, *BTree, *BTree) {
        tx := mp.txProcessor.txManager.txTree.GetTree()
        rbIno := mp.txProcessor.txResource.txRbInodeTree.GetTree()
        rbDen := mp.txProcessor.txResource.txRbDentryTree.GetTree()
        return tx, rbIno, rbDen
}

func (mp *metaPartition) TxGetInfo(req *proto.TxGetInfoRequest, p *Packet) (err error) {
        var status uint8

        txItem := proto.NewTxInfoBItem(req.TxID)
        var txInfo *proto.TransactionInfo
        if item := mp.txProcessor.txManager.txTree.Get(txItem); item != nil {
                txInfo = item.(*proto.TransactionInfo)
                status = proto.OpOk
        } else {
                status = proto.OpTxInfoNotExistErr
        }

        var reply []byte
        if status == proto.OpOk {
                resp := &proto.TxGetInfoResponse{
                        TxInfo: txInfo,
                }
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }
        p.PacketErrorWithBody(status, reply)
        return err
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"
        "encoding/json"
        "fmt"
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
)

func (mp *metaPartition) GetUniqID(p *Packet, num uint32) (err error) {
        idBuf := make([]byte, 4)
        binary.BigEndian.PutUint32(idBuf, num)
        resp, err := mp.submit(opFSMUniqID, idBuf)
        if err != nil {
                p.PacketErrorWithBody(proto.OpAgain, []byte(err.Error()))
                return
        }

        var (
                status = proto.OpErr
                reply  []byte
        )

        idResp := resp.(*UniqIdResp)
        if idResp.Status == proto.OpOk {
                resp := &GetUniqIDResp{
                        Start: idResp.Start,
                }
                status = proto.OpOk
                reply, err = json.Marshal(resp)
                if err != nil {
                        status = proto.OpErr
                        reply = []byte(err.Error())
                }
        }
        p.PacketErrorWithBody(status, reply)
        return
}

func (mp *metaPartition) allocateUniqID(num uint32) (start, end uint64) {
        for {
                // cur is the last allocated id
                cur := mp.GetUniqId()
                start = cur + 1
                end := cur + uint64(num)
                if atomic.CompareAndSwapUint64(&mp.config.UniqId, cur, end) {
                        return start, end
                }
        }
}

func (mp *metaPartition) uniqCheckerEvict() (left int, evict int, err error) {
        checker := mp.uniqChecker
        left, idx, op := checker.evictIndex()
        if op == nil {
                return left, 0, nil
        }

        fsmReq := &fsmEvictUniqCheckerRequest{
                Idx:    idx,
                UniqID: op.uniqid,
        }
        reqBytes, err := json.Marshal(fsmReq)
        if err != nil {
                panic(err)
        }
        _, err = mp.submit(opFSMUniqCheckerEvict, reqBytes)
        return left, idx + 1, err
}

var (
        inodeOnceSize    = 16
        newInodeOnceSize = 24
)

type InodeOnce struct {
        UniqID uint64
        Inode  uint64 // Inode ID
        VerSeq uint64
}

func (i *InodeOnce) Marshal() (val []byte) {
        val = make([]byte, newInodeOnceSize)
        binary.BigEndian.PutUint64(val[0:8], i.UniqID)
        binary.BigEndian.PutUint64(val[8:16], i.Inode)
        binary.BigEndian.PutUint64(val[16:24], i.VerSeq)
        return val
}

func InodeOnceUnlinkMarshal(req *UnlinkInoReq) []byte {
        inoOnce := &InodeOnce{
                UniqID: req.UniqID,
                Inode:  req.Inode,
                VerSeq: req.VerSeq,
        }
        return inoOnce.Marshal()
}

func InodeOnceLinkMarshal(req *LinkInodeReq) []byte {
        inoOnce := &InodeOnce{
                UniqID: req.UniqID,
                Inode:  req.Inode,
        }
        return inoOnce.Marshal()
}

func InodeOnceUnmarshal(val []byte) (i *InodeOnce, err error) {
        i = &InodeOnce{}
        if len(val) < inodeOnceSize {
                return i, fmt.Errorf("size incorrect")
        }
        i.UniqID = binary.BigEndian.Uint64(val[0:8])
        i.Inode = binary.BigEndian.Uint64(val[8:16])
        if len(val) == 24 {
                i.VerSeq = binary.BigEndian.Uint64(val[16:24])
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bufio"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "hash/crc32"
        "io"
        "os"
        "path"
        "strings"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        mmap "github.com/edsrzf/mmap-go"
)

const (
        snapshotDir             = "snapshot"
        snapshotDirTmp          = ".snapshot"
        snapshotBackup          = ".snapshot_backup"
        inodeFile               = "inode"
        dentryFile              = "dentry"
        extendFile              = "extend"
        multipartFile           = "multipart"
        txInfoFile              = "tx_info"
        txRbInodeFile           = "tx_rb_inode"
        txRbDentryFile          = "tx_rb_dentry"
        applyIDFile             = "apply"
        TxIDFile                = "transactionID"
        SnapshotSign            = ".sign"
        metadataFile            = "meta"
        metadataFileTmp         = ".meta"
        uniqIDFile              = "uniqID"
        uniqCheckerFile         = "uniqChecker"
        verdataFile             = "multiVer"
        StaleMetadataSuffix     = ".old"
        StaleMetadataTimeFormat = "20060102150405.000000000"
        verdataInitFile         = "multiVerInitFile"
)

func (mp *metaPartition) loadMetadata() (err error) {
        metaFile := path.Join(mp.config.RootDir, metadataFile)
        fp, err := os.OpenFile(metaFile, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadMetadata]: OpenFile %s", err.Error())
                return
        }
        defer fp.Close()
        data, err := io.ReadAll(fp)
        if err != nil || len(data) == 0 {
                err = errors.NewErrorf("[loadMetadata]: ReadFile %s, data: %s", err.Error(),
                        string(data))
                return
        }
        mConf := &MetaPartitionConfig{}
        if err = json.Unmarshal(data, mConf); err != nil {
                err = errors.NewErrorf("[loadMetadata]: Unmarshal MetaPartitionConfig %s",
                        err.Error())
                return
        }

        if mConf.checkMeta() != nil {
                return
        }
        mp.config.PartitionId = mConf.PartitionId
        mp.config.VolName = mConf.VolName
        mp.config.Start = mConf.Start
        mp.config.End = mConf.End
        mp.config.Peers = mConf.Peers
        mp.config.Cursor = mp.config.Start
        mp.config.UniqId = 0

        mp.uidManager = NewUidMgr(mp.config.VolName, mp.config.PartitionId)
        mp.mqMgr = NewQuotaManager(mp.config.VolName, mp.config.PartitionId)

        log.LogInfof("loadMetadata: load complete: partitionID(%v) volume(%v) range(%v,%v) cursor(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.config.Start, mp.config.End, mp.config.Cursor)
        return
}

func (mp *metaPartition) loadInode(rootDir string, crc uint32) (err error) {
        var numInodes uint64
        defer func() {
                if err == nil {
                        log.LogInfof("loadInode: load complete: partitonID(%v) volume(%v) numInodes(%v)",
                                mp.config.PartitionId, mp.config.VolName, numInodes)
                }
        }()
        filename := path.Join(rootDir, inodeFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadInode] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadInode] OpenFile: %s", err.Error())
                return
        }
        defer fp.Close()
        reader := bufio.NewReaderSize(fp, 4*1024*1024)
        inoBuf := make([]byte, 4)
        crcCheck := crc32.NewIEEE()
        for {
                inoBuf = inoBuf[:4]
                // first read length
                _, err = io.ReadFull(reader, inoBuf)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                if res := crcCheck.Sum32(); res != crc {
                                        log.LogErrorf("[loadInode]: check crc mismatch, expected[%d], actual[%d]", crc, res)
                                        return ErrSnapshotCrcMismatch
                                }
                                return
                        }
                        err = errors.NewErrorf("[loadInode] ReadHeader: %s", err.Error())
                        return
                }
                // length crc
                if _, err = crcCheck.Write(inoBuf); err != nil {
                        return err
                }

                length := binary.BigEndian.Uint32(inoBuf)

                // next read body
                if uint32(cap(inoBuf)) >= length {
                        inoBuf = inoBuf[:length]
                } else {
                        inoBuf = make([]byte, length)
                }
                _, err = io.ReadFull(reader, inoBuf)
                if err != nil {
                        err = errors.NewErrorf("[loadInode] ReadBody: %s", err.Error())
                        return
                }
                ino := NewInode(0, 0)
                if err = ino.Unmarshal(inoBuf); err != nil {
                        err = errors.NewErrorf("[loadInode] Unmarshal: %s", err.Error())
                        return
                }
                mp.acucumUidSizeByLoad(ino)
                // data crc
                if _, err = crcCheck.Write(inoBuf); err != nil {
                        return err
                }

                mp.size += ino.Size

                mp.fsmCreateInode(ino)
                mp.checkAndInsertFreeList(ino)
                if mp.config.Cursor < ino.Inode {
                        mp.config.Cursor = ino.Inode
                }
                numInodes += 1
        }
}

// Load dentry from the dentry snapshot.
func (mp *metaPartition) loadDentry(rootDir string, crc uint32) (err error) {
        var numDentries uint64
        defer func() {
                if err == nil {
                        log.LogInfof("loadDentry: load complete: partitonID(%v) volume(%v) numDentries(%v)",
                                mp.config.PartitionId, mp.config.VolName, numDentries)
                }
        }()
        filename := path.Join(rootDir, dentryFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadDentry] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadDentry] OpenFile: %s", err.Error())
                return
        }

        defer fp.Close()
        reader := bufio.NewReaderSize(fp, 4*1024*1024)
        dentryBuf := make([]byte, 4)
        crcCheck := crc32.NewIEEE()
        for {
                dentryBuf = dentryBuf[:4]
                // First Read 4byte header length
                _, err = io.ReadFull(reader, dentryBuf)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                if res := crcCheck.Sum32(); res != crc {
                                        log.LogErrorf("[loadDentry]: check crc mismatch, expected[%d], actual[%d]", crc, res)
                                        return ErrSnapshotCrcMismatch
                                }
                                return
                        }
                        err = errors.NewErrorf("[loadDentry] ReadHeader: %s", err.Error())
                        return
                }
                if _, err = crcCheck.Write(dentryBuf); err != nil {
                        return err
                }
                length := binary.BigEndian.Uint32(dentryBuf)

                // next read body
                if uint32(cap(dentryBuf)) >= length {
                        dentryBuf = dentryBuf[:length]
                } else {
                        dentryBuf = make([]byte, length)
                }
                _, err = io.ReadFull(reader, dentryBuf)
                if err != nil {
                        err = errors.NewErrorf("[loadDentry]: ReadBody: %s", err.Error())
                        return
                }
                dentry := &Dentry{}
                if err = dentry.Unmarshal(dentryBuf); err != nil {
                        err = errors.NewErrorf("[loadDentry] Unmarshal: %s", err.Error())
                        return
                }
                if status := mp.fsmCreateDentry(dentry, true); status != proto.OpOk {
                        err = errors.NewErrorf("[loadDentry] createDentry dentry: %v, resp code: %d", dentry, status)
                        return
                }
                if _, err = crcCheck.Write(dentryBuf); err != nil {
                        return err
                }
                numDentries += 1
        }
}

func (mp *metaPartition) loadExtend(rootDir string, crc uint32) (err error) {
        filename := path.Join(rootDir, extendFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadExtend] Stat: %s", err.Error())
                return err
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadExtend] OpenFile: %s", err.Error())
                return err
        }
        defer func() {
                _ = fp.Close()
        }()
        var mem mmap.MMap
        if mem, err = mmap.Map(fp, mmap.RDONLY, 0); err != nil {
                return err
        }
        defer func() {
                _ = mem.Unmap()
        }()
        var offset, n int
        // read number of extends
        var numExtends uint64
        numExtends, n = binary.Uvarint(mem)
        offset += n

        varintTmp := make([]byte, binary.MaxVarintLen64)
        // write number of extends
        n = binary.PutUvarint(varintTmp, numExtends)

        crcCheck := crc32.NewIEEE()
        if _, err = crcCheck.Write(varintTmp[:n]); err != nil {
                return
        }
        for i := uint64(0); i < numExtends; i++ {
                // read length
                var numBytes uint64
                numBytes, n = binary.Uvarint(mem[offset:])
                offset += n
                var extend *Extend
                if extend, err = NewExtendFromBytes(mem[offset : offset+int(numBytes)]); err != nil {
                        return err
                }

                if _, err = crcCheck.Write(mem[offset-n : offset]); err != nil {
                        return err
                }
                // log.LogDebugf("loadExtend: new extend from bytes: partitionID (%v) volume(%v) inode[%v]",
                //        mp.config.PartitionId, mp.config.VolName, extend.inode)
                _ = mp.fsmSetXAttr(extend)

                if _, err = crcCheck.Write(mem[offset : offset+int(numBytes)]); err != nil {
                        return
                }
                offset += int(numBytes)
                mp.statisticExtendByLoad(extend)
        }

        log.LogInfof("loadExtend: load complete: partitionID(%v) volume(%v) numExtends(%v) filename(%v)",
                mp.config.PartitionId, mp.config.VolName, numExtends, filename)
        if res := crcCheck.Sum32(); res != crc {
                log.LogErrorf("loadExtend: check crc mismatch, expected[%d], actual[%d]", crc, res)
                return ErrSnapshotCrcMismatch
        }
        return nil
}

func (mp *metaPartition) loadMultipart(rootDir string, crc uint32) (err error) {
        filename := path.Join(rootDir, multipartFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadMultipart] Stat: %s", err.Error())
                return err
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadMultipart] OpenFile: %s", err.Error())
                return err
        }
        defer func() {
                _ = fp.Close()
        }()
        var mem mmap.MMap
        if mem, err = mmap.Map(fp, mmap.RDONLY, 0); err != nil {
                return err
        }
        defer func() {
                _ = mem.Unmap()
        }()
        var offset, n int
        // read number of multipart
        var numMultiparts uint64
        numMultiparts, n = binary.Uvarint(mem)
        varintTmp := make([]byte, binary.MaxVarintLen64)
        // write number of multipart
        n = binary.PutUvarint(varintTmp, numMultiparts)
        crcCheck := crc32.NewIEEE()
        if _, err = crcCheck.Write(varintTmp[:n]); err != nil {
                return
        }
        offset += n
        for i := uint64(0); i < numMultiparts; i++ {
                // read length
                var numBytes uint64
                numBytes, n = binary.Uvarint(mem[offset:])
                offset += n
                if _, err = crcCheck.Write(mem[offset-n : offset]); err != nil {
                        return err
                }
                var multipart *Multipart
                multipart = MultipartFromBytes(mem[offset : offset+int(numBytes)])
                log.LogDebugf("loadMultipart: create multipart from bytes: partitionID（%v) multipartID(%v)", mp.config.PartitionId, multipart.id)
                mp.fsmCreateMultipart(multipart)
                offset += int(numBytes)
                if _, err = crcCheck.Write(mem[offset-int(numBytes) : offset]); err != nil {
                        return err
                }
        }
        log.LogInfof("loadMultipart: load complete: partitionID(%v) numMultiparts(%v) filename(%v)",
                mp.config.PartitionId, numMultiparts, filename)
        if res := crcCheck.Sum32(); res != crc {
                log.LogErrorf("[loadMultipart] check crc mismatch, expected[%d], actual[%d]", crc, res)
                return ErrSnapshotCrcMismatch
        }
        return nil
}

func (mp *metaPartition) loadApplyID(rootDir string) (err error) {
        filename := path.Join(rootDir, applyIDFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadApplyID]: Stat %s", err.Error())
                return
        }
        data, err := os.ReadFile(filename)
        if err != nil {
                err = errors.NewErrorf("[loadApplyID] ReadFile: %s", err.Error())
                return
        }
        if len(data) == 0 {
                err = errors.NewErrorf("[loadApplyID]: ApplyID is empty")
                return
        }
        var cursor uint64
        if strings.Contains(string(data), "|") {
                _, err = fmt.Sscanf(string(data), "%d|%d", &mp.applyID, &cursor)
        } else {
                _, err = fmt.Sscanf(string(data), "%d", &mp.applyID)
        }
        if err != nil {
                err = errors.NewErrorf("[loadApplyID] ReadApplyID: %s", err.Error())
                return
        }

        mp.storedApplyId = mp.applyID

        if cursor > mp.GetCursor() {
                atomic.StoreUint64(&mp.config.Cursor, cursor)
        }

        log.LogInfof("loadApplyID: load complete: partitionID(%v) volume(%v) applyID(%v) cursor(%v) filename(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.applyID, mp.config.Cursor, filename)
        return
}

func (mp *metaPartition) loadTxRbDentry(rootDir string, crc uint32) (err error) {
        var numTxRbDentry uint64
        defer func() {
                if err == nil {
                        log.LogInfof("loadTxRbDentry: load complete: partitonID(%v) volume(%v) numInodes(%v)",
                                mp.config.PartitionId, mp.config.VolName, numTxRbDentry)
                }
        }()
        filename := path.Join(rootDir, txRbDentryFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadTxRbDentry] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadTxRbDentry] OpenFile: %s", err.Error())
                return
        }
        defer fp.Close()
        reader := bufio.NewReaderSize(fp, 4*1024*1024)
        txBuf := make([]byte, 4)
        crcCheck := crc32.NewIEEE()

        for {
                txBuf = txBuf[:4]
                // first read length
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                if res := crcCheck.Sum32(); res != crc {
                                        log.LogErrorf("[loadTxRbDentry]: check crc mismatch, expected[%d], actual[%d]", crc, res)
                                        return ErrSnapshotCrcMismatch
                                }
                                return
                        }
                        err = errors.NewErrorf("[loadTxRbDentry] ReadHeader: %s", err.Error())
                        return
                }
                // length crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                length := binary.BigEndian.Uint32(txBuf)

                // next read body
                if uint32(cap(txBuf)) >= length {
                        txBuf = txBuf[:length]
                } else {
                        txBuf = make([]byte, length)
                }
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        err = errors.NewErrorf("[loadTxRbDentry] ReadBody: %s", err.Error())
                        return
                }

                txRbDentry := NewTxRollbackDentry(nil, nil, 0)
                if err = txRbDentry.Unmarshal(txBuf); err != nil {
                        err = errors.NewErrorf("[loadTxRbDentry] Unmarshal: %s", err.Error())
                        return
                }

                // data crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                // mp.txProcessor.txResource.txRollbackDentries[txRbDentry.txDentryInfo.GetKey()] = txRbDentry
                mp.txProcessor.txResource.txRbDentryTree.ReplaceOrInsert(txRbDentry, true)
                numTxRbDentry++
        }
}

func (mp *metaPartition) loadTxRbInode(rootDir string, crc uint32) (err error) {
        var numTxRbInode uint64
        defer func() {
                if err == nil {
                        log.LogInfof("loadTxRbInode: load complete: partitonID(%v) volume(%v) numInodes(%v)",
                                mp.config.PartitionId, mp.config.VolName, numTxRbInode)
                }
        }()
        filename := path.Join(rootDir, txRbInodeFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadTxRbInode] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadTxRbInode] OpenFile: %s", err.Error())
                return
        }
        defer fp.Close()
        reader := bufio.NewReaderSize(fp, 4*1024*1024)
        txBuf := make([]byte, 4)
        crcCheck := crc32.NewIEEE()

        for {
                txBuf = txBuf[:4]
                // first read length
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                return
                        }
                        err = errors.NewErrorf("[loadTxRbInode] ReadHeader: %s", err.Error())
                        return
                }
                // length crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                length := binary.BigEndian.Uint32(txBuf)

                // next read body
                if uint32(cap(txBuf)) >= length {
                        txBuf = txBuf[:length]
                } else {
                        txBuf = make([]byte, length)
                }
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        err = errors.NewErrorf("[loadTxRbInode] ReadBody: %s", err.Error())
                        return
                }

                txRbInode := NewTxRollbackInode(nil, []uint32{}, nil, 0)
                if err = txRbInode.Unmarshal(txBuf); err != nil {
                        err = errors.NewErrorf("[loadTxRbInode] Unmarshal: %s", err.Error())
                        return
                }
                // data crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                mp.txProcessor.txResource.txRbInodeTree.ReplaceOrInsert(txRbInode, true)
                numTxRbInode++
        }
}

func (mp *metaPartition) loadTxInfo(rootDir string, crc uint32) (err error) {
        var numTxInfos uint64
        defer func() {
                if err == nil {
                        log.LogInfof("loadTxInfo: load complete: partitonID(%v) volume(%v) numInodes(%v)",
                                mp.config.PartitionId, mp.config.VolName, numTxInfos)
                }
        }()
        filename := path.Join(rootDir, txInfoFile)
        if _, err = os.Stat(filename); err != nil {
                err = errors.NewErrorf("[loadTxInfo] Stat: %s", err.Error())
                return
        }
        fp, err := os.OpenFile(filename, os.O_RDONLY, 0o644)
        if err != nil {
                err = errors.NewErrorf("[loadTxInfo] OpenFile: %s", err.Error())
                return
        }
        defer fp.Close()
        reader := bufio.NewReaderSize(fp, 4*1024*1024)
        txBuf := make([]byte, 4)
        crcCheck := crc32.NewIEEE()

        for {
                txBuf = txBuf[:4]
                // first read length
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                                if res := crcCheck.Sum32(); res != crc {
                                        log.LogErrorf("[loadTxInfo]: check crc mismatch, expected[%d], actual[%d]", crc, res)
                                        return ErrSnapshotCrcMismatch
                                }
                                return
                        }
                        err = errors.NewErrorf("[loadTxInfo] ReadHeader: %s", err.Error())
                        return
                }
                // length crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                length := binary.BigEndian.Uint32(txBuf)

                // next read body
                if uint32(cap(txBuf)) >= length {
                        txBuf = txBuf[:length]
                } else {
                        txBuf = make([]byte, length)
                }
                _, err = io.ReadFull(reader, txBuf)
                if err != nil {
                        err = errors.NewErrorf("[loadTxInfo] ReadBody: %s", err.Error())
                        return
                }

                txInfo := proto.NewTransactionInfo(0, proto.TxTypeUndefined)
                if err = txInfo.Unmarshal(txBuf); err != nil {
                        err = errors.NewErrorf("[loadTxInfo] Unmarshal: %s", err.Error())
                        return
                }

                // data crc
                if _, err = crcCheck.Write(txBuf); err != nil {
                        return err
                }

                mp.txProcessor.txManager.addTxInfo(txInfo)
                numTxInfos++
        }
}

func (mp *metaPartition) loadTxID(rootDir string) (err error) {
        filename := path.Join(rootDir, TxIDFile)
        if _, err = os.Stat(filename); err != nil {
                err = nil
                return
        }
        data, err := os.ReadFile(filename)
        if err != nil {
                err = errors.NewErrorf("[loadTxID] OpenFile: %s", err.Error())
                return
        }
        if len(data) == 0 {
                err = errors.NewErrorf("[loadTxID]: TxID is empty")
                return
        }
        var txId uint64
        _, err = fmt.Sscanf(string(data), "%d", &txId)
        if err != nil {
                err = errors.NewErrorf("[loadTxID] ReadTxID: %s", err.Error())
                return
        }

        if txId > mp.txProcessor.txManager.txIdAlloc.getTransactionID() {
                mp.txProcessor.txManager.txIdAlloc.setTransactionID(txId)
        }
        log.LogInfof("loadTxID: load complete: partitionID(%v) volume(%v) txId(%v) filename(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.txProcessor.txManager.txIdAlloc.getTransactionID(), filename)
        return
}

func (mp *metaPartition) loadUniqID(rootDir string) (err error) {
        filename := path.Join(rootDir, uniqIDFile)
        if _, err = os.Stat(filename); err != nil {
                err = nil
                return
        }
        data, err := os.ReadFile(filename)
        if err != nil {
                err = errors.NewErrorf("[loadUniqID] OpenFile: %s", err.Error())
                return
        }
        if len(data) == 0 {
                err = errors.NewErrorf("[loadUniqID]: uniqID is empty")
                return
        }
        var uniqId uint64
        _, err = fmt.Sscanf(string(data), "%d", &uniqId)
        if err != nil {
                err = errors.NewErrorf("[loadUniqID] Read uniqID: %s", err.Error())
                return
        }

        if uniqId > mp.GetUniqId() {
                atomic.StoreUint64(&mp.config.UniqId, uniqId)
        }

        log.LogInfof("loadUniqID: load complete: partitionID(%v) volume(%v) uniqID(%v) filename(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.GetUniqId(), filename)
        return
}

func (mp *metaPartition) loadUniqChecker(rootDir string, crc uint32) (err error) {
        log.LogInfof("loadUniqChecker partition(%v) begin", mp.config.PartitionId)
        filename := path.Join(rootDir, uniqCheckerFile)
        if _, err = os.Stat(filename); err != nil {
                log.LogErrorf("loadUniqChecker get file %s err(%s)", filename, err)
                err = nil
                return
        }

        data, err := os.ReadFile(filename)
        if err != nil {
                log.LogErrorf("loadUniqChecker read file %s err(%s)", filename, err)
                err = errors.NewErrorf("[loadUniqChecker] OpenFile: %v", err.Error())
                return
        }
        if err = mp.uniqChecker.UnMarshal(data); err != nil {
                log.LogErrorf("loadUniqChecker UnMarshal err(%s)", err)
                err = errors.NewErrorf("[loadUniqChecker] Unmarshal: %v", err.Error())
                return
        }

        crcCheck := crc32.NewIEEE()
        if _, err = crcCheck.Write(data); err != nil {
                log.LogErrorf("loadUniqChecker write to  crcCheck failed: %s", err)
                return err
        }
        if res := crcCheck.Sum32(); res != crc {
                log.LogErrorf("[loadUniqChecker]: check crc mismatch, expected[%d], actual[%d]", crc, res)
                return ErrSnapshotCrcMismatch
        }

        log.LogInfof("loadUniqChecker partition(%v) complete", mp.config.PartitionId)
        return
}

func (mp *metaPartition) loadMultiVer(rootDir string, crc uint32) (err error) {
        filename := path.Join(rootDir, verdataFile)
        if _, err = os.Stat(filename); err != nil {

                err = nil
                return
        }

        data, err := os.ReadFile(filename)
        if err != nil {
                if err == os.ErrNotExist {
                        err = nil
                        return
                }

                err = errors.NewErrorf("[loadMultiVer] OpenFile: %s", err.Error())
                return
        }

        if len(data) == 0 {
                err = errors.NewErrorf("[loadMultiVer]: ApplyID is empty")
                return
        }

        var (
                verData string
                applyId uint64
        )
        if strings.Contains(string(data), "|") {
                _, err = fmt.Sscanf(string(data), "%d|%s", &applyId, &verData)
        } else {
                _, err = fmt.Sscanf(string(data), "%d", &applyId)
        }

        if err != nil {
                err = errors.NewErrorf("[loadMultiVer] ReadVerList: %s", err.Error())
                return
        }

        var verList []*proto.VolVersionInfo
        if err = json.Unmarshal([]byte(verData), &verList); err != nil {
                err = errors.NewErrorf("[loadMultiVer] ReadVerList: %s verData(%v) applyId %v", verList, verData, applyId)
                return
        }

        var byteData []byte
        if byteData, err = json.Marshal(verList); err != nil {
                return
        }
        sign := crc32.NewIEEE()
        if _, err = sign.Write(byteData); err != nil {
                return
        }

        if crc != sign.Sum32() {
                return fmt.Errorf("partitionID(%v) volume(%v) calc crc %v not equal with disk %v", mp.config.PartitionId, mp.config.VolName, sign.Sum32(), crc)
        }

        mp.multiVersionList.VerList = verList
        mp.verSeq = mp.multiVersionList.GetLastVer()

        log.LogInfof("loadMultiVer: updateVerList load complete: partitionID(%v) volume(%v) applyID(%v) filename(%v) verlist (%v) crc (%v) mp Ver(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.applyID, filename, mp.multiVersionList.VerList, crc, mp.verSeq)
        return
}

func (mp *metaPartition) storeMultiVersion(rootDir string, sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, verdataFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                fp.Close()
        }()
        var verData []byte
        if verData, err = json.Marshal(sm.multiVerList); err != nil {
                return
        }
        sign := crc32.NewIEEE()
        if _, err = sign.Write(verData); err != nil {
                return
        }
        crc = sign.Sum32()

        if _, err = fp.WriteString(fmt.Sprintf("%d|%s", sm.applyIndex, string(verData))); err != nil {
                return
        }
        log.LogInfof("storeMultiVersion: store complete: partitionID(%v) volume(%v) applyID(%v) verData(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.applyIndex, string(verData), crc)
        return
}

func (mp *metaPartition) renameStaleMetadata() (err error) {
        if _, err = os.Stat(mp.config.RootDir); err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
        }

        curTime := time.Now().Format(StaleMetadataTimeFormat)
        staleMetaDirName := mp.config.RootDir + "_" + curTime + StaleMetadataSuffix
        if err = os.Rename(mp.config.RootDir, staleMetaDirName); err != nil {
                return err
        }
        return nil
}

func (mp *metaPartition) persistMetadata() (err error) {
        if err = mp.config.checkMeta(); err != nil {
                err = errors.NewErrorf("[persistMetadata]->%s", err.Error())
                return
        }

        // TODO Unhandled errors
        os.MkdirAll(mp.config.RootDir, 0o755)
        filename := path.Join(mp.config.RootDir, metadataFileTmp)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                // TODO Unhandled errors
                fp.Sync()
                fp.Close()
                os.Remove(filename)
        }()

        data, err := json.Marshal(mp.config)
        if err != nil {
                return
        }
        if _, err = fp.Write(data); err != nil {
                return
        }
        if err = os.Rename(filename, path.Join(mp.config.RootDir, metadataFile)); err != nil {
                return
        }
        log.LogInfof("persistMetata: persist complete: partitionID(%v) volume(%v) range(%v,%v) cursor(%v)",
                mp.config.PartitionId, mp.config.VolName, mp.config.Start, mp.config.End, mp.config.Cursor)
        return
}

func (mp *metaPartition) storeApplyID(rootDir string, sm *storeMsg) (err error) {
        filename := path.Join(rootDir, applyIDFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                fp.Close()
        }()

        cursor := mp.GetCursor()
        if _, err = fp.WriteString(fmt.Sprintf("%d|%d", sm.applyIndex, cursor)); err != nil {
                return
        }
        log.LogWarnf("storeApplyID: store complete: partitionID(%v) volume(%v) applyID(%v) cursor(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.applyIndex, cursor)
        return
}

func (mp *metaPartition) storeTxID(rootDir string, sm *storeMsg) (err error) {
        filename := path.Join(rootDir, TxIDFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                fp.Close()
        }()
        if _, err = fp.WriteString(fmt.Sprintf("%d", sm.txId)); err != nil {
                return
        }
        log.LogInfof("storeTxID: store complete: partitionID(%v) volume(%v) txId(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.txId)
        return
}

func (mp *metaPartition) storeTxRbDentry(rootDir string, sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, txRbDentryFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                // TODO Unhandled errors
                fp.Close()
        }()

        var data []byte
        lenBuf := make([]byte, 4)
        sign := crc32.NewIEEE()

        sm.txRbDentryTree.Ascend(func(i BtreeItem) bool {
                rbDentry := i.(*TxRollbackDentry)
                if data, err = rbDentry.Marshal(); err != nil {
                        return false
                }
                binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
                if _, err = fp.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = sign.Write(lenBuf); err != nil {
                        return false
                }

                if _, err = fp.Write(data); err != nil {
                        return false
                }
                if _, err = sign.Write(data); err != nil {
                        return false
                }
                return true
        })

        crc = sign.Sum32()
        log.LogInfof("storeTxRbDentry: store complete: partitoinID(%v) volume(%v) numRbDentry(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.txRbDentryTree.Len(), crc)
        return
}

func (mp *metaPartition) storeTxRbInode(rootDir string, sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, txRbInodeFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                // TODO Unhandled errors
                fp.Close()
        }()

        var data []byte
        lenBuf := make([]byte, 4)
        sign := crc32.NewIEEE()

        sm.txRbInodeTree.Ascend(func(i BtreeItem) bool {
                rbInode := i.(*TxRollbackInode)
                if data, err = rbInode.Marshal(); err != nil {
                        return false
                }
                binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
                if _, err = fp.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = sign.Write(lenBuf); err != nil {
                        return false
                }

                if _, err = fp.Write(data); err != nil {
                        return false
                }
                if _, err = sign.Write(data); err != nil {
                        return false
                }
                return true
        })

        crc = sign.Sum32()
        log.LogInfof("storeTxRbInode: store complete: partitoinID(%v) volume(%v) numRbinode[%v] crc(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.txRbInodeTree.Len(), crc)
        return
}

func (mp *metaPartition) storeTxInfo(rootDir string, sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, txInfoFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                // TODO Unhandled errors
                fp.Close()
        }()

        var data []byte
        lenBuf := make([]byte, 4)
        sign := crc32.NewIEEE()

        sm.txTree.Ascend(func(i BtreeItem) bool {
                tx := i.(*proto.TransactionInfo)
                if data, err = tx.Marshal(); err != nil {
                        return false
                }

                binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
                if _, err = fp.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = sign.Write(lenBuf); err != nil {
                        return false
                }

                if _, err = fp.Write(data); err != nil {
                        return false
                }
                if _, err = sign.Write(data); err != nil {
                        return false
                }
                return true
        })

        crc = sign.Sum32()
        log.LogInfof("storeTxInfo: store complete: partitoinID(%v) volume(%v) numTxs(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.txTree.Len(), crc)
        return
}

func (mp *metaPartition) storeInode(rootDir string,
        sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, inodeFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                // TODO Unhandled errors
                fp.Close()
        }()

        size := uint64(0)

        var data []byte
        lenBuf := make([]byte, 4)
        sign := crc32.NewIEEE()
        sm.inodeTree.Ascend(func(i BtreeItem) bool {
                ino := i.(*Inode)
                if sm.uidRebuild {
                        mp.acucumUidSizeByStore(ino)
                }

                if data, err = ino.Marshal(); err != nil {
                        return false
                }

                size += ino.Size
                mp.fileStats(ino)

                // set length
                binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
                if _, err = fp.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = sign.Write(lenBuf); err != nil {
                        return false
                }
                // set body
                if _, err = fp.Write(data); err != nil {
                        return false
                }
                if _, err = sign.Write(data); err != nil {
                        return false
                }
                return true
        })
        mp.acucumRebuildFin(sm.uidRebuild)
        crc = sign.Sum32()
        mp.size = size

        log.LogInfof("storeInode: store complete: partitoinID(%v) volume(%v) numInodes(%v) crc(%v), size (%d)",
                mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), crc, size)

        return
}

func (mp *metaPartition) storeDentry(rootDir string,
        sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, dentryFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                // TODO Unhandled errors
                fp.Close()
        }()
        var data []byte
        lenBuf := make([]byte, 4)
        sign := crc32.NewIEEE()
        sm.dentryTree.Ascend(func(i BtreeItem) bool {
                dentry := i.(*Dentry)
                data, err = dentry.Marshal()
                if err != nil {
                        return false
                }
                // set length
                binary.BigEndian.PutUint32(lenBuf, uint32(len(data)))
                if _, err = fp.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = sign.Write(lenBuf); err != nil {
                        return false
                }
                if _, err = fp.Write(data); err != nil {
                        return false
                }
                if _, err = sign.Write(data); err != nil {
                        return false
                }
                return true
        })
        crc = sign.Sum32()
        log.LogInfof("storeDentry: store complete: partitoinID(%v) volume(%v) numDentries(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.dentryTree.Len(), crc)
        return
}

func (mp *metaPartition) storeExtend(rootDir string, sm *storeMsg) (crc uint32, err error) {
        extendTree := sm.extendTree
        fp := path.Join(rootDir, extendFile)
        var f *os.File
        f, err = os.OpenFile(fp, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        log.LogDebugf("storeExtend: store start: partitoinID(%v) volume(%v) numInodes(%v) extends(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), sm.extendTree.Len())
        defer func() {
                closeErr := f.Close()
                if err == nil && closeErr != nil {
                        err = closeErr
                }
        }()
        writer := bufio.NewWriterSize(f, 4*1024*1024)
        crc32 := crc32.NewIEEE()
        varintTmp := make([]byte, binary.MaxVarintLen64)
        var n int
        // write number of extends
        n = binary.PutUvarint(varintTmp, uint64(extendTree.Len()))
        if _, err = writer.Write(varintTmp[:n]); err != nil {
                return
        }
        if _, err = crc32.Write(varintTmp[:n]); err != nil {
                return
        }

        extendTree.Ascend(func(i BtreeItem) bool {
                e := i.(*Extend)
                var raw []byte
                if sm.quotaRebuild {
                        mp.statisticExtendByStore(e, sm.inodeTree)
                }
                if raw, err = e.Bytes(); err != nil {
                        return false
                }
                // write length
                n = binary.PutUvarint(varintTmp, uint64(len(raw)))
                if _, err = writer.Write(varintTmp[:n]); err != nil {
                        return false
                }
                if _, err = crc32.Write(varintTmp[:n]); err != nil {
                        return false
                }
                // write raw
                if _, err = writer.Write(raw); err != nil {
                        return false
                }
                if _, err = crc32.Write(raw); err != nil {
                        return false
                }
                return true
        })
        log.LogInfof("storeExtend: write data ok: partitoinID(%v) volume(%v) numInodes(%v) extends(%v) quotaRebuild(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.inodeTree.Len(), sm.extendTree.Len(), sm.quotaRebuild)
        mp.mqMgr.statisticRebuildFin(sm.quotaRebuild)
        if err != nil {
                return
        }

        if err = writer.Flush(); err != nil {
                return
        }
        if err = f.Sync(); err != nil {
                return
        }
        crc = crc32.Sum32()
        log.LogInfof("storeExtend: store complete: partitoinID(%v) volume(%v) numExtends(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, extendTree.Len(), crc)
        return
}

func (mp *metaPartition) storeMultipart(rootDir string, sm *storeMsg) (crc uint32, err error) {
        multipartTree := sm.multipartTree
        fp := path.Join(rootDir, multipartFile)
        var f *os.File
        f, err = os.OpenFile(fp, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                closeErr := f.Close()
                if err == nil && closeErr != nil {
                        err = closeErr
                }
        }()
        writer := bufio.NewWriterSize(f, 4*1024*1024)
        crc32 := crc32.NewIEEE()
        varintTmp := make([]byte, binary.MaxVarintLen64)
        var n int
        // write number of extends
        n = binary.PutUvarint(varintTmp, uint64(multipartTree.Len()))
        if _, err = writer.Write(varintTmp[:n]); err != nil {
                return
        }
        if _, err = crc32.Write(varintTmp[:n]); err != nil {
                return
        }
        multipartTree.Ascend(func(i BtreeItem) bool {
                m := i.(*Multipart)
                var raw []byte
                if raw, err = m.Bytes(); err != nil {
                        return false
                }
                // write length
                n = binary.PutUvarint(varintTmp, uint64(len(raw)))
                if _, err = writer.Write(varintTmp[:n]); err != nil {
                        return false
                }
                if _, err = crc32.Write(varintTmp[:n]); err != nil {
                        return false
                }
                // write raw
                if _, err = writer.Write(raw); err != nil {
                        return false
                }
                if _, err = crc32.Write(raw); err != nil {
                        return false
                }
                return true
        })
        if err != nil {
                return
        }

        if err = writer.Flush(); err != nil {
                return
        }
        if err = f.Sync(); err != nil {
                return
        }
        crc = crc32.Sum32()
        log.LogInfof("storeMultipart: store complete: partitoinID(%v) volume(%v) numMultiparts(%v) crc(%v)",
                mp.config.PartitionId, mp.config.VolName, multipartTree.Len(), crc)
        return
}

func (mp *metaPartition) storeUniqID(rootDir string, sm *storeMsg) (err error) {
        filename := path.Join(rootDir, uniqIDFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_APPEND|os.O_TRUNC|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                fp.Close()
        }()
        if _, err = fp.WriteString(fmt.Sprintf("%d", sm.uniqId)); err != nil {
                return
        }
        log.LogInfof("storeUniqID: store complete: partitionID(%v) volume(%v) uniqID(%v)",
                mp.config.PartitionId, mp.config.VolName, sm.uniqId)
        return
}

func (mp *metaPartition) storeUniqChecker(rootDir string, sm *storeMsg) (crc uint32, err error) {
        filename := path.Join(rootDir, uniqCheckerFile)
        fp, err := os.OpenFile(filename, os.O_RDWR|os.O_TRUNC|os.O_APPEND|os.
                O_CREATE, 0o755)
        if err != nil {
                return
        }
        defer func() {
                err = fp.Sync()
                fp.Close()
        }()

        var data []byte
        if data, crc, err = sm.uniqChecker.Marshal(); err != nil {
                return
        }

        if _, err = fp.Write(data); err != nil {
                return
        }

        log.LogInfof("storeUniqChecker: store complete: PartitionID(%v) volume(%v) crc(%v)",
                mp.config.UniqId, mp.config.VolName, crc)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "encoding/binary"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/cmd/common"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

type storeMsg struct {
        command        uint32
        applyIndex     uint64
        txId           uint64
        inodeTree      *BTree
        dentryTree     *BTree
        extendTree     *BTree
        multipartTree  *BTree
        txTree         *BTree
        txRbInodeTree  *BTree
        txRbDentryTree *BTree
        quotaRebuild   bool
        uidRebuild     bool
        uniqId         uint64
        uniqChecker    *uniqChecker
        multiVerList   []*proto.VolVersionInfo
}

func (mp *metaPartition) startSchedule(curIndex uint64) {
        timer := time.NewTimer(time.Hour * 24 * 365)
        timer.Stop()
        timerCursor := time.NewTimer(intervalToSyncCursor)
        scheduleState := common.StateStopped
        lastCursor := mp.GetCursor()
        dumpFunc := func(msg *storeMsg) {
                log.LogWarnf("[startSchedule] partitionId=%d: nowAppID"+
                        "=%d, applyID=%d", mp.config.PartitionId, curIndex,
                        msg.applyIndex)
                if err := mp.store(msg); err == nil {
                        // truncate raft log
                        if mp.raftPartition != nil {
                                log.LogWarnf("[startSchedule] start trunc, partitionId=%d: nowAppID"+
                                        "=%d, applyID=%d", mp.config.PartitionId, curIndex,
                                        msg.applyIndex)
                                mp.raftPartition.Truncate(curIndex)
                        } else {
                                // maybe happen when start load dentry
                                log.LogWarnf("[startSchedule] raftPartition is nil so skip" +
                                        " truncate raft log")
                        }
                        curIndex = msg.applyIndex
                } else {
                        // retry again
                        mp.storeChan <- msg
                        err = errors.NewErrorf("[startSchedule]: dump partition id=%d: %v",
                                mp.config.PartitionId, err.Error())
                        log.LogErrorf(err.Error())
                        exporter.Warning(err.Error())
                }

                if _, ok := mp.IsLeader(); ok {
                        timer.Reset(intervalToPersistData)
                }
                atomic.StoreUint32(&scheduleState, common.StateStopped)
        }
        go func(stopC chan bool) {
                var msgs []*storeMsg
                readyChan := make(chan struct{}, 1)
                for {
                        if len(msgs) > 0 {
                                if atomic.LoadUint32(&scheduleState) == common.StateStopped {
                                        atomic.StoreUint32(&scheduleState, common.StateRunning)
                                        readyChan <- struct{}{}
                                }
                        }
                        select {
                        case <-stopC:
                                timer.Stop()
                                return

                        case <-readyChan:
                                var (
                                        maxIdx uint64
                                        maxMsg *storeMsg
                                )
                                for _, msg := range msgs {
                                        if curIndex >= msg.applyIndex {
                                                continue
                                        }
                                        if maxIdx < msg.applyIndex {
                                                maxIdx = msg.applyIndex
                                                maxMsg = msg
                                        }
                                }
                                if maxMsg != nil {
                                        go dumpFunc(maxMsg)
                                } else {
                                        if _, ok := mp.IsLeader(); ok {
                                                timer.Reset(intervalToPersistData)
                                        }
                                        atomic.StoreUint32(&scheduleState, common.StateStopped)
                                }
                                msgs = msgs[:0]
                        case msg := <-mp.storeChan:
                                switch msg.command {
                                case startStoreTick:
                                        timer.Reset(intervalToPersistData)
                                case stopStoreTick:
                                        timer.Stop()
                                case opFSMStoreTick:
                                        msgs = append(msgs, msg)
                                default:
                                        // do nothing
                                }
                        case <-timer.C:
                                log.LogDebugf("[startSchedule] intervalToPersistData curIndex: %v,apply:%v", curIndex, mp.applyID)
                                if mp.applyID <= curIndex {
                                        timer.Reset(intervalToPersistData)
                                        continue
                                }
                                if _, err := mp.submit(opFSMStoreTick, nil); err != nil {
                                        log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
                                        if _, ok := mp.IsLeader(); ok {
                                                timer.Reset(intervalToPersistData)
                                        }
                                }
                        case <-timerCursor.C:
                                if _, ok := mp.IsLeader(); !ok {
                                        timerCursor.Reset(intervalToSyncCursor)
                                        continue
                                }
                                curCursor := mp.GetCursor()
                                if curCursor == lastCursor {
                                        log.LogDebugf("[startSchedule] partitionId=%d: curCursor[%v]=lastCursor[%v]",
                                                mp.config.PartitionId, curCursor, lastCursor)
                                        timerCursor.Reset(intervalToSyncCursor)
                                        continue
                                }
                                Buf := make([]byte, 8)
                                binary.BigEndian.PutUint64(Buf, curCursor)
                                if _, err := mp.submit(opFSMSyncCursor, Buf); err != nil {
                                        log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
                                }

                                binary.BigEndian.PutUint64(Buf, mp.txProcessor.txManager.txIdAlloc.getTransactionID())
                                if _, err := mp.submit(opFSMSyncTxID, Buf); err != nil {
                                        log.LogErrorf("[startSchedule] raft submit: %s", err.Error())
                                }
                                lastCursor = curCursor
                                timerCursor.Reset(intervalToSyncCursor)
                        }
                }
        }(mp.stopC)
}

func (mp *metaPartition) stop() {
        if mp.stopC != nil {
                close(mp.stopC)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        "os"
        "strconv"

        "github.com/cubefs/cubefs/raftstore"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// StartRaftServer initializes the address resolver and the raftStore server instance.
func (m *MetaNode) startRaftServer(cfg *config.Config) (err error) {
        _, err = os.Stat(m.raftDir)
        if err != nil {
                if !os.IsNotExist(err) {
                        return
                }
                if err = os.MkdirAll(m.raftDir, 0o755); err != nil {
                        err = errors.NewErrorf("create raft server dir: %s", err.Error())
                        return
                }
        }

        if m.clusterUuidEnable {
                if err = config.CheckOrStoreClusterUuid(m.raftDir, m.clusterUuid, false); err != nil {
                        log.LogErrorf("CheckOrStoreClusterUuid failed: %v", err)
                        return fmt.Errorf("CheckOrStoreClusterUuid failed: %v", err)
                }
        }

        heartbeatPort, _ := strconv.Atoi(m.raftHeartbeatPort)
        replicaPort, _ := strconv.Atoi(m.raftReplicatePort)

        raftConf := &raftstore.Config{
                NodeID:            m.nodeId,
                RaftPath:          m.raftDir,
                IPAddr:            m.localAddr,
                HeartbeatPort:     heartbeatPort,
                ReplicaPort:       replicaPort,
                TickInterval:      m.tickInterval,
                RecvBufSize:       m.raftRecvBufSize,
                NumOfLogsToRetain: m.raftRetainLogs,
        }
        m.raftStore, err = raftstore.NewRaftStore(raftConf, cfg)
        if err != nil {
                err = errors.NewErrorf("new raftStore: %s", err.Error())
        }
        return
}

func (m *MetaNode) stopRaftServer() {
        if m.raftStore != nil {
                m.raftStore.Stop()
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "fmt"
        "io"
        "net"

        "github.com/xtaci/smux"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

// StartTcpService binds and listens to the specified port.
func (m *MetaNode) startServer() (err error) {
        // initialize and start the server.
        m.httpStopC = make(chan uint8)

        addr := fmt.Sprintf(":%s", m.listen)
        if m.bindIp {
                addr = fmt.Sprintf("%s:%s", m.localAddr, m.listen)
        }

        ln, err := net.Listen("tcp", addr)
        if err != nil {
                return
        }
        go func(stopC chan uint8) {
                defer ln.Close()
                for {
                        conn, err := ln.Accept()
                        select {
                        case <-stopC:
                                return
                        default:
                        }
                        if err != nil {
                                continue
                        }
                        go m.serveConn(conn, stopC)
                }
        }(m.httpStopC)
        log.LogInfof("start server over...")
        return
}

func (m *MetaNode) stopServer() {
        if m.httpStopC != nil {
                defer func() {
                        if r := recover(); r != nil {
                                log.LogErrorf("action[StopTcpServer],err:%v", r)
                        }
                }()
                close(m.httpStopC)
        }
}

// Read data from the specified tcp connection until the connection is closed by the remote or the tcp service is down.
func (m *MetaNode) serveConn(conn net.Conn, stopC chan uint8) {
        defer func() {
                conn.Close()
                m.RemoveConnection()
        }()
        m.AddConnection()
        c := conn.(*net.TCPConn)
        c.SetKeepAlive(true)
        c.SetNoDelay(true)
        remoteAddr := conn.RemoteAddr().String()
        for {
                select {
                case <-stopC:
                        return
                default:
                }
                p := &Packet{}
                if err := p.ReadFromConnWithVer(conn, proto.NoReadDeadlineTime); err != nil {
                        if err != io.EOF {
                                log.LogError("serve MetaNode: ", err.Error())
                        }
                        return
                }
                if err := m.handlePacket(conn, p, remoteAddr); err != nil {
                        log.LogErrorf("serve handlePacket fail: %v", err)
                }
        }
}

func (m *MetaNode) handlePacket(conn net.Conn, p *Packet,
        remoteAddr string) (err error) {
        // Handle request
        err = m.metadataManager.HandleMetadataOperation(conn, p, remoteAddr)
        return
}

func (m *MetaNode) startSmuxServer() (err error) {
        // initialize and start the server.
        m.smuxStopC = make(chan uint8)

        ipPort := fmt.Sprintf(":%s", m.listen)
        if m.bindIp {
                ipPort = fmt.Sprintf("%s:%s", m.localAddr, m.listen)
        }
        addr := util.ShiftAddrPort(ipPort, smuxPortShift)
        ln, err := net.Listen("tcp", addr)
        if err != nil {
                return
        }
        go func(stopC chan uint8) {
                defer ln.Close()
                for {
                        conn, err := ln.Accept()
                        select {
                        case <-stopC:
                                return
                        default:
                        }
                        if err != nil {
                                continue
                        }
                        go m.serveSmuxConn(conn, stopC)
                }
        }(m.smuxStopC)
        log.LogInfof("start Smux Server over...")
        return
}

func (m *MetaNode) stopSmuxServer() {
        if smuxPool != nil {
                smuxPool.Close()
                log.LogDebugf("action[stopSmuxServer] stop smux conn pool")
        }

        if m.smuxStopC != nil {
                defer func() {
                        if r := recover(); r != nil {
                                log.LogErrorf("action[stopSmuxServer],err:%v", r)
                        }
                }()
                close(m.smuxStopC)
        }
}

func (m *MetaNode) serveSmuxConn(conn net.Conn, stopC chan uint8) {
        defer func() {
                conn.Close()
                m.RemoveConnection()
        }()
        m.AddConnection()
        c := conn.(*net.TCPConn)
        c.SetKeepAlive(true)
        c.SetNoDelay(true)
        remoteAddr := conn.RemoteAddr().String()

        var sess *smux.Session
        var err error
        sess, err = smux.Server(conn, smuxPoolCfg.Config)
        if err != nil {
                log.LogErrorf("action[serveSmuxConn] failed to serve smux connection, err(%v)", err)
                return
        }
        defer sess.Close()

        for {
                select {
                case <-stopC:
                        return
                default:
                }

                stream, err := sess.AcceptStream()
                if err != nil {
                        if util.FilterSmuxAcceptError(err) != nil {
                                log.LogErrorf("action[startSmuxService] failed to accept, err: %s", err)
                        } else {
                                log.LogInfof("action[startSmuxService] accept done, err: %s", err)
                        }
                        break
                }
                go m.serveSmuxStream(stream, remoteAddr, stopC)
        }
}

func (m *MetaNode) serveSmuxStream(stream *smux.Stream, remoteAddr string, stopC chan uint8) {
        for {
                select {
                case <-stopC:
                        return
                default:
                }

                p := &Packet{}
                if err := p.ReadFromConnWithVer(stream, proto.NoReadDeadlineTime); err != nil {
                        if err != io.EOF {
                                log.LogError("serve MetaNode: ", err.Error())
                        }
                        return
                }
                if err := m.handlePacket(stream, p, remoteAddr); err != nil {
                        log.LogErrorf("serve handlePacket fail: %v", err)
                }
        }
}

package metanode

import (
        "bytes"
        "encoding/json"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util/log"
)

type SortedExtents struct {
        sync.RWMutex
        eks []proto.ExtentKey
}

func NewSortedExtents() *SortedExtents {
        return &SortedExtents{
                eks: make([]proto.ExtentKey, 0),
        }
}

// attention: only used for deleted eks
func NewSortedExtentsFromEks(eks []proto.ExtentKey) *SortedExtents {
        return &SortedExtents{
                eks: eks,
        }
}

func (se *SortedExtents) String() string {
        se.RLock()
        data, err := json.Marshal(se.eks)
        se.RUnlock()
        if err != nil {
                return ""
        }
        return string(data)
}

func (se *SortedExtents) MarshalBinary(v3 bool) ([]byte, error) {
        var data []byte

        se.RLock()
        defer se.RUnlock()

        data = make([]byte, 0, proto.ExtentLength*len(se.eks))
        for _, ek := range se.eks {
                ekdata, err := ek.MarshalBinary(v3)
                if err != nil {
                        return nil, err
                }
                data = append(data, ekdata...)
        }

        return data, nil
}

func (se *SortedExtents) UnmarshalBinary(data []byte, v3 bool) (err error, splitMap *sync.Map) {
        se.Lock()
        defer se.Unlock()

        buf := bytes.NewBuffer(data)
        for {
                var ek proto.ExtentKey
                if buf.Len() == 0 {
                        break
                }
                if err = ek.UnmarshalBinary(buf, v3); err != nil {
                        return
                }
                // Don't use se.Append here, since we need to retain the raw ek order.
                se.eks = append(se.eks, ek)
                if ek.IsSplit() {
                        if splitMap == nil {
                                splitMap = new(sync.Map)
                        }
                        val, ok := splitMap.Load(ek.GenerateId())
                        if !ok {
                                splitMap.Store(ek.GenerateId(), uint32(1))
                                continue
                        }
                        splitMap.Store(ek.GenerateId(), val.(uint32)+1)
                }
        }
        return
}

func (se *SortedExtents) Append(ek proto.ExtentKey) (deleteExtents []proto.ExtentKey) {
        endOffset := ek.FileOffset + uint64(ek.Size)

        se.Lock()
        defer se.Unlock()

        if len(se.eks) <= 0 {
                se.eks = append(se.eks, ek)
                return
        }
        lastKey := se.eks[len(se.eks)-1]
        if lastKey.FileOffset+uint64(lastKey.Size) <= ek.FileOffset {
                se.eks = append(se.eks, ek)
                return
        }
        firstKey := se.eks[0]
        if firstKey.FileOffset >= endOffset {
                eks := se.doCopyExtents()
                se.eks = se.eks[:0]
                se.eks = append(se.eks, ek)
                se.eks = append(se.eks, eks...)
                return
        }

        var startIndex, endIndex int

        invalidExtents := make([]proto.ExtentKey, 0)
        for idx, key := range se.eks {
                if ek.FileOffset > key.FileOffset {
                        startIndex = idx + 1
                        continue
                }
                if endOffset >= key.FileOffset+uint64(key.Size) {
                        invalidExtents = append(invalidExtents, key)
                        continue
                }
                break
        }

        endIndex = startIndex + len(invalidExtents)
        upperExtents := make([]proto.ExtentKey, len(se.eks)-endIndex)
        copy(upperExtents, se.eks[endIndex:])
        se.eks = se.eks[:startIndex]
        se.eks = append(se.eks, ek)
        se.eks = append(se.eks, upperExtents...)
        // check if ek and key are the same extent file with size extented
        deleteExtents = make([]proto.ExtentKey, 0, len(invalidExtents))
        for _, key := range invalidExtents {
                if key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId {
                        deleteExtents = append(deleteExtents, key)
                }
        }
        return
}

func storeEkSplit(mpId uint64, inodeID uint64, ekRef *sync.Map, ek *proto.ExtentKey) (id uint64) {
        if ekRef == nil {
                log.LogErrorf("[storeEkSplit] mpId [%v] inodeID %v ekRef nil", mpId, inodeID)
                return
        }
        log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] dp [%v] extent id[%v] ek [%v]", mpId, inodeID, ek.PartitionId, ek.ExtentId, ek)
        id = ek.PartitionId<<32 | ek.ExtentId
        var v uint32
        if val, ok := ekRef.Load(id); !ok {
                if ek.IsSplit() {
                        log.LogErrorf("[storeEkSplit] mpId [%v]inode id[%v] ek [%v] already be set split", mpId, inodeID, ek)
                }
                v = 1
        } else {
                v = val.(uint32) + 1
        }
        ek.SetSplit(true)
        ekRef.Store(id, v)
        log.LogDebugf("[storeEkSplit] mpId [%v] inode[%v] dp [%v] extent id[%v].key %v, cnt %v", mpId, inodeID, ek.PartitionId, ek.ExtentId,
                ek.PartitionId<<32|ek.ExtentId, v)
        return
}

func (se *SortedExtents) SplitWithCheck(mpId uint64, inodeID uint64, ekSplit proto.ExtentKey, ekRef *sync.Map) (delExtents []proto.ExtentKey, status uint8) {
        status = proto.OpOk
        endOffset := ekSplit.FileOffset + uint64(ekSplit.Size)
        log.LogDebugf("[SplitWithCheck] mpId [%v]. inode[%v]  ekSplit ek [%v]", mpId, inodeID, ekSplit)
        se.Lock()
        defer se.Unlock()

        if len(se.eks) <= 0 {
                log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks empty cann't find ek [%v]", mpId, inodeID, ekSplit)
                status = proto.OpArgMismatchErr
                return
        }
        lastKey := se.eks[len(se.eks)-1]
        if lastKey.FileOffset+uint64(lastKey.Size) <= ekSplit.FileOffset {
                log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks do split not found", mpId, inodeID)
                status = proto.OpArgMismatchErr
                return
        }

        firstKey := se.eks[0]
        if firstKey.FileOffset >= endOffset {
                log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] eks do split not found", mpId, inodeID)
                status = proto.OpArgMismatchErr
                return
        }

        var startIndex int
        for idx, key := range se.eks {
                if ekSplit.FileOffset >= key.FileOffset {
                        startIndex = idx + 1
                        continue
                }
                if endOffset >= key.FileOffset+uint64(key.Size) {
                        continue
                }
                break
        }

        if startIndex == 0 {
                status = proto.OpArgMismatchErr
                log.LogErrorf("[SplitWithCheck] mpId [%v]. inode[%v] should have no valid extent request [%v]", mpId, inodeID, ekSplit)
                return
        }

        key := &se.eks[startIndex-1]
        if !storage.IsTinyExtent(key.ExtentId) && (key.PartitionId != ekSplit.PartitionId || key.ExtentId != ekSplit.ExtentId) {
                status = proto.OpArgMismatchErr
                log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v]  key found with mismatch extent info [%v] request [%v]", mpId, inodeID, key, ekSplit)
                return
        }

        keySize := key.Size
        key.AddModGen()
        if !key.IsSplit() {
                storeEkSplit(mpId, inodeID, ekRef, key)
        }

        if ekSplit.FileOffset+uint64(ekSplit.Size) > key.FileOffset+uint64(key.Size) {
                status = proto.OpArgMismatchErr
                log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] request [%v] out scope of exist key [%v]", mpId, inodeID, ekSplit, key)
                return
        }
        // Makes the request idempotent, just in case client retries.
        if ekSplit.IsEqual(key) {
                log.LogWarnf("SplitWithCheck. mpId [%v] request key %v is a repeat request", mpId, key)
                return
        }

        delKey := *key
        delKey.ExtentOffset = key.ExtentOffset + (ekSplit.FileOffset - key.FileOffset)
        delKey.Size = ekSplit.Size
        storeEkSplit(mpId, inodeID, ekRef, &delKey)

        if ekSplit.Size == 0 {
                log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
        }
        delKey.FileOffset = ekSplit.FileOffset

        delExtents = append(delExtents, delKey)

        log.LogDebugf("SplitWithCheck. mpId [%v]  inode[%v]  key offset %v, split FileOffset %v, startIndex %v,key [%v], ekSplit[%v] delkey [%v]", mpId, inodeID,
                key.FileOffset, ekSplit.FileOffset, startIndex, key, ekSplit, delKey)

        if key.FileOffset == ekSplit.FileOffset { // at the begin
                keyDup := *key
                eks := make([]proto.ExtentKey, len(se.eks)-startIndex)
                copy(eks, se.eks[startIndex:])
                se.eks = se.eks[:startIndex-1]

                var keyBefore *proto.ExtentKey
                if len(se.eks) > 0 {
                        keyBefore = &se.eks[len(se.eks)-1]
                        log.LogDebugf("SplitWithCheck. mpId [%v].keyBefore. ek [%v] and ekSplit %v", mpId, keyBefore, ekSplit)
                }
                if keyBefore != nil && keyBefore.IsSequenceWithSameSeq(&ekSplit) {
                        log.LogDebugf("SplitWithCheck. mpId [%v]. inode[%v]  keyBefore [%v], ekSplit [%v]", mpId, inodeID, keyBefore, ekSplit)
                        log.LogDebugf("SplitWithCheck. mpId [%v].merge.head. ek [%v] and %v", mpId, keyBefore, ekSplit)
                        keyBefore.Size += ekSplit.Size
                } else {
                        se.eks = append(se.eks, ekSplit)
                        storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
                }

                keyDup.FileOffset = keyDup.FileOffset + uint64(ekSplit.Size)
                keyDup.ExtentOffset = keyDup.ExtentOffset + uint64(ekSplit.Size)
                keyDup.Size = keySize - ekSplit.Size
                if keyDup.Size == 0 {
                        log.LogErrorf("SplitWithCheck. mpId [%v] inode[%v] delKey %v,keyDup %v, eksplit %v", mpId, inodeID, delKey, keyDup, ekSplit)
                }
                se.eks = append(se.eks, keyDup)
                se.eks = append(se.eks, eks...)
        } else if key.FileOffset+uint64(key.Size) == ekSplit.FileOffset+uint64(ekSplit.Size) { // in the end
                key.Size = keySize - ekSplit.Size
                if key.Size == 0 {
                        log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
                }
                eks := make([]proto.ExtentKey, len(se.eks[startIndex:]))
                copy(eks, se.eks[startIndex:])
                se.eks = se.eks[:startIndex]

                if len(eks) > 0 && ekSplit.IsSequenceWithSameSeq(&eks[0]) {
                        log.LogDebugf("SplitWithCheck.mpId [%v].merge.end. ek [%v] and %v", mpId, ekSplit, eks[0])
                        eks[0].FileOffset = ekSplit.FileOffset
                        eks[0].ExtentOffset = ekSplit.ExtentOffset
                        eks[0].Size += ekSplit.Size
                } else {
                        se.eks = append(se.eks, ekSplit)
                        storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
                }

                se.eks = append(se.eks, eks...)
        } else { // in the middle
                key.Size = uint32(ekSplit.FileOffset - key.FileOffset)
                if key.Size == 0 {
                        log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] delKey %v,key %v, eksplit %v", mpId, inodeID, delKey, key, ekSplit)
                }
                eks := make([]proto.ExtentKey, len(se.eks[startIndex:]))
                copy(eks, se.eks[startIndex:])

                se.eks = se.eks[:startIndex]
                se.eks = append(se.eks, ekSplit)
                storeEkSplit(mpId, inodeID, ekRef, &ekSplit)
                mKey := &proto.ExtentKey{
                        FileOffset:   ekSplit.FileOffset + uint64(ekSplit.Size),
                        PartitionId:  key.PartitionId,
                        ExtentId:     key.ExtentId,
                        ExtentOffset: key.ExtentOffset + uint64(key.Size) + uint64(ekSplit.Size),
                        Size:         keySize - key.Size - ekSplit.Size,
                        // crc
                        SnapInfo: &proto.ExtSnapInfo{
                                VerSeq:  key.GetSeq(),
                                ModGen:  0,
                                IsSplit: true,
                        },
                }
                se.eks = append(se.eks, *mKey)
                storeEkSplit(mpId, inodeID, ekRef, mKey)

                if keySize-key.Size-ekSplit.Size == 0 {
                        log.LogErrorf("SplitWithCheck. mpId [%v].inode[%v] keySize %v,key %v, eksplit %v", mpId, inodeID, keySize, key, ekSplit)
                }
                se.eks = append(se.eks, eks...)
        }
        return
}

func (se *SortedExtents) CheckAndAddRef(lastKey *proto.ExtentKey, currEk *proto.ExtentKey, addRefFunc func(*proto.ExtentKey)) (ok bool) {
        if !lastKey.IsSameExtent(currEk) {
                return
        }
        log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] ek [%v],lastKey %v", currEk, lastKey)
        if lastKey.FileOffset+uint64(lastKey.Size) <= currEk.FileOffset {
                if !lastKey.IsSplit() {
                        addRefFunc(lastKey)
                }
                addRefFunc(currEk)
                ok = true
                return
        }

        if lastKey.FileOffset == currEk.FileOffset &&
                lastKey.PartitionId == currEk.PartitionId &&
                lastKey.ExtentId == currEk.ExtentId &&
                lastKey.ExtentOffset == currEk.ExtentOffset && lastKey.Size < currEk.Size && lastKey.GetSeq() < currEk.GetSeq() {

                log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] split append key %v", currEk)
                currEk.FileOffset = lastKey.FileOffset + uint64(lastKey.Size)
                currEk.ExtentOffset = currEk.ExtentOffset + uint64(lastKey.Size)
                currEk.Size = currEk.Size - lastKey.Size
                log.LogDebugf("action[AppendWithCheck.CheckAndAddRef] after split append key %v", currEk)
                if !lastKey.IsSplit() {
                        addRefFunc(lastKey)
                }
                addRefFunc(currEk)
                ok = true
                return
        }
        return
}

func (se *SortedExtents) AppendWithCheck(inodeID uint64, ek proto.ExtentKey, addRefFunc func(*proto.ExtentKey), clientDiscardExts []proto.ExtentKey) (deleteExtents []proto.ExtentKey, status uint8) {
        status = proto.OpOk
        endOffset := ek.FileOffset + uint64(ek.Size)
        se.Lock()
        defer se.Unlock()
        log.LogDebugf("action[AppendWithCheck] ek [%v], clientDiscardExts [%v] se.eks [%v]", ek, clientDiscardExts, se.eks)
        if len(se.eks) <= 0 {
                se.eks = append(se.eks, ek)
                return
        }
        idx := len(se.eks) - 1
        tailKey := &se.eks[idx]

        log.LogDebugf("action[AppendWithCheck] ek [%v],tailKey %v, clientDiscardExts [%v] se.eks [%v]", ek, tailKey, clientDiscardExts, se.eks)
        if ok := se.CheckAndAddRef(tailKey, &ek, addRefFunc); ok {
                se.eks = append(se.eks, ek)
                return
        }

        firstKey := se.eks[0]
        if firstKey.FileOffset >= endOffset {
                se.insert(ek, 0)
                return
        }

        var startIndex, endIndex int
        invalidExtents := make([]proto.ExtentKey, 0)
        for idx, key := range se.eks {
                if ek.FileOffset > key.FileOffset {
                        startIndex = idx + 1
                        continue
                }
                if endOffset >= key.FileOffset+uint64(key.Size) {
                        invalidExtents = append(invalidExtents, key)
                        continue
                }
                break
        }

        // Makes the request idempotent, just in case client retries.
        if len(invalidExtents) == 1 && invalidExtents[0].Equals(&ek) {
                log.LogDebugf("action[AppendWithCheck] ek [%v]", ek)
                return
        }

        // check if ek and key are the same extent file with size extented
        deleteExtents = make([]proto.ExtentKey, 0, len(invalidExtents))
        for _, key := range invalidExtents {
                if key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId || key.ExtentOffset != ek.ExtentOffset {
                        deleteExtents = append(deleteExtents, key)
                }
        }

        log.LogDebugf("action[AppendWithCheck] invalidExtents(%v) deleteExtents(%v) discardExtents(%v)", invalidExtents, deleteExtents, clientDiscardExts)
        if clientDiscardExts != nil {
                if len(deleteExtents) != len(clientDiscardExts) {
                        log.LogErrorf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] deleteExtents [%v] clientDiscardExts [%v]", inodeID, deleteExtents, clientDiscardExts)
                        return deleteExtents, proto.OpConflictExtentsErr
                }
                for i := 0; i < len(clientDiscardExts); i++ {
                        if deleteExtents[i].PartitionId != clientDiscardExts[i].PartitionId || deleteExtents[i].ExtentId != clientDiscardExts[i].ExtentId || deleteExtents[i].ExtentOffset != clientDiscardExts[i].ExtentOffset {
                                log.LogDebugf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] idx %v deleteExtents[%v]  clientDiscardExts [%v]", inodeID, i, deleteExtents[i], clientDiscardExts[i])
                                return deleteExtents, proto.OpConflictExtentsErr
                        }
                }
        } else if len(deleteExtents) != 0 {
                log.LogDebugf("action[AppendWithCheck] OpConflictExtentsErr error. inode[%v] deleteExtents [%v]", inodeID, deleteExtents)
                return deleteExtents, proto.OpConflictExtentsErr
        }

        defer func() {
                if startIndex == 0 {
                        return
                }
                se.CheckAndAddRef(&se.eks[startIndex-1], &se.eks[startIndex], addRefFunc)
        }()

        if len(invalidExtents) == 0 {
                se.insert(ek, startIndex)
                return
        }

        endIndex = startIndex + len(invalidExtents)
        se.instertWithDiscard(ek, startIndex, endIndex)
        return
}

func (se *SortedExtents) Truncate(offset uint64, doOnLastKey func(*proto.ExtentKey), insertRefMap func(ek *proto.ExtentKey)) (deleteExtents []proto.ExtentKey) {
        var endIndex int

        se.Lock()
        defer se.Unlock()

        endIndex = -1
        for idx, key := range se.eks {
                if key.FileOffset >= offset {
                        endIndex = idx
                        break
                }
        }

        if endIndex < 0 {
                deleteExtents = make([]proto.ExtentKey, 0)
        } else {
                deleteExtents = make([]proto.ExtentKey, len(se.eks)-endIndex)
                copy(deleteExtents, se.eks[endIndex:])
                se.eks = se.eks[:endIndex]
        }

        numKeys := len(se.eks)
        if numKeys > 0 {
                lastKey := &se.eks[numKeys-1]
                if lastKey.FileOffset+uint64(lastKey.Size) > offset {
                        if doOnLastKey != nil {
                                doOnLastKey(&proto.ExtentKey{Size: uint32(lastKey.FileOffset + uint64(lastKey.Size) - offset)})
                        }
                        rsKey := &proto.ExtentKey{}
                        *rsKey = *lastKey
                        lastKey.Size = uint32(offset - lastKey.FileOffset)
                        if insertRefMap != nil {
                                insertRefMap(lastKey)
                        }

                        rsKey.Size -= lastKey.Size
                        rsKey.FileOffset += uint64(lastKey.Size)
                        rsKey.ExtentOffset += uint64(lastKey.Size)
                        if insertRefMap != nil {
                                insertRefMap(rsKey)
                        }

                        deleteExtents = append([]proto.ExtentKey{*rsKey}, deleteExtents...)
                        log.LogDebugf("SortedExtents.Truncate rsKey %v, deleteExtents %v", rsKey, deleteExtents)
                }
        }
        return
}

func (se *SortedExtents) insert(ek proto.ExtentKey, startIdx int) {
        se.eks = append(se.eks, ek)
        size := len(se.eks)

        for idx := size - 1; idx > startIdx; idx-- {
                se.eks[idx] = se.eks[idx-1]
        }

        se.eks[startIdx] = ek
}

func (se *SortedExtents) instertWithDiscard(ek proto.ExtentKey, startIdx, endIdx int) {
        upperSize := len(se.eks) - endIdx
        se.eks[startIdx] = ek

        for idx := 0; idx < upperSize; idx++ {
                se.eks[startIdx+1+idx] = se.eks[endIdx+idx]
        }

        se.eks = se.eks[:startIdx+1+upperSize]
}

func (se *SortedExtents) Len() int {
        se.RLock()
        defer se.RUnlock()
        return len(se.eks)
}

// Returns the file size
func (se *SortedExtents) LayerSize() (layerSize uint64) {
        se.RLock()
        defer se.RUnlock()

        last := len(se.eks)
        if last <= 0 {
                return uint64(0)
        }
        for _, ek := range se.eks {
                layerSize += uint64(ek.Size)
        }
        return
}

// Returns the file size
func (se *SortedExtents) Size() uint64 {
        se.RLock()
        defer se.RUnlock()

        last := len(se.eks)
        if last <= 0 {
                return uint64(0)
        }
        return se.eks[last-1].FileOffset + uint64(se.eks[last-1].Size)
}

func (se *SortedExtents) Range(f func(index int, ek proto.ExtentKey) bool) {
        se.RLock()
        defer se.RUnlock()

        for i, ek := range se.eks {
                if !f(i, ek) {
                        break
                }
        }
}

func (se *SortedExtents) Clone() *SortedExtents {
        newSe := NewSortedExtents()

        se.RLock()
        defer se.RUnlock()

        newSe.eks = se.doCopyExtents()
        return newSe
}

func (se *SortedExtents) CopyExtents() []proto.ExtentKey {
        se.RLock()
        defer se.RUnlock()
        return se.doCopyExtents()
}

func (se *SortedExtents) CopyTinyExtents() []proto.ExtentKey {
        se.RLock()
        defer se.RUnlock()
        return se.doCopyTinyExtents()
}

func (se *SortedExtents) doCopyExtents() []proto.ExtentKey {
        eks := make([]proto.ExtentKey, len(se.eks))
        copy(eks, se.eks)
        return eks
}

func (se *SortedExtents) doCopyTinyExtents() []proto.ExtentKey {
        eks := make([]proto.ExtentKey, 0)
        for _, ek := range se.eks {
                if storage.IsTinyExtent(ek.ExtentId) {
                        eks = append(eks, ek)
                }
        }
        return eks
}

// discard code
func (se *SortedExtents) Delete(delEks []proto.ExtentKey) (curEks []proto.ExtentKey) {
        se.RLock()
        defer se.RUnlock()

        curEks = make([]proto.ExtentKey, len(se.eks)-len(delEks))
        for _, key := range se.eks {
                delFlag := false
                for _, delKey := range delEks {
                        if key.FileOffset == delKey.ExtentOffset && key.ExtentId == delKey.ExtentId &&
                                key.ExtentOffset == delKey.ExtentOffset && key.PartitionId == delKey.PartitionId &&
                                key.Size == delKey.Size {
                                delFlag = true
                                break
                        }
                }
                if !delFlag {
                        curEks = append(curEks, key)
                }
        }
        se.eks = curEks
        return
}

package metanode

import (
        "bytes"
        "encoding/json"
        "fmt"
        "math"
        "sync"

        "github.com/cubefs/cubefs/proto"
)

type SortedObjExtents struct {
        sync.RWMutex
        eks []proto.ObjExtentKey
}

func NewSortedObjExtents() *SortedObjExtents {
        return &SortedObjExtents{
                eks: make([]proto.ObjExtentKey, 0),
        }
}

func (se *SortedObjExtents) String() string {
        se.RLock()
        data, err := json.Marshal(se.eks)
        se.RUnlock()
        if err != nil {
                return ""
        }
        return string(data)
}

func (se *SortedObjExtents) MarshalBinary() ([]byte, error) {
        var data []byte

        se.RLock()
        defer se.RUnlock()

        for _, ek := range se.eks {
                ekdata, err := ek.MarshalBinary()
                if err != nil {
                        return nil, err
                }
                data = append(data, ekdata...)
        }
        return data, nil
}

func (se *SortedObjExtents) UnmarshalBinary(data []byte) error {
        var ek proto.ObjExtentKey

        se.Lock()
        defer se.Unlock()

        buf := bytes.NewBuffer(data)
        for {
                if buf.Len() == 0 {
                        break
                }
                if err := ek.UnmarshalBinary(buf); err != nil {
                        return err
                }
                // Don't use se.Append here, since we need to retain the raw ek order.
                se.eks = append(se.eks, ek)
        }
        return nil
}

// Append will return error if the objextentkey exist overlap.
func (se *SortedObjExtents) Append(ek proto.ObjExtentKey) (err error) {
        se.Lock()
        defer se.Unlock()
        // 1. list is empty
        if len(se.eks) <= 0 {
                se.eks = append(se.eks, ek)
                return
        }
        // 2. last key's (fileoffset+size) is equal to new one
        lastKey := se.eks[len(se.eks)-1]
        if (lastKey.FileOffset + lastKey.Size) == ek.FileOffset {
                se.eks = append(se.eks, ek)
                return
        }

        // fix: find one key is equals to the new one, if not, return error.
        for i := len(se.eks) - 1; i >= 0; i-- {
                if ek.IsEquals(&se.eks[i]) {
                        return
                }
                if se.eks[i].FileOffset < ek.FileOffset {
                        break
                }
        }

        err = fmt.Errorf("obj extentkeys exist overlay! the new obj extent key must be appended to the last position with offset [%d], new(%s)",
                lastKey.FileOffset, ek.String())
        return
}

func (se *SortedObjExtents) Clone() *SortedObjExtents {
        newSe := NewSortedObjExtents()

        se.RLock()
        defer se.RUnlock()

        newSe.eks = se.doCopyExtents()
        return newSe
}

func (se *SortedObjExtents) doCopyExtents() []proto.ObjExtentKey {
        eks := make([]proto.ObjExtentKey, len(se.eks))
        copy(eks, se.eks)
        return eks
}

func (se *SortedObjExtents) CopyExtents() []proto.ObjExtentKey {
        se.RLock()
        defer se.RUnlock()
        return se.doCopyExtents()
}

// Returns the file size
func (se *SortedObjExtents) Size() uint64 {
        se.RLock()
        defer se.RUnlock()

        last := len(se.eks)
        if last <= 0 {
                return uint64(0)
        }
        // TODO: maybe we should use ebs location's Size?
        return se.eks[last-1].FileOffset + se.eks[last-1].Size
}

func (se *SortedObjExtents) Range(f func(ek proto.ObjExtentKey) bool) {
        se.RLock()
        defer se.RUnlock()

        for _, ek := range se.eks {
                if !f(ek) {
                        break
                }
        }
}

func (se *SortedObjExtents) FindOffsetExist(fileOffset uint64) (bool, int) {
        se.RLock()
        defer se.RUnlock()

        if len(se.eks) <= 0 {
                return false, 0
        }
        left, right, mid := 0, len(se.eks)-1, 0
        for {
                mid = int(math.Floor(float64((left + right) / 2)))
                if se.eks[mid].FileOffset > fileOffset {
                        right = mid - 1
                } else if se.eks[mid].FileOffset < fileOffset {
                        left = mid + 1
                } else {
                        return true, mid
                }
                if left > right {
                        break
                }
        }
        return false, 0
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package metanode

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "errors"
        "fmt"
        "net"
        "strconv"
        "strings"
        "sync"
        "time"

        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

// Rollback Type
const (
        TxNoOp uint8 = iota
        TxUpdate
        TxDelete
        TxAdd
)

func (i *TxRollbackInode) ToString() string {
        content := fmt.Sprintf("{inode:[ino:%v, type:%v, nlink:%v], quotaIds:%v, rbType:%v"+
                "txInodeInfo:[Ino:%v, MpID:%v, CreateTime:%v, Timeout:%v, TxID:%v, MpMembers:%v]}",
                i.inode.Inode, i.inode.Type, i.inode.NLink, i.quotaIds, i.rbType, i.txInodeInfo.Ino, i.txInodeInfo.MpID,
                i.txInodeInfo.CreateTime, i.txInodeInfo.Timeout, i.txInodeInfo.TxID, i.txInodeInfo.MpMembers)
        return content
}

type TxRollbackInode struct {
        inode       *Inode
        txInodeInfo *proto.TxInodeInfo
        rbType      uint8 // Rollback Type
        quotaIds    []uint32
}

// Less tests whether the current TxRollbackInode item is less than the given one.
func (i *TxRollbackInode) Less(than btree.Item) bool {
        ti, ok := than.(*TxRollbackInode)
        if !ok {
                return false
        }

        if i.txInodeInfo != nil && ti.txInodeInfo != nil {
                return i.txInodeInfo.Ino < ti.txInodeInfo.Ino
        }

        return i.inode.Inode < ti.inode.Inode
}

// Copy returns a copy of the TxRollbackInode.
func (i *TxRollbackInode) Copy() btree.Item {
        item := i.inode.Copy()
        txInodeInfo := *i.txInodeInfo

        quotaIds := make([]uint32, len(i.quotaIds))
        copy(quotaIds, i.quotaIds)

        return &TxRollbackInode{
                inode:       item.(*Inode),
                quotaIds:    quotaIds,
                txInodeInfo: &txInodeInfo,
                rbType:      i.rbType,
        }
}

func (i *TxRollbackInode) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0, 256))
        bs, err := i.inode.Marshal()
        if err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return
        }
        if _, err = buff.Write(bs); err != nil {
                return
        }
        bs, err = i.txInodeInfo.Marshal()
        if err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err = buff.Write(bs); err != nil {
                return
        }
        if err = binary.Write(buff, binary.BigEndian, &i.rbType); err != nil {
                return
        }

        quotaBytes := bytes.NewBuffer(make([]byte, 0, 8))
        for _, quotaId := range i.quotaIds {
                if err = binary.Write(quotaBytes, binary.BigEndian, quotaId); err != nil {
                        return
                }
        }

        _, err = buff.Write(quotaBytes.Bytes())
        return buff.Bytes(), err
}

func (i *TxRollbackInode) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        var dataLen uint32
        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data := make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        ino := NewInode(0, 0)
        if err = ino.Unmarshal(data); err != nil {
                return
        }
        i.inode = ino

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        txInodeInfo := proto.NewTxInodeInfo("", 0, 0)
        if err = txInodeInfo.Unmarshal(data); err != nil {
                return
        }
        i.txInodeInfo = txInodeInfo

        if err = binary.Read(buff, binary.BigEndian, &i.rbType); err != nil {
                return
        }

        var quotaId uint32
        for {
                if buff.Len() == 0 {
                        break
                }
                if err = binary.Read(buff, binary.BigEndian, &quotaId); err != nil {
                        return
                }
                i.quotaIds = append(i.quotaIds, quotaId)
        }
        return
}

func NewTxRollbackInode(inode *Inode, quotaIds []uint32, txInodeInfo *proto.TxInodeInfo, rbType uint8) *TxRollbackInode {
        return &TxRollbackInode{
                inode:       inode,
                quotaIds:    quotaIds,
                txInodeInfo: txInodeInfo,
                rbType:      rbType,
        }
}

type TxRollbackDentry struct {
        dentry       *Dentry
        txDentryInfo *proto.TxDentryInfo
        rbType       uint8 // Rollback Type `
}

func (d *TxRollbackDentry) ToString() string {
        content := fmt.Sprintf("{dentry:[ParentId:%v, Name:%v, Inode:%v, Type:%v], rbType:%v, "+
                "txDentryInfo:[ParentId:%v, Name:%v, MpMembers:%v, TxID:%v, MpID:%v, CreateTime:%v, Timeout:%v]}",
                d.dentry.ParentId, d.dentry.Name, d.dentry.Inode, d.dentry.Type, d.rbType, d.txDentryInfo.ParentId, d.txDentryInfo.Name,
                d.txDentryInfo.MpMembers, d.txDentryInfo.TxID, d.txDentryInfo.MpID, d.txDentryInfo.CreateTime, d.txDentryInfo.Timeout)
        return content
}

// Less tests whether the current TxRollbackDentry item is less than the given one.
func (d *TxRollbackDentry) Less(than btree.Item) bool {
        td, ok := than.(*TxRollbackDentry)
        return ok && d.txDentryInfo.GetKey() < td.txDentryInfo.GetKey()
}

// Copy returns a copy of the TxRollbackDentry.
func (d *TxRollbackDentry) Copy() btree.Item {
        item := d.dentry.Copy()
        txDentryInfo := *d.txDentryInfo

        return &TxRollbackDentry{
                dentry:       item.(*Dentry),
                txDentryInfo: &txDentryInfo,
                rbType:       d.rbType,
        }
}

func (d *TxRollbackDentry) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0, 512))
        bs, err := d.dentry.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }

        log.LogDebugf("TxRollbackDentry Marshal dentry %v", d.dentry)

        log.LogDebugf("TxRollbackDentry Marshal txDentryInfo %v", d.ToString())
        bs, err = d.txDentryInfo.Marshal()
        if err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                return nil, err
        }
        if _, err := buff.Write(bs); err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, &d.rbType); err != nil {
                return
        }
        return buff.Bytes(), nil
}

func (d *TxRollbackDentry) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        var dataLen uint32
        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        log.LogDebugf("TxRollbackDentry Unmarshal len %v", dataLen)
        data := make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        dentry := &Dentry{}
        if err = dentry.Unmarshal(data); err != nil {
                return
        }

        log.LogDebugf("TxRollbackDentry Unmarshal dentry %v", dentry)

        d.dentry = dentry

        if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                return
        }
        data = make([]byte, int(dataLen))
        if _, err = buff.Read(data); err != nil {
                return
        }

        txDentryInfo := proto.NewTxDentryInfo("", 0, "", 0)
        if err = txDentryInfo.Unmarshal(data); err != nil {
                return
        }
        d.txDentryInfo = txDentryInfo

        if err = binary.Read(buff, binary.BigEndian, &d.rbType); err != nil {
                return
        }
        return
}

func NewTxRollbackDentry(dentry *Dentry, txDentryInfo *proto.TxDentryInfo, rbType uint8) *TxRollbackDentry {
        return &TxRollbackDentry{
                dentry:       dentry,
                txDentryInfo: txDentryInfo,
                rbType:       rbType,
        }
}

// TM
type TransactionManager struct {
        // need persistence and sync to all the raft members of the mp
        txIdAlloc   *TxIDAllocator
        txTree      *BTree
        txProcessor *TransactionProcessor
        blacklist   *util.Set
        opLimiter   *rate.Limiter
        sync.RWMutex
}

// RM
type TransactionResource struct {
        txRbInodeTree  *BTree // key: inode id
        txRbDentryTree *BTree // key: parentId_name
        txProcessor    *TransactionProcessor
        sync.RWMutex
}

type TransactionProcessor struct {
        txManager  *TransactionManager  // TM
        txResource *TransactionResource // RM
        mp         *metaPartition
        mask       proto.TxOpMask
}

func (p *TransactionProcessor) Reset() {
        p.txManager.Reset()
        p.txResource.Reset()
}

func (p *TransactionProcessor) Pause() bool {
        return p.mask == proto.TxPause
}

func NewTransactionManager(txProcessor *TransactionProcessor) *TransactionManager {
        txMgr := &TransactionManager{
                txIdAlloc:   newTxIDAllocator(),
                txTree:      NewBtree(),
                txProcessor: txProcessor,
                blacklist:   util.NewSet(),
                opLimiter:   rate.NewLimiter(rate.Inf, 128),
        }
        return txMgr
}

func NewTransactionResource(txProcessor *TransactionProcessor) *TransactionResource {
        txRsc := &TransactionResource{
                txRbInodeTree:  NewBtree(),
                txRbDentryTree: NewBtree(),
                txProcessor:    txProcessor,
        }
        return txRsc
}

func NewTransactionProcessor(mp *metaPartition) *TransactionProcessor {
        txProcessor := &TransactionProcessor{
                mp: mp,
        }
        txProcessor.txManager = NewTransactionManager(txProcessor)
        txProcessor.txResource = NewTransactionResource(txProcessor)

        if mp.config != nil {
                go txProcessor.txManager.processExpiredTransactions()
        }
        return txProcessor
}

func (tm *TransactionManager) setLimit(val int) string {
        if val > 0 {
                tm.opLimiter.SetLimit(rate.Limit(val))
                return fmt.Sprintf("%v", val)
        }
        tm.opLimiter.SetLimit(rate.Inf)
        return "unlimited"
}

func (tm *TransactionManager) Reset() {
        tm.blacklist.Clear()
        tm.Lock()
        tm.txIdAlloc.Reset()
        tm.txTree.Reset()
        tm.opLimiter.SetLimit(0)
        tm.Unlock()
}

var test = false

func (tm *TransactionManager) processExpiredTransactions() {
        mpId := tm.txProcessor.mp.config.PartitionId
        log.LogInfof("processExpiredTransactions for mp[%v] started", mpId)
        clearInterval := time.Second * 60
        clearTimer := time.NewTimer(clearInterval)
        txCheckVal := time.Second * 3
        txCheckTimer := time.NewTimer(txCheckVal)

        defer func() {
                log.LogWarnf("processExpiredTransactions for mp[%v] exit", mpId)
                txCheckTimer.Stop()
                clearTimer.Stop()
                return
        }()

        for {
                select {
                case <-tm.txProcessor.mp.stopC:
                        log.LogDebugf("[processExpiredTransactions] deleteWorker stop partition: %v", mpId)
                        return
                default:
                }

                if _, ok := tm.txProcessor.mp.IsLeader(); !ok && !test {
                        log.LogDebugf("processExpiredTransactions: not leader sleep 1s, mp %d", mpId)
                        time.Sleep(time.Second * 10)
                        continue
                }

                select {
                case <-tm.txProcessor.mp.stopC:
                        log.LogWarnf("processExpiredTransactions for mp[%v] stopped", mpId)
                        return
                case <-clearTimer.C:
                        tm.blacklist.Clear()
                        clearTimer.Reset(clearInterval)
                        log.LogDebugf("processExpiredTransactions: blacklist cleared, mp %d", mpId)
                case <-txCheckTimer.C:
                        if tm.txProcessor.Pause() {
                                txCheckTimer.Reset(txCheckVal)
                                continue
                        }
                        tm.processTx()
                        txCheckTimer.Reset(txCheckVal)
                }
        }
}

func (tm *TransactionManager) processTx() {
        mpId := tm.txProcessor.mp.config.PartitionId
        start := time.Now()
        log.LogDebugf("processTx: mp[%v] mask %v", mpId, proto.GetMaskString(tm.txProcessor.mask))
        defer func() {
                log.LogDebugf("processTx: mp %d total cost %s", mpId, time.Since(start).String())
        }()

        limitCh := make(chan struct{}, 32)
        var wg sync.WaitGroup

        get := func() {
                wg.Add(1)
                limitCh <- struct{}{}
        }
        put := func() {
                <-limitCh
                wg.Done()
        }

        idx := 0
        f := func(i BtreeItem) bool {
                idx++
                if idx%100 == 0 {
                        if _, ok := tm.txProcessor.mp.IsLeader(); !ok {
                                log.LogWarnf("processExpiredTransactions for mp[%v] already not leader and break tx tree traverse",
                                        tm.txProcessor.mp.config.PartitionId)
                                return false
                        }
                }

                tx := i.(*proto.TransactionInfo)
                rollbackFunc := func(skipSetStat bool) {
                        defer put()
                        status, err := tm.rollbackTx(tx.TxID, skipSetStat)

                        if err != nil || status != proto.OpOk {
                                log.LogWarnf("processExpiredTransactions: transaction (%v) expired, rolling back failed, status(%v), err(%v)",
                                        tx, status, err)
                                return
                        }

                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) expired, rolling back done", tx)
                        }
                }

                commitFunc := func() {
                        defer put()
                        status, err := tm.commitTx(tx.TxID, true)
                        if err != nil || status != proto.OpOk {
                                log.LogWarnf("processExpiredTransactions: transaction (%v) expired, commit failed, status(%v), err(%v)",
                                        tx, status, err)
                                return
                        }

                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) expired, commit done", tx)
                        }
                }

                delFunc := func() {
                        defer put()
                        status, err := tm.delTxFromRM(tx.TxID)
                        if err != nil || status != proto.OpOk {
                                log.LogWarnf("processExpiredTransactions: delTxFromRM (%v) expired, commit failed, status(%v), err(%v)",
                                        tx, status, err)
                                return
                        }
                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) delTxFromRM, commit done", tx)
                        }
                }

                clearOrphan := func() {
                        defer put()
                        tm.clearOrphanTx(tx)
                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) clearOrphanTx", tx)
                        }
                }

                if tx.TmID != int64(mpId) {
                        if tx.CanDelete() {
                                if log.EnableDebug() {
                                        log.LogDebugf("processExpiredTransactions: transaction (%v) can be deleted", tx)
                                }
                                get()
                                go delFunc()
                                return true
                        }

                        if tx.NeedClearOrphan() {
                                if log.EnableDebug() {
                                        log.LogDebugf("processExpiredTransactions: orphan transaction (%v) can be clear", tx)
                                }
                                get()
                                go clearOrphan()
                                return true
                        }

                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: RM transaction (%v) is ongoing", tx)
                        }
                        return true
                }

                if tx.State == proto.TxStateCommit {
                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) continue to commit...", tx)
                        }
                        get()
                        go commitFunc()
                        return true
                }

                if tx.State == proto.TxStateRollback {
                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) continue to roll back...", tx)
                        }
                        get()
                        go rollbackFunc(true)
                        return true
                }

                if tx.State == proto.TxStatePreCommit {
                        if !tx.IsExpired() {
                                return true
                        }

                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) expired, rolling back...", tx)
                        }
                        get()
                        go rollbackFunc(false)
                        return true
                }

                if tx.IsDone() {
                        if !tx.CanDelete() {
                                if log.EnableDebug() {
                                        log.LogDebugf("processExpiredTransactions: transaction (%v) is ongoing", tx)
                                }
                                return true
                        }

                        if log.EnableDebug() {
                                log.LogDebugf("processExpiredTransactions: transaction (%v) can be deleted", tx)
                        }
                        get()
                        go delFunc()
                        return true
                }

                log.LogCriticalf("processExpiredTransactions: transaction (%v) is in state failed", tx)
                return true
        }

        tm.txTree.GetTree().Ascend(f)
        wg.Wait()
}

func (tm *TransactionManager) nextTxID() string {
        id := tm.txIdAlloc.allocateTransactionID()
        txId := fmt.Sprintf("%d_%d", tm.txProcessor.mp.config.PartitionId, id)
        log.LogDebugf("nextTxID: txId:%v", txId)
        return txId
}

func (tm *TransactionManager) txInRMDone(txId string) bool {
        ifo := tm.getTransaction(txId)
        if ifo == nil || ifo.Finish() {
                log.LogWarnf("txInRMDone: tx in rm already done, txId %s, ifo %v", txId, ifo)
                return true
        }
        return false
}

func (tm *TransactionManager) getTransaction(txID string) (txInfo *proto.TransactionInfo) {
        txItem := proto.NewTxInfoBItem(txID)
        item := tm.txTree.Get(txItem)
        if item == nil {
                return nil
        }
        txInfo = item.(*proto.TransactionInfo)
        return
}

func (tm *TransactionManager) copyGetTx(txId string) (txInfo *proto.TransactionInfo) {
        txItem := proto.NewTxInfoBItem(txId)
        item := tm.txTree.CopyGet(txItem)
        if item == nil {
                return nil
        }

        txInfo = item.(*proto.TransactionInfo)
        return
}

func (tm *TransactionManager) updateTxIdCursor(txId string) (err error) {
        arr := strings.Split(txId, "_")
        if len(arr) != 2 {
                return fmt.Errorf("updateTxId: tx[%v] is invalid", txId)
        }
        id, err := strconv.ParseUint(arr[1], 10, 64)
        if err != nil {
                return fmt.Errorf("updateTxId: tx[%v] is invalid", txId)
        }
        if id > tm.txIdAlloc.getTransactionID() {
                tm.txIdAlloc.setTransactionID(id)
        }
        return nil
}

func (tm *TransactionManager) addTxInfo(txInfo *proto.TransactionInfo) {
        tm.txTree.ReplaceOrInsert(txInfo, true)
}

// TM register a transaction, process client transaction
func (tm *TransactionManager) registerTransaction(txInfo *proto.TransactionInfo) (err error) {
        if uint64(txInfo.TmID) == tm.txProcessor.mp.config.PartitionId {
                if err := tm.updateTxIdCursor(txInfo.TxID); err != nil {
                        log.LogErrorf("updateTxIdCursor failed, txInfo %s, err %s", txInfo.String(), err.Error())
                        return err
                }

                for _, inode := range txInfo.TxInodeInfos {
                        inode.SetCreateTime(txInfo.CreateTime)
                        inode.SetTimeout(txInfo.Timeout)
                        inode.SetTxId(txInfo.TxID)
                }

                for _, dentry := range txInfo.TxDentryInfos {
                        dentry.SetCreateTime(txInfo.CreateTime)
                        dentry.SetTimeout(txInfo.Timeout)
                        dentry.SetTxId(txInfo.TxID)
                }
        }

        if info := tm.getTransaction(txInfo.TxID); info != nil {
                log.LogWarnf("tx is already exist, txId %s, info %v", txInfo.TxID, info.String())
                return nil
        }

        tm.addTxInfo(txInfo)

        if log.EnableDebug() {
                log.LogDebugf("registerTransaction: txInfo(%v)", txInfo)
        }

        return
}

func (tm *TransactionManager) deleteTxInfo(txId string) (status uint8) {
        tm.Lock()
        defer tm.Unlock()
        status = proto.OpOk
        txItem := proto.NewTxInfoBItem(txId)
        item := tm.txTree.Delete(txItem)
        if log.EnableDebug() {
                log.LogDebugf("deleteTxInfo: tx[%v] is deleted, item %v", txId, item)
        }
        return
}

func (tm *TransactionManager) rollbackTxInfo(txId string) (status uint8) {
        tm.Lock()
        defer tm.Unlock()
        status = proto.OpOk

        tx := tm.getTransaction(txId)
        if tx == nil {
                status = proto.OpTxInfoNotExistErr
                log.LogWarnf("rollbackTxInfo: rollback tx[%v] failed, not found", txId)
                return
        }

        tx.State = proto.TxStateRollbackDone
        tx.DoneTime = time.Now().Unix()
        log.LogDebugf("rollbackTxInfo: tx[%v] is rolled back", tx)
        return
}

func (tm *TransactionManager) commitTxInfo(txId string) (status uint8, err error) {
        tm.Lock()
        defer tm.Unlock()
        status = proto.OpOk
        tx := tm.getTransaction(txId)
        if tx == nil {
                status = proto.OpTxInfoNotExistErr
                err = fmt.Errorf("commitTxInfo: commit tx[%v] failed, not found", txId)
                return
        }

        tx.State = proto.TxStateCommitDone
        tx.DoneTime = time.Now().Unix()
        log.LogDebugf("commitTxInfo: tx[%v] is committed", tx)
        return
}

func buildTxPacket(data interface{}, mp uint64, op uint8) (pkt *proto.Packet, err error) {
        pkt = proto.NewPacketReqID()
        pkt.Opcode = op
        pkt.PartitionID = mp
        err = pkt.MarshalData(data)
        if err != nil {
                errInfo := fmt.Sprintf("buildTxPacket: marshal txInfo [%v] failed", data)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return nil, err
        }

        return
}

func (tm *TransactionManager) setTransactionState(txId string, state int32) (status uint8, err error) {
        var val []byte
        var resp interface{}
        status = proto.OpOk

        stateReq := &proto.TxSetStateRequest{
                TxID:  txId,
                State: state,
        }
        val, _ = json.Marshal(stateReq)

        resp, err = tm.txProcessor.mp.submit(opFSMTxSetState, val)
        if err != nil {
                log.LogWarnf("setTransactionState: set transaction[%v] state to [%v] failed, err[%v]", txId, state, err)
                return proto.OpAgain, err
        }
        status = resp.(uint8)

        if status != proto.OpOk {
                errInfo := fmt.Sprintf("setTransactionState: set transaction[%v] state to [%v] failed", txId, state)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
        }
        return
}

func (tm *TransactionManager) delTxFromRM(txId string) (status uint8, err error) {
        req := proto.TxApplyRequest{
                TxID: txId,
        }
        val, err := json.Marshal(req)
        if err != nil {
                return
        }

        resp, err := tm.txProcessor.mp.submit(opFSMTxDelete, val)
        if err != nil {
                log.LogWarnf("delTxFromRM: delTxFromRM transaction[%v] failed, err[%v]", txId, err)
                return proto.OpAgain, err
        }

        status = resp.(uint8)
        if log.EnableDebug() {
                log.LogDebugf("delTxFromRM: tx[%v] is deleted successfully, status (%s)", txId, proto.GetStatusStr(status))
        }

        return
}

func (tm *TransactionManager) clearOrphanTx(tx *proto.TransactionInfo) {
        log.LogWarnf("clearOrphanTx: start to clearOrphanTx, tx %v", tx)
        // check txInfo whether exist in tm
        req := &proto.TxGetInfoRequest{
                Pid:  uint64(tx.TmID),
                TxID: tx.TxID,
        }

        pkt, err := buildTxPacket(req, req.Pid, proto.OpMetaTxGet)
        if err != nil {
                return
        }

        mps := tx.GroupByMp()
        tmpMp, ok := mps[req.Pid]
        if !ok {
                log.LogErrorf("clearOrphanTx: can't get tm Mp info from tx, tx %v", tx)
                return
        }

        status := tm.txSendToMpWithAddrs(tmpMp.Members, pkt)
        if status != proto.OpTxInfoNotExistErr {
                log.LogWarnf("clearOrphanTx: tx is still exist, tx %v, status %s", tx, proto.GetStatusStr(status))
                return
        }

        log.LogWarnf("clearOrphanTx: find tx in tm already not exist, start clear it from rm, tx %v", tx)

        aReq := &proto.TxApplyRMRequest{
                PartitionID:     req.Pid,
                TransactionInfo: tx,
        }
        newPkt := &Packet{}
        err = tm.txProcessor.mp.TxRollbackRM(aReq, newPkt)
        log.LogWarnf("clearOrphanTx: finally rollback tx in rm, tx %v, status %s, err %v",
                tx, newPkt.GetResultMsg(), err)
        return
}

func (tm *TransactionManager) commitTx(txId string, skipSetStat bool) (status uint8, err error) {
        tx := tm.getTransaction(txId)
        if tx == nil {
                status = proto.OpTxInfoNotExistErr
                log.LogWarnf("commitTx: tx[%v] not found, already success", txId)
                return
        }

        if tx.State == proto.TxStateCommitDone {
                status = proto.OpOk
                log.LogWarnf("commitTx: tx[%v] is already commit", txId)
                return
        }

        // 1.set transaction to TxStateCommit
        if !skipSetStat && tx.State != proto.TxStateCommit {
                status, err = tm.setTransactionState(txId, proto.TxStateCommit)
                if status != proto.OpOk {
                        log.LogWarnf("commitTx: set transaction[%v] state to TxStateCommit failed", tx)
                        return
                }
        }

        // 2. notify all related RMs that a transaction is completed
        status = tm.sendToRM(tx, proto.OpTxCommitRM)
        if status != proto.OpOk {
                return
        }

        // 3. TM commit the transaction
        req := proto.TxApplyRequest{
                TxID: txId,
        }
        val, err := json.Marshal(req)
        if err != nil {
                return
        }

        resp, err := tm.txProcessor.mp.submit(opFSMTxCommit, val)
        if err != nil {
                log.LogWarnf("commitTx: commit transaction[%v] failed, err[%v]", txId, err)
                return proto.OpAgain, err
        }

        status = resp.(uint8)
        log.LogDebugf("commitTx: tx[%v] is commited successfully", txId)

        return
}

func (tm *TransactionManager) sendToRM(txInfo *proto.TransactionInfo, op uint8) (status uint8) {
        status = proto.OpOk
        mpIfos := txInfo.GroupByMp()
        statusCh := make(chan uint8, len(mpIfos))
        wg := sync.WaitGroup{}
        mp := tm.txProcessor.mp

        for mpId, ifo := range mpIfos {
                req := &proto.TxApplyRMRequest{
                        VolName:         mp.config.VolName,
                        PartitionID:     mpId,
                        TransactionInfo: txInfo,
                }

                wg.Add(1)

                pkt, _ := buildTxPacket(req, mpId, op)
                if mp.config.PartitionId == mpId {
                        pt := &Packet{*pkt}
                        go func() {
                                defer wg.Done()
                                var err error
                                if op == proto.OpTxCommitRM {
                                        err = mp.TxCommitRM(req, pt)
                                } else {
                                        err = mp.TxRollbackRM(req, pt)
                                }
                                statusCh <- pt.ResultCode
                                if pt.ResultCode != proto.OpOk {
                                        log.LogWarnf("sendToRM: invoke TxCommitRM failed, ifo %v, pkt %s, err %v", txInfo, pt.GetResultMsg(), err)
                                }
                        }()
                        continue
                }

                members := ifo.Members
                go func() {
                        defer wg.Done()
                        status := tm.txSendToMpWithAddrs(members, pkt)
                        if status != proto.OpOk {
                                log.LogWarnf("sendToRM: send to rm failed, addr %s, pkt %s, status %s",
                                        members, string(pkt.Data), proto.GetStatusStr(status))
                        }
                        statusCh <- status
                }()
        }

        wg.Wait()
        close(statusCh)

        updateStatus := func(st uint8) uint8 {
                if st == proto.OpTxConflictErr || st == proto.OpTxInfoNotExistErr {
                        log.LogWarnf("sendToRM: might have already been committed, tx[%v], status (%s)", txInfo, proto.GetStatusStr(st))
                        return proto.OpOk
                } else if st == proto.OpTxRbInodeNotExistErr || st == proto.OpTxRbDentryNotExistErr {
                        log.LogWarnf("sendToRM: already done before or not add, tx[%v], status (%s)", txInfo, proto.GetStatusStr(st))
                        return proto.OpOk
                } else {
                        return st
                }
        }

        for st := range statusCh {
                t := updateStatus(st)
                if t != proto.OpOk {
                        return t
                }
        }

        return status
}

func (tm *TransactionManager) rollbackTx(txId string, skipSetStat bool) (status uint8, err error) {
        status = proto.OpOk

        tx := tm.getTransaction(txId)
        if tx == nil {
                log.LogWarnf("commitTx: tx[%v] not found, already success", txId)
                return
        }

        if tx.State == proto.TxStateRollbackDone {
                status = proto.OpOk
                log.LogWarnf("commitTx: tx[%v] is already rollback", txId)
                return
        }

        // 1.set transaction to TxStateRollback
        if !skipSetStat && tx.State != proto.TxStateRollback {
                status, err = tm.setTransactionState(txId, proto.TxStateRollback)
                if status != proto.OpOk {
                        log.LogWarnf("commitTransaction: set transaction[%v] state to TxStateCommit failed", tx)
                        return
                }
        }

        // 2. notify all related RMs that a transaction is completed
        status = tm.sendToRM(tx, proto.OpTxRollbackRM)
        if status != proto.OpOk {
                return
        }

        req := proto.TxApplyRequest{
                TxID: txId,
        }
        val, err := json.Marshal(req)
        if err != nil {
                return
        }

        resp, err := tm.txProcessor.mp.submit(opFSMTxRollback, val)
        if err != nil {
                log.LogWarnf("commitTx: rollback transaction[%v]  failed, err[%v]", txId, err)
                return proto.OpAgain, err
        }

        status = resp.(uint8)
        log.LogDebugf("commitTx: tx[%v] is rollback successfully, msg %s", txId, proto.GetStatusStr(status))

        return
}

func (tm *TransactionManager) sendPacketToMP(addr string, p *proto.Packet) (err error) {
        var (
                mConn *net.TCPConn
                reqID = p.ReqID
                reqOp = p.Opcode
        )

        connPool := tm.txProcessor.mp.manager.connPool
        defer func() {
                connPool.PutConnect(mConn, err != nil)
                if err != nil {
                        p.PacketErrorWithBody(proto.OpErr, []byte(err.Error()))
                        log.LogErrorf("[sendPacketToMP]: req: %d - %v, %v, packet(%v)", p.GetReqID(),
                                p.GetOpMsg(), err, p)
                        return
                }
        }()

        mConn, err = connPool.GetConnect(addr)
        if err != nil {
                return
        }

        if err = p.WriteToConn(mConn); err != nil {
                return
        }

        // read connection from the master
        if err = p.ReadFromConn(mConn, proto.ReadDeadlineTime); err != nil {
                return
        }

        if reqID != p.ReqID || reqOp != p.Opcode {
                err = fmt.Errorf("sendPacketToMP: send and received packet mismatch: req(%v_%v) resp(%v_%v)",
                        reqID, reqOp, p.ReqID, p.Opcode)
                return
        }

        if log.EnableDebug() {
                log.LogDebugf("[sendPacketToMP] req: %d - %v, resp: %v, packet(%v)", p.GetReqID(), p.GetOpMsg(),
                        p.GetResultMsg(), p)
        }

        return
}

func (tm *TransactionManager) txSendToMpWithAddrs(addrStr string, p *proto.Packet) (status uint8) {
        addrs := strings.Split(addrStr, ",")
        var err error

        skippedAddrs := make([]string, 0)
        for _, addr := range addrs {
                if tm.blacklist.Has(addr) {
                        log.LogWarnf("txSendToMpWithAddrs: addr[%v] is already blacklisted, retry another addr, p %s", addr, string(p.Data))
                        skippedAddrs = append(skippedAddrs, addr)
                        continue
                }

                newPkt := p.GetCopy()
                err = tm.sendPacketToMP(addr, newPkt)
                if err != nil {
                        tm.blacklist.Add(addr)
                        log.LogWarnf("txSendToMpWithAddrs: send to %v failed, err(%s), add to blacklist and retry another addr, p %s",
                                addr, err.Error(), string(p.Data))
                        continue
                }

                status := newPkt.ResultCode
                if status == proto.OpErr || status == proto.OpAgain {
                        log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status(%s)",
                                addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
                        continue
                }

                if status == proto.OpOk {
                        if log.EnableDebug() {
                                log.LogDebugf("txSendToMpWithAddrs: send to %v done with status[%v], tx[%s]",
                                        addr, status, string(p.Data))
                        }
                        err = nil
                        return status
                }

                log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status %s",
                        addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
                return status
        }

        // try use skipped addr
        for _, addr := range skippedAddrs {
                newPkt := p.GetCopy()
                err = tm.sendPacketToMP(addr, newPkt)
                if err != nil {
                        log.LogWarnf("txSendToMpWithAddrs: send to %v failed, err(%s), add to blacklist and retry another addr, p %s",
                                addr, err.Error(), string(p.Data))
                        continue
                }

                status := newPkt.ResultCode
                if status == proto.OpErr || status == proto.OpAgain {
                        log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status(%s)",
                                addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
                        continue
                }

                if status == proto.OpOk {
                        if log.EnableDebug() {
                                log.LogDebugf("txSendToMpWithAddrs: send to %v done with status[%v], tx[%s]",
                                        addr, status, string(p.Data))
                        }
                        err = nil
                        return status
                }

                log.LogWarnf("txSendToMpWithAddrs: sendPacketToMp failed, addr %s, msg %s, data %s, status %s",
                        addr, newPkt.GetResultMsg(), string(p.Data), proto.GetStatusStr(status))
                return status
        }

        log.LogWarnf("txSendToMpWithAddrs: after retry still failed, return opAgain, pkt %s, addrs %v, err %v, status %s",
                string(p.Data), addrs, err, proto.GetStatusStr(status))
        return proto.OpAgain
}

func (tm *TransactionManager) txSetState(req *proto.TxSetStateRequest) (status uint8, err error) {
        tm.Lock()
        defer tm.Unlock()
        status = proto.OpOk

        txItem := proto.NewTxInfoBItem(req.TxID)
        item := tm.txTree.CopyGet(txItem)
        if item == nil {
                status = proto.OpTxInfoNotExistErr
                errInfo := fmt.Sprintf("txSetState: set state failed, req[%v] tx not existed", req)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }
        txInfo := item.(*proto.TransactionInfo)

        if req.State == proto.TxStateCommit && txInfo.State == proto.TxStateCommitDone {
                log.LogWarnf("txSetState: tx is already success before set commit state, tx %v", txInfo)
                status = proto.OpOk
                return
        }

        if req.State < proto.TxStateCommit || req.State > proto.TxStateFailed {
                status = proto.OpTxSetStateErr
                errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, req[%v]", req)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        if req.State == proto.TxStateCommit && txInfo.State != proto.TxStateCommit && txInfo.State != proto.TxStatePreCommit {
                status = proto.OpTxSetStateErr
                errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, tx state[%v], req state[%v], tx[%v]",
                        txInfo.State, req.State, req.TxID)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        if req.State == proto.TxStateRollback && txInfo.State != proto.TxStateRollback && txInfo.State != proto.TxStatePreCommit {
                status = proto.OpTxSetStateErr
                errInfo := fmt.Sprintf("txSetState: set state failed, wrong state, tx state[%v], req state[%v], tx[%v]",
                        txInfo.State, req.State, req.TxID)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        log.LogDebugf("txSetState: set tx state from [%v] to [%v], tx[%v]", txInfo.State, req.State, req.TxID)
        txInfo.State = req.State
        return
}

func (tr *TransactionResource) Reset() {
        tr.Lock()
        defer tr.Unlock()
        tr.txRbInodeTree.Reset()
        tr.txRbDentryTree.Reset()
        tr.txProcessor = nil
}

// check if item(inode, dentry) is in transaction for modifying
func (tr *TransactionResource) isInodeInTransction(ino *Inode) (inTx bool, txID string) {
        // return true only if specified inode is in an ongoing transaction(not expired yet)
        tr.Lock()
        defer tr.Unlock()

        if rbInode := tr.getTxRbInode(ino.Inode); rbInode != nil {
                inTx = true
                if rbInode.txInodeInfo != nil {
                        txID = rbInode.txInodeInfo.TxID
                }
                return
        }
        return false, ""
}

func (tr *TransactionResource) isDentryInTransction(dentry *Dentry) (inTx bool, txID string) {
        tr.Lock()
        defer tr.Unlock()

        if rbDentry := tr.getTxRbDentry(dentry.ParentId, dentry.Name); rbDentry != nil {
                inTx = true
                if rbDentry.txDentryInfo != nil {
                        txID = rbDentry.txDentryInfo.TxID
                }
                return
        }
        return false, ""
}

func (tr *TransactionResource) getTxRbInode(ino uint64) (rbInode *TxRollbackInode) {
        keyNode := &TxRollbackInode{
                inode: NewInode(ino, 0),
        }
        item := tr.txRbInodeTree.Get(keyNode)
        if item == nil {
                return nil
        }
        rbInode = item.(*TxRollbackInode)
        return
}

func (tr *TransactionResource) copyGetTxRbInode(ino uint64) (rbInode *TxRollbackInode) {
        keyNode := &TxRollbackInode{
                inode: NewInode(ino, 0),
        }
        item := tr.txRbInodeTree.CopyGet(keyNode)
        if item == nil {
                return nil
        }
        rbInode = item.(*TxRollbackInode)
        return
}

func (tr *TransactionResource) deleteTxRollbackInode(ino uint64, txId string) (status uint8) {
        tr.Lock()
        defer tr.Unlock()

        keyNode := &TxRollbackInode{
                txInodeInfo: proto.NewTxInodeInfo("", ino, 0),
        }

        item := tr.txRbInodeTree.Get(keyNode)
        if item == nil {
                log.LogWarnf("deleteTxRollbackInode: rollback inode may be already been deleted, inode %d, txId %s",
                        ino, txId)
                return proto.OpTxRbInodeNotExistErr
        }

        if item.(*TxRollbackInode).txInodeInfo.TxID != txId {
                log.LogWarnf("deleteTxRollbackInode: rollback dentry is already been update by other, txId %s, item %v",
                        txId, item)
                return proto.OpTxRbDentryNotExistErr
        }

        tr.txRbInodeTree.Delete(item)
        return proto.OpOk
}

// RM add an `TxRollbackInode` into `txRollbackInodes`
func (tr *TransactionResource) addTxRollbackInode(rbInode *TxRollbackInode) (status uint8) {
        tr.Lock()
        defer tr.Unlock()

        oldRbInode := tr.getTxRbInode(rbInode.inode.Inode)
        if oldRbInode != nil {
                if oldRbInode.txInodeInfo.TxID == rbInode.txInodeInfo.TxID {
                        log.LogWarnf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] is already exists",
                                rbInode.inode.Inode, rbInode.txInodeInfo.TxID)
                        return proto.OpExistErr
                } else {
                        log.LogErrorf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] "+
                                "is conflicted with inode [ino(%v) txID(%v)]",
                                rbInode.inode.Inode, rbInode.txInodeInfo.TxID, oldRbInode.inode.Inode, oldRbInode.txInodeInfo.TxID)
                        return proto.OpTxConflictErr
                }
        }

        tr.txRbInodeTree.ReplaceOrInsert(rbInode, true)
        log.LogDebugf("addTxRollbackInode: rollback inode [ino(%v) txID(%v)] is added", rbInode.inode.Inode, rbInode.txInodeInfo.TxID)
        return proto.OpOk
}

func (tr *TransactionResource) getTxRbDentry(pId uint64, name string) *TxRollbackDentry {
        keyNode := &TxRollbackDentry{
                txDentryInfo: proto.NewTxDentryInfo("", pId, name, 0),
        }
        item := tr.txRbDentryTree.Get(keyNode)
        if item == nil {
                return nil
        }

        return item.(*TxRollbackDentry)
}

func (tr *TransactionResource) deleteTxRollbackDentry(pid uint64, name, txId string) (status uint8) {
        tr.Lock()
        defer tr.Unlock()

        keyNode := &TxRollbackDentry{
                txDentryInfo: proto.NewTxDentryInfo("", pid, name, 0),
        }

        item := tr.txRbDentryTree.Get(keyNode)
        if item == nil {
                log.LogWarnf("deleteTxRollbackDentry: rollback dentry may be already been deleted, pid %d, name %s, txId %s",
                        pid, name, txId)
                return proto.OpTxRbDentryNotExistErr
        }

        if item.(*TxRollbackDentry).txDentryInfo.TxID != txId {
                log.LogWarnf("deleteTxRollbackDentry: rollback dentry is already been update by other, txId %s, item %v",
                        txId, name)
                return proto.OpTxRbDentryNotExistErr
        }

        tr.txRbDentryTree.Delete(item)
        return proto.OpOk
}

// RM add a `TxRollbackDentry` into `txRollbackDentries`
func (tr *TransactionResource) addTxRollbackDentry(rbDentry *TxRollbackDentry) (status uint8) {
        tr.Lock()
        defer tr.Unlock()

        oldRbDentry := tr.getTxRbDentry(rbDentry.txDentryInfo.ParentId, rbDentry.dentry.Name)
        if oldRbDentry != nil {
                if oldRbDentry.txDentryInfo.TxID == rbDentry.txDentryInfo.TxID {
                        log.LogWarnf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v)] is already exists",
                                rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID)
                        return proto.OpExistErr
                }
                log.LogWarnf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v) rbType(%v)] "+
                        "is conflicted with dentry [pino(%v) name(%v)  txID(%v) rbType(%v)]",
                        rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID, rbDentry.rbType,
                        oldRbDentry.dentry.ParentId, oldRbDentry.dentry.Name, oldRbDentry.txDentryInfo.TxID, oldRbDentry.rbType)
                return proto.OpTxConflictErr
        }

        tr.txRbDentryTree.ReplaceOrInsert(rbDentry, true)
        log.LogDebugf("addTxRollbackDentry: rollback dentry [pino(%v) name(%v) txID(%v) rbType(%v)] is added",
                rbDentry.dentry.ParentId, rbDentry.dentry.Name, rbDentry.txDentryInfo.TxID, rbDentry.rbType)
        return proto.OpOk
}

func (tr *TransactionResource) rollbackInodeInternal(rbInode *TxRollbackInode) (status uint8, err error) {
        status = proto.OpOk
        mp := tr.txProcessor.mp
        switch rbInode.rbType {
        case TxAdd:
                var ino *Inode
                item := mp.inodeTree.CopyGet(rbInode.inode)
                if item != nil {
                        ino = item.(*Inode)
                }

                if item == nil || ino.IsTempFile() || ino.ShouldDelete() {
                        mp.freeList.Remove(rbInode.inode.Inode)
                        if mp.uidManager != nil {
                                mp.uidManager.addUidSpace(rbInode.inode.Uid, rbInode.inode.Inode, rbInode.inode.Extents.eks)
                        }
                        if mp.mqMgr != nil && len(rbInode.quotaIds) > 0 && item == nil {
                                mp.setInodeQuota(rbInode.quotaIds, rbInode.inode.Inode)
                                for _, quotaId := range rbInode.quotaIds {
                                        mp.mqMgr.updateUsedInfo(int64(rbInode.inode.Size), 1, quotaId)
                                }
                        }
                        mp.inodeTree.ReplaceOrInsert(rbInode.inode, true)
                } else {
                        ino.IncNLink(mp.verSeq)
                }

        case TxDelete:
                if rsp := tr.txProcessor.mp.getInode(rbInode.inode, false); rsp.Status == proto.OpOk {
                        if tr.txProcessor.mp.uidManager != nil {
                                tr.txProcessor.mp.uidManager.doMinusUidSpace(rbInode.inode.Uid, rbInode.inode.Inode, rbInode.inode.Size)
                        }

                        if tr.txProcessor.mp.mqMgr != nil && len(rbInode.quotaIds) > 0 {
                                for _, quotaId := range rbInode.quotaIds {
                                        tr.txProcessor.mp.mqMgr.updateUsedInfo(-1*int64(rbInode.inode.Size), -1, quotaId)
                                }
                        }
                        tr.txProcessor.mp.fsmUnlinkInode(rbInode.inode, 0)
                        tr.txProcessor.mp.fsmEvictInode(rbInode.inode)
                }

        default:
                status = proto.OpTxRollbackUnknownRbType
                err = fmt.Errorf("rollbackInode: unknown rbType %d", rbInode.rbType)
                return
        }
        tr.txRbInodeTree.Delete(rbInode)
        return
}

// RM roll back an inode, retry if error occours
func (tr *TransactionResource) rollbackInode(req *proto.TxInodeApplyRequest) (status uint8, err error) {
        tr.Lock()
        defer tr.Unlock()
        status = proto.OpOk
        rbInode := tr.getTxRbInode(req.Inode)
        if rbInode == nil {
                status = proto.OpTxRbInodeNotExistErr
                errInfo := fmt.Sprintf("rollbackInode: roll back inode[%v] failed, txID[%v], rb inode not found", req.Inode, req.TxID)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        if rbInode.txInodeInfo.TxID != req.TxID {
                status = proto.OpTxConflictErr
                errInfo := fmt.Sprintf("rollbackInode: txID %v is not matching txInodeInfo txID %v", req.TxID, rbInode.txInodeInfo.TxID)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        status, err = tr.rollbackInodeInternal(rbInode)
        if err != nil {
                log.LogErrorf("rollbackInode: inode[%v] roll back failed in tx[%v], rbType[%v]", req.Inode, req.TxID, rbInode.rbType)
        } else {
                log.LogDebugf("rollbackInode: inode[%v] is rolled back in tx[%v], rbType[%v]", req.Inode, req.TxID, rbInode.rbType)
        }

        return
}

func (tr *TransactionResource) rollbackDentryInternal(rbDentry *TxRollbackDentry) (status uint8, err error) {
        defer func() {
                if status != proto.OpOk {
                        log.LogErrorf("rollbackDentryInternal: rollback dentry failed, ifo %v", rbDentry.txDentryInfo)
                }
        }()
        status = proto.OpOk
        switch rbDentry.rbType {
        case TxAdd:
                // need to be true to assert link not change.
                status = tr.txProcessor.mp.fsmCreateDentry(rbDentry.dentry, true)
        case TxDelete:
                resp := tr.txProcessor.mp.fsmDeleteDentry(rbDentry.dentry, true)
                status = resp.Status
        case TxUpdate:
                resp := tr.txProcessor.mp.fsmUpdateDentry(rbDentry.dentry)
                status = resp.Status
        default:
                status = proto.OpTxRollbackUnknownRbType
                err = fmt.Errorf("rollbackDentry: unknown rbType %d", rbDentry.rbType)
                return
        }

        tr.txRbDentryTree.Delete(rbDentry)
        return
}

// RM roll back a dentry, retry if error occours
func (tr *TransactionResource) rollbackDentry(req *proto.TxDentryApplyRequest) (status uint8, err error) {
        tr.Lock()
        defer tr.Unlock()
        status = proto.OpOk
        rbDentry := tr.getTxRbDentry(req.Pid, req.Name)
        if rbDentry == nil {
                status = proto.OpTxRbDentryNotExistErr
                errInfo := fmt.Sprintf("rollbackDentry: roll back dentry[%v_%v] failed, rb inode not found, txID[%v]",
                        req.Pid, req.Name, req.TxID)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
                return
        }

        if rbDentry.txDentryInfo.TxID != req.TxID {
                status = proto.OpTxConflictErr
                errInfo := fmt.Sprintf("rollbackDentry: txID %v is not matching txInodeInfo txID %v", req.TxID, rbDentry.txDentryInfo.TxID)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
                return
        }

        status, err = tr.rollbackDentryInternal(rbDentry)
        if err != nil {
                log.LogErrorf("rollbackDentry: denKey[%v] roll back failed in tx[%v], rbType[%v]",
                        rbDentry.txDentryInfo.GetKey(), req.TxID, rbDentry.rbType)
        } else {
                log.LogDebugf("rollbackDentry: denKey[%v] is rolled back in tx[%v], rbType[%v]",
                        rbDentry.txDentryInfo.GetKey(), req.TxID, rbDentry.rbType)
        }

        return
}

// RM simplely remove the inode from TransactionResource
func (tr *TransactionResource) commitInode(txID string, inode uint64) (status uint8, err error) {
        tr.Lock()
        defer tr.Unlock()
        status = proto.OpOk
        rbInode := tr.getTxRbInode(inode)
        if rbInode == nil {
                status = proto.OpTxRbInodeNotExistErr
                errInfo := fmt.Sprintf("commitInode: commit inode[%v] failed, rb inode not found", inode)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
                return
        }

        if rbInode.txInodeInfo.TxID != txID {
                status = proto.OpTxConflictErr
                errInfo := fmt.Sprintf("commitInode: txID %v is not matching txInodeInfo txID %v", txID, rbInode.txInodeInfo.TxID)
                err = errors.New(errInfo)
                log.LogErrorf("%v", errInfo)
                return
        }

        tr.txRbInodeTree.Delete(rbInode)
        log.LogDebugf("commitInode: inode[%v] is committed", inode)
        return
}

// RM simplely remove the dentry from TransactionResource
func (tr *TransactionResource) commitDentry(txID string, pId uint64, name string) (status uint8, err error) {
        tr.Lock()
        defer tr.Unlock()
        status = proto.OpOk

        rbDentry := tr.getTxRbDentry(pId, name)
        if rbDentry == nil {
                status = proto.OpTxRbDentryNotExistErr
                errInfo := fmt.Sprintf("commitDentry: commit dentry[%v_%v] failed, rb dentry not found", pId, name)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
                return
        }

        if rbDentry.txDentryInfo.TxID != txID {
                status = proto.OpTxConflictErr
                errInfo := fmt.Sprintf("commitDentry: txID %v is not matching txDentryInfo txID %v", txID, rbDentry.txDentryInfo.TxID)
                err = errors.New(errInfo)
                log.LogWarnf("%v", errInfo)
                return
        }

        tr.txRbDentryTree.Delete(rbDentry)
        // unlink parent inode
        if rbDentry.rbType == TxAdd {
                parInode := NewInode(pId, 0)
                st := tr.txProcessor.mp.fsmUnlinkInode(parInode, 0)
                if st.Status != proto.OpOk {
                        log.LogWarnf("commitDentry: try unlink parent inode failed, txId %s, inode[%v]", txID, parInode)
                        return
                }
        }

        log.LogDebugf("commitDentry: dentry[%v] is committed", rbDentry.txDentryInfo.GetKey())
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package metanode

import (
        "sync"
        "sync/atomic"
)

// TxIDAllocator generates and allocates ids
type TxIDAllocator struct {
        mpTxID   uint64
        txIDLock sync.RWMutex
}

// func newTxIDAllocator(mpID uint64, partition raftstore.Partition) (alloc *TxIDAllocator) {
func newTxIDAllocator() (alloc *TxIDAllocator) {
        alloc = new(TxIDAllocator)
        return
}

func (alloc *TxIDAllocator) Reset() {
        atomic.StoreUint64(&alloc.mpTxID, 0)
}

func (alloc *TxIDAllocator) setTransactionID(id uint64) {
        atomic.StoreUint64(&alloc.mpTxID, id)
}

func (alloc *TxIDAllocator) getTransactionID() uint64 {
        return atomic.LoadUint64(&alloc.mpTxID)
}

func (alloc *TxIDAllocator) allocateTransactionID() (mpTxID uint64) {
        alloc.txIDLock.Lock()
        defer alloc.txIDLock.Unlock()
        mpTxID = atomic.LoadUint64(&alloc.mpTxID) + 1
        alloc.setTransactionID(mpTxID)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package metanode

import (
        "bytes"
        "encoding/binary"
        "hash/crc32"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/timeutil"
)

const (
        checkerVersionSize = 4
        CrcUint32Size      = 4
        checkerVersion     = 1
        checkerRecordV1Len = 16
        opKeepTime         = 300
        opKeepOps          = 1024
        opRebuildSec       = 86400
        opCheckerInterval  = time.Second * 10

        opCheckerSliceCap = 1024
)

type uniqOp struct {
        uniqid uint64
        atime  int64
}

type uniqChecker struct {
        sync.Mutex
        op    map[uint64]struct{}
        inQue *uniqOpQueue
        rtime int64

        keepTime int64
        keepOps  int
}

func newUniqChecker() *uniqChecker {
        return &uniqChecker{
                op:       make(map[uint64]struct{}),
                inQue:    newUniqOpQueue(),
                keepTime: opKeepTime,
                keepOps:  opKeepOps,
                rtime:    timeutil.GetCurrentTimeUnix(),
        }
}

func (checker *uniqChecker) clone() *uniqChecker {
        checker.Lock()
        inQue := checker.inQue.clone()
        checker.Unlock()
        return &uniqChecker{inQue: inQue}
}

func (checker *uniqChecker) Marshal() (buf []byte, crc uint32, err error) {
        buffer := bytes.NewBuffer(make([]byte, 0, checkerVersionSize+checker.inQue.len()*checkerRecordV1Len))
        if err = binary.Write(buffer, binary.BigEndian, int32(checkerVersion)); err != nil {
                return
        }

        checker.inQue.scan(func(op *uniqOp) bool {
                if err = binary.Write(buffer, binary.BigEndian, op.uniqid); err != nil {
                        return false
                }
                if err = binary.Write(buffer, binary.BigEndian, op.atime); err != nil {
                        return false
                }
                return true
        })

        sign := crc32.NewIEEE()
        if _, err = sign.Write(buffer.Bytes()); err != nil {
                return
        }
        crc = sign.Sum32()

        buf = buffer.Bytes()
        return
}

func (checker *uniqChecker) UnMarshal(data []byte) (err error) {
        if len(data) < checkerVersionSize {
                err = errors.New("invalid uniqChecker file length")
                log.LogErrorf("uniqChecker UnMarshal err(%v)", err)
                return
        }

        buff := bytes.NewBuffer(data)
        var version int32
        if err = binary.Read(buff, binary.BigEndian, &version); err != nil {
                log.LogErrorf("uniqChecker unmarshal read version err(%v)", err)
                return
        }

        var uniqid uint64
        var atime int64
        now := time.Now().Unix()
        for buff.Len() != 0 {
                if err = binary.Read(buff, binary.BigEndian, &uniqid); err != nil {
                        log.LogErrorf("uniqChecker unmarshal read uniqid err(%v)", err)
                        return
                }
                if err = binary.Read(buff, binary.BigEndian, &atime); err != nil {
                        log.LogErrorf("uniqChecker unmarshal read atime err(%v)", err)
                        return
                }
                // atime over local time is too large
                if atime > now+86400 {
                        log.LogWarnf("uniqChecker skip invalid atime %v uniqid %v", atime, uniqid)
                        continue
                }
                checker.inQue.append(&uniqOp{uniqid, atime})
                checker.op[uniqid] = struct{}{}
        }
        return
}

func (checker *uniqChecker) legalIn(bid uint64) bool {
        // ignore zero uniqid
        if bid == 0 {
                return true
        }

        checker.Lock()
        defer checker.Unlock()

        if _, ok := checker.op[bid]; ok {
                return false
        } else {
                checker.op[bid] = struct{}{}
                checker.inQue.append(&uniqOp{bid, time.Now().Unix()})
        }

        return true
}

func (checker *uniqChecker) evictIndex() (left int, idx int, op *uniqOp) {
        checker.Lock()
        defer checker.Unlock()
        inQueCnt := checker.inQue.len()
        if inQueCnt <= checker.keepOps {
                return inQueCnt, -1, nil
        }

        var c int
        var lastOp *uniqOp
        nowtime := time.Now().Unix()

        checker.inQue.scan(func(op *uniqOp) bool {
                kt := checker.keepTime
                if inQueCnt-c <= checker.keepOps {
                        kt = 10 * checker.keepTime
                }
                if nowtime-op.atime >= kt {
                        lastOp = op
                        c++
                        if c%10000 == 0 {
                                checker.Unlock()
                                time.Sleep(100 * time.Microsecond)
                                checker.Lock()
                        }
                        return true
                }
                return false
        })

        return inQueCnt - c, c - 1, lastOp
}

func (checker *uniqChecker) doEvict(evictBid uint64) {
        checker.Lock()
        defer checker.Unlock()

        cnt := 0
        // evict from map
        if _, ok := checker.op[evictBid]; ok {
                checker.inQue.scan(func(op *uniqOp) bool {
                        cnt++
                        delete(checker.op, op.uniqid)
                        if op.uniqid == evictBid {
                                return false
                        }
                        return true
                })
        }

        if cnt == 0 {
                return
        }

        // truncate from queue
        checker.inQue.truncate(cnt - 1)

        // regular rebuild map to reduce memory usage
        n := timeutil.GetCurrentTimeUnix()
        if n-checker.rtime > opRebuildSec {
                checker.op = make(map[uint64]struct{}, checker.inQue.len())
                checker.inQue.scan(func(op *uniqOp) bool {
                        checker.op[op.uniqid] = struct{}{}
                        return true
                })
                checker.rtime = n
        }
}

type uniqOpSlice struct {
        s []*uniqOp
}

// uniqOpQueue append only queue, item in queue should not be modified
type uniqOpQueue struct {
        cnt int
        ss  []*uniqOpSlice
        cur *uniqOpSlice
}

func newUniqOpQueue() *uniqOpQueue {
        s := &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
        return &uniqOpQueue{
                cnt: 0,
                ss:  []*uniqOpSlice{s},
                cur: s,
        }
}

func (b *uniqOpQueue) append(v *uniqOp) {
        if cap(b.cur.s)-len(b.cur.s) == 0 {
                b.cur = &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
                b.ss = append(b.ss, b.cur)
        }
        b.cur.s = append(b.cur.s, v)
        b.cnt++
}

func (b *uniqOpQueue) index(idx int) *uniqOp {
        for _, s := range b.ss {
                l := len(s.s)
                if idx >= l {
                        idx = idx - l
                } else {
                        return s.s[idx]
                }
        }
        return nil
}

func (b *uniqOpQueue) truncate(idx int) {
        if idx >= b.cnt-1 {
                b.reset()
                return
        }

        b.cnt = b.cnt - idx - 1

        var tidx int
        var s *uniqOpSlice
        for tidx, s = range b.ss {
                l := len(s.s)
                if idx >= l {
                        idx = idx - l
                } else {
                        b.ss[tidx].s = s.s[idx+1:]
                        break
                }
        }
        b.ss = b.ss[tidx:]
}

func (b *uniqOpQueue) scan(fn func(op *uniqOp) bool) {
        for _, s := range b.ss {
                for _, op := range s.s {
                        if !fn(op) {
                                return
                        }
                }
        }
}

func (b *uniqOpQueue) len() int {
        return b.cnt
}

func (b *uniqOpQueue) reset() {
        b.cur = &uniqOpSlice{s: make([]*uniqOp, 0, opCheckerSliceCap)}
        b.ss = []*uniqOpSlice{b.cur}
        b.cnt = 0
}

func (b *uniqOpQueue) clone() *uniqOpQueue {
        ss := make([]*uniqOpSlice, 0, len(b.ss))
        for _, s := range b.ss {
                ss = append(ss, &uniqOpSlice{s.s[:]})
        }

        return &uniqOpQueue{
                cnt: b.cnt,
                ss:  ss,
                cur: ss[len(ss)-1],
        }
}

package metanode

import (
        "fmt"
        "os"
        "sort"
        "strconv"
        "strings"
)

type DelExtFile []os.FileInfo

func (del DelExtFile) Len() int {
        return len(del)
}

func (del DelExtFile) Swap(i, j int) {
        del[i], del[j] = del[j], del[i]
}

func (del DelExtFile) Less(i, j int) bool {
        idx1 := getDelExtFileIdx(del[i].Name())
        idx2 := getDelExtFileIdx(del[j].Name())

        return idx1 < idx2
}

func getDelExtFileIdx(name string) int64 {
        arr := strings.Split(name, "_")
        size := len(arr)
        if size < 2 {
                panic(fmt.Errorf("file name is not legal, %s", name))
        }

        idx, err := strconv.ParseInt(arr[size-1], 10, 64)
        if err != nil {
                panic(fmt.Errorf("file name is not legal, %s", name))
        }

        return idx
}

func sortDelExtFileInfo(files []os.FileInfo) []os.FileInfo {
        newFiles := make([]os.FileInfo, 0)

        for _, info := range files {
                if info.IsDir() {
                        continue
                }

                if strings.HasPrefix(info.Name(), prefixDelExtent) {
                        newFiles = append(newFiles, info)
                }
        }

        if len(newFiles) <= 1 {
                return newFiles
        }

        sort.Sort(DelExtFile(newFiles))

        return newFiles
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "encoding/json"
        "fmt"
        "strconv"
        "time"

        "github.com/cubefs/cubefs/util"
)

// api
const (
        // Admin APIs
        AdminGetMasterApiList                     = "/admin/getMasterApiList"
        AdminSetApiQpsLimit                       = "/admin/setApiQpsLimit"
        AdminGetApiQpsLimit                       = "/admin/getApiQpsLimit"
        AdminRemoveApiQpsLimit                    = "/admin/rmApiQpsLimit"
        AdminGetCluster                           = "/admin/getCluster"
        AdminSetClusterInfo                       = "/admin/setClusterInfo"
        AdminGetMonitorPushAddr                   = "/admin/getMonitorPushAddr"
        AdminGetDataPartition                     = "/dataPartition/get"
        AdminLoadDataPartition                    = "/dataPartition/load"
        AdminCreateDataPartition                  = "/dataPartition/create"
        AdminCreatePreLoadDataPartition           = "/dataPartition/createPreLoad"
        AdminDecommissionDataPartition            = "/dataPartition/decommission"
        AdminDiagnoseDataPartition                = "/dataPartition/diagnose"
        AdminResetDataPartitionDecommissionStatus = "/dataPartition/resetDecommissionStatus"
        AdminQueryDataPartitionDecommissionStatus = "/dataPartition/queryDecommissionStatus"
        AdminDeleteDataReplica                    = "/dataReplica/delete"
        AdminAddDataReplica                       = "/dataReplica/add"
        AdminDeleteVol                            = "/vol/delete"
        AdminUpdateVol                            = "/vol/update"
        AdminVolShrink                            = "/vol/shrink"
        AdminVolExpand                            = "/vol/expand"
        AdminVolForbidden                         = "/vol/forbidden"
        AdminVolEnableAuditLog                    = "/vol/auditlog"
        AdminCreateVol                            = "/admin/createVol"
        AdminGetVol                               = "/admin/getVol"
        AdminClusterFreeze                        = "/cluster/freeze"
        AdminClusterForbidMpDecommission          = "/cluster/forbidMetaPartitionDecommission"
        AdminClusterStat                          = "/cluster/stat"
        AdminSetCheckDataReplicasEnable           = "/cluster/setCheckDataReplicasEnable"
        AdminGetIP                                = "/admin/getIp"
        AdminCreateMetaPartition                  = "/metaPartition/create"
        AdminSetMetaNodeThreshold                 = "/threshold/set"
        AdminListVols                             = "/vol/list"
        AdminSetNodeInfo                          = "/admin/setNodeInfo"
        AdminGetNodeInfo                          = "/admin/getNodeInfo"
        AdminGetAllNodeSetGrpInfo                 = "/admin/getDomainInfo"
        AdminGetNodeSetGrpInfo                    = "/admin/getDomainNodeSetGrpInfo"
        AdminGetIsDomainOn                        = "/admin/getIsDomainOn"
        AdminUpdateNodeSetCapcity                 = "/admin/updateNodeSetCapcity"
        AdminUpdateNodeSetId                      = "/admin/updateNodeSetId"
        AdminUpdateNodeSetNodeSelector            = "/admin/updateNodeSetNodeSelector"
        AdminUpdateDomainDataUseRatio             = "/admin/updateDomainDataRatio"
        AdminUpdateZoneExcludeRatio               = "/admin/updateZoneExcludeRatio"
        AdminSetNodeRdOnly                        = "/admin/setNodeRdOnly"
        AdminSetDpRdOnly                          = "/admin/setDpRdOnly"
        AdminSetConfig                            = "/admin/setConfig"
        AdminGetConfig                            = "/admin/getConfig"
        AdminDataPartitionChangeLeader            = "/dataPartition/changeleader"
        AdminChangeMasterLeader                   = "/master/changeleader"
        AdminOpFollowerPartitionsRead             = "/master/opFollowerPartitionRead"
        AdminUpdateDecommissionLimit              = "/admin/updateDecommissionLimit"
        AdminQueryDecommissionLimit               = "/admin/queryDecommissionLimit"
        // #nosec G101
        AdminQueryDecommissionToken = "/admin/queryDecommissionToken"
        AdminSetFileStats           = "/admin/setFileStatsEnable"
        AdminGetFileStats           = "/admin/getFileStatsEnable"
        AdminGetClusterValue        = "/admin/getClusterValue"
        AdminSetClusterUuidEnable   = "/admin/setClusterUuidEnable"
        AdminGetClusterUuid         = "/admin/getClusterUuid"
        AdminGenerateClusterUuid    = "/admin/generateClusterUuid"
        AdminSetDpDiscard           = "/admin/setDpDiscard"
        AdminGetDiscardDp           = "/admin/getDiscardDp"

        AdminSetConLcNodeNum  = "/admin/setConLcNodeNum"
        AdminGetAllLcNodeInfo = "/admin/getAllLcNodeInfo"

        AdminLcNode = "/admin/lcnode"

        AdminUpdateDecommissionDiskFactor = "/admin/updateDecommissionDiskFactor"
        AdminQueryDecommissionDiskLimit   = "/admin/queryDecommissionDiskLimit"
        AdminEnableAutoDecommissionDisk   = "/admin/enableAutoDecommissionDisk"
        AdminQueryAutoDecommissionDisk    = "/admin/queryAutoDecommissionDisk"
        // graphql master api
        AdminClusterAPI = "/api/cluster"
        AdminUserAPI    = "/api/user"
        AdminVolumeAPI  = "/api/volume"

        // graphql coonsole api
        ConsoleIQL        = "/iql"
        ConsoleLoginAPI   = "/login"
        ConsoleMonitorAPI = "/cfs_monitor"
        ConsoleFile       = "/file"
        ConsoleFileDown   = "/file/down"
        ConsoleFileUpload = "/file/upload"

        // Client APIs
        ClientDataPartitions = "/client/partitions"
        ClientVol            = "/client/vol"
        ClientMetaPartition  = "/metaPartition/get"
        ClientVolStat        = "/client/volStat"
        ClientMetaPartitions = "/client/metaPartitions"

        // qos api
        QosGetStatus           = "/qos/getStatus"
        QosGetClientsLimitInfo = "/qos/getClientsInfo"
        QosGetZoneLimitInfo    = "/qos/getZoneLimit" // include disk enable
        QosUpdate              = "/qos/update"       // include disk enable
        QosUpdateMagnify       = "/qos/updateMagnify"
        QosUpdateClientParam   = "/qos/updateClientParam"
        QosUpdateZoneLimit     = "/qos/updateZoneLimit" // include disk enable
        QosUpload              = "/admin/qosUpload"
        QosUpdateMasterLimit   = "/qos/masterLimit"

        // acl api
        AdminACL = "/admin/aclOp"
        // uid api
        AdminUid = "/admin/uidOp"

        // raft node APIs
        AddRaftNode    = "/raftNode/add"
        RemoveRaftNode = "/raftNode/remove"
        RaftStatus     = "/get/raftStatus"

        // node APIs

        AddDataNode                        = "/dataNode/add"
        DecommissionDataNode               = "/dataNode/decommission"
        QueryDataNodeDecoProgress          = "/dataNode/queryDecommissionProgress"
        QueryDataNodeDecoFailedDps         = "/dataNode/queryDecommissionFailedDps"
        MigrateDataNode                    = "/dataNode/migrate"
        CancelDecommissionDataNode         = "/dataNode/cancelDecommission"
        DecommissionDisk                   = "/disk/decommission"
        RecommissionDisk                   = "/disk/recommission"
        QueryDiskDecoProgress              = "/disk/queryDecommissionProgress"
        MarkDecoDiskFixed                  = "/disk/MarkDecommissionDiskFixed"
        CancelDecommissionDisk             = "/disk/cancelDecommission"
        QueryDecommissionDiskDecoFailedDps = "/disk/queryDecommissionFailedDps"
        QueryBadDisks                      = "/disk/queryBadDisks"
        RestoreStoppedAutoDecommissionDisk = "/disk/restoreStoppedAutoDecommissionDisk"
        QueryAllDecommissionDisk           = "/disk/queryAllDecommissionDisk"
        GetDataNode                        = "/dataNode/get"
        AddMetaNode                        = "/metaNode/add"
        DecommissionMetaNode               = "/metaNode/decommission"
        MigrateMetaNode                    = "/metaNode/migrate"
        GetMetaNode                        = "/metaNode/get"
        AdminUpdateMetaNode                = "/metaNode/update"
        AdminUpdateDataNode                = "/dataNode/update"
        AdminGetInvalidNodes               = "/invalid/nodes"
        AdminLoadMetaPartition             = "/metaPartition/load"
        AdminDiagnoseMetaPartition         = "/metaPartition/diagnose"
        AdminDecommissionMetaPartition     = "/metaPartition/decommission"
        AdminChangeMetaPartitionLeader     = "/metaPartition/changeleader"
        AdminBalanceMetaPartitionLeader    = "/metaPartition/balanceLeader"
        AdminAddMetaReplica                = "/metaReplica/add"
        AdminDeleteMetaReplica             = "/metaReplica/delete"
        AdminPutDataPartitions             = "/dataPartitions/set"

        // admin multi version snapshot
        AdminCreateVersion     = "/multiVer/create"
        AdminDelVersion        = "/multiVer/del"
        AdminGetVersionInfo    = "/multiVer/get"
        AdminGetAllVersionInfo = "/multiVer/getAll"
        AdminGetVolVer         = "/vol/getVer"
        AdminSetVerStrategy    = "/vol/SetVerStrategy"

        // S3 lifecycle configuration APIS
        SetBucketLifecycle    = "/s3/setLifecycle"
        GetBucketLifecycle    = "/s3/getLifecycle"
        DeleteBucketLifecycle = "/s3/deleteLifecycle"

        AddLcNode = "/lcNode/add"

        QueryDisableDisk = "/dataNode/queryDisableDisk"
        // Operation response
        GetMetaNodeTaskResponse = "/metaNode/response" // Method: 'POST', ContentType: 'application/json'
        GetDataNodeTaskResponse = "/dataNode/response" // Method: 'POST', ContentType: 'application/json'
        GetLcNodeTaskResponse   = "/lcNode/response"   // Method: 'POST', ContentType: 'application/json'

        GetTopologyView = "/topo/get"
        UpdateZone      = "/zone/update"
        GetAllZones     = "/zone/list"
        GetAllNodeSets  = "/nodeSet/list"
        GetNodeSet      = "/nodeSet/get"
        UpdateNodeSet   = "/nodeSet/update"

        // Header keys
        SkipOwnerValidation = "Skip-Owner-Validation"
        ForceDelete         = "Force-Delete"

        // APIs for user management
        UserCreate          = "/user/create"
        UserDelete          = "/user/delete"
        UserUpdate          = "/user/update"
        UserUpdatePolicy    = "/user/updatePolicy"
        UserRemovePolicy    = "/user/removePolicy"
        UserDeleteVolPolicy = "/user/deleteVolPolicy"
        UserGetInfo         = "/user/info"
        UserGetAKInfo       = "/user/akInfo"
        UserTransferVol     = "/user/transferVol"
        UserList            = "/user/list"
        UsersOfVol          = "/vol/users"
        // graphql api for header
        HeadAuthorized  = "Authorization"
        ParamAuthorized = "_authorization"
        UserKey         = "_user_key"
        UserInfoKey     = "_user_info_key"
        // quota
        QuotaCreate = "/quota/create"
        QuotaUpdate = "/quota/update"
        QuotaDelete = "/quota/delete"
        QuotaList   = "/quota/list"
        QuotaGet    = "/quota/get"
        // QuotaBatchModifyPath = "/quota/batchModifyPath"
        QuotaListAll = "/quota/listAll"

        // s3 qos api
        S3QoSSet    = "/s3/qos/set"
        S3QoSGet    = "/s3/qos/get"
        S3QoSDelete = "/s3/qos/delete"
)

var GApiInfo map[string]string = map[string]string{
        "admingetmasterapilist":            AdminGetMasterApiList,
        "adminsetapiqpslimit":              AdminSetApiQpsLimit,
        "admingetcluster":                  AdminGetCluster,
        "adminsetclusterinfo":              AdminSetClusterInfo,
        "admingetdatapartition":            AdminGetDataPartition,
        "adminloaddatapartition":           AdminLoadDataPartition,
        "admincreatedatapartition":         AdminCreateDataPartition,
        "admincreatepreloaddatapartition":  AdminCreatePreLoadDataPartition,
        "admindecommissiondatapartition":   AdminDecommissionDataPartition,
        "admindiagnosedatapartition":       AdminDiagnoseDataPartition,
        "admindeletedatareplica":           AdminDeleteDataReplica,
        "adminadddatareplica":              AdminAddDataReplica,
        "admindeletevol":                   AdminDeleteVol,
        "adminupdatevol":                   AdminUpdateVol,
        "adminvolshrink":                   AdminVolShrink,
        "adminvolexpand":                   AdminVolExpand,
        "admincreatevol":                   AdminCreateVol,
        "admingetvol":                      AdminGetVol,
        "adminclusterfreeze":               AdminClusterFreeze,
        "adminclusterforbidmpdecommission": AdminClusterForbidMpDecommission,
        "adminclusterstat":                 AdminClusterStat,
        "admingetip":                       AdminGetIP,
        "admincreatemetapartition":         AdminCreateMetaPartition,
        "adminsetmetanodethreshold":        AdminSetMetaNodeThreshold,
        "adminlistvols":                    AdminListVols,
        "adminsetnodeinfo":                 AdminSetNodeInfo,
        "admingetnodeinfo":                 AdminGetNodeInfo,
        "admingetallnodesetgrpinfo":        AdminGetAllNodeSetGrpInfo,
        "admingetnodesetgrpinfo":           AdminGetNodeSetGrpInfo,
        "admingetisdomainon":               AdminGetIsDomainOn,
        "adminupdatenodesetcapcity":        AdminUpdateNodeSetCapcity,
        "adminupdatenodesetid":             AdminUpdateNodeSetId,
        "adminupdatedomaindatauseratio":    AdminUpdateDomainDataUseRatio,
        "adminupdatezoneexcluderatio":      AdminUpdateZoneExcludeRatio,
        "adminsetnoderdonly":               AdminSetNodeRdOnly,
        "adminsetdprdonly":                 AdminSetDpRdOnly,
        "admindatapartitionchangeleader":   AdminDataPartitionChangeLeader,
        "adminsetdpdiscard":                AdminSetDpDiscard,
        "admingetdiscarddp":                AdminGetDiscardDp,

        //"adminclusterapi":                 AdminClusterAPI,
        //"adminuserapi":                    AdminUserAPI,
        //"adminvolumeapi":                  AdminVolumeAPI,
        //"consoleiql":                      ConsoleIQL,
        //"consoleloginapi":                 ConsoleLoginAPI,
        //"consolemonitorapi":               ConsoleMonitorAPI,
        //"consolefile":                     ConsoleFile,
        //"consolefiledown":                 ConsoleFileDown,
        //"consolefileupload":               ConsoleFileUpload,
        "clientdatapartitions":   ClientDataPartitions,
        "clientvol":              ClientVol,
        "clientmetapartition":    ClientMetaPartition,
        "clientvolstat":          ClientVolStat,
        "clientmetapartitions":   ClientMetaPartitions,
        "qosgetstatus":           QosGetStatus,
        "qosgetclientslimitinfo": QosGetClientsLimitInfo,
        "qosgetzonelimitinfo":    QosGetZoneLimitInfo,
        "qosupdate":              QosUpdate,
        //"qosupdatemagnify":               QosUpdateMagnify,
        "qosupdateclientparam":            QosUpdateClientParam,
        "qosupdatezonelimit":              QosUpdateZoneLimit,
        "qosupload":                       QosUpload,
        "qosupdatemasterlimit":            QosUpdateMasterLimit,
        "addraftnode":                     AddRaftNode,
        "removeraftnode":                  RemoveRaftNode,
        "raftstatus":                      RaftStatus,
        "adddatanode":                     AddDataNode,
        "decommissiondatanode":            DecommissionDataNode,
        "migratedatanode":                 MigrateDataNode,
        "canceldecommissiondatanode":      CancelDecommissionDataNode,
        "decommissiondisk":                DecommissionDisk,
        "getdatanode":                     GetDataNode,
        "addmetanode":                     AddMetaNode,
        "decommissionmetanode":            DecommissionMetaNode,
        "migratemetanode":                 MigrateMetaNode,
        "getmetanode":                     GetMetaNode,
        "adminupdatemetanode":             AdminUpdateMetaNode,
        "adminupdatedatanode":             AdminUpdateDataNode,
        "admingetinvalidnodes":            AdminGetInvalidNodes,
        "adminloadmetapartition":          AdminLoadMetaPartition,
        "admindiagnosemetapartition":      AdminDiagnoseMetaPartition,
        "admindecommissionmetapartition":  AdminDecommissionMetaPartition,
        "adminchangemetapartitionleader":  AdminChangeMetaPartitionLeader,
        "adminbalancemetapartitionleader": AdminBalanceMetaPartitionLeader,
        "adminaddmetareplica":             AdminAddMetaReplica,
        "admindeletemetareplica":          AdminDeleteMetaReplica,
        "getmetanodetaskresponse":         GetMetaNodeTaskResponse,
        "getdatanodetaskresponse":         GetDataNodeTaskResponse,
        "gettopologyview":                 GetTopologyView,
        "updatezone":                      UpdateZone,
        "getallzones":                     GetAllZones,
        "usercreate":                      UserCreate,
        "userdelete":                      UserDelete,
        "userupdate":                      UserUpdate,
        "userupdatepolicy":                UserUpdatePolicy,
        "userremovepolicy":                UserRemovePolicy,
        "userdeletevolpolicy":             UserDeleteVolPolicy,
        "usergetinfo":                     UserGetInfo,
        "usergetakinfo":                   UserGetAKInfo,
        "usertransfervol":                 UserTransferVol,
        "userlist":                        UserList,
        "usersofvol":                      UsersOfVol,
}

// const TimeFormat = "2006-01-02 15:04:05"

const (
        TimeFormat                 = "2006-01-02 15:04:05"
        DefaultDirChildrenNumLimit = 20000000
        MinDirChildrenNumLimit     = 1000000
)

// HTTPReply uniform response structure
type HTTPReply struct {
        Code int32       `json:"code"`
        Msg  string      `json:"msg"`
        Data interface{} `json:"data"`
}

type HTTPReplyRaw struct {
        Code int32           `json:"code"`
        Msg  string          `json:"msg"`
        Data json.RawMessage `json:"data"`
}

func (raw *HTTPReplyRaw) Unmarshal(body []byte) error {
        r := new(HTTPReplyRaw)
        if err := json.Unmarshal(body, r); err != nil {
                return fmt.Errorf("httpreply unmarshal [%s]", err.Error())
        }
        *raw = *r
        return nil
}

func (raw *HTTPReplyRaw) Success() error {
        if code := raw.Code; code != ErrCodeSuccess {
                err := ParseErrorCode(code)
                return fmt.Errorf("httpreply code[%d] err[%s] msg[%s]", code, err.Error(), raw.Msg)
        }
        return nil
}

func (raw *HTTPReplyRaw) Bytes() []byte {
        return raw.Data
}

func (raw *HTTPReplyRaw) String() string {
        return string(raw.Bytes())
}

func (raw *HTTPReplyRaw) Int64() (int64, error) {
        return strconv.ParseInt(string(raw.Data), 10, 64)
}

func (raw *HTTPReplyRaw) Uint64() (uint64, error) {
        return strconv.ParseUint(string(raw.Data), 10, 64)
}

func (raw *HTTPReplyRaw) Result(result interface{}) error {
        return json.Unmarshal(raw.Data, result)
}

func UnmarshalHTTPReply(body []byte, result interface{}) error {
        raw := new(HTTPReplyRaw)
        if err := raw.Unmarshal(body); err != nil {
                return err
        }
        if err := raw.Success(); err != nil {
                return err
        }

        if result == nil {
                return nil
        }
        switch v := result.(type) {
        case *string:
                *v = raw.String()
        case *int64:
                val, err := raw.Int64()
                if err != nil {
                        return err
                }
                *v = val
        case *uint64:
                val, err := raw.Uint64()
                if err != nil {
                        return err
                }
                *v = val
        default:
                return raw.Result(result)
        }
        return nil
}

// RegisterMetaNodeResp defines the response to register a meta node.
type RegisterMetaNodeResp struct {
        ID uint64
}

type AclIpInfo struct {
        Ip    string
        CTime int64
}

type AclRsp struct {
        Info    string
        OK      bool
        List    []*AclIpInfo
        Reserve string
}

type UidSpaceRsp struct {
        Info        string
        OK          bool
        UidSpaceArr []*UidSpaceInfo
        Reserve     string
}

type VolumeVerStrategy struct {
        KeepVerCnt  int
        Periodic    int
        Enable      bool
        ForceUpdate bool
        UTime       time.Time
}

func (v *VolumeVerStrategy) GetPeriodic() int {
        return v.Periodic
}

func (v *VolumeVerStrategy) GetPeriodicSecond() int {
        // return v.Periodic*24*3600
        return v.Periodic * 3600
}

func (v *VolumeVerStrategy) TimeUp(curTime time.Time) bool {
        return v.UTime.Add(time.Second * time.Duration(v.GetPeriodicSecond())).Before(curTime)
}

type VolumeVerInfo struct {
        Name             string
        VerSeq           uint64
        VerSeqPrepare    uint64
        VerPrepareStatus uint8
        Enabled          bool
}

// ClusterInfo defines the cluster infomation.
type ClusterInfo struct {
        Cluster                     string
        Ip                          string
        MetaNodeDeleteBatchCount    uint64
        MetaNodeDeleteWorkerSleepMs uint64
        DataNodeDeleteLimitRate     uint64
        DataNodeAutoRepairLimitRate uint64
        DpMaxRepairErrCnt           uint64
        DirChildrenNumLimit         uint32
        EbsAddr                     string
        ServicePath                 string
        ClusterUuid                 string
        ClusterUuidEnable           bool
}

// CreateDataPartitionRequest defines the request to create a data partition.
type CreateDataPartitionRequest struct {
        PartitionTyp        int
        PartitionId         uint64
        PartitionSize       int
        ReplicaNum          int
        VolumeId            string
        IsRandomWrite       bool
        Members             []Peer
        Hosts               []string
        CreateType          int
        LeaderSize          int
        DecommissionedDisks []string
        IsMultiVer          bool
        VerSeq              uint64
}

// CreateDataPartitionResponse defines the response to the request of creating a data partition.
type CreateDataPartitionResponse struct {
        PartitionId uint64
        Status      uint8
        Result      string
}

// DeleteDataPartitionRequest defines the request to delete a data partition.
type DeleteDataPartitionRequest struct {
        DataPartitionType string
        PartitionId       uint64
        PartitionSize     int
}

// DeleteDataPartitionResponse defines the response to the request of deleting a data partition.
type DeleteDataPartitionResponse struct {
        Status      uint8
        Result      string
        PartitionId uint64
}

// DataPartitionDecommissionRequest defines the request of decommissioning a data partition.
type DataPartitionDecommissionRequest struct {
        PartitionId uint64
        RemovePeer  Peer
        AddPeer     Peer
}

// AddDataPartitionRaftMemberRequest defines the request of add raftMember a data partition.
type AddDataPartitionRaftMemberRequest struct {
        PartitionId uint64
        AddPeer     Peer
}

// RemoveDataPartitionRaftMemberRequest defines the request of add raftMember a data partition.
type RemoveDataPartitionRaftMemberRequest struct {
        PartitionId uint64
        RemovePeer  Peer
        Force       bool
}

// AddMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition.
type AddMetaPartitionRaftMemberRequest struct {
        PartitionId uint64
        AddPeer     Peer
}

// RemoveMetaPartitionRaftMemberRequest defines the request of add raftMember a meta partition.
type RemoveMetaPartitionRaftMemberRequest struct {
        PartitionId uint64
        RemovePeer  Peer
}

// LoadDataPartitionRequest defines the request of loading a data partition.
type LoadDataPartitionRequest struct {
        PartitionId uint64
}

// LoadDataPartitionResponse defines the response to the request of loading a data partition.
type LoadDataPartitionResponse struct {
        PartitionId       uint64
        Used              uint64
        PartitionSnapshot []*File
        Status            uint8
        PartitionStatus   int
        Result            string
        VolName           string
}

type StopDataPartitionRepairRequest struct {
        PartitionId uint64
        Stop        bool
}

// DeleteDataPartitionResponse defines the response to the request of deleting a data partition.
type StopDataPartitionRepairResponse struct {
        Status      uint8
        Result      string
        PartitionId uint64
}

// File defines the file struct.
type File struct {
        Name     string
        Crc      uint32
        Size     uint32
        Modified int64
        ApplyID  uint64
}

// LoadMetaPartitionMetricRequest defines the request of loading the meta partition metrics.
type LoadMetaPartitionMetricRequest struct {
        PartitionID uint64
        Start       uint64
        End         uint64
}

// LoadMetaPartitionMetricResponse defines the response to the request of loading the meta partition metrics.
type LoadMetaPartitionMetricResponse struct {
        Start    uint64
        End      uint64
        MaxInode uint64
        Status   uint8
        Result   string
}

type UidLimitToMetaNode struct {
        UidLimitInfo []*UidSpaceInfo
}

type QosToDataNode struct {
        EnableDiskQos     bool
        QosIopsReadLimit  uint64
        QosIopsWriteLimit uint64
        QosFlowReadLimit  uint64
        QosFlowWriteLimit uint64
}

// MultiVersionOpRequest defines the request of
type MultiVersionOpRequest struct {
        VolumeID   string
        VerSeq     uint64
        Op         uint8
        Addr       string
        VolVerList []*VolVersionInfo
}

// MultiVersionOpResponse defines the response to the request of l.
type MultiVersionOpResponse struct {
        VolumeID string
        Addr     string
        Op       uint8
        VerSeq   uint64
        Status   uint8
        Result   string
}

type QuotaHeartBeatInfos struct {
        QuotaHbInfos []*QuotaHeartBeatInfo
}

type TxInfo struct {
        Volume     string
        Mask       TxOpMask
        OpLimitVal int
}

type TxInfos struct {
        TxInfo []*TxInfo
}

// HeartBeatRequest define the heartbeat request.
type HeartBeatRequest struct {
        CurrTime   int64
        MasterAddr string
        FLReadVols []string
        QosToDataNode
        FileStatsEnable bool
        UidLimitToMetaNode
        QuotaHeartBeatInfos
        TxInfos
        ForbiddenVols     []string
        DisableAuditVols  []string
        DecommissionDisks []string // NOTE: for datanode
}

// DataPartitionReport defines the partition report.
type DataPartitionReport struct {
        VolName                    string
        PartitionID                uint64
        PartitionStatus            int
        Total                      uint64
        Used                       uint64
        DiskPath                   string
        IsLeader                   bool
        ExtentCount                int
        NeedCompare                bool
        DecommissionRepairProgress float64
}

type DataNodeQosResponse struct {
        IopsRLimit uint64
        IopsWLimit uint64
        FlowRlimit uint64
        FlowWlimit uint64
        Status     uint8
        Result     string
}

type BadDiskStat struct {
        DiskPath             string
        TotalPartitionCnt    int
        DiskErrPartitionList []uint64
}

// DataNodeHeartbeatResponse defines the response to the data node heartbeat.
type DataNodeHeartbeatResponse struct {
        Total               uint64
        Used                uint64
        Available           uint64
        TotalPartitionSize  uint64 // volCnt * volsize
        RemainingCapacity   uint64 // remaining capacity to create partition
        CreatedPartitionCnt uint32
        MaxCapacity         uint64 // maximum capacity to create partition
        StartTime           int64
        ZoneName            string
        PartitionReports    []*DataPartitionReport
        Status              uint8
        Result              string
        BadDisks            []string           // Keep this old field for compatibility
        BadDiskStats        []BadDiskStat      // key: disk path
        CpuUtil             float64            `json:"cpuUtil"`
        IoUtils             map[string]float64 `json:"ioUtil"`
}

// MetaPartitionReport defines the meta partition report.
type MetaPartitionReport struct {
        PartitionID      uint64
        Start            uint64
        End              uint64
        Status           int
        Size             uint64
        MaxInodeID       uint64
        IsLeader         bool
        VolName          string
        InodeCnt         uint64
        DentryCnt        uint64
        TxCnt            uint64
        TxRbInoCnt       uint64
        TxRbDenCnt       uint64
        FreeListLen      uint64
        UidInfo          []*UidReportSpaceInfo
        QuotaReportInfos []*QuotaReportInfo
}

// MetaNodeHeartbeatResponse defines the response to the meta node heartbeat request.
type MetaNodeHeartbeatResponse struct {
        ZoneName             string
        Total                uint64
        MemUsed              uint64
        MetaPartitionReports []*MetaPartitionReport
        Status               uint8
        Result               string
        CpuUtil              float64 `json:"cpuUtil"`
}

// LcNodeHeartbeatResponse defines the response to the lc node heartbeat.
type LcNodeHeartbeatResponse struct {
        Status                uint8
        Result                string
        LcTaskCountLimit      int
        LcScanningTasks       map[string]*LcNodeRuleTaskResponse
        SnapshotScanningTasks map[string]*SnapshotVerDelTaskResponse
}

// DeleteFileRequest defines the request to delete a file.
type DeleteFileRequest struct {
        VolId uint64
        Name  string
}

// DeleteFileResponse defines the response to the request of deleting a file.
type DeleteFileResponse struct {
        Status uint8
        Result string
        VolId  uint64
        Name   string
}

// DeleteMetaPartitionRequest defines the request of deleting a meta partition.
type DeleteMetaPartitionRequest struct {
        PartitionID uint64
}

// DeleteMetaPartitionResponse defines the response to the request of deleting a meta partition.
type DeleteMetaPartitionResponse struct {
        PartitionID uint64
        Status      uint8
        Result      string
}

// UpdateMetaPartitionRequest defines the request to update a meta partition.
type UpdateMetaPartitionRequest struct {
        PartitionID uint64
        VolName     string
        Start       uint64
        End         uint64
}

// UpdateMetaPartitionResponse defines the response to the request of updating the meta partition.
type UpdateMetaPartitionResponse struct {
        PartitionID uint64
        VolName     string
        End         uint64
        Status      uint8
        Result      string
}

// MetaPartitionDecommissionRequest defines the request of decommissioning a meta partition.
type MetaPartitionDecommissionRequest struct {
        PartitionID uint64
        VolName     string
        RemovePeer  Peer
        AddPeer     Peer
}

// MetaPartitionDecommissionResponse defines the response to the request of decommissioning a meta partition.
type MetaPartitionDecommissionResponse struct {
        PartitionID uint64
        VolName     string
        Status      uint8
        Result      string
}

// MetaPartitionLoadRequest defines the request to load meta partition.
type MetaPartitionLoadRequest struct {
        PartitionID uint64
}

// MetaPartitionLoadResponse defines the response to the request of loading meta partition.
type MetaPartitionLoadResponse struct {
        PartitionID uint64
        DoCompare   bool
        ApplyID     uint64
        CommittedID uint64
        MaxInode    uint64
        DentryCount uint64
        InodeCount  uint64
        Addr        string
}

// DataPartitionResponse defines the response from a data node to the master that is related to a data partition.
type DataPartitionResponse struct {
        PartitionType int
        PartitionID   uint64
        Status        int8
        ReplicaNum    uint8
        Hosts         []string
        LeaderAddr    string
        Epoch         uint64
        IsRecover     bool
        PartitionTTL  int64
        IsDiscard     bool
}

// DataPartitionsView defines the view of a data partition
type DataPartitionsView struct {
        DataPartitions []*DataPartitionResponse
}

func NewDataPartitionsView() (dataPartitionsView *DataPartitionsView) {
        dataPartitionsView = new(DataPartitionsView)
        dataPartitionsView.DataPartitions = make([]*DataPartitionResponse, 0)
        return
}

// MetaPartitionView defines the view of a meta partition
type MetaPartitionView struct {
        PartitionID uint64
        Start       uint64
        End         uint64
        MaxInodeID  uint64
        InodeCount  uint64
        DentryCount uint64
        FreeListLen uint64
        TxCnt       uint64
        TxRbInoCnt  uint64
        TxRbDenCnt  uint64
        IsRecover   bool
        Members     []string
        LeaderAddr  string
        Status      int8
}

type DataNodeDisksRequest struct{}

type DataNodeDisksResponse struct{}

type OSSSecure struct {
        AccessKey string
        SecretKey string
}

// VolView defines the view of a volume
type VolView struct {
        Name           string
        Owner          string
        Status         uint8
        FollowerRead   bool
        MetaPartitions []*MetaPartitionView
        DataPartitions []*DataPartitionResponse
        DomainOn       bool
        OSSSecure      *OSSSecure
        CreateTime     int64
        DeleteLockTime int64
        CacheTTL       int
        VolType        int
}

func (v *VolView) SetOwner(owner string) {
        v.Owner = owner
}

func (v *VolView) SetOSSSecure(accessKey, secretKey string) {
        v.OSSSecure = &OSSSecure{AccessKey: accessKey, SecretKey: secretKey}
}

func NewVolView(name string, status uint8, followerRead bool, createTime int64, cacheTTL int, volType int, deleteLockTime int64) (view *VolView) {
        view = new(VolView)
        view.Name = name
        view.FollowerRead = followerRead
        view.CreateTime = createTime
        view.DeleteLockTime = deleteLockTime
        view.Status = status
        view.MetaPartitions = make([]*MetaPartitionView, 0)
        view.DataPartitions = make([]*DataPartitionResponse, 0)
        view.CacheTTL = cacheTTL
        view.VolType = volType
        return
}

func NewMetaPartitionView(partitionID, start, end uint64, status int8) (mpView *MetaPartitionView) {
        mpView = new(MetaPartitionView)
        mpView.PartitionID = partitionID
        mpView.Start = start
        mpView.End = end
        mpView.Status = status
        mpView.Members = make([]string, 0)
        return
}

const (
        QosStateNormal   uint8 = 0x01
        QosStateHitLimit uint8 = 0x02

        MinIopsLimit uint64 = 3
        MinFLowLimit uint64 = 128 * util.KB
)

const (
        IopsReadType  uint32 = 0x01
        IopsWriteType uint32 = 0x02
        FlowReadType  uint32 = 0x03
        FlowWriteType uint32 = 0x04
)

const (
        QosDefaultBurst                   = 16000000
        QosDefaultClientCnt        uint32 = 100
        QosDefaultDiskMaxFLowLimit int    = 0x7FFFFFFF
        QosDefaultDiskMaxIoLimit   int    = 100000
)

func QosTypeString(factorType uint32) string {
        switch factorType {
        case IopsReadType:
                return "IopsRead"
        case IopsWriteType:
                return "IopsWrite"
        case FlowReadType:
                return "FlowRead"
        case FlowWriteType:
                return "FlowWrite"
        default:
                return "unkown"
        }
}

type ClientLimitInfo struct {
        UsedLimit  uint64
        UsedBuffer uint64
        Used       uint64
        Need       uint64
}

type ClientReportLimitInfo struct {
        ID        uint64
        FactorMap map[uint32]*ClientLimitInfo
        Host      string
        Status    uint8
        _         string // reserved
}

func NewClientReportLimitInfo() *ClientReportLimitInfo {
        return &ClientReportLimitInfo{
                FactorMap: make(map[uint32]*ClientLimitInfo),
        }
}

type LimitRsp2Client struct {
        ID            uint64
        Enable        bool
        ReqPeriod     uint32
        HitTriggerCnt uint8
        FactorMap     map[uint32]*ClientLimitInfo
        Magnify       map[uint32]uint32
        _             string // reserved
}

func NewLimitRsp2Client() *LimitRsp2Client {
        limit := &LimitRsp2Client{
                FactorMap: make(map[uint32]*ClientLimitInfo),
                Magnify:   make(map[uint32]uint32),
        }
        return limit
}

type UidSimpleInfo struct {
        UID     uint32
        Limited bool
}

// SimpleVolView defines the simple view of a volume
type SimpleVolView struct {
        ID                      uint64
        Name                    string
        Owner                   string
        ZoneName                string
        DpReplicaNum            uint8
        MpReplicaNum            uint8
        InodeCount              uint64
        DentryCount             uint64
        MaxMetaPartitionID      uint64
        Status                  uint8
        Capacity                uint64 // GB
        RwDpCnt                 int
        MpCnt                   int
        DpCnt                   int
        FollowerRead            bool
        NeedToLowerReplica      bool
        Authenticate            bool
        CrossZone               bool
        DefaultPriority         bool
        DomainOn                bool
        CreateTime              string
        DeleteLockTime          int64
        EnableToken             bool
        EnablePosixAcl          bool
        EnableQuota             bool
        EnableTransaction       string
        TxTimeout               int64
        TxConflictRetryNum      int64
        TxConflictRetryInterval int64
        TxOpLimit               int
        Description             string
        DpSelectorName          string
        DpSelectorParm          string
        DefaultZonePrior        bool
        DpReadOnlyWhenVolFull   bool

        VolType          int
        ObjBlockSize     int
        CacheCapacity    uint64
        CacheAction      int
        CacheThreshold   int
        CacheHighWater   int
        CacheLowWater    int
        CacheLruInterval int
        CacheTtl         int
        CacheRule        string
        PreloadCapacity  uint64
        Uids             []UidSimpleInfo
        // multi version snapshot
        LatestVer      uint64
        Forbidden      bool
        EnableAuditLog bool
}

type NodeSetInfo struct {
        ID           uint64
        ZoneName     string
        Capacity     int
        DataUseRatio float64
        MetaUseRatio float64
        MetaUsed     uint64
        MetaTotal    uint64
        MetaNodes    []*MetaNodeInfo
        DataUsed     uint64
        DataTotal    uint64
        DataNodes    []*DataNodeInfo
}

type SimpleNodeSetGrpInfo struct {
        ID          uint64
        Status      uint8
        NodeSetInfo []NodeSetInfo
}

type SimpleNodeSetGrpInfoList struct {
        DomainId             uint64
        Status               uint8
        SimpleNodeSetGrpInfo []*SimpleNodeSetGrpInfo
}

type DomainNodeSetGrpInfoList struct {
        DomainOn              bool
        DataRatioLimit        float64
        ZoneExcludeRatioLimit float64
        NeedDomain            bool
        ExcludeZones          []string
        DomainNodeSetGrpInfo  []*SimpleNodeSetGrpInfoList
}

// MasterAPIAccessResp defines the response for getting meta partition
type MasterAPIAccessResp struct {
        APIResp APIAccessResp `json:"api_resp"`
        Data    []byte        `json:"data"`
}

type VolInfo struct {
        Name                  string
        Owner                 string
        CreateTime            int64
        Status                uint8
        TotalSize             uint64
        UsedSize              uint64
        DpReadOnlyWhenVolFull bool
}

func NewVolInfo(name, owner string, createTime int64, status uint8, totalSize, usedSize uint64, dpReadOnlyWhenVolFull bool) *VolInfo {
        return &VolInfo{
                Name:                  name,
                Owner:                 owner,
                CreateTime:            createTime,
                Status:                status,
                TotalSize:             totalSize,
                UsedSize:              usedSize,
                DpReadOnlyWhenVolFull: dpReadOnlyWhenVolFull,
        }
}

// ZoneView define the view of zone
type ZoneView struct {
        Name                string
        Status              string
        DataNodesetSelector string
        MetaNodesetSelector string
        NodeSet             map[uint64]*NodeSetView
}

type NodeSetView struct {
        DataNodeLen int
        MetaNodeLen int
        MetaNodes   []NodeView
        DataNodes   []NodeView
}

// TopologyView provides the view of the topology view of the cluster
type TopologyView struct {
        Zones []*ZoneView
}

const (
        PartitionTypeNormal  = 0
        PartitionTypeCache   = 1
        PartitionTypePreLoad = 2
)

func GetDpType(volType int, isPreload bool) int {
        if volType == VolumeTypeHot {
                return PartitionTypeNormal
        }

        if isPreload {
                return PartitionTypePreLoad
        }

        return PartitionTypeCache
}

func IsCacheDp(typ int) bool {
        return typ == PartitionTypeCache
}

func IsNormalDp(typ int) bool {
        return typ == PartitionTypeNormal
}

func IsPreLoadDp(typ int) bool {
        return typ == PartitionTypePreLoad
}

const (
        VolumeTypeHot  = 0
        VolumeTypeCold = 1
)

func IsCold(typ int) bool {
        return typ == VolumeTypeCold
}

func IsHot(typ int) bool {
        return typ == VolumeTypeHot
}

const (
        NoCache = 0
        RCache  = 1
        RWCache = 2
)

const (
        LFClient = 1 // low frequency client
)

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "fmt"
        "time"
)

const (
        TaskFailed       = 2
        TaskStart        = 0
        TaskSucceeds     = 1
        TaskRunning      = 3
        ResponseInterval = 5
        ResponseTimeOut  = 100
        MaxSendCount     = 5
)

// AdminTask defines the administration task.
type AdminTask struct {
        ID           string
        PartitionID  uint64
        OpCode       uint8
        OperatorAddr string
        Status       int8
        SendTime     int64
        CreateTime   int64
        SendCount    uint8
        Request      interface{}
        Response     interface{}
}

// ToString returns the string format of the task.
func (t *AdminTask) ToString() (msg string) {
        msg = fmt.Sprintf("ID[%v] OpCode[%d] Status[%d] LastSendTime[%v]  SendCount[%v] Request[%v] Response[%v]",
                t.ID, t.OpCode, t.Status, t.SendTime, t.SendCount, t.Request, t.Response)

        return
}

func (t *AdminTask) IdString() string {
        return fmt.Sprintf("id:%s_sendTime_%d_createTime_%d", t.ID, t.SendTime, t.CreateTime)
}

// CheckTaskNeedSend checks if the task needs to be sent out.
func (t *AdminTask) CheckTaskNeedSend() (needRetry bool) {
        if (int)(t.SendCount) < MaxSendCount && time.Now().Unix()-t.SendTime > (int64)(ResponseInterval) {
                needRetry = true
        }
        return
}

// CheckTaskTimeOut checks if the task is timed out.
func (t *AdminTask) CheckTaskTimeOut() (notResponse bool) {
        if (int)(t.SendCount) >= MaxSendCount || (t.SendTime > 0 && (time.Now().Unix()-t.SendTime > int64(ResponseTimeOut))) {
                notResponse = true
        }
        return
}

// SetStatus sets the status of the task.
func (t *AdminTask) SetStatus(status int8) {
        t.Status = status
}

// IsTaskSuccessful returns if the task has been executed successful.
func (t *AdminTask) IsTaskSuccessful() (isSuccess bool) {
        if t.Status == TaskSucceeds {
                isSuccess = true
        }

        return
}

// IsTaskFailed returns if the task failed.
func (t *AdminTask) IsTaskFailed() (isFail bool) {
        if t.Status == TaskFailed {
                isFail = true
        }

        return
}

// IsUrgentTask returns if the task is urgent.
func (t *AdminTask) IsUrgentTask() bool {
        return t.isCreateTask() || t.isLoadTask() || t.isUpdateMetaPartitionTask()
}

// isUpdateMetaPartitionTask checks if the task is to update the meta partition.
func (t *AdminTask) isUpdateMetaPartitionTask() bool {
        return t.OpCode == OpUpdateMetaPartition
}

func (t *AdminTask) isLoadTask() bool {
        return t.OpCode == OpLoadDataPartition
}

func (t *AdminTask) isCreateTask() bool {
        return t.OpCode == OpCreateDataPartition || t.OpCode == OpCreateMetaPartition
}

// IsHeartbeatTask returns if the task is a heartbeat task.
func (t *AdminTask) IsHeartbeatTask() bool {
        return t.OpCode == OpDataNodeHeartbeat || t.OpCode == OpMetaNodeHeartbeat || t.OpCode == OpLcNodeHeartbeat
}

// NewAdminTask returns a new adminTask.
func NewAdminTask(opCode uint8, opAddr string, request interface{}) (t *AdminTask) {
        t = new(AdminTask)
        t.OpCode = opCode
        t.Request = request
        t.OperatorAddr = opAddr
        t.ID = fmt.Sprintf("addr[%v]_op[%v]", t.OperatorAddr, t.OpCode)
        t.CreateTime = time.Now().Unix()
        return
}

// NewAdminTaskEx returns a new adminTask.
func NewAdminTaskEx(opCode uint8, opAddr string, request interface{}, reqID string) (t *AdminTask) {
        t = new(AdminTask)
        t.OpCode = opCode
        t.Request = request
        t.OperatorAddr = opAddr
        t.ID = fmt.Sprintf("addr[%v]_op[%v]_reqID[%v]", t.OperatorAddr, t.OpCode, reqID)
        t.CreateTime = time.Now().Unix()
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "encoding/base64"
        "encoding/binary"
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        "net/url"
        "regexp"
        "time"

        "github.com/cubefs/cubefs/util/caps"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/keystore"
)

// ServiceID defines the type of tickets
type ServiceID uint32

// MsgType defines the type of req/resp for message
type MsgType uint32

// Nonce defines the nonce to mitigate the replay attack
type Nonce uint64

const (
        APIRsc          = "API"
        APIAccess       = "access"
        capSeparator    = ":"
        reqLiveLength   = 10
        ClientMessage   = "Token"
        OwnerVOLRsc     = "OwnerVOL"
        NoneOwnerVOLRsc = "NoneOwnerVOL"
        VOLAccess       = "*"
)

// api
const (
        // Client APIs
        ClientGetTicket = "/client/getticket"

        // Admin APIs
        AdminCreateKey  = "/admin/createkey"
        AdminDeleteKey  = "/admin/deletekey"
        AdminGetKey     = "/admin/getkey"
        AdminAddCaps    = "/admin/addcaps"
        AdminDeleteCaps = "/admin/deletecaps"
        AdminGetCaps    = "/admin/getcaps"

        // raft node APIs
        AdminAddRaftNode    = "/admin/addraftnode"
        AdminRemoveRaftNode = "/admin/removeraftnode"

        // Object node APIs
        OSAddCaps    = "/os/addcaps"
        OSDeleteCaps = "/os/deletecaps"
        OSGetCaps    = "/os/getcaps"
)

const (
        // AuthServiceID defines ticket for authnode access (not supported)
        AuthServiceID = "AuthService"

        // MasterServiceID defines ticket for master access
        MasterServiceID = "MasterService"

        // MetaServiceID defines ticket for metanode access (not supported)
        MetaServiceID = "MetanodeService"

        // DataServiceID defines ticket for datanode access (not supported)
        DataServiceID = "DatanodeService"

        // ObjectServiceID defines ticket for objectnode access
        ObjectServiceID = "ObjectService"
)

const (
        MasterNode = "master"
        MetaNode   = "metanode"
        DataNode   = "datanode"
)

const (
        // MsgAuthBase define the starting value for auth message
        MsgAuthBase MsgType = 0x100000

        // MsgAuthTicketReq request type for an auth ticket
        MsgAuthTicketReq MsgType = MsgAuthBase + 0x10000

        // MsgAuthTicketResp respose type for an auth ticket
        MsgAuthTicketResp MsgType = MsgAuthBase + 0x10001

        // MsgMasterTicketReq request type for a master ticket
        MsgMasterTicketReq MsgType = MsgAuthBase + 0x20000

        // MsgMasterTicketResp response type for a master ticket
        MsgMasterTicketResp MsgType = MsgAuthBase + 0x20001

        // MsgMetaTicketReq request type for a metanode ticket
        MsgMetaTicketReq MsgType = MsgAuthBase + 0x30000

        // MsgMetaTicketResp response type for a metanode ticket
        MsgMetaTicketResp MsgType = MsgAuthBase + 0x30001

        // MsgDataTicketReq request type for a datanode ticket
        MsgDataTicketReq MsgType = MsgAuthBase + 0x40000

        // MsgDataTicketResp response type for a datanode ticket
        MsgDataTicketResp MsgType = MsgAuthBase + 0x40001

        // MsgAuthCreateKeyReq request type for authnode add key
        MsgAuthCreateKeyReq MsgType = MsgAuthBase + 0x51000

        // MsgAuthCreateKeyResp response type for authnode add key
        MsgAuthCreateKeyResp MsgType = MsgAuthBase + 0x51001

        // MsgAuthDeleteKeyReq request type for authnode delete key
        MsgAuthDeleteKeyReq MsgType = MsgAuthBase + 0x52000

        // MsgAuthDeleteKeyResp response type for authnode delete key
        MsgAuthDeleteKeyResp MsgType = MsgAuthBase + 0x52001

        // MsgAuthGetKeyReq request type for authnode get key info
        MsgAuthGetKeyReq MsgType = MsgAuthBase + 0x53000

        // MsgAuthGetKeyResp response type for authnode get key info
        MsgAuthGetKeyResp MsgType = MsgAuthBase + 0x53001

        // MsgAuthAddCapsReq request type for authnode add caps
        MsgAuthAddCapsReq MsgType = MsgAuthBase + 0x54000

        // MsgAuthAddCapsResp response type for authnode add caps
        MsgAuthAddCapsResp MsgType = MsgAuthBase + 0x54001

        // MsgAuthDeleteCapsReq request type for authnode add caps
        MsgAuthDeleteCapsReq MsgType = MsgAuthBase + 0x55000

        // MsgAuthDeleteCapsResp response type for authnode add caps
        MsgAuthDeleteCapsResp MsgType = MsgAuthBase + 0x55001

        // MsgAuthGetCapsReq request type for authnode add caps
        MsgAuthGetCapsReq MsgType = MsgAuthBase + 0x56000

        // MsgAuthGetCapsResp response type for authnode add caps
        MsgAuthGetCapsResp MsgType = MsgAuthBase + 0x56001

        // MsgAuthAddRaftNodeReq request type for authnode add node
        MsgAuthAddRaftNodeReq MsgType = MsgAuthBase + 0x57000

        // MsgAuthAddRaftNodeResp response type for authnode remove node
        MsgAuthAddRaftNodeResp MsgType = MsgAuthBase + 0x57001

        // MsgAuthRemoveRaftNodeReq request type for authnode remove node
        MsgAuthRemoveRaftNodeReq MsgType = MsgAuthBase + 0x58000

        // MsgAuthRemoveRaftNodeResp response type for authnode remove node
        MsgAuthRemoveRaftNodeResp MsgType = MsgAuthBase + 0x58001

        // MsgAuthOSAddCapsReq request type from ObjectNode to add caps
        MsgAuthOSAddCapsReq MsgType = MsgAuthBase + 0x61000

        // MsgAuthOSAddCapsResp request type from ObjectNode to add caps
        MsgAuthOSAddCapsResp MsgType = MsgAuthBase + 0x61001

        // MsgAuthOSDeleteCapsReq request type from ObjectNode to delete caps
        MsgAuthOSDeleteCapsReq MsgType = MsgAuthBase + 0x62000

        // MsgAuthOSDeleteCapsResp request type from ObjectNode to delete caps
        MsgAuthOSDeleteCapsResp MsgType = MsgAuthBase + 0x62001

        // MsgAuthOSGetCapsReq request type from ObjectNode to get caps
        MsgAuthOSGetCapsReq MsgType = MsgAuthBase + 0x63000

        // MsgAuthOSGetCapsResp response type from ObjectNode to get caps
        MsgAuthOSGetCapsResp MsgType = MsgAuthBase + 0x63001

        // MsgMasterAPIAccessReq request type for master api access
        MsgMasterAPIAccessReq MsgType = 0x60000

        // MsgMasterAPIAccessResp response type for master api access
        MsgMasterAPIAccessResp MsgType = 0x60001

        // Master API ClientVol
        MsgMasterFetchVolViewReq MsgType = MsgMasterAPIAccessReq + 0x10000

        // Master API cluster management
        MsgMasterClusterFreezeReq    MsgType = MsgMasterAPIAccessReq + 0x20100
        MsgMasterAddRaftNodeReq      MsgType = MsgMasterAPIAccessReq + 0x20200
        MsgMasterRemoveRaftNodeReq   MsgType = MsgMasterAPIAccessReq + 0x20300
        MsgMasterSetNodeInfoReq      MsgType = MsgMasterAPIAccessReq + 0x20400
        MsgMasterSetNodeRdOnlyReq    MsgType = MsgMasterAPIAccessReq + 0x20500
        MsgMasterAutoDecommissionReq MsgType = MsgMasterAPIAccessReq + 0x20600

        // Master API volume management
        MsgMasterCreateVolReq MsgType = MsgMasterAPIAccessReq + 0x30100
        MsgMasterDeleteVolReq MsgType = MsgMasterAPIAccessReq + 0x30200
        MsgMasterUpdateVolReq MsgType = MsgMasterAPIAccessReq + 0x30300
        MsgMasterVolShrinkReq MsgType = MsgMasterAPIAccessReq + 0x30400
        MsgMasterVolExpandReq MsgType = MsgMasterAPIAccessReq + 0x30500

        // Master API meta partition management
        MsgMasterLoadMetaPartitionReq         MsgType = MsgMasterAPIAccessReq + 0x40100
        MsgMasterDecommissionMetaPartitionReq MsgType = MsgMasterAPIAccessReq + 0x40200
        MsgMasterChangeMetaPartitionLeaderReq MsgType = MsgMasterAPIAccessReq + 0x40300
        MsgMasterCreateMetaPartitionReq       MsgType = MsgMasterAPIAccessReq + 0x40400
        MsgMasterAddMetaReplicaReq            MsgType = MsgMasterAPIAccessReq + 0x40500
        MsgMasterDeleteMetaReplicaReq         MsgType = MsgMasterAPIAccessReq + 0x40600
        MsgMasterQosUpdateReq                 MsgType = MsgMasterAPIAccessReq + 0x40700
        MsgMasterQosUpdateZoneLimitReq        MsgType = MsgMasterAPIAccessReq + 0x40800
        MsgMasterQosUpdateMasterLimitReq      MsgType = MsgMasterAPIAccessReq + 0x40900
        MsgMasterQosUpdateClientParamReq      MsgType = MsgMasterAPIAccessReq + 0x40a00

        // Master API data partition management
        MsgMasterCreateDataPartitionReq       MsgType = MsgMasterAPIAccessReq + 0x50100
        MsgMasterDataPartitionChangeLeaderReq MsgType = MsgMasterAPIAccessReq + 0x50200
        MsgMasterLoadDataPartitionReq         MsgType = MsgMasterAPIAccessReq + 0x50300
        MsgMasterDecommissionDataPartitionReq MsgType = MsgMasterAPIAccessReq + 0x50400
        MsgMasterAddDataReplicaReq            MsgType = MsgMasterAPIAccessReq + 0x50500
        MsgMasterDeleteDataReplicaReq         MsgType = MsgMasterAPIAccessReq + 0x50600
        MsgMasterSetDpRdOnlyReq               MsgType = MsgMasterAPIAccessReq + 0x50700
        MsgMasterReportLackDataPartitions     MsgType = MsgMasterAPIAccessReq + 0x50800

        // Master API meta node management
        MsgMasterAddMetaNodeReq          MsgType = MsgMasterAPIAccessReq + 0x60100
        MsgMasterDecommissionMetaNodeReq MsgType = MsgMasterAPIAccessReq + 0x60200
        MsgMasterMigrateMetaNodeReq      MsgType = MsgMasterAPIAccessReq + 0x60300
        MsgMasterSetMetaNodeThresholdReq MsgType = MsgMasterAPIAccessReq + 0x60400
        MsgMasterUpdateMetaNodeReq       MsgType = MsgMasterAPIAccessReq + 0x60500

        // Master API data node management
        MsgMasterAddDataNodeReq                MsgType = MsgMasterAPIAccessReq + 0x70100
        MsgMasterDecommissionDataNodeReq       MsgType = MsgMasterAPIAccessReq + 0x70200
        MsgMasterMigrateDataNodeReq            MsgType = MsgMasterAPIAccessReq + 0x70300
        MsgMasterCancelDecommissionDataNodeReq MsgType = MsgMasterAPIAccessReq + 0x70400
        MsgMasterDecommissionDiskReq           MsgType = MsgMasterAPIAccessReq + 0x70500
        MsgMasterUpdateNodeSetCapcityReq       MsgType = MsgMasterAPIAccessReq + 0x70600
        MsgMasterUpdateNodeSetIdReq            MsgType = MsgMasterAPIAccessReq + 0x70700
        MsgMasterUpdateDomainDataUseRatioReq   MsgType = MsgMasterAPIAccessReq + 0x70800
        MsgMasterUpdateZoneExcludeRatioReq     MsgType = MsgMasterAPIAccessReq + 0x70900
        MsgMasterRecommissionDiskReq           MsgType = MsgMasterAPIAccessReq + 0x70a00

        // Master API user management
        MsgMasterUserCreateReq          MsgType = MsgMasterAPIAccessReq + 0x80100
        MsgMasterUserDeleteReq          MsgType = MsgMasterAPIAccessReq + 0x80200
        MsgMasterUserUpdateReq          MsgType = MsgMasterAPIAccessReq + 0x80300
        MsgMasterUserUpdatePolicyReq    MsgType = MsgMasterAPIAccessReq + 0x80400
        MsgMasterUserRemovePolicyReq    MsgType = MsgMasterAPIAccessReq + 0x80500
        MsgMasterUserDeleteVolPolicyReq MsgType = MsgMasterAPIAccessReq + 0x80600
        MsgMasterUserTransferVolReq     MsgType = MsgMasterAPIAccessReq + 0x80700

        // Master API zone management
        MsgMasterUpdateZoneReq MsgType = MsgMasterAPIAccessReq + 0x90100
)

// HTTPAuthReply uniform response structure
type HTTPAuthReply = HTTPReply

// MsgType2ResourceMap define the mapping from message type to resource
var MsgType2ResourceMap = map[MsgType]string{
        MsgAuthCreateKeyReq:      "auth:createkey",
        MsgAuthDeleteKeyReq:      "auth:deletekey",
        MsgAuthGetKeyReq:         "auth:getkey",
        MsgAuthAddCapsReq:        "auth:addcaps",
        MsgAuthDeleteCapsReq:     "auth:deletecaps",
        MsgAuthGetCapsReq:        "auth:getcaps",
        MsgAuthAddRaftNodeReq:    "auth:addnode",
        MsgAuthRemoveRaftNodeReq: "auth:removenode",
        MsgAuthOSAddCapsReq:      "auth:osaddcaps",
        MsgAuthOSDeleteCapsReq:   "auth:osdeletecaps",
        MsgAuthOSGetCapsReq:      "auth:osgetcaps",

        MsgMasterFetchVolViewReq: "master:getvol",

        // Master API cluster management
        MsgMasterClusterFreezeReq:    "master:clusterfreeze",
        MsgMasterAddRaftNodeReq:      "master:addraftnode",
        MsgMasterRemoveRaftNodeReq:   "master:removeraftnode",
        MsgMasterSetNodeInfoReq:      "master:setnodeinfo",
        MsgMasterSetNodeRdOnlyReq:    "master:sernoderdonly",
        MsgMasterAutoDecommissionReq: "master:autodecommission",

        // Master API volume management
        MsgMasterCreateVolReq: "master:createvol",
        MsgMasterDeleteVolReq: "master:deletevol",
        MsgMasterUpdateVolReq: "master:updatevol",
        MsgMasterVolShrinkReq: "master:volshrink",
        MsgMasterVolExpandReq: "master:volexpand",

        // Master API meta partition management
        MsgMasterLoadMetaPartitionReq:         "master:loadmetapartition",
        MsgMasterDecommissionMetaPartitionReq: "master:decommissionmetapartition",
        MsgMasterChangeMetaPartitionLeaderReq: "master:changemetapartitionleader",
        MsgMasterCreateMetaPartitionReq:       "master:createmetapartition",
        MsgMasterAddMetaReplicaReq:            "master:addmetareplica",
        MsgMasterDeleteMetaReplicaReq:         "master:deletemetareplica",
        MsgMasterQosUpdateReq:                 "master:qosupdate",
        MsgMasterQosUpdateZoneLimitReq:        "master:qosupdatezonelimit",
        MsgMasterQosUpdateMasterLimitReq:      "master:qosupdatemasterlimit",
        MsgMasterQosUpdateClientParamReq:      "master:qosupdateclientparam",

        // Master API data partition management
        MsgMasterCreateDataPartitionReq:       "master:createdatapartition",
        MsgMasterDataPartitionChangeLeaderReq: "master:changedatapartitionleader",
        MsgMasterLoadDataPartitionReq:         "master:loaddatapartition",
        MsgMasterDecommissionDataPartitionReq: "master:decommissiondatapartition",
        MsgMasterAddDataReplicaReq:            "master:adddatareplica",
        MsgMasterDeleteDataReplicaReq:         "master:removedatareplica",
        MsgMasterSetDpRdOnlyReq:               "master:setdprdonly",
        MsgMasterReportLackDataPartitions:     "master:reportLackDataPartitions",

        // Master API meta node management
        MsgMasterAddMetaNodeReq:          "master:addmetanode",
        MsgMasterDecommissionMetaNodeReq: "master:decommissionmetanode",
        MsgMasterMigrateMetaNodeReq:      "master:migratemetanode",
        MsgMasterSetMetaNodeThresholdReq: "master:setmetanodethreshold",
        MsgMasterUpdateMetaNodeReq:       "master:updatemetanode",

        // Master API data node management
        MsgMasterAddDataNodeReq:                "master:adddatannode",
        MsgMasterDecommissionDataNodeReq:       "master:decommissiondatannode",
        MsgMasterMigrateDataNodeReq:            "master:migratedatannode",
        MsgMasterCancelDecommissionDataNodeReq: "master:canceldecommissiondatannode",
        MsgMasterDecommissionDiskReq:           "master:decommissiondisk",
        MsgMasterUpdateNodeSetCapcityReq:       "master:updatenodesetcapcity",
        MsgMasterUpdateNodeSetIdReq:            "master:updatenodesetid",
        MsgMasterUpdateDomainDataUseRatioReq:   "master:updatedomaindatauseratio",
        MsgMasterUpdateZoneExcludeRatioReq:     "master:updatezoneexcluderatio",
        MsgMasterRecommissionDiskReq:           "master:recommissiondisk",

        // Master API user management
        MsgMasterUserCreateReq:          "master:usercreate",
        MsgMasterUserDeleteReq:          "master:userdelete",
        MsgMasterUserUpdateReq:          "master:userupdate",
        MsgMasterUserUpdatePolicyReq:    "master:userupdatepolicy",
        MsgMasterUserRemovePolicyReq:    "master:userremotepolicy",
        MsgMasterUserDeleteVolPolicyReq: "master:userdeletevolpolicy",
        MsgMasterUserTransferVolReq:     "master:usertransfervol",

        // Master API zone management
        MsgMasterUpdateZoneReq: "master:updatezone",
}

// AuthGetTicketReq defines the message from client to authnode
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type AuthGetTicketReq struct {
        Type      MsgType `json:"type"`
        ClientID  string  `json:"client_id"`
        ServiceID string  `json:"service_id"`
        Verifier  string  `json:"verifier"`
}

// AuthGetTicketResp defines the message from authnode to client
type AuthGetTicketResp struct {
        Type       MsgType              `json:"type"`
        ClientID   string               `json:"client_id"`
        ServiceID  string               `json:"service_id"`
        Verifier   int64                `json:"verifier"`
        Ticket     string               `json:"ticket"`
        SessionKey cryptoutil.CryptoKey `json:"session_key"`
}

// APIAccessReq defines the request for access restful api
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type APIAccessReq struct {
        Type      MsgType `json:"type"`
        ClientID  string  `json:"client_id"`
        ServiceID string  `json:"service_id"`
        Verifier  string  `json:"verifier"`
        Ticket    string  `json:"ticket"`
}

// APIAccessResp defines the response for access restful api
// use Timestamp as verifier for MITM mitigation
// verifier is also used to verify the server identity
type APIAccessResp struct {
        Type      MsgType `json:"type"`
        ClientID  string  `json:"client_id"`
        ServiceID string  `json:"service_id"`
        Verifier  int64   `json:"verifier"`
}

// AuthAPIAccessReq defines Auth API request
type AuthAPIAccessReq struct {
        APIReq  APIAccessReq     `json:"api_req"`
        KeyInfo keystore.KeyInfo `json:"key_info"`
}

// AuthAPIAccessResp defines the response for creating an key in authnode
type AuthAPIAccessResp struct {
        APIResp   APIAccessResp    `json:"api_resp"`
        KeyInfo   keystore.KeyInfo `json:"key_info"`
        AuthIDKey string           `json:"auth_id_key"`
}

// AuthRaftNodeInfo defines raft node information
type AuthRaftNodeInfo struct {
        ID   uint64 `json:"id"`
        Addr string `json:"addr"`
}

// AuthRaftNodeReq defines Auth API request for add/remove a raft node
type AuthRaftNodeReq struct {
        APIReq       APIAccessReq     `json:"api_req"`
        RaftNodeInfo AuthRaftNodeInfo `json:"node_info"`
}

// AuthRaftNodeResp defines Auth API response for add/remove a raft node
type AuthRaftNodeResp struct {
        APIResp APIAccessResp `json:"api_resp"`
        Msg     string        `json:"msg"`
}

// AuthAPIAccessKeystoreReq defines Auth API for put/delete Access Keystore vols
type AuthOSAccessKeyReq struct {
        APIReq APIAccessReq           `json:"api_req"`
        AKCaps keystore.AccessKeyCaps `json:"access_key_caps"`
}

// AuthAPIAccessKeystoreResp defines the response for put/delete Access Keystore vols
type AuthOSAccessKeyResp struct {
        APIResp APIAccessResp          `json:"api_resp"`
        AKCaps  keystore.AccessKeyCaps `json:"access_key_caps"`
}

// IsValidServiceID determine the validity of a serviceID
func IsValidServiceID(serviceID string) (err error) {
        if serviceID != AuthServiceID && serviceID != MasterServiceID && serviceID != MetaServiceID && serviceID != DataServiceID {
                err = fmt.Errorf("invalid service ID [%s]", serviceID)
                return
        }
        return
}

// IsValidMsgReqType determine the validity of a message type
func IsValidMsgReqType(serviceID string, msgType MsgType) (err error) {
        b := false
        switch serviceID {
        case "AuthService":
                fallthrough
        case "MasterService":
                if msgType|MsgAuthBase != 0 {
                        b = true
                }
        default:
                // do nothing
        }
        if !b {
                err = fmt.Errorf("invalid request type [%x] and serviceID[%s]", msgType, serviceID)
                return
        }
        return
}

// IsValidClientID determine the validity of a clientID
func IsValidClientID(id string) (err error) {
        re := regexp.MustCompile("^[A-Za-z]{1,1}[A-Za-z0-9_]{0,20}$")
        if !re.MatchString(id) {
                err = fmt.Errorf("clientID invalid format [%s]", id)
                return
        }
        return
}

// ParseAuthReply parse the response from auth
func ParseAuthReply(body []byte) (jobj HTTPAuthReply, err error) {
        if err = json.Unmarshal(body, &jobj); err != nil {
                return
        }
        if jobj.Code != 0 {
                err = fmt.Errorf(jobj.Msg)
                return
        }
        return
}

// GetDataFromResp extract data from response
func GetDataFromResp(body []byte, key []byte) (plaintext []byte, err error) {
        jobj, err := ParseAuthReply(body)
        if err != nil {
                return
        }
        data := fmt.Sprint(jobj.Data)

        if plaintext, err = cryptoutil.DecodeMessage(data, key); err != nil {
                return
        }

        return
}

// ParseAuthGetTicketResp parse and validate the auth get ticket resp
func ParseAuthGetTicketResp(body []byte, key []byte) (resp AuthGetTicketResp, err error) {
        var plaintext []byte

        if plaintext, err = GetDataFromResp(body, key); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &resp); err != nil {
                return
        }

        return
}

// ParseAuthAPIAccessResp parse and validate the auth api access resp
func ParseAuthAPIAccessResp(body []byte, key []byte) (resp AuthAPIAccessResp, err error) {
        var plaintext []byte

        if plaintext, err = GetDataFromResp(body, key); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &resp); err != nil {
                return
        }

        return
}

// ParseAuthRaftNodeResp parse and validate the auth raft node resp
func ParseAuthRaftNodeResp(body []byte, key []byte) (resp AuthRaftNodeResp, err error) {
        var plaintext []byte

        if plaintext, err = GetDataFromResp(body, key); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &resp); err != nil {
                return
        }

        return
}

func ParseAuthOSAKResp(body []byte, key []byte) (resp AuthOSAccessKeyResp, err error) {
        var plaintext []byte

        if plaintext, err = GetDataFromResp(body, key); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &resp); err != nil {
                return
        }

        return
}

func ExtractTicket(str string, key []byte) (ticket cryptoutil.Ticket, err error) {
        var plaintext []byte

        if plaintext, err = cryptoutil.DecodeMessage(str, key); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &ticket); err != nil {
                return
        }

        return
}

func checkTicketCaps(ticket *cryptoutil.Ticket, kind string, cap string) (err error) {
        c := new(caps.Caps)
        if err = c.Init(ticket.Caps); err != nil {
                return
        }
        if b := c.ContainCaps(kind, cap); !b {
                err = fmt.Errorf("no permission to access %v", kind)
                return
        }
        return
}

// ParseVerifier checks the verifier structure for replay attack mitigation
func ParseVerifier(verifier string, key []byte) (ts int64, err error) {
        var plainttext []byte

        if plainttext, err = cryptoutil.DecodeMessage(verifier, key); err != nil {
                return
        }

        ts = int64(binary.LittleEndian.Uint64(plainttext))

        if time.Now().Unix()-ts >= reqLiveLength { // mitigate replay attack
                err = fmt.Errorf("req verifier is timeout [%d] >= [%d]", time.Now().Unix()-ts, reqLiveLength)
                return
        }

        return
}

// VerifyAPIAccessReqIDs verify the req IDs
func VerifyAPIAccessReqIDs(req *APIAccessReq) (err error) {
        if err = IsValidClientID(req.ClientID); err != nil {
                err = fmt.Errorf("IsValidClientID failed: %s", err.Error())
                return
        }

        if err = IsValidServiceID(req.ServiceID); err != nil {
                err = fmt.Errorf("IsValidServiceID failed: %s", err.Error())
                return
        }

        if err = IsValidMsgReqType(req.ServiceID, req.Type); err != nil {
                err = fmt.Errorf("IsValidMsgReqType failed: %s", err.Error())
                return
        }
        return
}

// ExtractAPIAccessTicket verify ticket validity
func ExtractAPIAccessTicket(req *APIAccessReq, key []byte) (ticket cryptoutil.Ticket, ts int64, err error) {
        if ticket, err = ExtractTicket(req.Ticket, key); err != nil {
                err = fmt.Errorf("extractTicket failed: %s", err.Error())
                return
        }

        if time.Now().Unix() >= ticket.Exp {
                err = ErrExpiredTicket
                return
        }

        if ts, err = ParseVerifier(req.Verifier, ticket.SessionKey.Key); err != nil {
                err = fmt.Errorf("parseVerifier failed: %s", err.Error())
                return
        }

        return
}

// CheckAPIAccessCaps checks capability
func CheckAPIAccessCaps(ticket *cryptoutil.Ticket, rscType string, mp MsgType, action string) (err error) {
        if _, ok := MsgType2ResourceMap[mp]; !ok {
                err = fmt.Errorf("MsgType2ResourceMap key not found [%d]", mp)
                return
        }

        rule := MsgType2ResourceMap[mp] + capSeparator + action

        if err = checkTicketCaps(ticket, rscType, rule); err != nil {
                err = fmt.Errorf("checkTicketCaps failed: %s", err.Error())
                return
        }
        return
}

func GenAuthIDKey(id string, authKey []byte) (authIDKey string, err error) {
        type AuthIDKey struct {
                ID      string `json:"id"`
                AuthKey []byte `json:"auth_key"`
        }
        tmpAuthIDKey := AuthIDKey{
                ID:      id,
                AuthKey: authKey,
        }
        var jAuthIDKey []byte
        if jAuthIDKey, err = json.Marshal(tmpAuthIDKey); err != nil {
                err = fmt.Errorf("json marshal authIDKey failed %s", err.Error())
                return
        }
        authIDKey = cryptoutil.Base64Encode(jAuthIDKey)
        return
}

func ExtractIDAndAuthKey(authIDKey string) (id string, authKey []byte, err error) {
        type AuthIDKey struct {
                ID      string `json:"id"`
                AuthKey string `json:"auth_key"`
        }
        var jAuthIDKey []byte
        jAuthIDKey, err = cryptoutil.Base64Decode(authIDKey)
        if err != nil {
                err = fmt.Errorf("decode authIDKey failed %s", err.Error())
                return
        }
        tmpAuthIDKey := &AuthIDKey{}
        if err = json.Unmarshal(jAuthIDKey, &tmpAuthIDKey); err != nil {
                err = fmt.Errorf("json unmarshal authIDKey failed %s", err.Error())
                return
        }
        id = tmpAuthIDKey.ID
        authKey = []byte(tmpAuthIDKey.AuthKey)
        return
}

func CheckVOLAccessCaps(ticket *cryptoutil.Ticket, volName string, action string, accessNode string) (err error) {
        rule := accessNode + capSeparator + volName + capSeparator + action

        if err = checkTicketCaps(ticket, OwnerVOLRsc, rule); err != nil {
                if err = checkTicketCaps(ticket, NoneOwnerVOLRsc, rule); err != nil {
                        err = fmt.Errorf("checkTicketCaps failed: %s", err.Error())
                        return
                }
        }

        return
}

// VerifyAPIRespComm client verifies commond attributes returned from server
func VerifyAPIRespComm(apiResp *APIAccessResp, msg MsgType, clientID string, serviceID string, ts int64) (err error) {
        if ts+1 != apiResp.Verifier {
                err = fmt.Errorf("verifier verification failed")
                return
        }

        if apiResp.Type != msg+1 {
                err = fmt.Errorf("msg verification failed")
                return
        }

        if apiResp.ClientID != clientID {
                err = fmt.Errorf("id verification failed")
                return
        }

        if apiResp.ServiceID != serviceID {
                err = fmt.Errorf("service id verification failed")
                return
        }
        return
}

// VerifyTicketRespComm verifies the ticket respose from server
func VerifyTicketRespComm(ticketResp *AuthGetTicketResp, msg MsgType, clientID string, serviceID string, ts int64) (err error) {
        if ts+1 != ticketResp.Verifier {
                err = fmt.Errorf("verifier verification failed")
                return
        }

        if ticketResp.Type != msg+1 {
                err = fmt.Errorf("msg verification failed")
                return
        }

        if ticketResp.ClientID != clientID {
                err = fmt.Errorf("id verification failed")
                return
        }

        if ticketResp.ServiceID != serviceID {
                err = fmt.Errorf("service id verification failed")
                return
        }
        return
}

// SendBytes send raw bytes target in http/https protocol
func SendBytes(client *http.Client, target string, data []byte) (res []byte, err error) {
        message := base64.StdEncoding.EncodeToString(data)
        resp, err := client.PostForm(target, url.Values{ClientMessage: {message}})
        if err != nil {
                err = fmt.Errorf("action[SendData] failed:" + err.Error())
                return
        }
        defer resp.Body.Close()
        body, err := io.ReadAll(resp.Body)
        if err != nil {
                err = fmt.Errorf("action[doRealSend] failed:" + err.Error())
                return
        }
        res = body
        return
}

// SendData sends data to target
func SendData(client *http.Client, target string, data interface{}) (res []byte, err error) {
        messageJSON, err := json.Marshal(data)
        if err != nil {
                err = fmt.Errorf("action[doRealSend] failed:" + err.Error())
                return
        }
        if res, err = SendBytes(client, target, messageJSON); err != nil {
                return
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import "github.com/cubefs/cubefs/util/errors"

// err
var (
        ErrSuc                    = errors.New("success")
        ErrInternalError          = errors.New("internal error")
        ErrParamError             = errors.New("parameter error")
        ErrInvalidCfg             = errors.New("bad configuration file")
        ErrPersistenceByRaft      = errors.New("persistence by raft occurred error")
        ErrMarshalData            = errors.New("marshal data error")
        ErrUnmarshalData          = errors.New("unmarshal data error")
        ErrVolNotExists           = errors.New("vol not exists")
        ErrMetaPartitionNotExists = errors.New("meta partition not exists")
        ErrDataPartitionNotExists = errors.New("data partition not exists")
        ErrDataNodeNotExists      = errors.New("data node not exists")
        ErrMetaNodeNotExists      = errors.New("meta node not exists")
        ErrDuplicateVol           = errors.New("duplicate vol")
        ErrActiveDataNodesTooLess = errors.New("no enough active data node")
        ErrActiveMetaNodesTooLess = errors.New("no enough active meta node")
        ErrInvalidMpStart         = errors.New("invalid meta partition start value")
        ErrNoAvailDataPartition   = errors.New("no available data partition")
        ErrReshuffleArray         = errors.New("the array to be reshuffled is nil")

        ErrIllegalDataReplica = errors.New("data replica is illegal")

        ErrMissingReplica       = errors.New("a missing data replica is found")
        ErrHasOneMissingReplica = errors.New("there is a missing replica")

        ErrNoDataNodeToWrite = errors.New("No data node available for creating a data partition")
        ErrNoMetaNodeToWrite = errors.New("No meta node available for creating a meta partition")

        ErrCannotBeOffLine                         = errors.New("cannot take the data replica offline")
        ErrNoDataNodeToCreateDataPartition         = errors.New("no enough data nodes for creating a data partition")
        ErrNoZoneToCreateDataPartition             = errors.New("no zone available for creating a data partition")
        ErrNoZoneToCreateMetaPartition             = errors.New("no zone available for creating a meta partition")
        ErrNoNodeSetToCreateDataPartition          = errors.New("no node set available for creating a data partition")
        ErrNoNodeSetToCreateMetaPartition          = errors.New("no node set available for creating a meta partition")
        ErrNoMetaNodeToCreateMetaPartition         = errors.New("no enough meta nodes for creating a meta partition")
        ErrIllegalMetaReplica                      = errors.New("illegal meta replica")
        ErrNoEnoughReplica                         = errors.New("no enough replicas")
        ErrNoLeader                                = errors.New("no leader")
        ErrVolAuthKeyNotMatch                      = errors.New("client and server auth key do not match")
        ErrAuthKeyStoreError                       = errors.New("auth keystore error")
        ErrAuthAPIAccessGenRespError               = errors.New("auth API access response error")
        ErrAuthOSCapsOpGenRespError                = errors.New("auth Object Storage node API response error")
        ErrKeyNotExists                            = errors.New("key not exists")
        ErrDuplicateKey                            = errors.New("duplicate key")
        ErrAccessKeyNotExists                      = errors.New("access key not exists")
        ErrInvalidTicket                           = errors.New("invalid ticket")
        ErrInvalidClientIDKey                      = errors.New("invalid clientIDKey")
        ErrExpiredTicket                           = errors.New("expired ticket")
        ErrMasterAPIGenRespError                   = errors.New("master API generate response error")
        ErrDuplicateUserID                         = errors.New("duplicate user id")
        ErrUserNotExists                           = errors.New("user not exists")
        ErrReadBodyError                           = errors.New("read request body failed")
        ErrVolPolicyNotExists                      = errors.New("vol policy not exists")
        ErrDuplicateAccessKey                      = errors.New("duplicate access key")
        ErrHaveNoPolicy                            = errors.New("no vol policy")
        ErrZoneNotExists                           = errors.New("zone not exists")
        ErrOwnVolExists                            = errors.New("own vols not empty")
        ErrSuperAdminExists                        = errors.New("super administrator exists ")
        ErrInvalidUserID                           = errors.New("invalid user ID")
        ErrInvalidUserType                         = errors.New("invalid user type")
        ErrNoPermission                            = errors.New("no permission")
        ErrTokenNotFound                           = errors.New("token not found")
        ErrInvalidAccessKey                        = errors.New("invalid access key")
        ErrInvalidSecretKey                        = errors.New("invalid secret key")
        ErrIsOwner                                 = errors.New("user owns the volume")
        ErrZoneNum                                 = errors.New("zone num not qualified")
        ErrNoNodeSetToUpdateDecommissionLimit      = errors.New("no node set available for updating decommission limit")
        ErrNoNodeSetToQueryDecommissionLimitStatus = errors.New("no node set available for query decommission limit status")
        ErrNoNodeSetToDecommission                 = errors.New("no node set available to decommission ")
        ErrVolNoAvailableSpace                     = errors.New("vol has no available space")
        ErrVolNoCacheAndRule                       = errors.New("vol has no cache and rule")
        ErrNoAclPermission                         = errors.New("acl no permission")
        ErrQuotaNotExists                          = errors.New("quota not exists")
        ErrCodeVersionOp                           = errors.New("version op failed")
        ErrNoSuchLifecycleConfiguration            = errors.New("The lifecycle configuration does not exist")
        ErrNoNodeSetToUpdateDecommissionDiskFactor = errors.New("no node set available for updating decommission disk factor")
        ErrNoNodeSetToQueryDecommissionDiskLimit   = errors.New("no node set available for query decommission disk limit")
        ErrNodeSetNotExists                        = errors.New("node set not exists")
        ErrCompressFailed                          = errors.New("compress data failed")
        ErrDecompressFailed                        = errors.New("decompress data failed")
)

// http response error code and error message definitions
const (
        ErrCodeSuccess = iota
        ErrCodeInternalError
        ErrCodeParamError
        ErrCodeInvalidCfg
        ErrCodePersistenceByRaft
        ErrCodeMarshalData
        ErrCodeUnmarshalData
        ErrCodeVolNotExists
        ErrCodeMetaPartitionNotExists
        ErrCodeDataPartitionNotExists
        ErrCodeDataNodeNotExists
        ErrCodeMetaNodeNotExists
        ErrCodeDuplicateVol
        ErrCodeActiveDataNodesTooLess
        ErrCodeActiveMetaNodesTooLess
        ErrCodeInvalidMpStart
        ErrCodeNoAvailDataPartition
        ErrCodeReshuffleArray
        ErrCodeIllegalDataReplica
        ErrCodeMissingReplica
        ErrCodeHasOneMissingReplica
        ErrCodeNoDataNodeToWrite
        ErrCodeNoMetaNodeToWrite
        ErrCodeCannotBeOffLine
        ErrCodeNoDataNodeToCreateDataPartition
        ErrCodeNoZoneToCreateDataPartition
        ErrCodeNoNodeSetToCreateDataPartition
        ErrCodeNoNodeSetToCreateMetaPartition
        ErrCodeNoMetaNodeToCreateMetaPartition
        ErrCodeIllegalMetaReplica
        ErrCodeNoEnoughReplica
        ErrCodeNoLeader
        ErrCodeVolAuthKeyNotMatch
        ErrCodeAuthKeyStoreError
        ErrCodeAuthAPIAccessGenRespError
        ErrCodeAuthRaftNodeGenRespError
        ErrCodeAuthOSCapsOpGenRespError
        ErrCodeAuthReqRedirectError
        ErrCodeAccessKeyNotExists
        ErrCodeInvalidTicket
        ErrCodeInvalidClientIDKey
        ErrCodeExpiredTicket
        ErrCodeMasterAPIGenRespError
        ErrCodeDuplicateUserID
        ErrCodeUserNotExists
        ErrCodeReadBodyError
        ErrCodeVolPolicyNotExists
        ErrCodeDuplicateAccessKey
        ErrCodeHaveNoPolicy
        ErrCodeNoZoneToCreateMetaPartition
        ErrCodeZoneNotExists
        ErrCodeOwnVolExists
        ErrCodeSuperAdminExists
        ErrCodeInvalidUserID
        ErrCodeInvalidUserType
        ErrCodeNoPermission
        ErrCodeTokenNotExist
        ErrCodeInvalidAccessKey
        ErrCodeInvalidSecretKey
        ErrCodeIsOwner
        ErrCodeZoneNumError
        ErrCodeVersionOpError
        ErrCodeNodeSetNotExists
)

// Err2CodeMap error map to code
var Err2CodeMap = map[error]int32{
        ErrSuc:                             ErrCodeSuccess,
        ErrInternalError:                   ErrCodeInternalError,
        ErrParamError:                      ErrCodeParamError,
        ErrInvalidCfg:                      ErrCodeInvalidCfg,
        ErrPersistenceByRaft:               ErrCodePersistenceByRaft,
        ErrMarshalData:                     ErrCodeMarshalData,
        ErrUnmarshalData:                   ErrCodeUnmarshalData,
        ErrVolNotExists:                    ErrCodeVolNotExists,
        ErrMetaPartitionNotExists:          ErrCodeMetaPartitionNotExists,
        ErrDataPartitionNotExists:          ErrCodeDataPartitionNotExists,
        ErrDataNodeNotExists:               ErrCodeDataNodeNotExists,
        ErrMetaNodeNotExists:               ErrCodeMetaNodeNotExists,
        ErrDuplicateVol:                    ErrCodeDuplicateVol,
        ErrActiveDataNodesTooLess:          ErrCodeActiveDataNodesTooLess,
        ErrActiveMetaNodesTooLess:          ErrCodeActiveMetaNodesTooLess,
        ErrInvalidMpStart:                  ErrCodeInvalidMpStart,
        ErrNoAvailDataPartition:            ErrCodeNoAvailDataPartition,
        ErrReshuffleArray:                  ErrCodeReshuffleArray,
        ErrIllegalDataReplica:              ErrCodeIllegalDataReplica,
        ErrMissingReplica:                  ErrCodeMissingReplica,
        ErrHasOneMissingReplica:            ErrCodeHasOneMissingReplica,
        ErrNoDataNodeToWrite:               ErrCodeNoDataNodeToWrite,
        ErrNoMetaNodeToWrite:               ErrCodeNoMetaNodeToWrite,
        ErrCannotBeOffLine:                 ErrCodeCannotBeOffLine,
        ErrNoDataNodeToCreateDataPartition: ErrCodeNoDataNodeToCreateDataPartition,
        ErrNoZoneToCreateDataPartition:     ErrCodeNoZoneToCreateDataPartition,
        ErrNoZoneToCreateMetaPartition:     ErrCodeNoZoneToCreateMetaPartition,
        ErrNoNodeSetToCreateDataPartition:  ErrCodeNoNodeSetToCreateDataPartition,
        ErrNoNodeSetToCreateMetaPartition:  ErrCodeNoNodeSetToCreateMetaPartition,
        ErrNoMetaNodeToCreateMetaPartition: ErrCodeNoMetaNodeToCreateMetaPartition,
        ErrIllegalMetaReplica:              ErrCodeIllegalMetaReplica,
        ErrNoEnoughReplica:                 ErrCodeNoEnoughReplica,
        ErrNoLeader:                        ErrCodeNoLeader,
        ErrVolAuthKeyNotMatch:              ErrCodeVolAuthKeyNotMatch,
        ErrAuthKeyStoreError:               ErrCodeAuthKeyStoreError,
        ErrAuthAPIAccessGenRespError:       ErrCodeAuthAPIAccessGenRespError,
        ErrAuthOSCapsOpGenRespError:        ErrCodeAuthOSCapsOpGenRespError,
        ErrAccessKeyNotExists:              ErrCodeAccessKeyNotExists,
        ErrInvalidTicket:                   ErrCodeInvalidTicket,
        ErrInvalidClientIDKey:              ErrCodeInvalidClientIDKey,
        ErrExpiredTicket:                   ErrCodeExpiredTicket,
        ErrMasterAPIGenRespError:           ErrCodeMasterAPIGenRespError,
        ErrDuplicateUserID:                 ErrCodeDuplicateUserID,
        ErrUserNotExists:                   ErrCodeUserNotExists,
        ErrReadBodyError:                   ErrCodeReadBodyError,
        ErrVolPolicyNotExists:              ErrCodeVolPolicyNotExists,
        ErrDuplicateAccessKey:              ErrCodeDuplicateAccessKey,
        ErrHaveNoPolicy:                    ErrCodeHaveNoPolicy,
        ErrZoneNotExists:                   ErrCodeZoneNotExists,
        ErrOwnVolExists:                    ErrCodeOwnVolExists,
        ErrSuperAdminExists:                ErrCodeSuperAdminExists,
        ErrInvalidUserID:                   ErrCodeInvalidUserID,
        ErrInvalidUserType:                 ErrCodeInvalidUserType,
        ErrNoPermission:                    ErrCodeNoPermission,
        ErrTokenNotFound:                   ErrCodeTokenNotExist,
        ErrInvalidAccessKey:                ErrCodeInvalidAccessKey,
        ErrInvalidSecretKey:                ErrCodeInvalidSecretKey,
        ErrIsOwner:                         ErrCodeIsOwner,
        ErrZoneNum:                         ErrCodeZoneNumError,
        ErrCodeVersionOp:                   ErrCodeVersionOpError,
        ErrNodeSetNotExists:                ErrCodeNodeSetNotExists,
}

func ParseErrorCode(code int32) error {
        if err, exist := code2ErrMap[code]; exist {
                return err
        }
        return ErrInternalError
}

// Code2ErrMap error map to code
var code2ErrMap = map[int32]error{
        ErrCodeSuccess:                         ErrSuc,
        ErrCodeInternalError:                   ErrInternalError,
        ErrCodeParamError:                      ErrParamError,
        ErrCodeInvalidCfg:                      ErrInvalidCfg,
        ErrCodePersistenceByRaft:               ErrPersistenceByRaft,
        ErrCodeMarshalData:                     ErrMarshalData,
        ErrCodeUnmarshalData:                   ErrUnmarshalData,
        ErrCodeVolNotExists:                    ErrVolNotExists,
        ErrCodeMetaPartitionNotExists:          ErrMetaPartitionNotExists,
        ErrCodeDataPartitionNotExists:          ErrDataPartitionNotExists,
        ErrCodeDataNodeNotExists:               ErrDataNodeNotExists,
        ErrCodeMetaNodeNotExists:               ErrMetaNodeNotExists,
        ErrCodeDuplicateVol:                    ErrDuplicateVol,
        ErrCodeActiveDataNodesTooLess:          ErrActiveDataNodesTooLess,
        ErrCodeActiveMetaNodesTooLess:          ErrActiveMetaNodesTooLess,
        ErrCodeInvalidMpStart:                  ErrInvalidMpStart,
        ErrCodeNoAvailDataPartition:            ErrNoAvailDataPartition,
        ErrCodeReshuffleArray:                  ErrReshuffleArray,
        ErrCodeIllegalDataReplica:              ErrIllegalDataReplica,
        ErrCodeMissingReplica:                  ErrMissingReplica,
        ErrCodeHasOneMissingReplica:            ErrHasOneMissingReplica,
        ErrCodeNoDataNodeToWrite:               ErrNoDataNodeToWrite,
        ErrCodeNoMetaNodeToWrite:               ErrNoMetaNodeToWrite,
        ErrCodeCannotBeOffLine:                 ErrCannotBeOffLine,
        ErrCodeNoDataNodeToCreateDataPartition: ErrNoDataNodeToCreateDataPartition,
        ErrCodeNoZoneToCreateDataPartition:     ErrNoZoneToCreateDataPartition,
        ErrCodeNoZoneToCreateMetaPartition:     ErrNoZoneToCreateMetaPartition,
        ErrCodeNoNodeSetToCreateDataPartition:  ErrNoNodeSetToCreateDataPartition,
        ErrCodeNoNodeSetToCreateMetaPartition:  ErrNoNodeSetToCreateMetaPartition,
        ErrCodeNoMetaNodeToCreateMetaPartition: ErrNoMetaNodeToCreateMetaPartition,
        ErrCodeIllegalMetaReplica:              ErrIllegalMetaReplica,
        ErrCodeNoEnoughReplica:                 ErrNoEnoughReplica,
        ErrCodeNoLeader:                        ErrNoLeader,
        ErrCodeVolAuthKeyNotMatch:              ErrVolAuthKeyNotMatch,
        ErrCodeAuthKeyStoreError:               ErrAuthKeyStoreError,
        ErrCodeAuthAPIAccessGenRespError:       ErrAuthAPIAccessGenRespError,
        ErrCodeAuthOSCapsOpGenRespError:        ErrAuthOSCapsOpGenRespError,
        ErrCodeAccessKeyNotExists:              ErrAccessKeyNotExists,
        ErrCodeInvalidTicket:                   ErrInvalidTicket,
        ErrCodeInvalidClientIDKey:              ErrInvalidClientIDKey,
        ErrCodeExpiredTicket:                   ErrExpiredTicket,
        ErrCodeMasterAPIGenRespError:           ErrMasterAPIGenRespError,
        ErrCodeDuplicateUserID:                 ErrDuplicateUserID,
        ErrCodeUserNotExists:                   ErrUserNotExists,
        ErrCodeReadBodyError:                   ErrReadBodyError,
        ErrCodeVolPolicyNotExists:              ErrVolPolicyNotExists,
        ErrCodeDuplicateAccessKey:              ErrDuplicateAccessKey,
        ErrCodeHaveNoPolicy:                    ErrHaveNoPolicy,
        ErrCodeZoneNotExists:                   ErrZoneNotExists,
        ErrCodeOwnVolExists:                    ErrOwnVolExists,
        ErrCodeSuperAdminExists:                ErrSuperAdminExists,
        ErrCodeInvalidUserType:                 ErrInvalidUserType,
        ErrCodeInvalidUserID:                   ErrInvalidUserID,
        ErrCodeNoPermission:                    ErrNoPermission,
        ErrCodeTokenNotExist:                   ErrTokenNotFound,
        ErrCodeInvalidAccessKey:                ErrInvalidAccessKey,
        ErrCodeInvalidSecretKey:                ErrInvalidSecretKey,
        ErrCodeIsOwner:                         ErrIsOwner,
        ErrCodeZoneNumError:                    ErrZoneNum,
        ErrCodeVersionOpError:                  ErrCodeVersionOp,
        ErrCodeNodeSetNotExists:                ErrNodeSetNotExists,
}

type GeneralResp struct {
        Message string
        Code    int32
}

func Success(msg string) *GeneralResp {
        return &GeneralResp{Message: msg, Code: ErrCodeSuccess}
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "bytes"
        "encoding/binary"
        "errors"
        "fmt"
        "hash/crc32"
        "math"
        "sync"

        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

var (
        ExtentKeyHeader       = []byte("EKV2")
        ExtentKeyHeaderV3     = []byte("EKV3")
        ExtentKeyHeaderSize   = len(ExtentKeyHeader)
        ExtentLength          = 40
        ExtentKeyChecksumSize = 4
        ExtentVerFieldSize    = 9 // ver(8) and isSplit(1)
        ExtentV2Length        = ExtentKeyHeaderSize + ExtentLength + ExtentKeyChecksumSize
        ExtentV3Length        = ExtentKeyHeaderSize + ExtentLength + ExtentKeyChecksumSize + ExtentVerFieldSize
        InvalidKey            = errors.New("invalid key error")
        InvalidKeyHeader      = errors.New("invalid extent v2 key header error")
        InvalidKeyCheckSum    = errors.New("invalid extent v2 key checksum error")
)

type ExtSnapInfo struct {
        VerSeq  uint64
        IsSplit bool
        ModGen  uint64
}

// ExtentKey defines the extent key struct.
type ExtentKey struct {
        FileOffset   uint64 // offset in file
        PartitionId  uint64
        ExtentId     uint64
        ExtentOffset uint64 // offset in extent like tiny extent offset large than 0,normal is 0
        Size         uint32 // real size that inode used on the extent,it's size may be part of extent real size, such as tinyExt
        CRC          uint32
        // snapshot
        SnapInfo *ExtSnapInfo
}

func (k *ExtentKey) GetModGen() uint64 {
        if k.SnapInfo == nil {
                return 0
        }
        return k.SnapInfo.ModGen
}

func (k *ExtentKey) AddModGen() {
        if k.SnapInfo == nil {
                k.SnapInfo = &ExtSnapInfo{
                        ModGen: 1,
                }
                return
        }
        k.SnapInfo.ModGen++
}

func (k *ExtentKey) Equals(ek *ExtentKey) bool {
        if k == nil && ek == nil {
                return true
        } else if k == nil || ek == nil {
                return false
        }
        if k.PartitionId != ek.PartitionId ||
                k.Size != ek.Size ||
                k.ExtentOffset != ek.ExtentOffset ||
                k.FileOffset != ek.FileOffset ||
                k.ExtentId != ek.ExtentId ||
                k.CRC != ek.CRC {
                return false
        }
        if k.SnapInfo == nil && ek.SnapInfo == nil {
                return true
        } else if k.SnapInfo == nil || ek.SnapInfo == nil {
                return false
        }
        return k.SnapInfo.IsSplit == ek.SnapInfo.IsSplit &&
                k.SnapInfo.VerSeq == ek.SnapInfo.VerSeq
}

func (k *ExtentKey) IsSplit() bool {
        if k.SnapInfo == nil {
                return false
        }
        return k.SnapInfo.IsSplit
}

func (k *ExtentKey) GetSeq() uint64 {
        if k.SnapInfo == nil {
                return 0
        }
        return k.SnapInfo.VerSeq
}

func (k *ExtentKey) SetSeq(seq uint64) {
        if seq == 0 && k.SnapInfo == nil {
                return
        }
        if k.SnapInfo == nil {
                k.SnapInfo = &ExtSnapInfo{VerSeq: seq}
                return
        }
        k.SnapInfo.VerSeq = seq
}

func (k *ExtentKey) SetSplit(split bool) {
        if !split && k.SnapInfo == nil {
                return
        }
        if k.SnapInfo == nil {
                k.SnapInfo = &ExtSnapInfo{
                        IsSplit: split,
                }
                return
        }
        k.SnapInfo.IsSplit = split
}

func (k *ExtentKey) GenerateId() uint64 {
        if k.PartitionId > math.MaxUint32 || k.ExtentId > math.MaxUint32 {
                log.LogFatalf("ext %v abnormal", k)
        }
        return (k.PartitionId << 32) | k.ExtentId
}

func ParseFromId(sID uint64) (dpID uint64, extID uint64) {
        dpID = sID >> 32
        extID = sID & math.MaxUint32
        return
}

func MergeSplitKey(inodeID uint64, ekRefMap *sync.Map, sMap *sync.Map) (err error) {
        if ekRefMap == nil || sMap == nil {
                log.LogErrorf("MergeSplitKey. inodeID %v should not use nil role", inodeID)
                return
        }
        sMap.Range(func(id, value interface{}) bool {
                dpID, extID := ParseFromId(id.(uint64))
                nVal := uint32(0)
                val, ok := ekRefMap.Load(id)
                if ok {
                        nVal = val.(uint32)
                }
                log.LogDebugf("UnmarshalInodeValue inode %v get splitID %v dp id %v extentid %v, refCnt %v, add %v",
                        inodeID, id.(uint64), dpID, extID, value.(uint32), nVal)

                ekRefMap.Store(id, nVal+value.(uint32))
                return true
        })
        return
}

func (k *ExtentKey) IsEqual(rightKey *ExtentKey) bool {
        //        return false
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId &&
                k.GetSeq() == rightKey.GetSeq() &&
                k.ExtentOffset == rightKey.ExtentOffset &&
                k.FileOffset == rightKey.FileOffset
}

func (k *ExtentKey) IsCoveredWithDiffSeq(rightKey *ExtentKey) bool {
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId &&
                k.GetSeq() < rightKey.GetSeq() &&
                k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
                k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}

func (k *ExtentKey) IsSequenceWithSameSeq(rightKey *ExtentKey) bool {
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId &&
                k.GetSeq() == rightKey.GetSeq() &&
                k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
                k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}

func (k *ExtentKey) IsSameExtent(rightKey *ExtentKey) bool {
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId
}

func (k *ExtentKey) IsSequenceWithDiffSeq(rightKey *ExtentKey) bool {
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId &&
                !(k.GetSeq() == rightKey.GetSeq()) &&
                k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset &&
                k.FileOffset+uint64(k.Size) == rightKey.FileOffset
}

func (k *ExtentKey) IsFileInSequence(rightKey *ExtentKey) bool {
        return k.PartitionId == rightKey.PartitionId &&
                k.ExtentId == rightKey.ExtentId &&
                k.ExtentOffset+uint64(k.Size) == rightKey.ExtentOffset
}

// String returns the string format of the extentKey.
func (k ExtentKey) String() string {
        return fmt.Sprintf("ExtentKey{FileOffset(%v),VerSeq(%v) Partition(%v),ExtentID(%v),ExtentOffset(%v),isSplit(%v),Size(%v),CRC(%v)}",
                k.FileOffset, k.GetSeq(), k.PartitionId, k.ExtentId, k.ExtentOffset, k.IsSplit(), k.Size, k.CRC)
}

// Less defines the less comparator.
func (k *ExtentKey) Less(than btree.Item) bool {
        that := than.(*ExtentKey)
        return k.FileOffset < that.FileOffset
}

// Marshal marshals the extent key.
func (k *ExtentKey) Copy() btree.Item {
        return k
}

func (k *ExtentKey) Marshal() (m string) {
        return fmt.Sprintf("%v_%v_%v_%v_%v_%v", k.FileOffset, k.PartitionId, k.ExtentId, k.ExtentOffset, k.Size, k.CRC)
}

func (k *ExtentKey) MarshalBinaryExt(data []byte) {
        binary.BigEndian.PutUint64(data[0:], k.FileOffset)
        binary.BigEndian.PutUint64(data[8:], k.PartitionId)
        binary.BigEndian.PutUint64(data[16:], k.ExtentId)
        binary.BigEndian.PutUint64(data[24:], k.ExtentOffset)
        binary.BigEndian.PutUint32(data[32:], k.Size)
        binary.BigEndian.PutUint32(data[36:], k.CRC)
}

// MarshalBinary marshals the binary format of the extent key.
func (k *ExtentKey) MarshalBinary(v3 bool) ([]byte, error) {
        buf := bytes.NewBuffer(make([]byte, 0, ExtentLength))
        if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.PartitionId); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.ExtentId); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.ExtentOffset); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.CRC); err != nil {
                return nil, err
        }
        if v3 {
                if err := binary.Write(buf, binary.BigEndian, k.GetSeq()); err != nil {
                        return nil, err
                }
                if err := binary.Write(buf, binary.BigEndian, k.IsSplit()); err != nil {
                        return nil, err
                }
        }
        return buf.Bytes(), nil
}

// UnmarshalBinary unmarshals the binary format of the extent key.
func (k *ExtentKey) UnmarshalBinary(buf *bytes.Buffer, v3 bool) (err error) {
        if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.PartitionId); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.ExtentId); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.ExtentOffset); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.CRC); err != nil {
                return
        }
        if v3 {
                var seq uint64
                if err = binary.Read(buf, binary.BigEndian, &seq); err != nil {
                        return
                }
                k.SetSeq(seq)
                var isSplit bool
                if err = binary.Read(buf, binary.BigEndian, &isSplit); err != nil {
                        return
                }
                k.SetSplit(isSplit)
        }

        return
}

func (k *ExtentKey) CheckSum(v3 bool) uint32 {
        sign := crc32.NewIEEE()
        buf, err := k.MarshalBinary(v3)
        if err != nil {
                log.LogErrorf("[ExtentKey] extentKey %v CRC32 error: %v", k, err)
                return 0
        }
        sign.Write(buf)

        return sign.Sum32()
}

// marshal extentkey to []bytes with v2 of magic head
func (k *ExtentKey) MarshalBinaryWithCheckSum(v3 bool) ([]byte, error) {
        extLen := ExtentV2Length
        flag := ExtentKeyHeader
        if v3 {
                extLen = ExtentV3Length
                flag = ExtentKeyHeaderV3
        }
        buf := bytes.NewBuffer(make([]byte, 0, extLen))
        if err := binary.Write(buf, binary.BigEndian, flag); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.PartitionId); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.ExtentId); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.ExtentOffset); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.CRC); err != nil {
                return nil, err
        }
        if v3 {
                if err := binary.Write(buf, binary.BigEndian, k.GetSeq()); err != nil {
                        return nil, err
                }
                if err := binary.Write(buf, binary.BigEndian, k.IsSplit()); err != nil {
                        return nil, err
                }
        }
        if err := binary.Write(buf, binary.BigEndian, k.CheckSum(v3)); err != nil {
                return nil, err
        }

        return buf.Bytes(), nil
}

// unmarshal extentkey from bytes.Buffer with checksum
func (k *ExtentKey) UnmarshalBinaryWithCheckSum(buf *bytes.Buffer) (err error) {
        var (
                checksum uint32
                v3       bool
        )
        magic := make([]byte, ExtentKeyHeaderSize)

        if err = binary.Read(buf, binary.BigEndian, magic); err != nil {
                return
        }

        if r := bytes.Compare(magic, ExtentKeyHeader); r != 0 {
                if r = bytes.Compare(magic, ExtentKeyHeaderV3); r != 0 {
                        log.LogErrorf("action[UnmarshalBinaryWithCheckSum] err header magic %v", string(magic))
                        err = InvalidKeyHeader
                        return
                }
                v3 = true
        }
        if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.PartitionId); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.ExtentId); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.ExtentOffset); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.CRC); err != nil {
                return
        }

        if v3 {
                var seq uint64
                if err = binary.Read(buf, binary.BigEndian, &seq); err != nil {
                        return
                }
                k.SetSeq(seq)
                var split bool
                if err = binary.Read(buf, binary.BigEndian, &split); err != nil {
                        return
                }
                k.SetSplit(split)
        }

        if err = binary.Read(buf, binary.BigEndian, &checksum); err != nil {
                return
        }

        if k.CheckSum(v3) != checksum {
                err = InvalidKeyCheckSum
                return
        }

        return
}

// TODO remove
func (k *ExtentKey) UnMarshal(m string) (err error) {
        _, err = fmt.Sscanf(m, "%v_%v_%v_%v_%v_%v", &k.FileOffset, &k.PartitionId, &k.ExtentId, &k.ExtentOffset, &k.Size, &k.CRC)
        return
}

// TODO remove
func (k *ExtentKey) GetExtentKey() (m string) {
        return fmt.Sprintf("%v_%v_%v_%v_%v", k.PartitionId, k.FileOffset, k.ExtentId, k.ExtentOffset, k.Size)
}

type TinyExtentDeleteRecord struct {
        FileOffset   uint64
        PartitionId  uint64
        ExtentId     uint64
        ExtentOffset uint64
        Size         uint32
        CRC          uint32
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "fmt"
        "os"
        "path/filepath"
        "strings"
        "time"
)

const (
        RootIno    = uint64(1)
        SummaryKey = "cbfs.dir.summary"
        QuotaKey   = "qa"
)

const (
        FlagsSyncWrite int = 1 << iota
        FlagsAppend
        FlagsCache
)

const (
        FlagsSnapshotDel int = 1 << iota
        FlagsSnapshotDelDir
        FlagsVerAll
)

// Mode returns the fileMode.
func Mode(osMode os.FileMode) uint32 {
        return uint32(osMode)
}

// OsMode returns os.FileMode.
func OsMode(mode uint32) os.FileMode {
        return os.FileMode(mode)
}

// Returns os.FileMode masked by os.ModeType
func OsModeType(mode uint32) os.FileMode {
        return os.FileMode(mode) & os.ModeType
}

// IsRegular checks if the mode is regular.
func IsRegular(mode uint32) bool {
        return OsMode(mode).IsRegular()
}

// IsDir checks if the mode is dir.
func IsDir(mode uint32) bool {
        return OsMode(mode).IsDir()
}

// IsSymlink checks if the mode is symlink.
func IsSymlink(mode uint32) bool {
        return OsMode(mode)&os.ModeSymlink != 0
}

func IsAncestor(parent, child string) bool {
        rel, err := filepath.Rel(parent, child)
        if err != nil {
                return false
        }
        return !strings.HasPrefix(rel, "..")
}

// InodeInfo defines the inode struct.
type InodeInfo struct {
        Inode      uint64                    `json:"ino"`
        Mode       uint32                    `json:"mode"`
        Nlink      uint32                    `json:"nlink"`
        Size       uint64                    `json:"sz"`
        Uid        uint32                    `json:"uid"`
        Gid        uint32                    `json:"gid"`
        Generation uint64                    `json:"gen"`
        ModifyTime time.Time                 `json:"mt"`
        CreateTime time.Time                 `json:"ct"`
        AccessTime time.Time                 `json:"at"`
        Target     []byte                    `json:"tgt"`
        QuotaInfos map[uint32]*MetaQuotaInfo `json:"qifs"`
        VerSeq     uint64                    `json:"seq"`
        expiration int64
}

type SimpleExtInfo struct {
        ID          uint64
        PartitionID uint32
        ExtentID    uint32
}

// InodeInfo defines the inode struct.
type InodeSplitInfo struct {
        Inode    uint64          `json:"ino"`
        SplitArr []SimpleExtInfo `json:"splitInfo"`
        VerSeq   uint64          `json:"seq"`
}

type SummaryInfo struct {
        Files   int64 `json:"files"`
        Subdirs int64 `json:"subdirs"`
        Fbytes  int64 `json:"fbytes"`
}

type DentryInfo struct {
        Name       string `json:"name"`
        Inode      uint64 `json:"inode"`
        expiration int64
}

func (info *DentryInfo) SetExpiration(e int64) {
        info.expiration = e
}

func (info *DentryInfo) Expiration() int64 {
        return info.expiration
}

func (info *InodeInfo) Expiration() int64 {
        return info.expiration
}

func (info *InodeInfo) SetExpiration(e int64) {
        info.expiration = e
}

// String returns the string format of the inode.
func (info *InodeInfo) String() string {
        return fmt.Sprintf("Inode(%v) Mode(%v) OsMode(%v) Nlink(%v) Size(%v) Uid(%v) Gid(%v) Gen(%v) QuotaIds(%v)",
                info.Inode, info.Mode, OsMode(info.Mode), info.Nlink, info.Size, info.Uid, info.Gid, info.Generation, info.QuotaInfos)
}

type XAttrInfo struct {
        Inode  uint64
        XAttrs map[string]string
}

func (info XAttrInfo) Get(key string) []byte {
        return []byte(info.XAttrs[key])
}

func (info XAttrInfo) VisitAll(visitor func(key string, value []byte) bool) {
        for k, v := range info.XAttrs {
                if visitor == nil || !visitor(k, []byte(v)) {
                        return
                }
        }
}

func (info XAttrInfo) String() string {
        builder := strings.Builder{}
        for k, v := range info.XAttrs {
                if builder.Len() != 0 {
                        builder.WriteString(",")
                }
                builder.WriteString(fmt.Sprintf("%s:%s", k, v))
        }
        return fmt.Sprintf("XAttrInfo{Inode(%v), XAttrs(%v)}", info.Inode, builder.String())
}

// Dentry defines the dentry struct.
type Dentry struct {
        Name  string `json:"name"`
        Inode uint64 `json:"ino"`
        Type  uint32 `json:"type"`
}

// String returns the string format of the dentry.
func (d Dentry) String() string {
        return fmt.Sprintf("Dentry{Name(%v),Inode(%v),Type(%v)}", d.Name, d.Inode, d.Type)
}

type RequestExtend struct {
        FullPaths []string `json:"fullPaths"`
}

// NOTE: batch request may have multi full path
// values, but other request only have one
func (r *RequestExtend) GetFullPath() string {
        if len(r.FullPaths) < 1 {
                return ""
        }
        return r.FullPaths[0]
}

// CreateInodeRequest defines the request to create an inode.
type QuotaCreateInodeRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        Mode        uint32   `json:"mode"`
        Uid         uint32   `json:"uid"`
        Gid         uint32   `json:"gid"`
        Target      []byte   `json:"tgt"`
        QuotaIds    []uint32 `json:"qids"`
        RequestExtend
}

type CreateInodeRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Mode        uint32 `json:"mode"`
        Uid         uint32 `json:"uid"`
        Gid         uint32 `json:"gid"`
        Target      []byte `json:"tgt"`
        RequestExtend
}

// CreateInodeResponse defines the response to the request of creating an inode.
type CreateInodeResponse struct {
        Info *InodeInfo `json:"info"`
}

type TxCreateRequest struct {
        VolName          string `json:"vol"`
        PartitionID      uint64 `json:"pid"`
        *TransactionInfo `json:"tx"`
}

type TxCreateResponse struct {
        TxInfo *TransactionInfo `json:"tx"`
}

type TxApplyRMRequest struct {
        VolName          string `json:"vol"`
        PartitionID      uint64 `json:"pid"`
        *TransactionInfo `json:"tx"`
}

// TxCreateInodeRequest defines the request to create an inode with transaction info.
type TxCreateInodeRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        Mode        uint32           `json:"mode"`
        Uid         uint32           `json:"uid"`
        Gid         uint32           `json:"gid"`
        Target      []byte           `json:"tgt"`
        QuotaIds    []uint32         `json:"qids"`
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

// TxCreateInodeResponse defines the response with transaction info to the request of creating an inode.
type TxCreateInodeResponse struct {
        Info   *InodeInfo       `json:"info"`
        TxInfo *TransactionInfo `json:"tx"`
}

const (
        TxCommit int = 1 << iota
        TxRollback
)

type TxApplyRequest struct {
        TxID        string `json:"tx"`
        TmID        uint64 `json:"tmid"`
        TxApplyType int    `json:"type"`
}

type TxSetStateRequest struct {
        TxID  string `json:"tx"`
        State int32  `json:"state"`
}

type TxInodeApplyRequest struct {
        TxID        string `json:"txid"`
        Inode       uint64 `json:"ino"`
        TxApplyType int    `json:"type"`
        ApplyFrom   uint32 `json:"from"`
}

type TxDentryApplyRequest struct {
        TxID string `json:"txid"`
        // DenKey      string `json:"denkey"`
        Pid         uint64 `json:"pid"`
        Name        string `json:"name"`
        TxApplyType int    `json:"type"`
        ApplyFrom   uint32 `json:"from"`
}

type TxGetInfoRequest struct {
        VolName string `json:"vol"`
        TxID    string `json:"txid"`
        Pid     uint64 `json:"pid"`
}

type TxGetInfoResponse struct {
        TxInfo *TransactionInfo `json:"tx"`
}

// LinkInodeRequest defines the request to link an inode.
type LinkInodeRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        UniqID      uint64 `json:"uiq"`
        IsRename    bool   `json:"rename"`
        RequestExtend
}

// LinkInodeResponse defines the response to the request of linking an inode.
type LinkInodeResponse struct {
        Info *InodeInfo `json:"info"`
}

type TxLinkInodeRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        Inode       uint64           `json:"ino"`
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

func (tx *TxLinkInodeRequest) GetInfo() string {
        return tx.TxInfo.String()
}

type TxLinkInodeResponse struct {
        Info *InodeInfo `json:"info"`
}

type ClearInodeCacheRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
}

type ClearInodeCacheResponse struct {
        Info *InodeInfo `json:"info"`
}

type TxUnlinkInodeRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        Inode       uint64           `json:"ino"`
        Evict       bool             `json:"evict"`
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

func (tx *TxUnlinkInodeRequest) GetInfo() string {
        return tx.TxInfo.String()
}

type TxUnlinkInodeResponse struct {
        Info   *InodeInfo       `json:"info"`
        TxInfo *TransactionInfo `json:"tx"`
}

// UnlinkInodeRequest defines the request to unlink an inode.
type UnlinkInodeRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        UniqID      uint64 `json:"uid"` // for request dedup
        VerSeq      uint64 `json:"ver"`
        DenVerSeq   uint64 `json:"denVer"`
        RequestExtend
}

// UnlinkInodeRequest defines the request to unlink an inode.
type BatchUnlinkInodeRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        Inodes      []uint64 `json:"inos"`
        FullPaths   []string `json:"fullPaths"`
}

// UnlinkInodeResponse defines the response to the request of unlinking an inode.
type UnlinkInodeResponse struct {
        Info *InodeInfo `json:"info"`
}

// batch UnlinkInodeResponse defines the response to the request of unlinking an inode.
type BatchUnlinkInodeResponse struct {
        Items []*struct {
                Info   *InodeInfo `json:"info"`
                Status uint8      `json:"status"`
        } `json:"items"`
}

// EvictInodeRequest defines the request to evict an inode.
type EvictInodeRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        RequestExtend
}

// EvictInodeRequest defines the request to evict some inode.
type BatchEvictInodeRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        Inodes      []uint64 `json:"inos"`
        FullPaths   []string `json:"fullPaths"`
}

// CreateDentryRequest defines the request to create a dentry.
type QuotaCreateDentryRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        ParentID    uint64   `json:"pino"`
        Inode       uint64   `json:"ino"`
        Name        string   `json:"name"`
        Mode        uint32   `json:"mode"`
        QuotaIds    []uint32 `json:"qids"`
        VerSeq      uint64   `json:"seq"`
        RequestExtend
}

type CreateDentryRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        Inode       uint64 `json:"ino"`
        Name        string `json:"name"`
        Mode        uint32 `json:"mode"`
        RequestExtend
}

type TxPack interface {
        GetInfo() string
}

// TxCreateDentryRequest defines the request to create a dentry.
type TxCreateDentryRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        ParentID    uint64           `json:"pino"`
        Inode       uint64           `json:"ino"`
        Name        string           `json:"name"`
        Mode        uint32           `json:"mode"`
        QuotaIds    []uint32         `json:"qids"`
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

func (tx *TxCreateDentryRequest) GetInfo() string {
        return tx.TxInfo.String()
}

type TxCreateDentryResponse struct {
        TxInfo *TransactionInfo `json:"tx"`
}

// UpdateDentryRequest defines the request to update a dentry.
type UpdateDentryRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        Name        string `json:"name"`
        Inode       uint64 `json:"ino"` // new inode number
        RequestExtend
}

// UpdateDentryResponse defines the response to the request of updating a dentry.
type UpdateDentryResponse struct {
        Inode uint64 `json:"ino"` // old inode number
}

type TxUpdateDentryRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        ParentID    uint64           `json:"pino"`
        Name        string           `json:"name"`
        Inode       uint64           `json:"ino"`    // new inode number
        OldIno      uint64           `json:"oldIno"` // new inode number
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

func (tx *TxUpdateDentryRequest) GetInfo() string {
        return tx.TxInfo.String()
}

type TxUpdateDentryResponse struct {
        Inode uint64 `json:"ino"` // old inode number
}

type TxDeleteDentryRequest struct {
        VolName     string           `json:"vol"`
        PartitionID uint64           `json:"pid"`
        ParentID    uint64           `json:"pino"`
        Name        string           `json:"name"`
        Ino         uint64           `json:"ino"`
        TxInfo      *TransactionInfo `json:"tx"`
        RequestExtend
}

func (tx *TxDeleteDentryRequest) GetInfo() string {
        return tx.TxInfo.String()
}

type TxDeleteDentryResponse struct {
        Inode uint64 `json:"ino"`
}

// DeleteDentryRequest define the request tp delete a dentry.
type DeleteDentryRequest struct {
        VolName         string `json:"vol"`
        PartitionID     uint64 `json:"pid"`
        ParentID        uint64 `json:"pino"`
        Name            string `json:"name"`
        InodeCreateTime int64  `json:"inodeCreateTime"`
        Verseq          uint64 `json:"ver"`
        RequestExtend
}

type BatchDeleteDentryRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        ParentID    uint64   `json:"pino"`
        Dens        []Dentry `json:"dens"`
        FullPaths   []string `json:"fullPaths"`
}

// DeleteDentryResponse defines the response to the request of deleting a dentry.
type DeleteDentryResponse struct {
        Inode uint64 `json:"ino"`
}

// BatchDeleteDentryResponse defines the response to the request of deleting a dentry.
type BatchDeleteDentryResponse struct {
        ParentID uint64 `json:"pino"`
        Items    []*struct {
                Inode  uint64 `json:"ino"`
                Status uint8  `json:"status"`
        } `json:"items"`
}

// LookupRequest defines the request for lookup.
type LookupRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        Name        string `json:"name"`
        VerSeq      uint64 `json:"seq"`
        VerAll      bool   `json:"verAll"`
}

type DetryInfo struct {
        Inode  uint64 `json:"ino"`
        Mode   uint32 `json:"mode"`
        VerSeq uint64 `json:"seq"`
        IsDel  bool   `json:"isDel"`
}

// LookupResponse defines the response for the loopup request.
type LookupResponse struct {
        Inode  uint64      `json:"ino"`
        Mode   uint32      `json:"mode"`
        VerSeq uint64      `json:"seq"`
        LayAll []DetryInfo `json:"layerInfo"`
}

// InodeGetRequest defines the request to get the inode.
type InodeGetRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"seq"`
        VerAll      bool   `json:"verAll"`
}

type LayerInfo struct {
        LayerIdx uint32      `json:"layerIdx"`
        Info     *InodeInfo  `json:"info"`
        Eks      []ExtentKey `json:"eks"`
}

// InodeGetResponse defines the response to the InodeGetRequest.
type InodeGetResponse struct {
        Info   *InodeInfo  `json:"info"`
        LayAll []InodeInfo `json:"layerInfo"`
}

// BatchInodeGetRequest defines the request to get the inode in batch.
type BatchInodeGetRequest struct {
        VolName     string   `json:"vol"`
        PartitionID uint64   `json:"pid"`
        Inodes      []uint64 `json:"inos"`
        VerSeq      uint64   `json:"seq"`
}

// BatchInodeGetResponse defines the response to the request of getting the inode in batch.
type BatchInodeGetResponse struct {
        Infos []*InodeInfo `json:"infos"`
}

// InodeGetRequest defines the request to get the inode.
type InodeGetSplitRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"seq"`
        VerAll      bool   `json:"verAll"`
}

// InodeGetResponse defines the response to the InodeGetRequest.
type InodeGetSplitResponse struct {
        Info   *InodeSplitInfo  `json:"info"`
        LayAll []InodeSplitInfo `json:"layerInfo"`
}

// ReadDirRequest defines the request to read dir.
type ReadDirRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        VerSeq      uint64 `json:"seq"`
}

type ReadDirOnlyRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        VerSeq      uint64 `json:"seq"`
}

// ReadDirResponse defines the response to the request of reading dir.
type ReadDirResponse struct {
        Children []Dentry `json:"children"`
}

type ReadDirOnlyResponse struct {
        Children []Dentry `json:"children"`
}

// ReadDirLimitRequest defines the request to read dir with limited dentries.
type ReadDirLimitRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        ParentID    uint64 `json:"pino"`
        Marker      string `json:"marker"`
        Limit       uint64 `json:"limit"`
        VerSeq      uint64 `json:"seq"`
        VerOpt      uint8  `json:"VerOpt"`
}

type ReadDirLimitResponse struct {
        Children []Dentry `json:"children"`
}

// AppendExtentKeyRequest defines the request to append an extent key.
type AppendExtentKeyRequest struct {
        VolName     string    `json:"vol"`
        PartitionID uint64    `json:"pid"`
        Inode       uint64    `json:"ino"`
        Extent      ExtentKey `json:"ek"`
}

type AppendExtentKeyWithCheckRequest struct {
        VolName        string      `json:"vol"`
        PartitionID    uint64      `json:"pid"`
        Inode          uint64      `json:"ino"`
        Extent         ExtentKey   `json:"ek"`
        DiscardExtents []ExtentKey `json:"dek"`
        VerSeq         uint64      `json:"seq"`
        IsSplit        bool
}

// AppendObjExtentKeyRequest defines the request to append an obj extent key.
type AppendObjExtentKeysRequest struct {
        VolName     string         `json:"vol"`
        PartitionID uint64         `json:"pid"`
        Inode       uint64         `json:"ino"`
        Extents     []ObjExtentKey `json:"ek"`
}

// GetExtentsRequest defines the reques to get extents.
type GetExtentsRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"seq"`
        VerAll      bool
}

// GetObjExtentsResponse defines the response to the request of getting obj extents.
type GetObjExtentsResponse struct {
        Generation uint64         `json:"gen"`
        Size       uint64         `json:"sz"`
        Extents    []ExtentKey    `json:"eks"`
        ObjExtents []ObjExtentKey `json:"objeks"`
}

// GetExtentsResponse defines the response to the request of getting extents.
type GetExtentsResponse struct {
        Generation uint64      `json:"gen"`
        Size       uint64      `json:"sz"`
        Extents    []ExtentKey `json:"eks"`
        LayerInfo  []LayerInfo `json:"layer"`
        Status     int
}

// TruncateRequest defines the request to truncate.
type TruncateRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Size        uint64 `json:"sz"`
        RequestExtend
}

type EmptyExtentKeyRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
}

type DelVerRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"ver"`
}

type DelExtentKeyRequest struct {
        VolName     string      `json:"vol"`
        PartitionID uint64      `json:"pid"`
        Inode       uint64      `json:"ino"`
        Extents     []ExtentKey `json:"ek"`
}

// SetAttrRequest defines the request to set attribute.
type SetAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Mode        uint32 `json:"mode"`
        Uid         uint32 `json:"uid"`
        Gid         uint32 `json:"gid"`
        ModifyTime  int64  `json:"mt"`
        AccessTime  int64  `json:"at"`
        Valid       uint32 `json:"valid"`
        VerSeq      uint64 `json:"seq"`
}

const (
        AttrMode uint32 = 1 << iota
        AttrUid
        AttrGid
        AttrModifyTime
        AttrAccessTime
)

// DeleteInodeRequest defines the request to delete an inode.
type DeleteInodeRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        RequestExtend
}

// DeleteInodeRequest defines the request to delete an inode.
type DeleteInodeBatchRequest struct {
        VolName     string   `json:"vol"`
        PartitionId uint64   `json:"pid"`
        Inodes      []uint64 `json:"ino"`
        FullPaths   []string `json:"fullPaths"`
}

// AppendExtentKeysRequest defines the request to append an extent key.
type AppendExtentKeysRequest struct {
        VolName     string      `json:"vol"`
        PartitionId uint64      `json:"pid"`
        Inode       uint64      `json:"ino"`
        Extents     []ExtentKey `json:"eks"`
}

type SetXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        Value       string `json:"val"`
}

type BatchSetXAttrRequest struct {
        VolName     string            `json:"vol"`
        PartitionId uint64            `json:"pid"`
        Inode       uint64            `json:"ino"`
        Attrs       map[string]string `json:"attrs"`
}

type GetAllXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"seq"`
}

type GetAllXAttrResponse struct {
        VolName     string            `json:"vol"`
        PartitionId uint64            `json:"pid"`
        Inode       uint64            `json:"ino"`
        Attrs       map[string]string `json:"attrs"`
}

type GetXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        VerSeq      uint64 `json:"seq"`
}

type GetXAttrResponse struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        Value       string `json:"val"`
}

type RemoveXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        VerSeq      uint64 `json:"seq"`
}

type ListXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        VerSeq      uint64 `json:"seq"`
}

type ListXAttrResponse struct {
        VolName     string   `json:"vol"`
        PartitionId uint64   `json:"pid"`
        Inode       uint64   `json:"ino"`
        XAttrs      []string `json:"xattrs"`
}

type BatchGetXAttrRequest struct {
        VolName     string   `json:"vol"`
        PartitionId uint64   `json:"pid"`
        Inodes      []uint64 `json:"inos"`
        Keys        []string `json:"keys"`
        VerSeq      uint64   `json:"seq"`
}

type BatchGetXAttrResponse struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        XAttrs      []*XAttrInfo
}

type UpdateXAttrRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        Value       string `json:"val"`
}

type MultipartInfo struct {
        ID       string               `json:"id"`
        Path     string               `json:"path"`
        InitTime time.Time            `json:"itime"`
        Parts    []*MultipartPartInfo `json:"parts"`
        Extend   map[string]string    `json:"extend"`
}

type MultipartPartInfo struct {
        ID         uint16    `json:"id"`
        Inode      uint64    `json:"ino"`
        MD5        string    `json:"md5"`
        Size       uint64    `json:"sz"`
        UploadTime time.Time `json:"ut"`
}

type CreateMultipartRequest struct {
        VolName     string            `json:"vol"`
        PartitionId uint64            `json:"pid"`
        Path        string            `json:"path"`
        Extend      map[string]string `json:"extend"`
}

type CreateMultipartResponse struct {
        Info *MultipartInfo `json:"info"`
}

type GetMultipartRequest struct {
        VolName     string `json:"vol"`
        Path        string `json:"path"`
        PartitionId uint64 `json:"pid"`
        MultipartId string `json:"mid"`
}

type GetMultipartResponse struct {
        Info *MultipartInfo `json:"info"`
}

type GetExpiredMultipartRequest struct {
        VolName     string `json:"vol"`
        Prefix      string `json:"path"`
        Days        int    `json:"days"`
        PartitionId uint64 `json:"pid"`
}

type ExpiredMultipartInfo struct {
        Path        string   `json:"path"`
        MultipartId string   `json:"mid"`
        Inodes      []uint64 `json:"inodes"`
}

type GetExpiredMultipartResponse struct {
        Infos []*ExpiredMultipartInfo `json:"infos"`
}

type AddMultipartPartRequest struct {
        VolName     string             `json:"vol"`
        PartitionId uint64             `json:"pid"`
        Path        string             `json:"path"`
        MultipartId string             `json:"mid"`
        Part        *MultipartPartInfo `json:"part"`
}

type RemoveMultipartRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Path        string `json:"path"`
        MultipartId string `json:"mid"`
}

type ListMultipartRequest struct {
        VolName           string `json:"vol"`
        PartitionId       uint64 `json:"pid"`
        Marker            string `json:"mk"`
        MultipartIdMarker string `json:"mmk"`
        Max               uint64 `json:"max"`
        Delimiter         string `json:"dm"`
        Prefix            string `json:"pf"`
}

type ListMultipartResponse struct {
        Multiparts []*MultipartInfo `json:"mps"`
}

type UpdateSummaryInfoRequest struct {
        VolName     string `json:"vol"`
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
        Key         string `json:"key"`
        FileInc     int64  `json:"fileinc"`
        DirInc      int64  `json:"dirinc"`
        ByteInc     int64  `json:"byteinc"`
}

type SetMasterQuotaReuqest struct {
        VolName   string          `json:"vol"`
        PathInfos []QuotaPathInfo `json:"pinfos"`
        MaxFiles  uint64          `json:"mf"`
        MaxBytes  uint64          `json:"mbyte"`
}

type UpdateMasterQuotaReuqest struct {
        VolName  string `json:"vol"`
        QuotaId  uint32 `json:"qid"`
        MaxFiles uint64 `json:"mf"`
        MaxBytes uint64 `json:"mbyte"`
}

type ListMasterQuotaResponse struct {
        Quotas []*QuotaInfo
}

type BatchSetMetaserverQuotaReuqest struct {
        PartitionId uint64   `json:"pid"`
        Inodes      []uint64 `json:"ino"`
        QuotaId     uint32   `json:"qid"`
        IsRoot      bool     `json:"root"`
}

type BatchSetMetaserverQuotaResponse struct {
        InodeRes map[uint64]uint8 `json:"inores"`
}

type BatchDeleteMetaserverQuotaReuqest struct {
        PartitionId uint64   `json:"pid"`
        Inodes      []uint64 `json:"ino"`
        QuotaId     uint32   `json:"qid"`
}

type BatchDeleteMetaserverQuotaResponse struct {
        InodeRes map[uint64]uint8 `json:"inores"`
}

type GetInodeQuotaRequest struct {
        PartitionId uint64 `json:"pid"`
        Inode       uint64 `json:"ino"`
}

type GetInodeQuotaResponse struct {
        MetaQuotaInfoMap map[uint32]*MetaQuotaInfo
}

type AppendMultipartResponse struct {
        Status   uint8  `json:"status"`
        Update   bool   `json:"update"`
        OldInode uint64 `json:"oldinode"`
}

type GetUniqIDRequest struct {
        VolName     string `json:"vol"`
        PartitionID uint64 `json:"pid"`
        Num         uint32 `json:"num"`
}

type GetUniqIDResponse struct {
        Start uint64 `json:"start"`
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "fmt"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

type LcConfiguration struct {
        VolName string
        Rules   []*Rule
}

type Rule struct {
        Expire *ExpirationConfig
        Filter *FilterConfig
        ID     string
        Status string
}

type ExpirationConfig struct {
        Date *time.Time
        Days int
}

type FilterConfig struct {
        Prefix string
}

const (
        RuleEnabled  string = "Enabled"
        RuleDisabled string = "Disabled"
)

func (lcConf *LcConfiguration) GenEnabledRuleTasks() []*RuleTask {
        tasks := make([]*RuleTask, 0)
        for _, r := range lcConf.Rules {
                if r.Status != RuleEnabled {
                        log.LogDebugf("GenEnabledRuleTasks: skip disabled rule(%v) in volume(%v)", r.ID, lcConf.VolName)
                        continue
                }
                task := &RuleTask{
                        Id:      fmt.Sprintf("%s:%s", lcConf.VolName, r.ID),
                        VolName: lcConf.VolName,
                        Rule:    r,
                }
                tasks = append(tasks, task)
                log.LogDebugf("GenEnabledRuleTasks: RuleTask(%v) generated from rule(%v) in volume(%v)", *task, r.ID, lcConf.VolName)
        }
        return tasks
}

// ----------------------------------------------
// lcnode <-> master
// LcNodeRuleTask

type LcNodeRuleTaskRequest struct {
        MasterAddr string
        LcNodeAddr string
        Task       *RuleTask
}

type RuleTask struct {
        Id      string
        VolName string
        Rule    *Rule
}

type LcNodeRuleTaskResponse struct {
        ID         string
        LcNode     string
        StartTime  *time.Time
        EndTime    *time.Time
        UpdateTime *time.Time
        Done       bool
        Status     uint8
        Result     string
        LcNodeRuleTaskStatistics
}

type LcNodeRuleTaskStatistics struct {
        Volume               string
        RuleId               string
        TotalInodeScannedNum int64
        FileScannedNum       int64
        DirScannedNum        int64
        ExpiredNum           int64
        ErrorSkippedNum      int64
}

// ----------------------------------
// lcnode <-> meta

type ScanDentry struct {
        ParentId uint64 `json:"pid"`   // FileID value of the parent inode.
        Inode    uint64 `json:"inode"` // FileID value of the current inode.
        Name     string `json:"name"`  // Name of the current dentry.
        Path     string `json:"path"`  // Path of the current dentry.
        Type     uint32 `json:"type"`  // Type of the current dentry.
}

type BatchDentries struct {
        sync.RWMutex
        dentries map[uint64]*ScanDentry
}

func NewBatchDentries() *BatchDentries {
        return &BatchDentries{
                dentries: make(map[uint64]*ScanDentry),
        }
}

func (f *BatchDentries) Append(dentry *ScanDentry) {
        f.Lock()
        defer f.Unlock()
        f.dentries[dentry.Inode] = dentry
}

func (f *BatchDentries) Len() int {
        f.RLock()
        defer f.RUnlock()
        return len(f.dentries)
}

func (f *BatchDentries) BatchGetAndClear() (map[uint64]*ScanDentry, []uint64) {
        f.Lock()
        defer f.Unlock()
        dentries := f.dentries
        var inodes []uint64
        for i := range f.dentries {
                inodes = append(inodes, i)
        }
        f.dentries = make(map[uint64]*ScanDentry)
        return dentries, inodes
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import "sync"

// CreateNameSpaceRequest defines the request to create a name space.
type CreateNameSpaceRequest struct {
        Name string
}

// CreateNameSpaceResponse defines the response to the request of creating a name space.
type CreateNameSpaceResponse struct {
        Status int
        Result string
}

// Peer defines the peer of the node id and address.
type Peer struct {
        ID   uint64 `json:"id"`
        Addr string `json:"addr"`
}

// CreateMetaPartitionRequest defines the request to create a meta partition.
type CreateMetaPartitionRequest struct {
        MetaId      string
        VolName     string
        Start       uint64
        End         uint64
        PartitionID uint64
        Members     []Peer
        VerSeq      uint64
}

// CreateMetaPartitionResponse defines the response to the request of creating a meta partition.
type CreateMetaPartitionResponse struct {
        VolName     string
        PartitionID uint64
        Status      uint8
        Result      string
}

type UidSpaceInfo struct {
        VolName   string
        Uid       uint32
        CTime     int64
        Enabled   bool
        Limited   bool
        UsedSize  uint64
        LimitSize uint64
        Rsv       string
}

type UidReportSpaceInfo struct {
        Uid   uint32
        Size  uint64
        Rsv   string
        MTime int64
}

type QuotaUsedInfo struct {
        UsedFiles int64
        UsedBytes int64
}

type QuotaLimitedInfo struct {
        LimitedFiles bool
        LimitedBytes bool
}

type QuotaReportInfo struct {
        QuotaId  uint32
        UsedInfo QuotaUsedInfo
}

type QuotaInfo struct {
        VolName     string
        QuotaId     uint32
        CTime       int64
        PathInfos   []QuotaPathInfo
        LimitedInfo QuotaLimitedInfo
        UsedInfo    QuotaUsedInfo
        MaxFiles    uint64
        MaxBytes    uint64
        Rsv         string
}

type QuotaHeartBeatInfo struct {
        VolName     string
        QuotaId     uint32
        LimitedInfo QuotaLimitedInfo
        Enable      bool
}

type MetaQuotaInfos struct {
        QuotaInfoMap map[uint32]*MetaQuotaInfo
        sync.RWMutex
}

type MetaQuotaInfo struct {
        RootInode bool `json:"rid"`
}

type QuotaPathInfo struct {
        FullPath    string
        RootInode   uint64
        PartitionId uint64
}

func (usedInfo *QuotaUsedInfo) Add(info *QuotaUsedInfo) {
        usedInfo.UsedFiles += info.UsedFiles
        usedInfo.UsedBytes += info.UsedBytes
}

func (quotaInfo *QuotaInfo) IsOverQuotaFiles() (isOver bool) {
        if uint64(quotaInfo.UsedInfo.UsedFiles) > quotaInfo.MaxFiles {
                isOver = true
        } else {
                isOver = false
        }
        return
}

func (quotaInfo *QuotaInfo) IsOverQuotaBytes() (isOver bool) {
        if uint64(quotaInfo.UsedInfo.UsedBytes) > quotaInfo.MaxBytes {
                isOver = true
        } else {
                isOver = false
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "fmt"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        DefaultZoneName = "default"
)

// MetaNode defines the structure of a meta node
type MetaNodeInfo struct {
        ID                        uint64
        Addr                      string
        DomainAddr                string
        IsActive                  bool
        IsWriteAble               bool
        ZoneName                  string `json:"Zone"`
        MaxMemAvailWeight         uint64 `json:"MaxMemAvailWeight"`
        Total                     uint64 `json:"TotalWeight"`
        Used                      uint64 `json:"UsedWeight"`
        Ratio                     float64
        SelectCount               uint64
        Threshold                 float32
        ReportTime                time.Time
        MetaPartitionCount        int
        NodeSetID                 uint64
        PersistenceMetaPartitions []uint64
        RdOnly                    bool
        CpuUtil                   float64 `json:"cpuUtil"`
}

// DataNode stores all the information about a data node
type DataNodeInfo struct {
        Total                     uint64 `json:"TotalWeight"`
        Used                      uint64 `json:"UsedWeight"`
        AvailableSpace            uint64
        ID                        uint64
        ZoneName                  string `json:"Zone"`
        Addr                      string
        DomainAddr                string
        ReportTime                time.Time
        IsActive                  bool
        IsWriteAble               bool
        UsageRatio                float64 // used / total space
        SelectedTimes             uint64  // number times that this datanode has been selected as the location for a data partition.
        DataPartitionReports      []*DataPartitionReport
        DataPartitionCount        uint32
        NodeSetID                 uint64
        PersistenceDataPartitions []uint64
        BadDisks                  []string
        RdOnly                    bool
        MaxDpCntLimit             uint32             `json:"maxDpCntLimit"`
        CpuUtil                   float64            `json:"cpuUtil"`
        IoUtils                   map[string]float64 `json:"ioUtil"`
}

// MetaPartition defines the structure of a meta partition
type MetaPartitionInfo struct {
        PartitionID   uint64
        Start         uint64
        End           uint64
        MaxInodeID    uint64
        InodeCount    uint64
        DentryCount   uint64
        VolName       string
        Replicas      []*MetaReplicaInfo
        ReplicaNum    uint8
        Status        int8
        IsRecover     bool
        Hosts         []string
        Peers         []Peer
        Zones         []string
        NodeSets      []uint64
        OfflinePeerID uint64
        MissNodes     map[string]int64
        LoadResponse  []*MetaPartitionLoadResponse
        Forbidden     bool
}

// MetaReplica defines the replica of a meta partition
type MetaReplicaInfo struct {
        Addr        string
        DomainAddr  string
        MaxInodeID  uint64
        ReportTime  int64
        Status      int8 // unavailable, readOnly, readWrite
        IsLeader    bool
        InodeCount  uint64
        MaxInode    uint64
        DentryCount uint64
}

// ClusterView provides the view of a cluster.
type ClusterView struct {
        Name                 string
        CreateTime           string
        LeaderAddr           string
        DisableAutoAlloc     bool
        ForbidMpDecommission bool
        MetaNodeThreshold    float32
        Applied              uint64
        MaxDataPartitionID   uint64
        MaxMetaNodeID        uint64
        MaxMetaPartitionID   uint64
        DataNodeStatInfo     *NodeStatInfo
        MetaNodeStatInfo     *NodeStatInfo
        VolStatInfo          []*VolStatInfo
        BadPartitionIDs      []BadPartitionView
        BadMetaPartitionIDs  []BadPartitionView
        MasterNodes          []NodeView
        MetaNodes            []NodeView
        DataNodes            []NodeView
}

// ClusterNode defines the structure of a cluster node
type ClusterNodeInfo struct {
        // BatchCount          int
        LoadFactor string
        // MarkDeleteRate      int
        // AutoRepairRate      int
        // DeleteWorkerSleepMs int
}

type ClusterIP struct {
        Cluster string
        // MetaNodeDeleteBatchCount         int
        // MetaNodeDeleteWorkerSleepMs int
        // DataNodeDeleteLimitRate     int
        // DataNodeAutoRepairLimitRate int
        // Ip                                                         string
        EbsAddr string
        // ServicePath                                 string
}

// NodeView provides the view of the data or meta node.
type NodeView struct {
        Addr       string
        IsActive   bool
        DomainAddr string
        ID         uint64
        IsWritable bool
}

type DpRepairInfo struct {
        PartitionID                uint64
        DecommissionRepairProgress float64
}

type BadPartitionRepairView struct {
        Path           string
        PartitionInfos []DpRepairInfo
}

type BadPartitionView struct {
        Path         string
        PartitionIDs []uint64
}

type ClusterStatInfo struct {
        DataNodeStatInfo *NodeStatInfo
        MetaNodeStatInfo *NodeStatInfo
        ZoneStatInfo     map[string]*ZoneStat
}

type ZoneStat struct {
        DataNodeStat *ZoneNodesStat
        MetaNodeStat *ZoneNodesStat
}

type ZoneNodesStat struct {
        Total         float64 `json:"TotalGB"`
        Used          float64 `json:"UsedGB"`
        Avail         float64 `json:"AvailGB"`
        UsedRatio     float64
        TotalNodes    int
        WritableNodes int
}

type NodeSetStat struct {
        ID          uint64
        Capacity    int
        Zone        string
        MetaNodeNum int
        DataNodeNum int
}

type NodeSetStatInfo struct {
        ID               uint64
        Capacity         int
        Zone             string
        MetaNodes        []*NodeStatView
        DataNodes        []*NodeStatView
        DataNodeSelector string
        MetaNodeSelector string
}

type NodeStatView struct {
        Addr       string
        Status     bool
        DomainAddr string
        ID         uint64
        IsWritable bool
        Total      uint64
        Used       uint64
        Avail      uint64
}

type NodeStatInfo struct {
        TotalGB     uint64
        UsedGB      uint64
        IncreasedGB int64
        UsedRatio   string
        AvailGB     uint64
}

type VolStatInfo struct {
        Name                  string
        TotalSize             uint64
        UsedSize              uint64
        UsedRatio             string
        CacheTotalSize        uint64
        CacheUsedSize         uint64
        CacheUsedRatio        string
        EnableToken           bool
        InodeCount            uint64
        TxCnt                 uint64
        TxRbInoCnt            uint64
        TxRbDenCnt            uint64
        DpReadOnlyWhenVolFull bool
}

// DataPartition represents the structure of storing the file contents.
type DataPartitionInfo struct {
        PartitionID              uint64
        PartitionTTL             int64
        PartitionType            int
        LastLoadedTime           int64
        ReplicaNum               uint8
        Status                   int8
        Recover                  bool
        Replicas                 []*DataReplica
        Hosts                    []string // host addresses
        Peers                    []Peer
        Zones                    []string
        NodeSets                 []uint64
        MissingNodes             map[string]int64 // key: address of the missing node, value: when the node is missing
        VolName                  string
        VolID                    uint64
        OfflinePeerID            uint64
        FileInCoreMap            map[string]*FileInCore
        IsRecover                bool
        FilesWithMissingReplica  map[string]int64 // key: file name, value: last time when a missing replica is found
        SingleDecommissionStatus uint32
        SingleDecommissionAddr   string
        RdOnly                   bool
        IsDiscard                bool
        Forbidden                bool
}

// FileInCore define file in data partition
type FileInCore struct {
        Name          string
        LastModify    int64
        MetadataArray []*FileMetadata
}

// FileMetadata defines the file metadata on a dataNode
type FileMetadata struct {
        Crc     uint32
        LocAddr string
        Size    uint32
        ApplyID uint64
}

// DataReplica represents the replica of a data partition
type DataReplica struct {
        Addr                       string
        DomainAddr                 string
        ReportTime                 int64
        FileCount                  uint32
        Status                     int8
        HasLoadResponse            bool   // if there is any response when loading
        Total                      uint64 `json:"TotalSize"`
        Used                       uint64 `json:"UsedSize"`
        IsLeader                   bool
        NeedsToCompare             bool
        DiskPath                   string
        DecommissionRepairProgress float64
}

// data partition diagnosis represents the inactive data nodes, corrupt data partitions, and data partitions lack of replicas
type DataPartitionDiagnosis struct {
        InactiveDataNodes           []string
        CorruptDataPartitionIDs     []uint64
        LackReplicaDataPartitionIDs []uint64
        RepFileCountDifferDpIDs     []uint64
        RepUsedSizeDifferDpIDs      []uint64
        ExcessReplicaDpIDs          []uint64
        // BadDataPartitionIDs         []BadPartitionView
        BadDataPartitionInfos      []BadPartitionRepairView
        BadReplicaDataPartitionIDs []uint64
}

// meta partition diagnosis represents the inactive meta nodes, corrupt meta partitions, and meta partitions lack of replicas
type MetaPartitionDiagnosis struct {
        InactiveMetaNodes                          []string
        CorruptMetaPartitionIDs                    []uint64
        LackReplicaMetaPartitionIDs                []uint64
        BadMetaPartitionIDs                        []BadPartitionView
        BadReplicaMetaPartitionIDs                 []uint64
        ExcessReplicaMetaPartitionIDs              []uint64
        InodeCountNotEqualReplicaMetaPartitionIDs  []uint64
        MaxInodeNotEqualReplicaMetaPartitionIDs    []uint64
        DentryCountNotEqualReplicaMetaPartitionIDs []uint64
}

type DecommissionProgress struct {
        Status        uint32
        Progress      string
        FailedDps     []uint64
        StatusMessage string
}

type BadDiskInfo struct {
        Address              string
        Path                 string
        TotalPartitionCnt    int
        DiskErrPartitionList []uint64
}

type BadDiskInfos struct {
        BadDisks []BadDiskInfo
}

type DiscardDataPartitionInfos struct {
        DiscardDps []DataPartitionInfo
}

type VolVersionInfo struct {
        Ver     uint64 // unixMicro of createTime used as version
        DelTime int64
        Status  uint8 // building,normal,deleted,abnormal
}

func (vv *VolVersionInfo) String() string {
        return fmt.Sprintf("Ver:%v|DelTimt:%v|status:%v", vv.Ver, vv.DelTime, vv.Status)
}

type VolVersionInfoList struct {
        VerList         []*VolVersionInfo
        Strategy        VolumeVerStrategy
        TemporaryVerMap map[uint64]*VolVersionInfo
        RWLock          sync.RWMutex
}

func (v *VolVersionInfoList) GetNextOlderVer(ver uint64) (verSeq uint64, err error) {
        v.RWLock.RLock()
        defer v.RWLock.RUnlock()
        log.LogDebugf("getNextOlderVer ver %v", ver)
        for idx, info := range v.VerList {
                log.LogDebugf("getNextOlderVer id %v ver %v info %v", idx, info.Ver, info)
                if info.Ver >= ver {
                        if idx == 0 {
                                return 0, fmt.Errorf("not found")
                        }
                        return v.VerList[idx-1].Ver, nil
                }
        }
        log.LogErrorf("getNextOlderVer ver %v not found", ver)
        return 0, fmt.Errorf("version not exist")
}

func (v *VolVersionInfoList) GetNextNewerVer(ver uint64) (verSeq uint64, err error) {
        log.LogDebugf("getNextOlderVer ver %v", ver)
        for idx, info := range v.VerList {
                log.LogDebugf("getNextOlderVer id %v ver %v info %v", idx, info.Ver, info)
                if info.Ver > ver {
                        return info.Ver, nil
                }
        }
        log.LogErrorf("getNextOlderVer ver %v not found", ver)
        return 0, fmt.Errorf("version not exist")
}

func (v *VolVersionInfoList) GetLastVolVerInfo() *VolVersionInfo {
        if len(v.VerList) == 0 {
                return nil
        }
        return v.VerList[len(v.VerList)-1]
}

func (v *VolVersionInfoList) GetLastVer() uint64 {
        if len(v.VerList) == 0 {
                return 0
        }
        return v.VerList[len(v.VerList)-1].Ver
}

type DecommissionDiskLimitDetail struct {
        NodeSetId uint64
        Limit     int
}

type DecommissionDiskLimit struct {
        Details []DecommissionDiskLimitDetail
}

type DecommissionDiskInfo struct {
        SrcAddr                  string
        DiskPath                 string
        DecommissionStatus       uint32
        DecommissionRaftForce    bool
        DecommissionRetry        uint8
        DecommissionDpTotal      int
        DecommissionTerm         uint64
        DecommissionLimit        int
        Type                     uint32
        DecommissionCompleteTime int64
        Progress                 float64
}

type DecommissionDisksResponse struct {
        Infos []DecommissionDiskInfo
}

package proto

import (
        "flag"
        "fmt"
        "strconv"

        "github.com/cubefs/cubefs/util/auth"
        "github.com/cubefs/cubefs/util/config"
)

// For client
const (
        // Mandatory
        MountPoint int = iota
        VolName
        Owner
        Master
        // Optional
        LogDir
        WarnLogDir
        LogLevel
        ProfPort
        IcacheTimeout
        LookupValid
        AttrValid
        ReadRate
        WriteRate
        EnSyncWrite
        AutoInvalData
        Rdonly
        WriteCache
        KeepCache
        FollowerRead
        Authenticate
        ClientKey
        TicketHost
        EnableHTTPS
        CertFile
        AccessKey
        SecretKey
        DisableDcache
        SubDir
        FsyncOnClose
        MaxCPUs
        EnableXattr
        NearRead
        EnablePosixACL
        EnableSummary
        EnableUnixPermission
        RequestTimeout

        // adls
        VolType
        EbsEndpoint
        EbsServerPath
        CacheAction
        EbsBlockSize
        EnableBcache
        BcacheDir
        BcacheFilterFiles
        BcacheBatchCnt
        BcacheCheckIntervalS
        ReadThreads
        WriteThreads
        MetaSendTimeout
        BuffersTotalLimit
        MaxStreamerLimit
        EnableAudit

        LocallyProf
        MinWriteAbleDataPartitionCnt
        FileSystemName

        // snapshot
        SnapshotReadVerSeq

        DisableMountSubtype
        MaxMountOption
)

// For server
const (
        MasterAddr       = "masterAddr"
        ListenPort       = "listen"
        ObjectNodeDomain = "objectNodeDomain"
        BindIpKey        = "bindIp"
)

type MountOption struct {
        keyword      string
        description  string
        cmdlineValue string
        value        interface{}
}

func (opt MountOption) String() string {
        return fmt.Sprintf("[%v] %T: %v", opt.keyword, opt.value, opt.value)
}

func NewMountOptions() []MountOption {
        opts := make([]MountOption, MaxMountOption)
        return opts
}

func InitMountOptions(opts []MountOption) {
        opts[MountPoint] = MountOption{"mountPoint", "Mount Point", "", ""}
        opts[VolName] = MountOption{"volName", "Volume Name", "", ""}
        opts[Owner] = MountOption{"owner", "Owner", "", ""}
        opts[Master] = MountOption{MasterAddr, "Master Address", "", ""}
        opts[LogDir] = MountOption{"logDir", "Log Path", "", ""}
        opts[WarnLogDir] = MountOption{"warnLogDir", "Warn Log Path", "", ""}
        opts[LogLevel] = MountOption{"logLevel", "Log Level", "", ""}
        opts[ProfPort] = MountOption{"profPort", "PProf Port", "", ""}
        opts[LocallyProf] = MountOption{"locallyProf", "Locally PProf", "", false}
        opts[IcacheTimeout] = MountOption{"icacheTimeout", "Inode Cache Expiration Time", "", int64(-1)}
        opts[LookupValid] = MountOption{"lookupValid", "Lookup Valid Duration", "", int64(-1)}
        opts[AttrValid] = MountOption{"attrValid", "Attr Valid Duration", "", int64(-1)}
        opts[ReadRate] = MountOption{"readRate", "Read Rate Limit", "", int64(-1)}
        opts[WriteRate] = MountOption{"writeRate", "Write Rate Limit", "", int64(-1)}
        opts[EnSyncWrite] = MountOption{"enSyncWrite", "Enable Sync Write", "", int64(-1)}
        opts[AutoInvalData] = MountOption{"autoInvalData", "Auto Invalidate Data", "", int64(-1)}
        opts[Rdonly] = MountOption{"rdonly", "Mount as readonly", "", false}
        opts[WriteCache] = MountOption{"writecache", "Enable FUSE writecache feature", "", false}
        opts[KeepCache] = MountOption{"keepcache", "Enable FUSE keepcache feature", "", false}
        opts[FollowerRead] = MountOption{"followerRead", "Enable read from follower", "", false}
        opts[NearRead] = MountOption{"nearRead", "Enable read from nearest node", "", true}

        opts[Authenticate] = MountOption{"authenticate", "Enable Authenticate", "", false}
        opts[ClientKey] = MountOption{"clientKey", "Client Key", "", ""}
        opts[TicketHost] = MountOption{"ticketHost", "Ticket Host", "", ""}
        opts[EnableHTTPS] = MountOption{"enableHTTPS", "Enable HTTPS", "", false}
        opts[CertFile] = MountOption{"certFile", "Cert File", "", ""}

        opts[AccessKey] = MountOption{"accessKey", "Access Key", "", ""}
        opts[SecretKey] = MountOption{"secretKey", "Secret Key", "", ""}

        opts[DisableDcache] = MountOption{"disableDcache", "Disable Dentry Cache", "", false}
        opts[SubDir] = MountOption{"subdir", "Mount sub directory", "", ""}
        opts[FsyncOnClose] = MountOption{"fsyncOnClose", "Perform fsync upon file close", "", true}
        opts[MaxCPUs] = MountOption{"maxcpus", "The maximum number of CPUs that can be executing", "", int64(-1)}
        opts[EnableXattr] = MountOption{"enableXattr", "Enable xattr support", "", false}
        opts[EnablePosixACL] = MountOption{"enablePosixACL", "Enable posix ACL support", "", false}
        opts[EnableSummary] = MountOption{"enableSummary", "Enable content summary", "", false}
        opts[EnableUnixPermission] = MountOption{"enableUnixPermission", "Enable unix permission check(e.g: 777/755)", "", false}

        opts[VolType] = MountOption{"volType", "volume type", "", int64(0)}
        opts[EbsEndpoint] = MountOption{"ebsEndpoint", "Ebs service address", "", ""}
        opts[EbsServerPath] = MountOption{"ebsServerPath", "Ebs service path", "", ""}
        opts[CacheAction] = MountOption{"cacheAction", "Cold cache action", "", int64(0)}
        opts[EbsBlockSize] = MountOption{"ebsBlockSize", "Ebs object size", "", ""}
        // opts[EnableBcache] = MountOption{"enableBcache", "Enable block cache", "", false}
        opts[BcacheDir] = MountOption{"bcacheDir", "block cache dir", "", ""}
        opts[ReadThreads] = MountOption{"readThreads", "Cold volume read threads", "", int64(10)}
        opts[WriteThreads] = MountOption{"writeThreads", "Cold volume write threads", "", int64(10)}
        opts[MetaSendTimeout] = MountOption{"metaSendTimeout", "Meta send timeout", "", int64(600)}
        opts[BuffersTotalLimit] = MountOption{"buffersTotalLimit", "Send/Receive packets memory limit", "", int64(32768)} // default 4G
        opts[MaxStreamerLimit] = MountOption{"maxStreamerLimit", "The maximum number of streamers", "", int64(0)}         // default 0
        opts[BcacheFilterFiles] = MountOption{"bcacheFilterFiles", "The block cache filter files suffix", "", "py;pyx;sh;yaml;conf;pt;pth;log;out"}
        opts[BcacheBatchCnt] = MountOption{"bcacheBatchCnt", "The block cache get meta count", "", int64(100000)}
        opts[BcacheCheckIntervalS] = MountOption{"bcacheCheckIntervalS", "The block cache check interval", "", int64(300)}
        opts[EnableAudit] = MountOption{"enableAudit", "enable client audit logging", "", false}
        opts[RequestTimeout] = MountOption{"requestTimeout", "The Request Expiration Time", "", int64(0)}
        opts[MinWriteAbleDataPartitionCnt] = MountOption{
                "minWriteAbleDataPartitionCnt",
                "Min writeable data partition count retained int dpSelector when update DataPartitionsView from master",
                "", int64(10),
        }

        opts[FileSystemName] = MountOption{"fileSystemName", "The explicit name of the filesystem", "", ""}
        opts[SnapshotReadVerSeq] = MountOption{"snapshotReadSeq", "Snapshot read seq", "", int64(0)} // default false
        opts[DisableMountSubtype] = MountOption{"disableMountSubtype", "Disable Mount Subtype", "", false}

        for i := 0; i < MaxMountOption; i++ {
                flag.StringVar(&opts[i].cmdlineValue, opts[i].keyword, "", opts[i].description)
        }
}

func ParseMountOptions(opts []MountOption, cfg *config.Config) {
        for i := 0; i < MaxMountOption; i++ {
                switch v := opts[i].value.(type) {
                case string:
                        if opts[i].cmdlineValue != "" {
                                opts[i].value = opts[i].cmdlineValue
                        } else {
                                if value, present := cfg.CheckAndGetString(opts[i].keyword); present {
                                        opts[i].value = value
                                } else {
                                        opts[i].value = v
                                }
                        }
                        fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)

                case int64:
                        if opts[i].cmdlineValue != "" {
                                opts[i].value = parseInt64(opts[i].cmdlineValue)
                        } else {
                                if present := cfg.HasKey(opts[i].keyword); present {
                                        opts[i].value = cfg.GetInt64(opts[i].keyword)
                                } else {
                                        opts[i].value = v
                                }
                        }
                        fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)

                case bool:
                        if opts[i].cmdlineValue != "" {
                                opts[i].value = parseBool(opts[i].cmdlineValue)
                        } else {
                                if value, present := cfg.CheckAndGetBool(opts[i].keyword); present {
                                        opts[i].value = value
                                } else {
                                        opts[i].value = v
                                }
                        }
                        fmt.Printf("keyword[%v] value[%v] type[%T]\n", opts[i].keyword, opts[i].value, v)

                default:
                        fmt.Printf("keyword[%v] unknown type[%T]\n", opts[i].keyword, v)
                }
        }
}

func parseInt64(s string) int64 {
        var ret int64 = -1

        if s != "" {
                val, err := strconv.Atoi(s)
                if err == nil {
                        ret = int64(val)
                }
        }
        return ret
}

func parseBool(s string) bool {
        var ret bool = false

        if s == "true" {
                ret = true
        }
        return ret
}

func (opt *MountOption) GetString() string {
        val, ok := opt.value.(string)
        if !ok {
                return ""
        }
        return val
}

func (opt *MountOption) GetBool() bool {
        val, ok := opt.value.(bool)
        if !ok {
                return false
        }
        return val
}

func (opt *MountOption) GetInt64() int64 {
        val, ok := opt.value.(int64)
        if !ok {
                return int64(-1)
        }
        return val
}

type MountOptions struct {
        Config                       *config.Config
        MountPoint                   string
        Volname                      string
        Owner                        string
        Master                       string
        Logpath                      string
        Loglvl                       string
        Profport                     string
        LocallyProf                  bool
        IcacheTimeout                int64
        LookupValid                  int64
        AttrValid                    int64
        ReadRate                     int64
        WriteRate                    int64
        EnSyncWrite                  int64
        AutoInvalData                int64
        UmpDatadir                   string
        Rdonly                       bool
        WriteCache                   bool
        KeepCache                    bool
        FollowerRead                 bool
        Authenticate                 bool
        TicketMess                   auth.TicketMess
        TokenKey                     string
        AccessKey                    string
        SecretKey                    string
        DisableDcache                bool
        SubDir                       string
        FsyncOnClose                 bool
        MaxCPUs                      int64
        EnableXattr                  bool
        NearRead                     bool
        EnablePosixACL               bool
        EnableQuota                  bool
        EnableTransaction            string
        TxTimeout                    int64
        TxConflictRetryNum           int64
        TxConflictRetryInterval      int64
        VolType                      int
        EbsEndpoint                  string
        EbsServicePath               string
        CacheAction                  int
        CacheThreshold               int
        EbsBlockSize                 int
        EnableBcache                 bool
        BcacheDir                    string
        BcacheFilterFiles            string
        BcacheCheckIntervalS         int64
        BcacheBatchCnt               int64
        ReadThreads                  int64
        WriteThreads                 int64
        EnableSummary                bool
        EnableUnixPermission         bool
        NeedRestoreFuse              bool
        MetaSendTimeout              int64
        BuffersTotalLimit            int64
        MaxStreamerLimit             int64
        EnableAudit                  bool
        RequestTimeout               int64
        MinWriteAbleDataPartitionCnt int
        FileSystemName               string
        VerReadSeq                   uint64
        // disable mount subtype
        DisableMountSubtype bool
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "bytes"
        "encoding/binary"
        "fmt"

        "github.com/cubefs/cubefs/util/btree"
)

type Blob struct {
        MinBid uint64
        Count  uint64
        Vid    uint64
}

// ObjExtentKey defines the extent key struct.
type ObjExtentKey struct {
        Cid        uint64 // cluster id
        CodeMode   uint8  // EC encode and decode mode
        BlobSize   uint32 // block size
        BlobsLen   uint32 // blob array length
        Size       uint64 // objExtentKey size
        Blobs      []Blob
        FileOffset uint64 // obj offset in file
        Crc        uint32
        // snapshot
        VerSeq uint64
        ModGen uint64
}

// String returns the string format of the extentKey.
func (k ObjExtentKey) String() string {
        return fmt.Sprintf("ObjExtentKey{FileOffset(%v),Cid(%v),CodeMode(%v),BlobSize(%v),BlobsLen(%v),Blobs(%v),Size(%v),Crc(%v)}", k.FileOffset, k.Cid, k.CodeMode, k.BlobSize, k.BlobsLen, k.Blobs, k.Size, k.Crc)
}

// Less defines the less comparator.
func (k *ObjExtentKey) Less(than btree.Item) bool {
        that := than.(*ObjExtentKey)
        return k.FileOffset < that.FileOffset
}

// Marshal marshals the obj extent key.
func (k *ObjExtentKey) Copy() btree.Item {
        return k
}

func (k *ObjExtentKey) IsEquals(obj *ObjExtentKey) bool {
        if k.FileOffset != obj.FileOffset {
                return false
        }
        if k.Cid != obj.Cid {
                return false
        }
        if k.CodeMode != obj.CodeMode {
                return false
        }
        if k.BlobSize != obj.BlobSize {
                return false
        }
        if k.BlobsLen != obj.BlobsLen {
                return false
        }
        if k.Size != obj.Size {
                return false
        }
        if k.Crc != obj.Crc {
                return false
        }
        if len(k.Blobs) > 0 {
                for i := len(k.Blobs) - 1; i >= 0; i-- {
                        if k.Blobs[i].Count != obj.Blobs[i].Count || k.Blobs[i].MinBid != obj.Blobs[i].MinBid || k.Blobs[i].Vid != obj.Blobs[i].Vid {
                                return false
                        }
                }
        }
        return true
}

// MarshalBinary marshals the binary format of the extent key.
func (k *ObjExtentKey) MarshalBinary() ([]byte, error) {
        buf := bytes.NewBuffer(make([]byte, 0))
        if err := binary.Write(buf, binary.BigEndian, uint32(len(k.Blobs))); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.FileOffset); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Size); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Crc); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.CodeMode); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Cid); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.BlobSize); err != nil {
                return nil, err
        }
        if err := binary.Write(buf, binary.BigEndian, k.Blobs); err != nil {
                return nil, err
        }
        return buf.Bytes(), nil
}

func (k *ObjExtentKey) UnmarshalBinary(buf *bytes.Buffer) (err error) {
        if err = binary.Read(buf, binary.BigEndian, &k.BlobsLen); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.FileOffset); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.Size); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.Crc); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.CodeMode); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.Cid); err != nil {
                return
        }
        if err = binary.Read(buf, binary.BigEndian, &k.BlobSize); err != nil {
                return
        }
        blobs := make([]Blob, 0, int(k.BlobsLen))
        for i := 0; i < int(k.BlobsLen); i++ {
                tmpBlob := Blob{}
                if err = binary.Read(buf, binary.BigEndian, &tmpBlob); err != nil {
                        return
                }
                blobs = append(blobs, tmpBlob)
        }
        k.Blobs = blobs

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net"
        "strconv"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/buf"
        "github.com/cubefs/cubefs/util/log"
)

var (
        GRequestID = int64(1)
        Buffers    *buf.BufferPool
)

// GenerateRequestID generates the request ID.
func GenerateRequestID() int64 {
        return atomic.AddInt64(&GRequestID, 1)
}

const (
        AddrSplit = "/"
)

// Operations
const (
        ProtoMagic           uint8 = 0xFF
        OpInitResultCode     uint8 = 0x00
        OpCreateExtent       uint8 = 0x01
        OpMarkDelete         uint8 = 0x02
        OpWrite              uint8 = 0x03
        OpRead               uint8 = 0x04
        OpStreamRead         uint8 = 0x05
        OpStreamFollowerRead uint8 = 0x06
        OpGetAllWatermarks   uint8 = 0x07

        OpNotifyReplicasToRepair         uint8 = 0x08
        OpExtentRepairRead               uint8 = 0x09
        OpBroadcastMinAppliedID          uint8 = 0x0A
        OpRandomWrite                    uint8 = 0x0F
        OpGetAppliedId                   uint8 = 0x10
        OpGetPartitionSize               uint8 = 0x11
        OpSyncRandomWrite                uint8 = 0x12
        OpSyncWrite                      uint8 = 0x13
        OpReadTinyDeleteRecord           uint8 = 0x14
        OpTinyExtentRepairRead           uint8 = 0x15
        OpGetMaxExtentIDAndPartitionSize uint8 = 0x16

        // Operations: Client -> MetaNode.
        OpMetaCreateInode   uint8 = 0x20
        OpMetaUnlinkInode   uint8 = 0x21
        OpMetaCreateDentry  uint8 = 0x22
        OpMetaDeleteDentry  uint8 = 0x23
        OpMetaOpen          uint8 = 0x24
        OpMetaLookup        uint8 = 0x25
        OpMetaReadDir       uint8 = 0x26
        OpMetaInodeGet      uint8 = 0x27
        OpMetaBatchInodeGet uint8 = 0x28
        OpMetaExtentsAdd    uint8 = 0x29
        OpMetaExtentsDel    uint8 = 0x2A
        OpMetaExtentsList   uint8 = 0x2B
        OpMetaUpdateDentry  uint8 = 0x2C
        OpMetaTruncate      uint8 = 0x2D
        OpMetaLinkInode     uint8 = 0x2E
        OpMetaEvictInode    uint8 = 0x2F
        OpMetaSetattr       uint8 = 0x30
        OpMetaReleaseOpen   uint8 = 0x31

        // Operations: MetaNode Leader -> MetaNode Follower
        OpMetaFreeInodesOnRaftFollower uint8 = 0x32

        OpMetaDeleteInode        uint8 = 0x33 // delete specified inode immediately and do not remove data.
        OpMetaBatchExtentsAdd    uint8 = 0x34 // for extents batch attachment
        OpMetaSetXAttr           uint8 = 0x35
        OpMetaGetXAttr           uint8 = 0x36
        OpMetaRemoveXAttr        uint8 = 0x37
        OpMetaListXAttr          uint8 = 0x38
        OpMetaBatchGetXAttr      uint8 = 0x39
        OpMetaExtentAddWithCheck uint8 = 0x3A // Append extent key with discard extents check
        OpMetaReadDirLimit       uint8 = 0x3D

        // Operations: Master -> MetaNode
        OpCreateMetaPartition           uint8 = 0x40
        OpMetaNodeHeartbeat             uint8 = 0x41
        OpDeleteMetaPartition           uint8 = 0x42
        OpUpdateMetaPartition           uint8 = 0x43
        OpLoadMetaPartition             uint8 = 0x44
        OpDecommissionMetaPartition     uint8 = 0x45
        OpAddMetaPartitionRaftMember    uint8 = 0x46
        OpRemoveMetaPartitionRaftMember uint8 = 0x47
        OpMetaPartitionTryToLeader      uint8 = 0x48

        // Quota
        OpMetaBatchSetInodeQuota    uint8 = 0x50
        OpMetaBatchDeleteInodeQuota uint8 = 0x51
        OpMetaGetInodeQuota         uint8 = 0x52
        OpQuotaCreateInode          uint8 = 0x53
        OpQuotaCreateDentry         uint8 = 0x54

        // Operations: Master -> LcNode

        OpLcNodeHeartbeat      uint8 = 0x55
        OpLcNodeScan           uint8 = 0x56
        OpLcNodeSnapshotVerDel uint8 = 0x57

        // Operations: Master -> DataNode
        OpCreateDataPartition           uint8 = 0x60
        OpDeleteDataPartition           uint8 = 0x61
        OpLoadDataPartition             uint8 = 0x62
        OpDataNodeHeartbeat             uint8 = 0x63
        OpReplicateFile                 uint8 = 0x64
        OpDeleteFile                    uint8 = 0x65
        OpDecommissionDataPartition     uint8 = 0x66
        OpAddDataPartitionRaftMember    uint8 = 0x67
        OpRemoveDataPartitionRaftMember uint8 = 0x68
        OpDataPartitionTryToLeader      uint8 = 0x69
        OpQos                           uint8 = 0x6A
        OpStopDataPartitionRepair       uint8 = 0x6B

        // Operations: MultipartInfo
        OpCreateMultipart  uint8 = 0x70
        OpGetMultipart     uint8 = 0x71
        OpAddMultipartPart uint8 = 0x72
        OpRemoveMultipart  uint8 = 0x73
        OpListMultiparts   uint8 = 0x74

        OpBatchDeleteExtent   uint8 = 0x75 // SDK to MetaNode
        OpGetExpiredMultipart uint8 = 0x76

        // Operations: MetaNode Leader -> MetaNode Follower
        OpMetaBatchDeleteInode  uint8 = 0x90
        OpMetaBatchDeleteDentry uint8 = 0x91
        OpMetaBatchUnlinkInode  uint8 = 0x92
        OpMetaBatchEvictInode   uint8 = 0x93

        // Transaction Operations: Client -> MetaNode.
        OpMetaTxCreate       uint8 = 0xA0
        OpMetaTxCreateInode  uint8 = 0xA1
        OpMetaTxUnlinkInode  uint8 = 0xA2
        OpMetaTxCreateDentry uint8 = 0xA3
        OpTxCommit           uint8 = 0xA4
        OpTxRollback         uint8 = 0xA5
        OpTxCommitRM         uint8 = 0xA6
        OpTxRollbackRM       uint8 = 0xA7
        OpMetaTxDeleteDentry uint8 = 0xA8
        OpMetaTxUpdateDentry uint8 = 0xA9
        OpMetaTxLinkInode    uint8 = 0xAA
        OpMetaTxGet          uint8 = 0xAB

        // Operations: Client -> MetaNode.
        OpMetaGetUniqID uint8 = 0xAC

        // Multi version snapshot
        OpRandomWriteAppend     uint8 = 0xB1
        OpSyncRandomWriteAppend uint8 = 0xB2
        OpRandomWriteVer        uint8 = 0xB3
        OpSyncRandomWriteVer    uint8 = 0xB4
        OpSyncRandomWriteVerRsp uint8 = 0xB5
        OpTryWriteAppend        uint8 = 0xB6
        OpSyncTryWriteAppend    uint8 = 0xB7
        OpVersionOp             uint8 = 0xB8

        // Commons
        OpNoSpaceErr uint8 = 0xEE
        OpDirQuota   uint8 = 0xF1

        // Commons

        OpConflictExtentsErr uint8 = 0xF2
        OpIntraGroupNetErr   uint8 = 0xF3
        OpArgMismatchErr     uint8 = 0xF4
        OpNotExistErr        uint8 = 0xF5
        OpDiskNoSpaceErr     uint8 = 0xF6
        OpDiskErr            uint8 = 0xF7
        OpErr                uint8 = 0xF8
        OpAgain              uint8 = 0xF9
        OpExistErr           uint8 = 0xFA
        OpInodeFullErr       uint8 = 0xFB
        OpTryOtherAddr       uint8 = 0xFC
        OpNotPerm            uint8 = 0xFD
        OpNotEmpty           uint8 = 0xFE
        OpOk                 uint8 = 0xF0
        OpAgainVerionList    uint8 = 0xEF

        OpPing                  uint8 = 0xFF
        OpMetaUpdateXAttr       uint8 = 0x3B
        OpMetaReadDirOnly       uint8 = 0x3C
        OpUploadPartConflictErr uint8 = 0x3D

        // ebs obj meta
        OpMetaObjExtentAdd       uint8 = 0xDD
        OpMetaObjExtentsList     uint8 = 0xDE
        OpMetaExtentsEmpty       uint8 = 0xDF
        OpMetaBatchObjExtentsAdd uint8 = 0xD0
        OpMetaClearInodeCache    uint8 = 0xD1

        OpMetaBatchSetXAttr uint8 = 0xD2
        OpMetaGetAllXAttr   uint8 = 0xD3

        // transaction error

        OpTxInodeInfoNotExistErr  uint8 = 0xE0
        OpTxConflictErr           uint8 = 0xE1
        OpTxDentryInfoNotExistErr uint8 = 0xE2
        OpTxRbInodeNotExistErr    uint8 = 0xE3
        OpTxRbDentryNotExistErr   uint8 = 0xE4
        OpTxInfoNotExistErr       uint8 = 0xE5
        OpTxInternalErr           uint8 = 0xE6
        OpTxCommitItemErr         uint8 = 0xE7
        OpTxRollbackItemErr       uint8 = 0xE8
        OpTxRollbackUnknownRbType uint8 = 0xE9
        OpTxTimeoutErr            uint8 = 0xEA
        OpTxSetStateErr           uint8 = 0xEB
        OpTxCommitErr             uint8 = 0xEC
        OpTxRollbackErr           uint8 = 0xED
        OpTxUnknownOp             uint8 = 0xEE

        // multiVersion to dp/mp
        OpVersionOperation uint8 = 0xD5
        OpSplitMarkDelete  uint8 = 0xD6
        OpTryOtherExtent   uint8 = 0xD7
)

const (
        WriteDeadlineTime                         = 5
        ReadDeadlineTime                          = 5
        SyncSendTaskDeadlineTime                  = 30
        NoReadDeadlineTime                        = -1
        BatchDeleteExtentReadDeadLineTime         = 120
        GetAllWatermarksDeadLineTime              = 60
        DefaultClusterLoadFactor          float64 = 10
        MultiVersionFlag                          = 0x80
        VersionListFlag                           = 0x40
)

// multi version operation
const (
        CreateVersion        = 1
        DeleteVersion        = 2
        CreateVersionPrepare = 3
        CreateVersionCommit  = 4
        SyncBatchVersionList = 5
)

// stage of version building
const (
        VersionInit            = 0
        VersionWorking         = 1
        VersionWorkingTimeOut  = 2
        VersionWorkingAbnormal = 3
        VersionWorkingFinished = 4
)

// status of version
const (
        VersionNormal         = 1
        VersionDeleted        = 2
        VersionDeleting       = 3
        VersionDeleteAbnormal = 4
        VersionPrepare        = 5
)

const (
        TinyExtentType   = 0
        NormalExtentType = 1
)

const (
        NormalCreateDataPartition         = 0
        DecommissionedCreateDataPartition = 1
)

// Packet defines the packet structure.
type Packet struct {
        Magic              uint8
        ExtentType         uint8 // the highest bit be set while rsp to client if version not consistent then Verseq be valid
        Opcode             uint8
        ResultCode         uint8
        RemainingFollowers uint8
        CRC                uint32
        Size               uint32
        ArgLen             uint32
        KernelOffset       uint64
        PartitionID        uint64
        ExtentID           uint64
        ExtentOffset       int64
        ReqID              int64
        Arg                []byte // for create or append ops, the data contains the address
        Data               []byte
        StartT             int64
        mesg               string
        HasPrepare         bool
        VerSeq             uint64 // only used in mod request to datanode
        VerList            []*VolVersionInfo
}

func IsTinyExtentType(extentType uint8) bool {
        return extentType&NormalExtentType != NormalExtentType
}

func IsNormalExtentType(extentType uint8) bool {
        return extentType&NormalExtentType == NormalExtentType
}

// NewPacket returns a new packet.
func NewPacket() *Packet {
        p := new(Packet)
        p.Magic = ProtoMagic
        p.StartT = time.Now().UnixNano()
        return p
}

// NewPacketReqID returns a new packet with ReqID assigned.
func NewPacketReqID() *Packet {
        p := NewPacket()
        p.ReqID = GenerateRequestID()
        return p
}

func (p *Packet) GetCopy() *Packet {
        newPacket := NewPacket()
        newPacket.ReqID = p.ReqID
        newPacket.Opcode = p.Opcode
        newPacket.PartitionID = p.PartitionID

        newPacket.Data = make([]byte, p.Size)
        copy(newPacket.Data[:p.Size], p.Data)

        newPacket.Size = p.Size
        return newPacket
}

func (p *Packet) String() string {
        return fmt.Sprintf("ReqID(%v)Op(%v)PartitionID(%v)ResultCode(%v)ExID(%v)ExtOffset(%v)KernelOff(%v)Type(%v)Seq(%v)Size(%v)",
                p.ReqID, p.GetOpMsg(), p.PartitionID, p.GetResultMsg(), p.ExtentID, p.ExtentOffset, p.KernelOffset, p.ExtentType, p.VerSeq, p.Size)
}

// GetStoreType returns the store type.
func (p *Packet) GetStoreType() (m string) {
        if IsNormalExtentType(p.ExtentType) {
                return "NormalExtent"
        } else if IsTinyExtentType(p.ExtentType) {
                return "TinyExtent"
        } else {
                return "Unknown"
        }
}

func (p *Packet) GetOpMsgWithReqAndResult() (m string) {
        return fmt.Sprintf("Req(%v)_(%v)_Result(%v)", p.ReqID, p.GetOpMsg(), p.GetResultMsg())
}

// GetOpMsg returns the operation type.
func (p *Packet) GetOpMsg() (m string) {
        switch p.Opcode {
        case OpCreateExtent:
                m = "OpCreateExtent"
        case OpMarkDelete:
                m = "OpMarkDelete"
        case OpSplitMarkDelete:
                m = "OpMarkDelete"
        case OpWrite:
                m = "OpWrite"
        case OpTryWriteAppend:
                m = "OpTryWriteAppend"
        case OpRandomWrite:
                m = "OpRandomWrite"
        case OpRandomWriteAppend:
                m = "OpRandomWriteAppend"
        case OpRandomWriteVer:
                m = "OpRandomWriteVer"
        case OpRead:
                m = "Read"
        case OpStreamRead:
                m = "OpStreamRead"
        case OpStreamFollowerRead:
                m = "OpStreamFollowerRead"
        case OpGetAllWatermarks:
                m = "OpGetAllWatermarks"
        case OpNotifyReplicasToRepair:
                m = "OpNotifyReplicasToRepair"
        case OpExtentRepairRead:
                m = "OpExtentRepairRead"
        case OpConflictExtentsErr:
                m = "ConflictExtentsErr"
        case OpIntraGroupNetErr:
                m = "IntraGroupNetErr"
        case OpMetaCreateInode:
                m = "OpMetaCreateInode"
        case OpQuotaCreateInode:
                m = "OpQuotaCreateInode"
        case OpMetaUnlinkInode:
                m = "OpMetaUnlinkInode"
        case OpMetaBatchUnlinkInode:
                m = "OpMetaBatchUnlinkInode"
        case OpMetaCreateDentry:
                m = "OpMetaCreateDentry"
        case OpQuotaCreateDentry:
                m = "OpQuotaCreateDentry"
        case OpMetaDeleteDentry:
                m = "OpMetaDeleteDentry"
        case OpMetaBatchDeleteDentry:
                m = "OpMetaBatchDeleteDentry"
        case OpMetaOpen:
                m = "OpMetaOpen"
        case OpMetaReleaseOpen:
                m = "OpMetaReleaseOpen"
        case OpMetaLookup:
                m = "OpMetaLookup"
        case OpMetaReadDir:
                m = "OpMetaReadDir"
        case OpMetaReadDirLimit:
                m = "OpMetaReadDirLimit"
        case OpMetaInodeGet:
                m = "OpMetaInodeGet"
        case OpMetaBatchInodeGet:
                m = "OpMetaBatchInodeGet"
        case OpMetaExtentsAdd:
                m = "OpMetaExtentsAdd"
        case OpMetaExtentAddWithCheck:
                m = "OpMetaExtentAddWithCheck"
        case OpMetaObjExtentAdd:
                m = "OpMetaObjExtentAdd"
        case OpMetaExtentsDel:
                m = "OpMetaExtentsDel"
        case OpMetaExtentsList:
                m = "OpMetaExtentsList"
        case OpMetaObjExtentsList:
                m = "OpMetaObjExtentsList"
        case OpMetaUpdateDentry:
                m = "OpMetaUpdateDentry"
        case OpMetaTruncate:
                m = "OpMetaTruncate"
        case OpMetaLinkInode:
                m = "OpMetaLinkInode"
        case OpMetaEvictInode:
                m = "OpMetaEvictInode"
        case OpMetaBatchEvictInode:
                m = "OpMetaBatchEvictInode"
        case OpMetaSetattr:
                m = "OpMetaSetattr"
        case OpCreateMetaPartition:
                m = "OpCreateMetaPartition"
        case OpMetaNodeHeartbeat:
                m = "OpMetaNodeHeartbeat"
        case OpDeleteMetaPartition:
                m = "OpDeleteMetaPartition"
        case OpUpdateMetaPartition:
                m = "OpUpdateMetaPartition"
        case OpLoadMetaPartition:
                m = "OpLoadMetaPartition"
        case OpDecommissionMetaPartition:
                m = "OpDecommissionMetaPartition"
        case OpCreateDataPartition:
                m = "OpCreateDataPartition"
        case OpDeleteDataPartition:
                m = "OpDeleteDataPartition"
        case OpLoadDataPartition:
                m = "OpLoadDataPartition"
        case OpDecommissionDataPartition:
                m = "OpDecommissionDataPartition"
        case OpDataNodeHeartbeat:
                m = "OpDataNodeHeartbeat"
        case OpReplicateFile:
                m = "OpReplicateFile"
        case OpDeleteFile:
                m = "OpDeleteFile"
        case OpGetAppliedId:
                m = "OpGetAppliedId"
        case OpGetPartitionSize:
                m = "OpGetPartitionSize"
        case OpSyncWrite:
                m = "OpSyncWrite"
        case OpSyncTryWriteAppend:
                m = "OpSyncTryWriteAppend"
        case OpSyncRandomWrite:
                m = "OpSyncRandomWrite"
        case OpSyncRandomWriteVer:
                m = "OpSyncRandomWriteVer"
        case OpSyncRandomWriteAppend:
                m = "OpSyncRandomWriteAppend"
        case OpReadTinyDeleteRecord:
                m = "OpReadTinyDeleteRecord"
        case OpPing:
                m = "OpPing"
        case OpTinyExtentRepairRead:
                m = "OpTinyExtentRepairRead"
        case OpGetMaxExtentIDAndPartitionSize:
                m = "OpGetMaxExtentIDAndPartitionSize"
        case OpBroadcastMinAppliedID:
                m = "OpBroadcastMinAppliedID"
        case OpRemoveDataPartitionRaftMember:
                m = "OpRemoveDataPartitionRaftMember"
        case OpAddDataPartitionRaftMember:
                m = "OpAddDataPartitionRaftMember"
        case OpAddMetaPartitionRaftMember:
                m = "OpAddMetaPartitionRaftMember"
        case OpRemoveMetaPartitionRaftMember:
                m = "OpRemoveMetaPartitionRaftMember"
        case OpMetaPartitionTryToLeader:
                m = "OpMetaPartitionTryToLeader"
        case OpDataPartitionTryToLeader:
                m = "OpDataPartitionTryToLeader"
        case OpMetaDeleteInode:
                m = "OpMetaDeleteInode"
        case OpMetaBatchDeleteInode:
                m = "OpMetaBatchDeleteInode"
        case OpMetaBatchExtentsAdd:
                m = "OpMetaBatchExtentsAdd"
        case OpMetaBatchObjExtentsAdd:
                m = "OpMetaBatchObjExtentsAdd"
        case OpMetaSetXAttr:
                m = "OpMetaSetXAttr"
        case OpMetaGetXAttr:
                m = "OpMetaGetXAttr"
        case OpMetaRemoveXAttr:
                m = "OpMetaRemoveXAttr"
        case OpMetaListXAttr:
                m = "OpMetaListXAttr"
        case OpMetaBatchGetXAttr:
                m = "OpMetaBatchGetXAttr"
        case OpMetaUpdateXAttr:
                m = "OpMetaUpdateXAttr"
        case OpCreateMultipart:
                m = "OpCreateMultipart"
        case OpGetMultipart:
                m = "OpGetMultipart"
        case OpAddMultipartPart:
                m = "OpAddMultipartPart"
        case OpRemoveMultipart:
                m = "OpRemoveMultipart"
        case OpListMultiparts:
                m = "OpListMultiparts"
        case OpBatchDeleteExtent:
                m = "OpBatchDeleteExtent"
        case OpMetaClearInodeCache:
                m = "OpMetaClearInodeCache"
        case OpMetaTxCreateInode:
                m = "OpMetaTxCreateInode"
        case OpMetaTxCreateDentry:
                m = "OpMetaTxCreateDentry"
        case OpTxCommit:
                m = "OpTxCommit"
        case OpMetaTxCreate:
                m = "OpMetaTxCreate"
        case OpTxRollback:
                m = "OpTxRollback"
        case OpTxCommitRM:
                m = "OpTxCommitRM"
        case OpTxRollbackRM:
                m = "OpTxRollbackRM"
        case OpMetaTxDeleteDentry:
                m = "OpMetaTxDeleteDentry"
        case OpMetaTxUnlinkInode:
                m = "OpMetaTxUnlinkInode"
        case OpMetaTxUpdateDentry:
                m = "OpMetaTxUpdateDentry"
        case OpMetaTxLinkInode:
                m = "OpMetaTxLinkInode"
        case OpMetaTxGet:
                m = "OpMetaTxGet"
        case OpMetaBatchSetInodeQuota:
                m = "OpMetaBatchSetInodeQuota"
        case OpMetaBatchDeleteInodeQuota:
                m = "OpMetaBatchDeleteInodeQuota"
        case OpMetaGetInodeQuota:
                m = "OpMetaGetInodeQuota"
        case OpStopDataPartitionRepair:
                m = "OpStopDataPartitionRepair"
        case OpLcNodeHeartbeat:
                m = "OpLcNodeHeartbeat"
        case OpLcNodeScan:
                m = "OpLcNodeScan"
        case OpLcNodeSnapshotVerDel:
                m = "OpLcNodeSnapshotVerDel"
        case OpMetaReadDirOnly:
                m = "OpMetaReadDirOnly"
        default:
                m = fmt.Sprintf("op:%v not found", p.Opcode)
        }
        return
}

func GetStatusStr(status uint8) string {
        pkt := &Packet{}
        pkt.ResultCode = status
        return pkt.GetResultMsg()
}

// GetResultMsg returns the result message.
func (p *Packet) GetResultMsg() (m string) {
        if p == nil {
                return ""
        }

        switch p.ResultCode {
        case OpConflictExtentsErr:
                m = "ConflictExtentsErr"
        case OpIntraGroupNetErr:
                m = "IntraGroupNetErr"
        case OpDiskNoSpaceErr:
                m = "DiskNoSpaceErr"
        case OpDiskErr:
                m = "DiskErr"
        case OpErr:
                m = "Err: " + string(p.Data)
        case OpAgain:
                m = "Again: " + string(p.Data)
        case OpOk:
                m = "Ok"
        case OpExistErr:
                m = "ExistErr"
        case OpInodeFullErr:
                m = "InodeFullErr"
        case OpArgMismatchErr:
                m = "ArgUnmatchErr"
        case OpNotExistErr:
                m = "NotExistErr"
        case OpTryOtherAddr:
                m = "TryOtherAddr"
        case OpNotPerm:
                m = "NotPerm"
        case OpNotEmpty:
                m = "DirNotEmpty"
        case OpDirQuota:
                m = "OpDirQuota"
        case OpNoSpaceErr:
                m = "NoSpaceErr"
        case OpTxInodeInfoNotExistErr:
                m = "OpTxInodeInfoNotExistErr"
        case OpTxConflictErr:
                m = "TransactionConflict"
        case OpTxDentryInfoNotExistErr:
                m = "OpTxDentryInfoNotExistErr"
        case OpTxRbInodeNotExistErr:
                m = "OpTxRbInodeNotExistEr"
        case OpTxRbDentryNotExistErr:
                m = "OpTxRbDentryNotExistEr"
        case OpTxInfoNotExistErr:
                m = "OpTxInfoNotExistErr"
        case OpTxInternalErr:
                m = "OpTxInternalErr"
        case OpTxCommitItemErr:
                m = "OpTxCommitItemErr"
        case OpTxRollbackItemErr:
                m = "OpTxRollbackItemErr"
        case OpTxRollbackUnknownRbType:
                m = "OpTxRollbackUnknownRbType"
        case OpTxTimeoutErr:
                m = "OpTxTimeoutErr"
        case OpTxSetStateErr:
                m = "OpTxSetStateErr"
        case OpTxCommitErr:
                m = "OpTxCommitErr"
        case OpTxRollbackErr:
                m = "OpTxRollbackErr"
        case OpUploadPartConflictErr:
                m = "OpUploadPartConflictErr"
        default:
                return fmt.Sprintf("Unknown ResultCode(%v)", p.ResultCode)
        }
        return
}

func (p *Packet) GetReqID() int64 {
        return p.ReqID
}

// MarshalHeader marshals the packet header.
func (p *Packet) MarshalHeader(out []byte) {
        out[0] = p.Magic
        out[1] = p.ExtentType
        out[2] = p.Opcode
        out[3] = p.ResultCode
        out[4] = p.RemainingFollowers
        binary.BigEndian.PutUint32(out[5:9], p.CRC)
        binary.BigEndian.PutUint32(out[9:13], p.Size)
        binary.BigEndian.PutUint32(out[13:17], p.ArgLen)
        binary.BigEndian.PutUint64(out[17:25], p.PartitionID)
        binary.BigEndian.PutUint64(out[25:33], p.ExtentID)
        binary.BigEndian.PutUint64(out[33:41], uint64(p.ExtentOffset))
        binary.BigEndian.PutUint64(out[41:49], uint64(p.ReqID))
        binary.BigEndian.PutUint64(out[49:util.PacketHeaderSize], p.KernelOffset)
        if p.Opcode == OpRandomWriteVer || p.ExtentType&MultiVersionFlag > 0 {
                binary.BigEndian.PutUint64(out[util.PacketHeaderSize:util.PacketHeaderSize+8], p.VerSeq)
        }
}

func (p *Packet) IsVersionList() bool {
        return p.ExtentType&VersionListFlag == VersionListFlag
}

// UnmarshalHeader unmarshals the packet header.
func (p *Packet) UnmarshalHeader(in []byte) error {
        p.Magic = in[0]
        if p.Magic != ProtoMagic {
                return errors.New("Bad Magic " + strconv.Itoa(int(p.Magic)))
        }

        p.ExtentType = in[1]
        p.Opcode = in[2]
        p.ResultCode = in[3]
        p.RemainingFollowers = in[4]
        p.CRC = binary.BigEndian.Uint32(in[5:9])
        p.Size = binary.BigEndian.Uint32(in[9:13])
        p.ArgLen = binary.BigEndian.Uint32(in[13:17])
        p.PartitionID = binary.BigEndian.Uint64(in[17:25])
        p.ExtentID = binary.BigEndian.Uint64(in[25:33])
        p.ExtentOffset = int64(binary.BigEndian.Uint64(in[33:41]))
        p.ReqID = int64(binary.BigEndian.Uint64(in[41:49]))
        p.KernelOffset = binary.BigEndian.Uint64(in[49:util.PacketHeaderSize])

        // header opcode OpRandomWriteVer should not unmarshal here due to header size is const
        // the ver param should read at the higher level directly
        // if p.Opcode ==OpRandomWriteVer {

        return nil
}

const verInfoCnt = 17

func (p *Packet) MarshalVersionSlice() (data []byte, err error) {
        items := p.VerList
        cnt := len(items)
        buff := bytes.NewBuffer(make([]byte, 0, 2*cnt*verInfoCnt))
        if err := binary.Write(buff, binary.BigEndian, uint16(cnt)); err != nil {
                return nil, err
        }

        for _, v := range items {
                if err := binary.Write(buff, binary.BigEndian, v.Ver); err != nil {
                        return nil, err
                }
                if err := binary.Write(buff, binary.BigEndian, v.DelTime); err != nil {
                        return nil, err
                }
                if err := binary.Write(buff, binary.BigEndian, v.Status); err != nil {
                        return nil, err
                }
        }

        return buff.Bytes(), nil
}

func (p *Packet) UnmarshalVersionSlice(cnt int, d []byte) error {
        items := make([]*VolVersionInfo, 0)
        buf := bytes.NewBuffer(d)
        var err error

        for idx := 0; idx < cnt; idx++ {
                e := &VolVersionInfo{}
                err = binary.Read(buf, binary.BigEndian, &e.Ver)
                if err != nil {
                        return err
                }
                err = binary.Read(buf, binary.BigEndian, &e.DelTime)
                if err != nil {
                        return err
                }
                err = binary.Read(buf, binary.BigEndian, &e.Status)
                if err != nil {
                        return err
                }
                items = append(items, e)
        }
        p.VerList = items
        return nil
}

// MarshalData marshals the packet data.
func (p *Packet) MarshalData(v interface{}) error {
        data, err := json.Marshal(v)
        if err == nil {
                p.Data = data
                p.Size = uint32(len(p.Data))
        }
        return err
}

// UnmarshalData unmarshals the packet data.
func (p *Packet) UnmarshalData(v interface{}) error {
        return json.Unmarshal(p.Data, v)
}

// WriteToNoDeadLineConn writes through the connection without deadline.
func (p *Packet) WriteToNoDeadLineConn(c net.Conn) (err error) {
        header, err := Buffers.Get(util.PacketHeaderSize)
        if err != nil {
                header = make([]byte, util.PacketHeaderSize)
        }
        defer Buffers.Put(header)

        p.MarshalHeader(header)
        if _, err = c.Write(header); err == nil {
                if _, err = c.Write(p.Arg[:int(p.ArgLen)]); err == nil {
                        if p.Data != nil {
                                _, err = c.Write(p.Data[:p.Size])
                        }
                }
        }

        return
}

// WriteToConn writes through the given connection.
func (p *Packet) WriteToConn(c net.Conn) (err error) {
        headSize := util.PacketHeaderSize
        if p.Opcode == OpRandomWriteVer || p.ExtentType&MultiVersionFlag > 0 {
                headSize = util.PacketHeaderVerSize
        }
        // log.LogDebugf("packet opcode %v header size %v extentype %v conn %v", p.Opcode, headSize, p.ExtentType, c)
        header, err := Buffers.Get(headSize)
        if err != nil {
                header = make([]byte, headSize)
        }
        // log.LogErrorf("action[WriteToConn] buffer get nil,opcode %v head len [%v]", p.Opcode, len(header))
        defer Buffers.Put(header)
        c.SetWriteDeadline(time.Now().Add(WriteDeadlineTime * time.Second))
        p.MarshalHeader(header)
        if _, err = c.Write(header); err == nil {
                // write dir version info.
                if p.IsVersionList() {
                        d, err1 := p.MarshalVersionSlice()
                        if err1 != nil {
                                log.LogErrorf("MarshalVersionSlice: marshal version ifo failed, err %s", err1.Error())
                                return err1
                        }

                        _, err = c.Write(d)
                        if err != nil {
                                return err
                        }
                }
                if _, err = c.Write(p.Arg[:int(p.ArgLen)]); err == nil {
                        if p.Data != nil && p.Size != 0 {
                                _, err = c.Write(p.Data[:p.Size])
                        }
                }
        }

        return
}

// ReadFull is a wrapper function of io.ReadFull.
func ReadFull(c net.Conn, buf *[]byte, readSize int) (err error) {
        *buf = make([]byte, readSize)
        _, err = io.ReadFull(c, (*buf)[:readSize])
        return
}

func (p *Packet) IsWriteOperation() bool {
        return p.Opcode == OpWrite || p.Opcode == OpSyncWrite
}

func (p *Packet) IsReadOperation() bool {
        return p.Opcode == OpStreamRead || p.Opcode == OpRead ||
                p.Opcode == OpExtentRepairRead || p.Opcode == OpReadTinyDeleteRecord ||
                p.Opcode == OpTinyExtentRepairRead || p.Opcode == OpStreamFollowerRead
}

// ReadFromConn reads the data from the given connection.
// Recognize the version bit and parse out version,
// to avoid version field rsp back , the rsp of random write from datanode with replace OpRandomWriteVer to OpRandomWriteVerRsp
func (p *Packet) ReadFromConnWithVer(c net.Conn, timeoutSec int) (err error) {
        if timeoutSec != NoReadDeadlineTime {
                c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
        } else {
                c.SetReadDeadline(time.Time{})
        }

        header, err := Buffers.Get(util.PacketHeaderSize)
        if err != nil {
                header = make([]byte, util.PacketHeaderSize)
        }
        defer Buffers.Put(header)
        var n int
        if n, err = io.ReadFull(c, header); err != nil {
                return
        }
        if n != util.PacketHeaderSize {
                return syscall.EBADMSG
        }
        if err = p.UnmarshalHeader(header); err != nil {
                return
        }

        if p.ExtentType&MultiVersionFlag > 0 {
                ver := make([]byte, 8)
                if _, err = io.ReadFull(c, ver); err != nil {
                        return
                }
                p.VerSeq = binary.BigEndian.Uint64(ver)
        }

        if p.IsVersionList() {
                cntByte := make([]byte, 2)
                if _, err = io.ReadFull(c, cntByte); err != nil {
                        return err
                }
                cnt := binary.BigEndian.Uint16(cntByte)
                log.LogDebugf("action[ReadFromConnWithVer] op %s verseq %v, extType %d, cnt %d",
                        p.GetOpMsg(), p.VerSeq, p.ExtentType, cnt)
                verData := make([]byte, cnt*verInfoCnt)
                if _, err = io.ReadFull(c, verData); err != nil {
                        log.LogWarnf("ReadFromConnWithVer: read ver slice from conn failed, err %s", err.Error())
                        return err
                }

                err = p.UnmarshalVersionSlice(int(cnt), verData)
                if err != nil {
                        log.LogWarnf("ReadFromConnWithVer: unmarshal ver slice failed, err %s", err.Error())
                        return err
                }
        }

        if p.ArgLen > 0 {
                p.Arg = make([]byte, int(p.ArgLen))
                if _, err = io.ReadFull(c, p.Arg[:int(p.ArgLen)]); err != nil {
                        return err
                }
        }

        size := p.Size
        if p.IsReadOperation() && p.ResultCode == OpInitResultCode {
                size = 0
        }

        if p.IsWriteOperation() && size == util.BlockSize {
                p.Data, _ = Buffers.Get(int(size))
        } else {
                p.Data = make([]byte, size)
        }

        if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
                return err
        }
        if n != int(size) {
                return syscall.EBADMSG
        }
        return nil
}

// ReadFromConn reads the data from the given connection.
func (p *Packet) ReadFromConn(c net.Conn, timeoutSec int) (err error) {
        if timeoutSec != NoReadDeadlineTime {
                c.SetReadDeadline(time.Now().Add(time.Second * time.Duration(timeoutSec)))
        } else {
                c.SetReadDeadline(time.Time{})
        }
        header, err := Buffers.Get(util.PacketHeaderSize)
        if err != nil {
                header = make([]byte, util.PacketHeaderSize)
        }
        defer Buffers.Put(header)
        var n int
        if n, err = io.ReadFull(c, header); err != nil {
                return
        }
        if n != util.PacketHeaderSize {
                return syscall.EBADMSG
        }
        if err = p.UnmarshalHeader(header); err != nil {
                return
        }

        if p.ArgLen > 0 {
                p.Arg = make([]byte, int(p.ArgLen))
                if _, err = io.ReadFull(c, p.Arg[:int(p.ArgLen)]); err != nil {
                        return err
                }
        }

        size := p.Size
        if (p.Opcode == OpRead || p.Opcode == OpStreamRead || p.Opcode == OpExtentRepairRead || p.Opcode == OpStreamFollowerRead) && p.ResultCode == OpInitResultCode {
                size = 0
        }
        p.Data = make([]byte, size)
        if n, err = io.ReadFull(c, p.Data[:size]); err != nil {
                return err
        }
        if n != int(size) {
                return syscall.EBADMSG
        }
        return nil
}

// PacketOkReply sets the result code as OpOk, and sets the body as empty.
func (p *Packet) PacketOkReply() {
        p.ResultCode = OpOk
        p.Size = 0
        p.Data = nil
        p.ArgLen = 0
}

// PacketOkWithBody sets the result code as OpOk, and sets the body with the give data.
func (p *Packet) PacketOkWithBody(reply []byte) {
        p.Size = uint32(len(reply))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:p.Size], reply)
        p.ResultCode = OpOk
        p.ArgLen = 0
}

// attention use for tmp byte arr, eg: json marshal data
func (p *Packet) PacketOkWithByte(reply []byte) {
        p.Size = uint32(len(reply))
        p.Data = reply
        p.ResultCode = OpOk
        p.ArgLen = 0
}

// PacketErrorWithBody sets the packet with error code whose body is filled with the given data.
func (p *Packet) PacketErrorWithBody(code uint8, reply []byte) {
        p.Size = uint32(len(reply))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:p.Size], reply)
        p.ResultCode = code
        p.ArgLen = 0
}

func (p *Packet) SetPacketHasPrepare() {
        p.setPacketPrefix()
        p.HasPrepare = true
}

func (p *Packet) SetPacketRePrepare() {
        p.HasPrepare = false
}

func (p *Packet) AddMesgLog(m string) {
        p.mesg += m
}

// GetUniqueLogId returns the unique log ID.
func (p *Packet) GetUniqueLogId() (m string) {
        defer func() {
                m = m + fmt.Sprintf("_ResultMesg(%v)", p.GetResultMsg())
        }()
        if p.HasPrepare {
                m = p.mesg
                return
        }
        m = fmt.Sprintf("Req(%v)_Partition(%v)_", p.ReqID, p.PartitionID)
        if p.Opcode == OpSplitMarkDelete || (IsTinyExtentType(p.ExtentType) && p.Opcode == OpMarkDelete) && len(p.Data) > 0 {
                ext := new(TinyExtentDeleteRecord)
                err := json.Unmarshal(p.Data, ext)
                if err == nil {
                        m += fmt.Sprintf("Extent(%v)_ExtentOffset(%v)_Size(%v)_Opcode(%v)",
                                ext.ExtentId, ext.ExtentOffset, ext.Size, p.GetOpMsg())
                        return m
                }
        } else if p.Opcode == OpReadTinyDeleteRecord || p.Opcode == OpNotifyReplicasToRepair || p.Opcode == OpDataNodeHeartbeat ||
                p.Opcode == OpLoadDataPartition || p.Opcode == OpBatchDeleteExtent {
                p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
                return
        } else if p.Opcode == OpBroadcastMinAppliedID || p.Opcode == OpGetAppliedId {
                if p.Size > 0 {
                        applyID := binary.BigEndian.Uint64(p.Data)
                        m += fmt.Sprintf("Opcode(%v)_AppliedID(%v)", p.GetOpMsg(), applyID)
                } else {
                        m += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
                }
                return m
        }
        m = fmt.Sprintf("Req(%v)_Partition(%v)_Extent(%v)_ExtentOffset(%v)_KernelOffset(%v)_"+
                "Size(%v)_Opcode(%v)_CRC(%v)",
                p.ReqID, p.PartitionID, p.ExtentID, p.ExtentOffset,
                p.KernelOffset, p.Size, p.GetOpMsg(), p.CRC)

        return
}

func (p *Packet) setPacketPrefix() {
        p.mesg = fmt.Sprintf("Req(%v)_Partition(%v)_", p.ReqID, p.PartitionID)
        if (p.Opcode == OpSplitMarkDelete || (IsTinyExtentType(p.ExtentType) && p.Opcode == OpMarkDelete)) && len(p.Data) > 0 {
                ext := new(TinyExtentDeleteRecord)
                err := json.Unmarshal(p.Data, ext)
                if err == nil {
                        p.mesg += fmt.Sprintf("Extent(%v)_ExtentOffset(%v)_Size(%v)_Opcode(%v)",
                                ext.ExtentId, ext.ExtentOffset, ext.Size, p.GetOpMsg())
                        return
                }
        } else if p.Opcode == OpReadTinyDeleteRecord || p.Opcode == OpNotifyReplicasToRepair || p.Opcode == OpDataNodeHeartbeat ||
                p.Opcode == OpLoadDataPartition || p.Opcode == OpBatchDeleteExtent {
                p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
                return
        } else if p.Opcode == OpBroadcastMinAppliedID || p.Opcode == OpGetAppliedId {
                if p.Size > 0 {
                        applyID := binary.BigEndian.Uint64(p.Data)
                        p.mesg += fmt.Sprintf("Opcode(%v)_AppliedID(%v)", p.GetOpMsg(), applyID)
                } else {
                        p.mesg += fmt.Sprintf("Opcode(%v)", p.GetOpMsg())
                }
                return
        }
        p.mesg = fmt.Sprintf("Req(%v)_Partition(%v)_Extent(%v)_ExtentOffset(%v)_KernelOffset(%v)_"+
                "Size(%v)_Opcode(%v)_CRC(%v)",
                p.ReqID, p.PartitionID, p.ExtentID, p.ExtentOffset,
                p.KernelOffset, p.Size, p.GetOpMsg(), p.CRC)
}

// IsForwardPkt returns if the packet is the forward packet (a packet that will be forwarded to the followers).
func (p *Packet) IsForwardPkt() bool {
        return p.RemainingFollowers > 0
}

// LogMessage logs the given message.
func (p *Packet) LogMessage(action, remote string, start int64, err error) (m string) {
        if err == nil {
                m = fmt.Sprintf("id[%v] isPrimaryBackReplLeader[%v] remote[%v] "+
                        " cost[%v] ", p.GetUniqueLogId(), p.IsForwardPkt(), remote, (time.Now().UnixNano()-start)/1e6)
        } else {
                m = fmt.Sprintf("id[%v] isPrimaryBackReplLeader[%v] remote[%v]"+
                        ", err[%v]", p.GetUniqueLogId(), p.IsForwardPkt(), remote, err.Error())
        }

        return
}

// ShallRetry returns if we should retry the packet.
func (p *Packet) ShouldRetryWithVersionList() bool {
        return p.ResultCode == OpAgainVerionList
}

// ShallRetry returns if we should retry the packet.
func (p *Packet) ShouldRetry() bool {
        return p.ResultCode == OpAgain || p.ResultCode == OpErr
}

func (p *Packet) IsBatchDeleteExtents() bool {
        return p.Opcode == OpBatchDeleteExtent
}

func InitBufferPool(bufLimit int64) {
        buf.NormalBuffersTotalLimit = bufLimit
        buf.HeadBuffersTotalLimit = bufLimit
        buf.HeadVerBuffersTotalLimit = bufLimit

        Buffers = buf.NewBufferPool()
}

// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "path"
        "regexp"
        "strings"
)

var (
        actionRegexp       = regexp.MustCompile(`^action:((oss:(\w+))|(posix:(\w)+))$`)
        actionPrefixRegexp = regexp.MustCompile(`^action:((oss)|(posix)):`)
)

type Action string

func (a Action) String() string {
        return string(a)
}

func (a Action) IsNone() bool {
        return len(a) == 0 || a == NoneAction
}

func (a Action) Name() string {
        loc := actionPrefixRegexp.FindStringIndex(a.String())
        if len(loc) != 2 {
                return "Unknown"
        }
        return a.String()[loc[1]:]
}

const (
        ActionPrefix      = "action:"
        OSSActionPrefix   = ActionPrefix + "oss:"
        POSIXActionPrefix = ActionPrefix + "posix:"

        // Object actions
        OSSGetObjectAction     Action = OSSActionPrefix + "GetObject"
        OSSPutObjectAction     Action = OSSActionPrefix + "PutObject"
        OSSPostObjectAction    Action = OSSActionPrefix + "PostObject"
        OSSCopyObjectAction    Action = OSSActionPrefix + "CopyObject"
        OSSListObjectsAction   Action = OSSActionPrefix + "ListObjects"
        OSSDeleteObjectAction  Action = OSSActionPrefix + "DeleteObject"
        OSSDeleteObjectsAction Action = OSSActionPrefix + "DeleteObjects"
        OSSHeadObjectAction    Action = OSSActionPrefix + "HeadObject"

        // Bucket actions
        OSSCreateBucketAction Action = OSSActionPrefix + "CreateBucket"
        OSSDeleteBucketAction Action = OSSActionPrefix + "DeleteBucket"
        OSSHeadBucketAction   Action = OSSActionPrefix + "HeadBucket"
        OSSListBucketsAction  Action = OSSActionPrefix + "ListBuckets"

        // Bucket policy actions
        OSSGetBucketPolicyAction       Action = OSSActionPrefix + "GetBucketPolicy"
        OSSPutBucketPolicyAction       Action = OSSActionPrefix + "PutBucketPolicy"
        OSSDeleteBucketPolicyAction    Action = OSSActionPrefix + "DeleteBucketPolicy"
        OSSGetBucketPolicyStatusAction Action = OSSActionPrefix + "GetBucketPolicyStatus" // unsupported

        // Bucket ACL actions
        OSSGetBucketAclAction Action = OSSActionPrefix + "GetBucketAcl"
        OSSPutBucketAclAction Action = OSSActionPrefix + "PutBucketAcl"

        // Bucket CORS actions
        OSSGetBucketCorsAction    Action = OSSActionPrefix + "GetBucketCors"
        OSSPutBucketCorsAction    Action = OSSActionPrefix + "PutBucketCors"
        OSSDeleteBucketCorsAction Action = OSSActionPrefix + "DeleteBucketCors"
        OSSOptionsObjectAction    Action = OSSActionPrefix + "OptionsObject"

        // Object torrent actions
        OSSGetObjectTorrentAction Action = OSSActionPrefix + "GetObjectTorrent" // unsupported

        // Object ACL actions
        OSSGetObjectAclAction Action = OSSActionPrefix + "GetObjectAcl"
        OSSPutObjectAclAction Action = OSSActionPrefix + "PutObjectAcl"

        // Multipart actions
        OSSCreateMultipartUploadAction   Action = OSSActionPrefix + "CreateMultipartUpload"
        OSSListMultipartUploadsAction    Action = OSSActionPrefix + "ListMultipartUploads"
        OSSUploadPartAction              Action = OSSActionPrefix + "UploadPart"
        OSSUploadPartCopyAction          Action = OSSActionPrefix + "UploadPartCopy" // unsupported
        OSSListPartsAction               Action = OSSActionPrefix + "ListParts"
        OSSCompleteMultipartUploadAction Action = OSSActionPrefix + "CompleteMultipartUpload"
        OSSAbortMultipartUploadAction    Action = OSSActionPrefix + "AbortMultipartUpload"

        // Bucket location
        OSSGetBucketLocationAction Action = OSSActionPrefix + "GetBucketLocation"

        // Object extend attributes (xattr)
        OSSGetObjectXAttrAction    Action = OSSActionPrefix + "GetObjectXAttr"
        OSSPutObjectXAttrAction    Action = OSSActionPrefix + "PutObjectXAttr"
        OSSListObjectXAttrsAction  Action = OSSActionPrefix + "ListObjectXAttrs"
        OSSDeleteObjectXAttrAction Action = OSSActionPrefix + "DeleteObjectXAttr"

        // Object tagging actions
        OSSGetObjectTaggingAction    Action = OSSActionPrefix + "GetObjectTagging"
        OSSPutObjectTaggingAction    Action = OSSActionPrefix + "PutObjectTagging"
        OSSDeleteObjectTaggingAction Action = OSSActionPrefix + "DeleteObjectTagging"

        // Bucket tagging actions
        OSSGetBucketTaggingAction    Action = OSSActionPrefix + "GetBucketTagging"
        OSSPutBucketTaggingAction    Action = OSSActionPrefix + "PutBucketTagging"
        OSSDeleteBucketTaggingAction Action = OSSActionPrefix + "DeleteBucketTagging"

        // Bucket lifecycle actions
        OSSGetBucketLifecycleAction                 Action = OSSActionPrefix + "GetBucketLifecycle"    // unsupported
        OSSPutBucketLifecycleAction                 Action = OSSActionPrefix + "PutBucketLifecycle"    // unsupported
        OSSDeleteBucketLifecycleAction              Action = OSSActionPrefix + "DeleteBucketLifecycle" // unsupported
        OSSGetBucketLifecycleConfigurationAction    Action = OSSActionPrefix + "GetBucketLifecycleConfiguration"
        OSSPutBucketLifecycleConfigurationAction    Action = OSSActionPrefix + "PutBucketLifecycleConfiguration"
        OSSDeleteBucketLifecycleConfigurationAction Action = OSSActionPrefix + "DeleteBucketLifecycleConfiguration"

        // Object storage version actions
        OSSGetBucketVersioningAction Action = OSSActionPrefix + "GetBucketVersioning" // unsupported
        OSSPutBucketVersioningAction Action = OSSActionPrefix + "PutBucketVersioning" // unsupported
        OSSListObjectVersionsAction  Action = OSSActionPrefix + "ListObjectVersions"  // unsupported

        // Object legal hold actions
        OSSGetObjectLegalHoldAction Action = OSSActionPrefix + "GetObjectLegalHold" // unsupported
        OSSPutObjectLegalHoldAction Action = OSSActionPrefix + "PutObjectLegalHold" // unsupported

        // Object retention actions
        OSSGetObjectRetentionAction Action = OSSActionPrefix + "GetObjectRetention" // unsupported
        OSSPutObjectRetentionAction Action = OSSActionPrefix + "PutObjectRetention" // unsupported

        // Bucket encryption actions
        OSSGetBucketEncryptionAction    Action = OSSActionPrefix + "GetBucketEncryption"    // unsupported
        OSSPutBucketEncryptionAction    Action = OSSActionPrefix + "PutBucketEncryption"    // unsupported
        OSSDeleteBucketEncryptionAction Action = OSSActionPrefix + "DeleteBucketEncryption" // unsupported

        // Bucket website actions
        OSSGetBucketWebsiteAction    Action = OSSActionPrefix + "GetBucketWebsite"    // unsupported
        OSSPutBucketWebsiteAction    Action = OSSActionPrefix + "PutBucketWebsite"    // unsupported
        OSSDeleteBucketWebsiteAction Action = OSSActionPrefix + "DeleteBucketWebsite" // unsupported

        // Object restore actions
        OSSRestoreObjectAction Action = OSSActionPrefix + "RestoreObject" // unsupported

        // Public access block actions
        OSSGetPublicAccessBlockAction    Action = OSSActionPrefix + "GetPublicAccessBlock"   // unsupported
        OSSPutPublicAccessBlockAction    Action = OSSActionPrefix + "PutPublicAccessBlock"   // unsupported
        OSSDeletePublicAccessBlockAction Action = OSSActionPrefix + "DeletePulicAccessBlock" // unuspported

        // Bucket request payment actions
        OSSGetBucketRequestPaymentAction Action = OSSActionPrefix + "GetBucketRequestPayment" // unsupported
        OSSPutBucketRequestPaymentAction Action = OSSActionPrefix + "PutBucketRequestPayment" // unsupported

        // Bucket replication actions
        OSSGetBucketReplicationAction    Action = OSSActionPrefix + "GetBucketReplicationAction"    // unsupported
        OSSPutBucketReplicationAction    Action = OSSActionPrefix + "PutBucketReplicationAction"    // unsupported
        OSSDeleteBucketReplicationAction Action = OSSActionPrefix + "DeleteBucketReplicationAction" // unsupported

        // STS actions
        OSSGetFederationTokenAction Action = OSSActionPrefix + "GetFederationToken"

        // constants for POSIX file system interface
        POSIXReadAction  Action = POSIXActionPrefix + "Read"
        POSIXWriteAction Action = POSIXActionPrefix + "Write"

        // Object Lock actions
        OSSPutObjectLockConfigurationAction Action = OSSActionPrefix + "PutObjectLockConfiguration"
        OSSGetObjectLockConfigurationAction Action = OSSActionPrefix + "GetObjectLockConfiguration"

        NoneAction Action = ""
)

var AllActions = []Action{
        // Object storage interface actions
        OSSGetObjectAction,
        OSSPutObjectAction,
        OSSPostObjectAction,
        OSSCopyObjectAction,
        OSSListObjectsAction,
        OSSDeleteObjectAction,
        OSSDeleteObjectsAction,
        OSSHeadObjectAction,
        OSSCreateBucketAction,
        OSSDeleteBucketAction,
        OSSHeadBucketAction,
        OSSListBucketsAction,
        OSSGetBucketPolicyAction,
        OSSPutBucketPolicyAction,
        OSSDeleteBucketPolicyAction,
        OSSGetBucketPolicyStatusAction,
        OSSGetBucketAclAction,
        OSSPutBucketAclAction,
        OSSGetObjectTorrentAction,
        OSSGetObjectAclAction,
        OSSPutObjectAclAction,
        OSSCreateMultipartUploadAction,
        OSSListMultipartUploadsAction,
        OSSUploadPartAction,
        OSSUploadPartCopyAction,
        OSSListPartsAction,
        OSSCompleteMultipartUploadAction,
        OSSAbortMultipartUploadAction,
        OSSGetBucketLocationAction,
        OSSGetObjectXAttrAction,
        OSSPutObjectXAttrAction,
        OSSListObjectXAttrsAction,
        OSSDeleteObjectXAttrAction,
        OSSGetObjectTaggingAction,
        OSSPutObjectTaggingAction,
        OSSDeleteObjectTaggingAction,
        OSSGetBucketTaggingAction,
        OSSPutBucketTaggingAction,
        OSSDeleteBucketTaggingAction,
        OSSGetBucketLifecycleAction,
        OSSPutBucketLifecycleAction,
        OSSDeleteBucketLifecycleAction,
        OSSGetBucketLifecycleConfigurationAction,
        OSSPutBucketLifecycleConfigurationAction,
        OSSDeleteBucketLifecycleConfigurationAction,
        OSSGetBucketVersioningAction,
        OSSPutBucketVersioningAction,
        OSSListObjectVersionsAction,
        OSSGetObjectLegalHoldAction,
        OSSPutObjectLegalHoldAction,
        OSSGetObjectRetentionAction,
        OSSPutObjectRetentionAction,
        OSSGetBucketEncryptionAction,
        OSSPutBucketEncryptionAction,
        OSSDeleteBucketEncryptionAction,
        OSSGetBucketCorsAction,
        OSSPutBucketCorsAction,
        OSSDeleteBucketCorsAction,
        OSSGetBucketWebsiteAction,
        OSSPutBucketWebsiteAction,
        OSSDeleteBucketWebsiteAction,
        OSSRestoreObjectAction,
        OSSGetPublicAccessBlockAction,
        OSSPutPublicAccessBlockAction,
        OSSDeletePublicAccessBlockAction,
        OSSGetBucketRequestPaymentAction,
        OSSPutBucketRequestPaymentAction,
        OSSGetBucketReplicationAction,
        OSSPutBucketReplicationAction,
        OSSDeleteBucketReplicationAction,
        OSSOptionsObjectAction,
        OSSGetFederationTokenAction,

        // POSIX file system interface actions
        POSIXReadAction,
        POSIXWriteAction,

        OSSPutObjectLockConfigurationAction,
        OSSGetObjectLockConfigurationAction,
}

func ParseAction(str string) Action {
        if len(str) == 0 || !actionRegexp.MatchString(str) {
                return NoneAction
        }
        for _, act := range AllActions {
                if act.String() == str {
                        return act
                }
        }
        return NoneAction
}

type Actions []Action

func (actions Actions) Contains(action Action) bool {
        if len(actions) == 0 {
                return false
        }
        for _, a := range actions {
                if a == action {
                        return true
                }
        }
        return false
}

func (actions Actions) Len() int {
        return len(actions)
}

type Permission string

func (p Permission) String() string {
        return string(p)
}

func (p Permission) ReadableString() string {
        if p.Valid() {
                if p.IsBuiltin() {
                        return p.String()[len(BuiltinPermissionPrefix.String()):] + "(builtin)"
                }
                if p.IsCustom() {
                        return p.String()[len(CustomPermissionPrefix.String()):] + "(custom)"
                }
                return p.String()
        }
        return "None"
}

func (p Permission) IsBuiltin() bool {
        return builtinPermRegexp.MatchString(string(p))
}

func (p Permission) MatchSubdir(subdir string) bool {
        if !strings.HasPrefix(string(p), string(BuiltinPermissionPrefix)) {
                return false
        }

        s := strings.TrimPrefix(string(p), string(BuiltinPermissionPrefix))

        if !subdirRegexp.MatchString(s) {
                return true
        }

        pars := strings.Split(s, ":")
        pars = pars[:len(pars)-1] // trim (Writable|ReadOnly) at the end
        for _, toCmp := range pars {
                if toCmp == "/" || toCmp == "" {
                        return true
                }
                subdir = path.Clean("/" + subdir)
                toCmp = path.Clean("/" + toCmp)
                if strings.HasPrefix(subdir, toCmp) {
                        tail := strings.TrimPrefix(subdir, toCmp)
                        // match case 1:
                        // subdir = "/a/b/c"
                        // toCmp  = "/a/b/c"
                        // tail   =       ""

                        // match case 2:
                        // subdir = "/a/b/c"
                        // toCmp  = "/a/b"
                        // tail   =     "/c"

                        if tail == "" || strings.HasPrefix(tail, "/") {
                                return true
                        }
                }
        }

        return false
}

func (p Permission) IsCustom() bool {
        return customPermRegexp.MatchString(string(p))
}

func (p Permission) Valid() bool {
        return permRegexp.MatchString(string(p))
}

func (p Permission) IsNone() bool {
        return p == NonePermission
}

const (
        // prefixes for value organization
        PermissionPrefix        Permission = "perm:"
        BuiltinPermissionPrefix Permission = PermissionPrefix + "builtin:"
        CustomPermissionPrefix  Permission = PermissionPrefix + "custom:"

        // constants for builtin permissions
        BuiltinPermissionReadOnly Permission = BuiltinPermissionPrefix + "ReadOnly"
        BuiltinPermissionWritable Permission = BuiltinPermissionPrefix + "Writable"

        // constants for unknown permission
        NonePermission Permission = ""
)

var (
        permRegexp                = regexp.MustCompile(`^perm:((builtin:((.*/*)([^/]*):*)(Writable|ReadOnly))|(custom:(\w)+))$`)
        builtinPermRegexp         = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)(Writable|ReadOnly)$`)
        builtinWritablePermRegexp = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)Writable$`)
        builtinReadOnlyPermRegexp = regexp.MustCompile(`^perm:builtin:((.*/*)([^/]*):*)ReadOnly$`)
        customPermRegexp          = regexp.MustCompile(`^perm:custom:(\w)+$`)
        subdirRegexp              = regexp.MustCompile(`((.*/*)([^/]*)):(Writable|ReadOnly)$`)
)

func ParsePermission(value string) Permission {
        if permRegexp.MatchString(value) {
                return Permission(value)
        }
        return NonePermission
}

func NewCustomPermission(name string) Permission {
        return Permission(CustomPermissionPrefix + Permission(name))
}

var builtinPermissionActionsMap = map[Permission]Actions{
        BuiltinPermissionReadOnly: {
                // Object storage interface actions
                OSSGetObjectAction,
                OSSListObjectsAction,
                OSSHeadObjectAction,
                OSSHeadBucketAction,
                OSSGetObjectTorrentAction,
                OSSGetObjectAclAction,
                OSSListPartsAction,
                OSSGetBucketLocationAction,
                OSSGetObjectTaggingAction,
                OSSListObjectVersionsAction,
                OSSGetObjectLegalHoldAction,
                OSSGetObjectRetentionAction,
                OSSGetBucketEncryptionAction,

                // file system interface
                POSIXReadAction,
        },
        BuiltinPermissionWritable: {
                // Object storage interface actions
                OSSGetObjectAction,
                OSSPutObjectAction,
                OSSCopyObjectAction,
                OSSListObjectsAction,
                OSSDeleteObjectAction,
                OSSDeleteObjectsAction,
                OSSHeadObjectAction,
                OSSHeadBucketAction,
                OSSGetObjectTorrentAction,
                OSSGetObjectAclAction,
                OSSPutObjectAclAction,
                OSSCreateMultipartUploadAction,
                OSSListMultipartUploadsAction,
                OSSUploadPartAction,
                OSSUploadPartCopyAction,
                OSSListPartsAction,
                OSSCompleteMultipartUploadAction,
                OSSAbortMultipartUploadAction,
                OSSGetBucketLocationAction,
                OSSGetObjectTaggingAction,
                OSSPutObjectTaggingAction,
                OSSDeleteObjectTaggingAction,
                OSSListObjectVersionsAction,
                OSSGetObjectLegalHoldAction,
                OSSPutObjectLegalHoldAction,
                OSSGetObjectRetentionAction,
                OSSPutObjectRetentionAction,
                OSSGetBucketEncryptionAction,

                // POSIX file system interface actions
                POSIXReadAction,
                POSIXWriteAction,
        },
}

func BuiltinPermissionActions(perm Permission) Actions {
        var p Permission

        if builtinWritablePermRegexp.MatchString(string(perm)) {
                p = BuiltinPermissionWritable
        } else if builtinReadOnlyPermRegexp.MatchString(string(perm)) {
                p = BuiltinPermissionReadOnly
        }
        if actions, exists := builtinPermissionActionsMap[p]; exists {
                return actions
        }
        return nil
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import "strings"

const (
        FlowLimit       = "f"
        QPSLimit        = "q"
        ConcurrentLimit = "c"
        S3Nodes         = "s3nodes"
        DefaultUid      = "default"
)

type UserLimitConf struct {
        BandWidthQuota  map[string]uint64 `json:"band_width_quota"` // uid --> BytesPS
        QPSQuota        map[string]uint64 `json:"qps_quota"`        // uid --> QPS
        ConcurrentQuota map[string]uint64 `json:"concurrent_quota"` // uid --> concurrency
}

type S3QosRequest struct {
        Uid   string `json:"uid"`
        Api   string `json:"api"`
        Type  string `json:"type"`
        Quota uint64 `json:"quota"`
        Nodes uint64 `json:"nodes"`
}

type S3QoSResponse struct {
        ApiLimitConf map[string]*UserLimitConf `json:"user_limit_conf"` // api --> userLimitConf
        Nodes        uint64                    `json:"nodes"`
}

func IsS3PutApi(api string) bool {
        switch strings.ToLower(api) {
        case "putobject", "copyobject", "uploadpart", "uploadpartcopy", "postobject":
                return true
        default:
                return false
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package proto

import (
        "bytes"
        "encoding/binary"
        "encoding/json"
        "errors"
        "io"
        "strconv"
        "strings"
        "time"

        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

const (
        DefaultTransactionTimeout      = 1  // minutes
        MaxTransactionTimeout          = 60 // minutes
        DefaultTxConflictRetryNum      = 10
        MaxTxConflictRetryNum          = 100
        DefaultTxConflictRetryInterval = 20   // ms
        MaxTxConflictRetryInterval     = 1000 // ms
        MinTxConflictRetryInterval     = 10   // ms
        DefaultTxDeleteTime            = 120
        ClearOrphanTxTime              = 3600
)

type TxOpMask uint8

const (
        TxOpMaskOff TxOpMask = 0x00
        TxOpMaskAll TxOpMask = 0x7F
        TxPause     TxOpMask = 0xFF
)

const (
        TxOpMaskCreate TxOpMask = 0x01 << iota
        TxOpMaskMkdir
        TxOpMaskRemove
        TxOpMaskRename
        TxOpMaskMknod
        TxOpMaskSymlink
        TxOpMaskLink
)

var GTxMaskMap = map[string]TxOpMask{
        "off":     TxOpMaskOff,
        "create":  TxOpMaskCreate,
        "mkdir":   TxOpMaskMkdir,
        "remove":  TxOpMaskRemove,
        "rename":  TxOpMaskRename,
        "mknod":   TxOpMaskMknod,
        "symlink": TxOpMaskSymlink,
        "link":    TxOpMaskLink,
        "all":     TxOpMaskAll,
}

func GetMaskString(mask TxOpMask) (maskStr string) {
        if mask == TxPause {
                return "pause"
        }

        if mask&TxOpMaskAll == TxOpMaskAll {
                return "all"
        }

        for k, v := range GTxMaskMap {
                if k == "all" {
                        continue
                }
                if mask&v > 0 {
                        if maskStr == "" {
                                maskStr = k
                        } else {
                                maskStr = maskStr + "|" + k
                        }
                }
        }
        if maskStr == "" {
                maskStr = "off"
        }
        return
}

func txInvalidMask() (err error) {
        return errors.New("transaction mask key value pair should be: enableTxMaskKey=[create|mkdir|remove|rename|mknod|symlink|link]\n enableTxMaskKey=off \n enableTxMaskKey=all")
}

func MaskContains(mask TxOpMask, subMask TxOpMask) bool {
        if mask != TxOpMaskOff && subMask == TxOpMaskOff {
                return false
        }
        if (mask | subMask) != mask {
                return false
        }
        return true
}

func GetMaskFromString(maskStr string) (mask TxOpMask, err error) {
        if maskStr == "" {
                err = txInvalidMask()
                return
        }
        if maskStr == "pause" {
                mask = TxPause
                return
        }
        arr := strings.Split(maskStr, "|")

        optNum := len(arr)

        for _, v := range arr {
                if m, ok := GTxMaskMap[v]; ok {
                        if optNum >= 2 && (m == TxOpMaskOff || m == TxOpMaskAll) {
                                mask = TxOpMaskOff
                                err = txInvalidMask()
                                return
                        } else {
                                mask = mask | m
                        }
                } else {
                        mask = TxOpMaskOff
                        err = txInvalidMask()
                        return
                }
        }
        return mask, nil
}

type TxInodeInfo struct {
        Ino        uint64
        MpID       uint64
        CreateTime int64 // time.Now().Unix()
        Timeout    int64
        TxID       string
        MpMembers  string
}

func NewTxInodeInfo(members string, ino uint64, mpID uint64) *TxInodeInfo {
        return &TxInodeInfo{
                Ino:       ino,
                MpID:      mpID,
                MpMembers: members,
        }
}

func (info *TxInodeInfo) String() string {
        data, err := json.Marshal(info)
        if err != nil {
                return ""
        }
        return string(data)
}

func (info *TxInodeInfo) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        if err = binary.Write(buff, binary.BigEndian, &info.Ino); err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, &info.MpID); err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, &info.CreateTime); err != nil {
                return nil, err
        }
        if err = binary.Write(buff, binary.BigEndian, &info.Timeout); err != nil {
                return nil, err
        }

        id := []byte(info.TxID)
        idSize := uint32(len(id))
        if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
                return nil, err
        }
        if _, err = buff.Write(id); err != nil {
                return nil, err
        }

        addr := []byte(info.MpMembers)
        addrSize := uint32(len(addr))
        if err = binary.Write(buff, binary.BigEndian, &addrSize); err != nil {
                return nil, err
        }
        if _, err = buff.Write(addr); err != nil {
                return nil, err
        }

        result = buff.Bytes()
        return
}

func (info *TxInodeInfo) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &info.Ino); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &info.MpID); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &info.CreateTime); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &info.Timeout); err != nil {
                return
        }

        idSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
                return
        }
        if idSize > 0 {
                id := make([]byte, idSize)
                if _, err = io.ReadFull(buff, id); err != nil {
                        return
                }
                info.TxID = string(id)
        }

        addrSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &addrSize); err != nil {
                return
        }
        if addrSize > 0 {
                addr := make([]byte, addrSize)
                if _, err = io.ReadFull(buff, addr); err != nil {
                        return
                }
                info.MpMembers = string(addr)
        }

        return
}

func (info *TxInodeInfo) GetKey() uint64 {
        return info.Ino
}

func (info *TxInodeInfo) SetTxId(txID string) {
        info.TxID = txID
}

func (info *TxInodeInfo) SetTimeout(timeout int64) {
        info.Timeout = timeout
}

func (info *TxInodeInfo) SetCreateTime(createTime int64) {
        info.CreateTime = createTime
}

type TxDentryInfo struct {
        ParentId   uint64 // FileID value of the parent inode.
        Name       string // Name of the current dentry.
        MpMembers  string
        TxID       string
        MpID       uint64
        CreateTime int64 // time.Now().Unix()
        Timeout    int64
}

func NewTxDentryInfo(members string, parentId uint64, name string, mpID uint64) *TxDentryInfo {
        return &TxDentryInfo{
                ParentId:  parentId,
                Name:      name,
                MpMembers: members,
                MpID:      mpID,
        }
}

func (info *TxDentryInfo) String() string {
        data, err := json.Marshal(info)
        if err != nil {
                return ""
        }
        return string(data)
}

func (info *TxDentryInfo) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0, 128))
        if err = binary.Write(buff, binary.BigEndian, &info.ParentId); err != nil {
                panic(err)
        }

        name := []byte(info.Name)
        nameSize := uint32(len(name))
        if err = binary.Write(buff, binary.BigEndian, &nameSize); err != nil {
                panic(err)
        }
        if _, err = buff.Write(name); err != nil {
                panic(err)
        }

        addr := []byte(info.MpMembers)
        addrSize := uint32(len(addr))
        if err = binary.Write(buff, binary.BigEndian, &addrSize); err != nil {
                panic(err)
        }
        if _, err = buff.Write(addr); err != nil {
                panic(err)
        }

        id := []byte(info.TxID)
        idSize := uint32(len(id))
        if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
                panic(err)
        }
        if _, err = buff.Write(id); err != nil {
                panic(err)
        }

        if err = binary.Write(buff, binary.BigEndian, &info.MpID); err != nil {
                panic(err)
        }

        if err = binary.Write(buff, binary.BigEndian, &info.CreateTime); err != nil {
                panic(err)
        }

        if err = binary.Write(buff, binary.BigEndian, &info.Timeout); err != nil {
                panic(err)
        }
        result = buff.Bytes()
        return
}

func (info *TxDentryInfo) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        if err = binary.Read(buff, binary.BigEndian, &info.ParentId); err != nil {
                return
        }

        nameSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &nameSize); err != nil {
                return
        }
        if nameSize > 0 {
                name := make([]byte, nameSize)
                if _, err = io.ReadFull(buff, name); err != nil {
                        return
                }
                info.Name = string(name)
        }

        addrSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &addrSize); err != nil {
                return
        }
        if addrSize > 0 {
                addr := make([]byte, addrSize)
                if _, err = io.ReadFull(buff, addr); err != nil {
                        return
                }
                info.MpMembers = string(addr)
        }

        idSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
                return
        }
        if idSize > 0 {
                id := make([]byte, idSize)
                if _, err = io.ReadFull(buff, id); err != nil {
                        return
                }
                info.TxID = string(id)
        }

        if err = binary.Read(buff, binary.BigEndian, &info.MpID); err != nil {
                return
        }

        if err = binary.Read(buff, binary.BigEndian, &info.CreateTime); err != nil {
                return
        }

        if err = binary.Read(buff, binary.BigEndian, &info.Timeout); err != nil {
                return
        }
        return
}

func (info *TxDentryInfo) GetKey() string {
        return strconv.FormatUint(info.ParentId, 10) + "_" + info.Name
}

func (info *TxDentryInfo) GetTxId() (string, error) {
        if info.TxID == "" {
                return "", errors.New("txID is not set")
        }
        return info.TxID, nil
}

func (info *TxDentryInfo) SetTxId(txID string) {
        info.TxID = txID
}

func (info *TxDentryInfo) SetTimeout(timeout int64) {
        info.Timeout = timeout
}

func (info *TxDentryInfo) SetCreateTime(createTime int64) {
        info.CreateTime = createTime
}

const (
        TxTypeUndefined uint32 = iota
        TxTypeCreate
        TxTypeMkdir
        TxTypeRemove
        TxTypeRename
        TxTypeMknod
        TxTypeSymlink
        TxTypeLink
)

func TxMaskToType(mask TxOpMask) (txType uint32) {
        switch mask {
        case TxOpMaskOff:
                txType = TxTypeUndefined
        case TxOpMaskCreate:
                txType = TxTypeCreate
        case TxOpMaskMkdir:
                txType = TxTypeMkdir
        case TxOpMaskRemove:
                txType = TxTypeRemove
        case TxOpMaskRename:
                txType = TxTypeRename
        case TxOpMaskMknod:
                txType = TxTypeMknod
        case TxOpMaskSymlink:
                txType = TxTypeSymlink
        case TxOpMaskLink:
                txType = TxTypeLink
        default:
                txType = TxTypeUndefined
        }
        return txType
}

const (
        TxStateInit int32 = iota
        TxStatePreCommit
        TxStateCommit
        TxStateRollback
        TxStateCommitDone
        TxStateRollbackDone
        TxStateFailed
)

type TransactionInfo struct {
        TxID       string // "metapartitionId_atomicId", if empty, mp should be TM, otherwise it will be RM
        TxType     uint32
        TmID       int64
        CreateTime int64 // time.Now()
        Timeout    int64 // minutes
        State      int32
        DoneTime   int64 // time.now()
        RMFinish   bool  // used to check whether tx success on target rm.
        // once insert to txTree, not change inode & dentry ifo
        TxInodeInfos  map[uint64]*TxInodeInfo
        TxDentryInfos map[string]*TxDentryInfo
        LastCheckTime int64
}

type TxMpInfo struct {
        MpId          uint64
        Members       string
        TxInodeInfos  map[uint64]*TxInodeInfo
        TxDentryInfos map[string]*TxDentryInfo
}

const InitInode = 0

func (tx *TransactionInfo) SetCreateInodeId(ino uint64) {
        inoIfo := tx.TxInodeInfos[InitInode]
        inoIfo.Ino = ino
        delete(tx.TxInodeInfos, InitInode)
        tx.TxInodeInfos[ino] = inoIfo
}

func (tx *TransactionInfo) GroupByMp() map[uint64]*TxMpInfo {
        txMap := make(map[uint64]*TxMpInfo)

        for k, ifo := range tx.TxInodeInfos {
                mpIfo, ok := txMap[ifo.MpID]
                if !ok {
                        mpIfo = &TxMpInfo{
                                MpId:          ifo.MpID,
                                Members:       ifo.MpMembers,
                                TxInodeInfos:  make(map[uint64]*TxInodeInfo),
                                TxDentryInfos: make(map[string]*TxDentryInfo),
                        }
                        txMap[ifo.MpID] = mpIfo
                }

                mpIfo.TxInodeInfos[k] = ifo
        }

        for k, ifo := range tx.TxDentryInfos {
                mpIfo, ok := txMap[ifo.MpID]
                if !ok {
                        mpIfo = &TxMpInfo{
                                MpId:          ifo.MpID,
                                Members:       ifo.MpMembers,
                                TxInodeInfos:  make(map[uint64]*TxInodeInfo),
                                TxDentryInfos: make(map[string]*TxDentryInfo),
                        }
                        txMap[ifo.MpID] = mpIfo
                }

                mpIfo.TxDentryInfos[k] = ifo
        }

        return txMap
}

func (tx *TransactionInfo) IsDone() bool {
        return tx.State == TxStateCommitDone || tx.State == TxStateRollbackDone
}

func (tx *TransactionInfo) CanDelete() bool {
        if !tx.Finish() {
                return false
        }

        if tx.DoneTime+DefaultTxDeleteTime < time.Now().Unix() {
                return true
        }
        return false
}

func (tx *TransactionInfo) NeedClearOrphan() bool {
        if tx.Finish() {
                return false
        }

        now := time.Now().Unix()
        if tx.CreateTime+ClearOrphanTxTime > now {
                return false
        }

        // try to check every 1 minutes to avoid too many request
        if now-tx.LastCheckTime < 60 {
                return false
        }

        tx.LastCheckTime = now
        return true
}

func (tx *TransactionInfo) Finish() bool {
        return tx.RMFinish
}

func (tx *TransactionInfo) SetFinish() {
        tx.RMFinish = true
        tx.DoneTime = time.Now().Unix()
}

func (txInfo *TransactionInfo) GetInfo() string {
        return txInfo.String()
}

func (txInfo *TransactionInfo) IsExpired() (expired bool) {
        now := time.Now().Unix()
        expired = txInfo.Timeout*60+txInfo.CreateTime < now
        if expired {
                log.LogWarnf("IsExpired: transaction [%v] is expired, now[%v], CreateTime[%v]", txInfo, now, txInfo.CreateTime)
        }
        return expired
}

// Less tests whether the current TransactionInfo item is less than the given one.
// This method is necessary fot B-Tree item implementation.
func (txInfo *TransactionInfo) Less(than btree.Item) bool {
        ti, ok := than.(*TransactionInfo)
        return ok && txInfo.TxID < ti.TxID
}

// Copy returns a copy of the inode.
func (txInfo *TransactionInfo) Copy() btree.Item {
        return txInfo.GetCopy()
}

func NewTxInfoBItem(txId string) *TransactionInfo {
        return &TransactionInfo{
                TxID: txId,
        }
}

const initTmId = -1

func NewTransactionInfo(timeout int64, txType uint32) *TransactionInfo {
        return &TransactionInfo{
                Timeout:       timeout,
                TxInodeInfos:  make(map[uint64]*TxInodeInfo),
                TxDentryInfos: make(map[string]*TxDentryInfo),
                TmID:          initTmId,
                TxType:        txType,
                State:         TxStateInit,
        }
}

func (txInfo *TransactionInfo) IsInitialized() bool {
        return txInfo.TxID != ""
}

func (txInfo *TransactionInfo) String() string {
        data, err := json.Marshal(txInfo)
        if err != nil {
                return ""
        }
        return string(data)
}

func (txInfo *TransactionInfo) GetCopy() *TransactionInfo {
        newInfo := *txInfo
        return &newInfo
}

func (txInfo *TransactionInfo) Marshal() (result []byte, err error) {
        buff := bytes.NewBuffer(make([]byte, 0, 256))
        id := []byte(txInfo.TxID)
        idSize := uint32(len(id))
        if err = binary.Write(buff, binary.BigEndian, &idSize); err != nil {
                return nil, err
        }
        if _, err = buff.Write(id); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.TxType); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.TmID); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.CreateTime); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.Timeout); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.State); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.DoneTime); err != nil {
                return nil, err
        }

        if err = binary.Write(buff, binary.BigEndian, &txInfo.RMFinish); err != nil {
                return nil, err
        }

        inodeNum := uint32(len(txInfo.TxInodeInfos))
        if err = binary.Write(buff, binary.BigEndian, &inodeNum); err != nil {
                return nil, err
        }

        for _, txInodeInfo := range txInfo.TxInodeInfos {
                bs, err := txInodeInfo.Marshal()
                if err != nil {
                        return nil, err
                }
                if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                        return nil, err
                }
                if _, err := buff.Write(bs); err != nil {
                        return nil, err
                }
        }

        dentryNum := uint32(len(txInfo.TxDentryInfos))
        if err = binary.Write(buff, binary.BigEndian, &dentryNum); err != nil {
                panic(err)
        }
        for _, txDentryInfo := range txInfo.TxDentryInfos {
                bs, err := txDentryInfo.Marshal()
                if err != nil {
                        return nil, err
                }
                if err = binary.Write(buff, binary.BigEndian, uint32(len(bs))); err != nil {
                        return nil, err
                }
                if _, err := buff.Write(bs); err != nil {
                        return nil, err
                }
        }

        return buff.Bytes(), nil
}

func (txInfo *TransactionInfo) Unmarshal(raw []byte) (err error) {
        buff := bytes.NewBuffer(raw)
        idSize := uint32(0)
        if err = binary.Read(buff, binary.BigEndian, &idSize); err != nil {
                return
        }
        if idSize > 0 {
                id := make([]byte, idSize)
                if _, err = io.ReadFull(buff, id); err != nil {
                        return
                }
                txInfo.TxID = string(id)
        }

        if err = binary.Read(buff, binary.BigEndian, &txInfo.TxType); err != nil {
                return
        }

        if err = binary.Read(buff, binary.BigEndian, &txInfo.TmID); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &txInfo.CreateTime); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &txInfo.Timeout); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &txInfo.State); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &txInfo.DoneTime); err != nil {
                return
        }
        if err = binary.Read(buff, binary.BigEndian, &txInfo.RMFinish); err != nil {
                return
        }

        var inodeNum uint32
        if err = binary.Read(buff, binary.BigEndian, &inodeNum); err != nil {
                return
        }
        var dataLen uint32
        txInfo.TxInodeInfos = map[uint64]*TxInodeInfo{}
        for i := uint32(0); i < inodeNum; i++ {
                if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                        return
                }
                data := make([]byte, int(dataLen))
                if _, err = buff.Read(data); err != nil {
                        return
                }
                txInodeInfo := NewTxInodeInfo("", 0, 0)
                if err = txInodeInfo.Unmarshal(data); err != nil {
                        return
                }
                txInfo.TxInodeInfos[txInodeInfo.GetKey()] = txInodeInfo
        }

        var dentryNum uint32
        txInfo.TxDentryInfos = map[string]*TxDentryInfo{}
        if err = binary.Read(buff, binary.BigEndian, &dentryNum); err != nil {
                return
        }

        for i := uint32(0); i < dentryNum; i++ {
                if err = binary.Read(buff, binary.BigEndian, &dataLen); err != nil {
                        return
                }
                data := make([]byte, int(dataLen))
                if _, err = buff.Read(data); err != nil {
                        return
                }
                txDentryInfo := NewTxDentryInfo("", 0, "", 0)
                if err = txDentryInfo.Unmarshal(data); err != nil {
                        return
                }
                txInfo.TxDentryInfos[txDentryInfo.GetKey()] = txDentryInfo
        }

        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package proto

import (
        "fmt"
        "regexp"
        "sync"
)

var (
        AKRegexp   = regexp.MustCompile("^[a-zA-Z0-9]{16}$")
        SKRegexp   = regexp.MustCompile("^[a-zA-Z0-9]{32}$")
        WriteS3Api = []string{
                "PostObject", "PutObject", "CopyObject", "CreateMultipartUpload", "UploadPart", "UploadPartCopy",
                "CompleteMultipartUpload", "AbortMultipartUpload", "DeleteObjects", "DeleteObject",
        }
)

type UserType uint8

const (
        UserTypeInvalid UserType = 0x0
        UserTypeRoot    UserType = 0x1
        UserTypeAdmin   UserType = 0x2
        UserTypeNormal  UserType = 0x3
)

func (u UserType) Valid() bool {
        switch u {
        case UserTypeRoot,
                UserTypeAdmin,
                UserTypeNormal:
                return true
        default:
        }
        return false
}

func (u UserType) String() string {
        switch u {
        case UserTypeRoot:
                return "root"
        case UserTypeAdmin:
                return "admin"
        case UserTypeNormal:
                return "normal"
        default:
        }
        return "invalid"
}

func UserTypeFromString(name string) UserType {
        switch name {
        case "root":
                return UserTypeRoot
        case "admin":
                return UserTypeAdmin
        case "normal":
                return UserTypeNormal
        default:
        }
        return UserTypeInvalid
}

func IsValidAK(ak string) bool {
        if AKRegexp.MatchString(ak) {
                return true
        } else {
                return false
        }
}

func IsValidSK(sk string) bool {
        if SKRegexp.MatchString(sk) {
                return true
        } else {
                return false
        }
}

type AKUser struct {
        AccessKey string `json:"access_key" graphql:"access_key"`
        UserID    string `json:"user_id" graphql:"user_id"`
        Password  string `json:"password" graphql:"password"`
}

type UserInfo struct {
        UserID      string       `json:"user_id" graphql:"user_id"`
        AccessKey   string       `json:"access_key" graphql:"access_key"`
        SecretKey   string       `json:"secret_key" graphql:"secret_key"`
        Policy      *UserPolicy  `json:"policy" graphql:"policy"`
        UserType    UserType     `json:"user_type" graphql:"user_type"`
        CreateTime  string       `json:"create_time" graphql:"create_time"`
        Description string       `json:"description" graphql:"description"`
        Mu          sync.RWMutex `json:"-" graphql:"-"`
        EMPTY       bool         // graphql need ???
}

func (i *UserInfo) String() string {
        if i == nil {
                return "nil"
        }
        return fmt.Sprintf("%v_%v_%v_%v",
                i.UserID, i.AccessKey, i.SecretKey, i.UserType)
}

func NewUserInfo() *UserInfo {
        return &UserInfo{Policy: NewUserPolicy()}
}

type VolUser struct {
        Vol     string       `json:"vol"`
        UserIDs []string     `json:"user_id"`
        Mu      sync.RWMutex `json:"-" graphql:"-"`
}

type UserPolicy struct {
        OwnVols        []string            `json:"own_vols" graphql:"own_vols"`
        AuthorizedVols map[string][]string `json:"authorized_vols" graphql:"-"` // mapping: volume -> actions
        mu             sync.RWMutex
}

func NewUserPolicy() *UserPolicy {
        return &UserPolicy{
                OwnVols:        make([]string, 0),
                AuthorizedVols: make(map[string][]string),
        }
}

func (policy *UserPolicy) IsOwn(volume string) bool {
        policy.mu.RLock()
        defer policy.mu.RUnlock()
        for _, vol := range policy.OwnVols {
                if vol == volume {
                        return true
                }
        }
        return false
}

func (policy *UserPolicy) IsAuthorized(volume, subdir string, action Action) bool {
        policy.mu.RLock()
        defer policy.mu.RUnlock()
        if len(policy.OwnVols) > 0 {
                for _, v := range policy.OwnVols {
                        if v == volume {
                                return true
                        }
                }
        }
        values, exist := policy.AuthorizedVols[volume]
        if !exist {
                return false
        }
        for _, value := range values {
                if perm := ParsePermission(value); !perm.IsNone() && perm.IsBuiltin() && perm.MatchSubdir(subdir) && BuiltinPermissionActions(perm).Contains(action) {
                        return true
                }
                if act := ParseAction(value); act == action {
                        return true
                }
        }
        return false
}

func (policy *UserPolicy) IsAuthorizedS3(volume, api string) bool {
        policy.mu.RLock()
        defer policy.mu.RUnlock()
        perms := policy.AuthorizedVols[volume]
        for _, perm := range perms {
                if builtinWritablePermRegexp.MatchString(perm) {
                        return true
                }
                if builtinReadOnlyPermRegexp.MatchString(perm) && !contain(api, WriteS3Api) {
                        return true
                }
        }
        return false
}

func contain(str string, strs []string) bool {
        for _, v := range strs {
                if v == str {
                        return true
                }
        }
        return false
}

func (policy *UserPolicy) AddOwnVol(volume string) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        for _, ownVol := range policy.OwnVols {
                if ownVol == volume {
                        return
                }
        }
        policy.OwnVols = append(policy.OwnVols, volume)
}

func (policy *UserPolicy) RemoveOwnVol(volume string) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        for i, ownVol := range policy.OwnVols {
                if ownVol == volume {
                        if i == len(policy.OwnVols)-1 {
                                policy.OwnVols = policy.OwnVols[:i]
                                return
                        }
                        policy.OwnVols = append(policy.OwnVols[:i], policy.OwnVols[i+1:]...)
                        return
                }
        }
}

func (policy *UserPolicy) AddAuthorizedVol(volume string, policies []string) { // todo check duplicate
        policy.mu.Lock()
        defer policy.mu.Unlock()
        newPolicies := make([]string, 0)
        for _, policy := range policies {
                if perm := ParsePermission(policy); !perm.IsNone() {
                        newPolicies = append(newPolicies, perm.String())
                }
                if act := ParseAction(policy); !act.IsNone() {
                        newPolicies = append(newPolicies, act.String())
                }
        }
        policy.AuthorizedVols[volume] = newPolicies
}

func (policy *UserPolicy) RemoveAuthorizedVol(volume string) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        delete(policy.AuthorizedVols, volume)
}

func (policy *UserPolicy) SetPerm(volume string, perm Permission) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        policy.AuthorizedVols[volume] = []string{perm.String()}
}

func (policy *UserPolicy) SetActions(volume string, actions Actions) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        values := make([]string, actions.Len())
        for i, action := range actions {
                values[i] = action.String()
        }
        policy.AuthorizedVols[volume] = values
}

func (policy *UserPolicy) Add(addPolicy *UserPolicy) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        policy.OwnVols = append(policy.OwnVols, addPolicy.OwnVols...)
        for k, v := range addPolicy.AuthorizedVols {
                if apis, ok := policy.AuthorizedVols[k]; ok {
                        policy.AuthorizedVols[k] = append(apis, addPolicy.AuthorizedVols[k]...)
                } else {
                        policy.AuthorizedVols[k] = v
                }
        }
}

func (policy *UserPolicy) Delete(deletePolicy *UserPolicy) {
        policy.mu.Lock()
        defer policy.mu.Unlock()
        policy.OwnVols = removeSlice(policy.OwnVols, deletePolicy.OwnVols)
        for k, v := range deletePolicy.AuthorizedVols {
                if apis, ok := policy.AuthorizedVols[k]; ok {
                        policy.AuthorizedVols[k] = removeSlice(apis, v)
                }
        }
}

func removeSlice(s []string, removeSlice []string) []string {
        if len(s) == 0 {
                return s
        }
        for _, elem := range removeSlice {
                for i, v := range s {
                        if v == elem {
                                s = append(s[:i], s[i+1:]...)
                                break
                        }
                }
        }
        return s
}

func CleanPolicy(policy *UserPolicy) (newUserPolicy *UserPolicy) {
        m := make(map[string]bool)
        newUserPolicy = NewUserPolicy()
        policy.mu.Lock()
        defer policy.mu.Unlock()
        for _, vol := range policy.OwnVols {
                if _, exist := m[vol]; !exist {
                        m[vol] = true
                        newUserPolicy.OwnVols = append(newUserPolicy.OwnVols, vol)
                }
        }
        for vol, apis := range policy.AuthorizedVols {
                checkMap := make(map[string]bool)
                newAPI := make([]string, 0)
                for _, api := range apis {
                        if _, exist := checkMap[api]; !exist {
                                checkMap[api] = true
                                newAPI = append(newAPI, api)
                        }
                }
                newUserPolicy.AuthorizedVols[vol] = newAPI
        }
        return
}

type UserCreateParam struct {
        ID          string   `json:"id"`
        Password    string   `json:"pwd"`
        AccessKey   string   `json:"ak"`
        SecretKey   string   `json:"sk"`
        Type        UserType `json:"type"`
        Description string   `json:"description"`
}

type UserPermUpdateParam struct {
        UserID string   `json:"user_id"`
        Volume string   `json:"volume"`
        Subdir string   `json:"subdir"`
        Policy []string `json:"policy"`
}

func NewUserPermUpdateParam(userID, volmue string) *UserPermUpdateParam {
        return &UserPermUpdateParam{UserID: userID, Volume: volmue, Policy: make([]string, 0)}
}

func (param *UserPermUpdateParam) SetPolicy(policy string) {
        param.Policy = append(param.Policy, policy)
}

type UserPermRemoveParam struct {
        UserID string `json:"user_id"`
        Volume string `json:"volume"`
}

func NewUserPermRemoveParam(userID, volmue string) *UserPermRemoveParam {
        return &UserPermRemoveParam{UserID: userID, Volume: volmue}
}

type UserTransferVolParam struct {
        Volume  string `json:"volume"`
        UserSrc string `json:"user_src"`
        UserDst string `json:"user_dst"`
        Force   bool   `json:"force"`
}

type UserUpdateParam struct {
        UserID      string   `json:"user_id"`
        AccessKey   string   `json:"access_key"`
        SecretKey   string   `json:"secret_key"`
        Type        UserType `json:"type"`
        Password    string   `json:"password"`
        Description string   `json:"description"`
}

package proto

import (
        "fmt"
        "runtime"
)

//TODO: remove this later.
//go:generate golangci-lint run --issues-exit-code=1 -D errcheck -E bodyclose .

var (
        Version    string
        CommitID   string
        BranchName string
        BuildTime  string
)

func DumpVersion(role string) string {
        return fmt.Sprintf("CubeFS %s\n"+
                "Version : %s\n"+
                "Branch  : %s\n"+
                "Commit  : %s\n"+
                "Build   : %s %s %s %s\n",
                role,
                Version,
                BranchName,
                CommitID,
                runtime.Version(), runtime.GOOS, runtime.GOARCH, BuildTime)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package raftstore

import (
        "fmt"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

// Constants for network port definition.
const (
        DefaultHeartbeatPort     = 5901
        DefaultReplicaPort       = 5902
        DefaultNumOfLogsToRetain = 20000
        DefaultTickInterval      = 300
        DefaultElectionTick      = 3
)

// Config defines the configuration properties for the raft store.
type Config struct {
        NodeID            uint64 // Identity of raft server instance.
        RaftPath          string // Path of raft logs
        IPAddr            string // IP address
        HeartbeatPort     int
        ReplicaPort       int
        NumOfLogsToRetain uint64 // number of logs to be kept after truncation. The default value is 20000.

        // TickInterval is the interval of timer which check heartbeat and election timeout.
        // The default value is 300,unit is millisecond.
        TickInterval int

        // RecvBufSize is the size of raft receive buffer channel.
        // The default value is 2048.
        RecvBufSize int

        // ElectionTick is the election timeout. If a follower does not receive any message
        // from the leader of current term during ElectionTick, it will become candidate and start an election.
        // ElectionTick must be greater than HeartbeatTick.
        // We suggest to use ElectionTick = 10 * HeartbeatTick to avoid unnecessary leader switching.
        // The default value is 1s.
        ElectionTick int
}

// PeerAddress defines the set of addresses that will be used by the peers.
type PeerAddress struct {
        proto.Peer
        Address       string
        HeartbeatPort int
        ReplicaPort   int
}

// PartitionConfig defines the configuration properties for the partitions.
type PartitionConfig struct {
        ID      uint64
        Applied uint64
        Leader  uint64
        Term    uint64
        Peers   []PeerAddress
        SM      PartitionFsm
        WalPath string
}

func (p PeerAddress) String() string {
        return fmt.Sprintf(`"nodeID":"%v","peerID":"%v","priority":"%v","type":"%v","heartbeatPort":"%v","ReplicaPort":"%v"`,
                p.ID, p.PeerID, p.Priority, p.Type.String(), p.HeartbeatPort, p.ReplicaPort)
}

package raftstore

import (
        "fmt"
        "sync"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

const (
        defaultReportDuration    = time.Minute * 3
        defaultZombieThreshold   = time.Minute * 3
        defaultNoLeaderThreshold = time.Second * 30
)

const (
        cfgZombieThresholdSec          = "raftMonZombieThrSec"
        cfgZombieTooLongThresholdSec   = "raftMonZombieTooLongThrSec"
        cfgNoLeaderThresholdSec        = "raftMonNoLeaderThrSec"
        cfgNoLeaderTooLongThresholdSec = "raftMonNoLeaderTooLongThrSec"
)

type monitorConf struct {
        ZombieThreshold          time.Duration
        ZombieTooLongThreshold   time.Duration
        NoLeaderThreshold        time.Duration
        NoLeaderTooLongThreshold time.Duration
}

var gMonConf = monitorConf{
        ZombieThreshold:          defaultZombieThreshold,
        ZombieTooLongThreshold:   defaultReportDuration,
        NoLeaderThreshold:        defaultNoLeaderThreshold,
        NoLeaderTooLongThreshold: defaultReportDuration,
}

func setMonitorConf(cfg *config.Config) {
        if cfg == nil {
                return
        }

        cfgZomThr := cfg.GetInt64(cfgZombieThresholdSec)
        if cfgZomThr > 0 {
                gMonConf.ZombieThreshold = time.Second * time.Duration(cfgZomThr)
        }

        cfgZomTooLongThr := cfg.GetInt64(cfgZombieTooLongThresholdSec)
        if cfgZomTooLongThr > 0 {
                gMonConf.ZombieTooLongThreshold = time.Second * time.Duration(cfgZomTooLongThr)
        }

        cfgNoLeaderThr := cfg.GetInt64(cfgNoLeaderThresholdSec)
        if cfgNoLeaderThr > 0 {
                gMonConf.NoLeaderThreshold = time.Second * time.Duration(cfgNoLeaderThr)
        }

        cfgNoLeaderTooLongThr := cfg.GetInt64(cfgNoLeaderTooLongThresholdSec)
        if cfgNoLeaderTooLongThr > 0 {
                gMonConf.NoLeaderTooLongThreshold = time.Second * time.Duration(cfgNoLeaderTooLongThr)
        }

        log.LogInfof("set raft monitor cfg: zombieThreshold:[%v], zombieTooLongThreshold:[%v],"+
                " noLeaderThreshold:[%v], noLeaderTooLongThreshold:[%v]",
                gMonConf.ZombieThreshold, gMonConf.ZombieTooLongThreshold,
                gMonConf.NoLeaderThreshold, gMonConf.NoLeaderTooLongThreshold)
}

type zombiePeer struct {
        partitionID uint64
        peer        proto.Peer
}

type monitor struct {
        zombieDurations     map[zombiePeer]time.Duration
        zombieDurationMutex sync.RWMutex

        noLeaderDurations      map[uint64]time.Duration
        noLeaderDurationsMutex sync.RWMutex
}

func newMonitor() *monitor {
        var m *monitor
        m = &monitor{}

        m.zombieDurations = make(map[zombiePeer]time.Duration)
        m.noLeaderDurations = make(map[uint64]time.Duration)
        return m
}

func (d *monitor) MonitorZombie(id uint64, peer proto.Peer, replicasMsg string, du time.Duration) {
        if du < gMonConf.ZombieThreshold {
                return
        }

        needReport := true
        var errMsg string

        zombiePeer := zombiePeer{
                partitionID: id,
                peer:        peer,
        }

        d.zombieDurationMutex.RLock()
        oldDu := d.zombieDurations[zombiePeer]
        d.zombieDurationMutex.RUnlock()

        if oldDu == 0 || du < oldDu {
                // peer became zombie recently
                errMsg = fmt.Sprintf("[MonitorZombie] raft peer zombie, "+
                        "partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
                        id, peer.PeerID, replicasMsg, peer, du)
        } else if du-oldDu > gMonConf.ZombieTooLongThreshold {
                // peer keeping zombie for too long
                errMsg = fmt.Sprintf("[MonitorZombieTooLong] raft peer zombie too long, "+
                        "partitionID[%d] replicaID[%v] replicasMsg[%s] zombiePeer[%v] zombieDuration[%v]",
                        id, peer.PeerID, replicasMsg, peer, du)
        } else {
                // peer keeping zombie, but it's not time for another too-long-report yet
                needReport = false
        }

        if !needReport {
                return
        }
        d.zombieDurationMutex.Lock()
        d.zombieDurations[zombiePeer] = du
        d.zombieDurationMutex.Unlock()
        log.LogError(errMsg)
        exporter.Warning(errMsg)
}

func (d *monitor) MonitorElection(id uint64, replicaMsg string, du time.Duration) {
        if du < gMonConf.NoLeaderThreshold {
                return
        }
        needReport := true
        var errMsg string

        d.noLeaderDurationsMutex.RLock()
        oldDu := d.noLeaderDurations[id]
        d.noLeaderDurationsMutex.RUnlock()

        if oldDu == 0 || du < oldDu {
                // became no leader recently
                errMsg = fmt.Sprintf("[RaftNoLeader] raft no leader partitionID[%d]_replicas[%v]_Duration[%v]",
                        id, replicaMsg, du)
        } else if du-oldDu > gMonConf.NoLeaderTooLongThreshold {
                // keeping no leader for too long
                errMsg = fmt.Sprintf("[RaftNoLeaderTooLong] raft no leader too long, "+
                        "partitionID[%d]_replicas[%v]_Duration[%v]",
                        id, replicaMsg, du)
        } else {
                // keeping not health, but it's not time for another too-long-report yet
                needReport = false
        }

        if !needReport {
                return
        }

        d.noLeaderDurationsMutex.Lock()
        d.noLeaderDurations[id] = du
        d.noLeaderDurationsMutex.Unlock()
        log.LogError(errMsg)
        exporter.Warning(errMsg)
}

func (d *monitor) RemovePeer(id uint64, p proto.Peer) {
        zp := zombiePeer{
                partitionID: id,
                peer:        p,
        }

        d.zombieDurationMutex.Lock()
        _, present := d.zombieDurations[zp]
        if present {
                delete(d.zombieDurations, zp)
                log.LogInfof("remove peer from raft monitor, partitionID: %v, peer: %v", id, p)
        }
        d.zombieDurationMutex.Unlock()
}

func (d *monitor) RemovePartition(id uint64, peers []proto.Peer) {
        d.noLeaderDurationsMutex.Lock()
        _, present := d.noLeaderDurations[id]
        if present {
                delete(d.noLeaderDurations, id)
                log.LogInfof("remove partition from raft monitor, partitionID: %v, peers: %v", id, peers)
        }
        d.noLeaderDurationsMutex.Unlock()

        for _, p := range peers {
                d.RemovePeer(id, p)
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package raftstore

import (
        "os"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
)

// PartitionStatus is a type alias of raft.Status
type PartitionStatus = raft.Status

// PartitionFsm wraps necessary methods include both FSM implementation
// and data storage operation for raft store partition.
// It extends from raft StateMachine and Store.
type PartitionFsm = raft.StateMachine

// Partition wraps necessary methods for raft store partition operation.
// Partition is a shard for multi-raft in RaftSore. RaftStore is based on multi-raft which
// manages multiple raft replication groups at same time through a single
// raft server instance and system resource.
type Partition interface {
        // Submit submits command data to raft log.
        Submit(cmd []byte) (resp interface{}, err error)

        // ChangeMember submits member change event and information to raft log.
        ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (resp interface{}, err error)

        // Stop removes the raft partition from raft server and shuts down this partition.
        Stop() error

        // Delete stops and deletes the partition.
        Delete() error

        // Status returns the current raft status.
        Status() (status *PartitionStatus)

        // IsRestoring Much faster then status().RestoringSnapshot.
        IsRestoring() bool

        // LeaderTerm returns the current term of leader in the raft group. TODO what is term?
        LeaderTerm() (leaderID, term uint64)

        // IsRaftLeader returns true if this node is the leader of the raft group it belongs to.
        IsRaftLeader() bool

        // AppliedIndex returns the current index of the applied raft log in the raft store partition.
        AppliedIndex() uint64

        // CommittedIndex returns the current index of the applied raft log in the raft store partition.
        CommittedIndex() uint64

        // Truncate raft log
        Truncate(index uint64)
        TryToLeader(nodeID uint64) error
        IsOfflinePeer() bool
}

// Default implementation of the Partition interface.
type partition struct {
        id      uint64
        raft    *raft.RaftServer
        walPath string
        config  *PartitionConfig
}

// ChangeMember submits member change event and information to raft log.
func (p *partition) ChangeMember(changeType proto.ConfChangeType, peer proto.Peer, context []byte) (
        resp interface{}, err error) {
        if !p.IsRaftLeader() {
                err = raft.ErrNotLeader
                return
        }
        future := p.raft.ChangeMember(p.id, changeType, peer, context)
        resp, err = future.Response()
        return
}

// Stop removes the raft partition from raft server and shuts down this partition.
func (p *partition) Stop() (err error) {
        err = p.raft.RemoveRaft(p.id)
        return
}

func (p *partition) TryToLeader(nodeID uint64) (err error) {
        future := p.raft.TryToLeader(nodeID)
        _, err = future.Response()
        return
}

// Delete stops and deletes the partition.
func (p *partition) Delete() (err error) {
        if err = p.Stop(); err != nil {
                return
        }
        err = os.RemoveAll(p.walPath)
        return
}

func (p *partition) IsRestoring() bool {
        return p.raft.IsRestoring(p.id)
}

// Status returns the current raft status.
func (p *partition) Status() (status *PartitionStatus) {
        status = p.raft.Status(p.id)
        return
}

// LeaderTerm returns the current term of leader in the raft group.
func (p *partition) LeaderTerm() (leaderID, term uint64) {
        if p.raft == nil {
                return
        }

        leaderID, term = p.raft.LeaderTerm(p.id)
        return
}

func (p *partition) IsOfflinePeer() bool {
        status := p.Status()
        active := 0
        sumPeers := 0
        for _, peer := range status.Replicas {
                if peer.Active {
                        active++
                }
                sumPeers++
        }

        return active >= (int(sumPeers)/2 + 1)
}

// IsRaftLeader returns true if this node is the leader of the raft group it belongs to.
func (p *partition) IsRaftLeader() (isLeader bool) {
        isLeader = p.raft != nil && p.raft.IsLeader(p.id)
        return
}

// AppliedIndex returns the current index of the applied raft log in the raft store partition.
func (p *partition) AppliedIndex() (applied uint64) {
        applied = p.raft.AppliedIndex(p.id)
        return
}

// CommittedIndex returns the current index of the applied raft log in the raft store partition.
func (p *partition) CommittedIndex() (applied uint64) {
        applied = p.raft.CommittedIndex(p.id)
        return
}

// Submit submits command data to raft log.
func (p *partition) Submit(cmd []byte) (resp interface{}, err error) {
        if !p.IsRaftLeader() {
                err = raft.ErrNotLeader
                return
        }
        future := p.raft.Submit(p.id, cmd)
        resp, err = future.Response()
        return
}

// Truncate truncates the raft log
func (p *partition) Truncate(index uint64) {
        if p.raft != nil {
                p.raft.Truncate(p.id, index)
        }
}

func newPartition(cfg *PartitionConfig, raft *raft.RaftServer, walPath string) Partition {
        return &partition{
                id:      cfg.ID,
                raft:    raft,
                walPath: walPath,
                config:  cfg,
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package raftstore

import (
        "fmt"
        syslog "log"
        "os"
        "path"
        "strconv"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/logger"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/proto"
        "github.com/cubefs/cubefs/depends/tiglabs/raft/storage/wal"
        raftlog "github.com/cubefs/cubefs/depends/tiglabs/raft/util/log"
        utilConfig "github.com/cubefs/cubefs/util/config"
)

// RaftStore defines the interface for the raft store.
type RaftStore interface {
        CreatePartition(cfg *PartitionConfig) (Partition, error)
        Stop()
        RaftConfig() *raft.Config
        RaftStatus(raftID uint64) (raftStatus *raft.Status)
        NodeManager
        RaftServer() *raft.RaftServer
}

type raftStore struct {
        nodeID     uint64
        resolver   NodeResolver
        raftConfig *raft.Config
        raftServer *raft.RaftServer
        raftPath   string
}

// RaftConfig returns the raft configuration.
func (s *raftStore) RaftConfig() *raft.Config {
        return s.raftConfig
}

func (s *raftStore) RaftStatus(raftID uint64) (raftStatus *raft.Status) {
        return s.raftServer.Status(raftID)
}

// AddNodeWithPort add a new node with the given port.
func (s *raftStore) AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int) {
        s.resolver.AddNodeWithPort(nodeID, addr, heartbeat, replicate)
}

// DeleteNode deletes the node with the given ID in the raft store.
func (s *raftStore) DeleteNode(nodeID uint64) {
        s.resolver.DeleteNode(nodeID)
}

// Stop stops the raft store server.
func (s *raftStore) Stop() {
        if s.raftServer != nil {
                s.raftServer.Stop()
        }
}

func newRaftLogger(dir string) {
        raftLogPath := path.Join(dir, "logs")
        _, err := os.Stat(raftLogPath)
        if err != nil {
                if pathErr, ok := err.(*os.PathError); ok {
                        if os.IsNotExist(pathErr) {
                                os.MkdirAll(raftLogPath, 0o755)
                        }
                }
        }

        raftLog, err := raftlog.NewLog(raftLogPath, "raft", "debug")
        if err != nil {
                syslog.Println("Fatal: failed to start the baud storage daemon - ", err)
                return
        }
        logger.SetLogger(raftLog)
        return
}

// NewRaftStore returns a new raft store instance.
func NewRaftStore(cfg *Config, extendCfg *utilConfig.Config) (mr RaftStore, err error) {
        resolver := NewNodeResolver()

        newRaftLogger(cfg.RaftPath)
        setMonitorConf(extendCfg)

        rc := raft.DefaultConfig()
        rc.NodeID = cfg.NodeID
        rc.LeaseCheck = true
        rc.PreVote = true
        if cfg.HeartbeatPort <= 0 {
                cfg.HeartbeatPort = DefaultHeartbeatPort
        }
        if cfg.ReplicaPort <= 0 {
                cfg.ReplicaPort = DefaultReplicaPort
        }
        if cfg.NumOfLogsToRetain == 0 {
                cfg.NumOfLogsToRetain = DefaultNumOfLogsToRetain
        }
        if cfg.ElectionTick < DefaultElectionTick {
                cfg.ElectionTick = DefaultElectionTick
        }
        if cfg.TickInterval < DefaultTickInterval {
                cfg.TickInterval = DefaultTickInterval
        }
        // if cfg's RecvBufSize bigger than the default 2048,
        // use the bigger one.
        if cfg.RecvBufSize > rc.ReqBufferSize {
                rc.ReqBufferSize = cfg.RecvBufSize
        }
        rc.HeartbeatAddr = fmt.Sprintf("%s:%d", cfg.IPAddr, cfg.HeartbeatPort)
        rc.ReplicateAddr = fmt.Sprintf("%s:%d", cfg.IPAddr, cfg.ReplicaPort)
        rc.Resolver = resolver
        rc.RetainLogs = cfg.NumOfLogsToRetain
        rc.TickInterval = time.Duration(cfg.TickInterval) * time.Millisecond
        rc.ElectionTick = cfg.ElectionTick
        rs, err := raft.NewRaftServer(rc)
        if err != nil {
                return
        }
        mr = &raftStore{
                nodeID:     cfg.NodeID,
                resolver:   resolver,
                raftConfig: rc,
                raftServer: rs,
                raftPath:   cfg.RaftPath,
        }
        return
}

func (s *raftStore) RaftServer() *raft.RaftServer {
        return s.raftServer
}

// CreatePartition creates a new partition in the raft store.
func (s *raftStore) CreatePartition(cfg *PartitionConfig) (p Partition, err error) {
        // Init WaL Storage for this partition.
        // Variables:
        // wc: WaL Configuration.
        // wp: WaL Path.
        // ws: WaL Storage.
        var walPath string
        if cfg.WalPath == "" {
                walPath = path.Join(s.raftPath, strconv.FormatUint(cfg.ID, 10))
        } else {
                walPath = path.Join(cfg.WalPath, "wal_"+strconv.FormatUint(cfg.ID, 10))
        }

        wc := &wal.Config{}
        ws, err := wal.NewStorage(walPath, wc)
        if err != nil {
                return
        }
        peers := make([]proto.Peer, 0)
        for _, peerAddress := range cfg.Peers {
                peers = append(peers, peerAddress.Peer)
                s.AddNodeWithPort(
                        peerAddress.ID,
                        peerAddress.Address,
                        peerAddress.HeartbeatPort,
                        peerAddress.ReplicaPort,
                )
        }
        logger.Info("action[raftstore:CreatePartition] raft config applied [%v] id:%d", cfg.Applied, cfg.ID)
        rc := &raft.RaftConfig{
                ID:           cfg.ID,
                Peers:        peers,
                Leader:       cfg.Leader,
                Term:         cfg.Term,
                Storage:      ws,
                StateMachine: cfg.SM,
                Applied:      cfg.Applied,
                Monitor:      newMonitor(),
        }
        if err = s.raftServer.CreateRaft(rc); err != nil {
                return
        }
        p = newPartition(cfg, s.raftServer, walPath)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package raftstore_db

import (
        "fmt"
        "os"
        "strings"

        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/fileutil"
        "github.com/cubefs/cubefs/util/log"
        "github.com/tecbot/gorocksdb"
)

// RocksDBStore is a wrapper of the gorocksdb.DB
type RocksDBStore struct {
        dir             string
        lruCacheSize    int
        writeBufferSize int
        db              *gorocksdb.DB
}

func (rs *RocksDBStore) GetLruCacheSize() int {
        return rs.lruCacheSize
}

func (rs *RocksDBStore) GetWriteBufferSize() int {
        return rs.writeBufferSize
}

func (rs *RocksDBStore) GetDir() string {
        return rs.dir
}

// NewRocksDBStore returns a new RocksDB instance.
func NewRocksDBStore(dir string, lruCacheSize, writeBufferSize int) (store *RocksDBStore, err error) {
        if err = os.MkdirAll(dir, os.ModePerm); err != nil {
                return
        }
        store = &RocksDBStore{
                dir:             dir,
                lruCacheSize:    lruCacheSize,
                writeBufferSize: writeBufferSize,
        }
        if err = store.Open(); err != nil {
                return
        }
        return
}

func GetRocksDBStoreRecoveryDir(dir string) string {
        dir = strings.TrimSuffix(dir, "/")
        return fmt.Sprintf("%v_temp", dir)
}

// NewRocksDBStoreAndRecovery returns a new RocksDB instance after execute recovery.
func NewRocksDBStoreAndRecovery(dir string, lruCacheSize, writeBufferSize int) (store *RocksDBStore, err error) {
        // start recovery
        recoverDir := GetRocksDBStoreRecoveryDir(dir)
        // if rocksdb dir is not exists but temp dir is exist
        if !fileutil.ExistDir(dir) && fileutil.ExistDir(recoverDir) {
                // we move temp dir to rocksdb dir for commiting transaction
                if err = os.Rename(recoverDir, dir); err != nil {
                        log.LogErrorf("action[NewRocksDBStoreAndRecovery]failed to rename rocksdb recovery dir %v", err.Error())
                        return
                }
                log.LogDebug("action[NewRocksDBStoreAndRecovery]recovery rocksdb success")
        } else if err = os.MkdirAll(dir, os.ModePerm); err != nil {
                return
        }
        store = &RocksDBStore{
                dir:             dir,
                lruCacheSize:    lruCacheSize,
                writeBufferSize: writeBufferSize,
        }
        if err = store.Open(); err != nil {
                return
        }
        return
}

// Open opens the RocksDB instance.
func (rs *RocksDBStore) Open() error {
        basedTableOptions := gorocksdb.NewDefaultBlockBasedTableOptions()
        basedTableOptions.SetBlockCache(gorocksdb.NewLRUCache(uint64(rs.lruCacheSize)))
        opts := gorocksdb.NewDefaultOptions()
        opts.SetBlockBasedTableFactory(basedTableOptions)
        opts.SetCreateIfMissing(true)
        opts.SetWriteBufferSize(rs.writeBufferSize)
        opts.SetMaxWriteBufferNumber(2)
        opts.SetCompression(gorocksdb.NoCompression)
        db, err := gorocksdb.OpenDb(opts, rs.dir)
        if err != nil {
                err = fmt.Errorf("action[openRocksDB],err:%v", err)
                return err
        }
        rs.db = db
        return nil
}

func (rs *RocksDBStore) Close() {
        rs.db.Close()
}

// Del deletes a key-value pair.
func (rs *RocksDBStore) Del(key interface{}, isSync bool) (result interface{}, err error) {
        ro := gorocksdb.NewDefaultReadOptions()
        wo := gorocksdb.NewDefaultWriteOptions()
        wb := gorocksdb.NewWriteBatch()
        wo.SetSync(isSync)
        defer func() {
                wo.Destroy()
                ro.Destroy()
                wb.Destroy()
        }()
        slice, err := rs.db.Get(ro, []byte(key.(string)))
        if err != nil {
                return
        }
        result = slice.Data()
        err = rs.db.Delete(wo, []byte(key.(string)))
        return
}

// Put adds a new key-value pair to the RocksDB.
func (rs *RocksDBStore) Put(key, value interface{}, isSync bool) (result interface{}, err error) {
        wo := gorocksdb.NewDefaultWriteOptions()
        wb := gorocksdb.NewWriteBatch()
        wo.SetSync(isSync)
        defer func() {
                wo.Destroy()
                wb.Destroy()
        }()
        wb.Put([]byte(key.(string)), value.([]byte))
        if err := rs.db.Write(wo, wb); err != nil {
                return nil, err
        }

        result = value
        return result, nil
}

func (rs *RocksDBStore) Flush() (err error) {
        fo := gorocksdb.NewDefaultFlushOptions()
        return rs.db.Flush(fo)
}

// Get returns the value based on the given key.
func (rs *RocksDBStore) Get(key interface{}) (result interface{}, err error) {
        ro := gorocksdb.NewDefaultReadOptions()
        ro.SetFillCache(false)
        defer ro.Destroy()
        return rs.db.GetBytes(ro, []byte(key.(string)))
}

// DeleteKeyAndPutIndex deletes the key-value pair based on the given key and put other keys in the cmdMap to RocksDB.
// TODO explain
func (rs *RocksDBStore) DeleteKeyAndPutIndex(key string, cmdMap map[string][]byte, isSync bool) error {
        wo := gorocksdb.NewDefaultWriteOptions()
        wo.SetSync(isSync)
        wb := gorocksdb.NewWriteBatch()
        defer func() {
                wo.Destroy()
                wb.Destroy()
        }()
        wb.Delete([]byte(key))
        for otherKey, value := range cmdMap {
                if otherKey == key {
                        continue
                }
                wb.Put([]byte(otherKey), value)
        }

        if err := rs.db.Write(wo, wb); err != nil {
                err = fmt.Errorf("action[deleteFromRocksDB],err:%v", err)
                return err
        }
        return nil
}

// Put adds a new key-value pair to the RocksDB.
func (rs *RocksDBStore) Replace(key string, value interface{}, isSync bool) (result interface{}, err error) {
        wo := gorocksdb.NewDefaultWriteOptions()
        wb := gorocksdb.NewWriteBatch()
        wo.SetSync(isSync)
        defer func() {
                wo.Destroy()
                wb.Destroy()
        }()
        wb.Delete([]byte(key))
        wb.Put([]byte(key), value.([]byte))
        if err := rs.db.Write(wo, wb); err != nil {
                return nil, err
        }
        result = value
        return result, nil
}

// BatchDeleteAndPut delete the keys in set and put the kvs in batch
func (rs *RocksDBStore) BatchDeleteAndPut(deleteSet map[string]util.Null, cmdMap map[string][]byte, isSync bool) error {
        wo := gorocksdb.NewDefaultWriteOptions()
        wo.SetSync(isSync)
        wb := gorocksdb.NewWriteBatch()
        defer func() {
                wo.Destroy()
                wb.Destroy()
        }()
        for key := range deleteSet {
                wb.Delete([]byte(key))
        }
        for key, value := range cmdMap {
                // NOTE: skip if the key in delete set
                if deleteSet != nil {
                        _, ok := deleteSet[key]
                        if ok {
                                continue
                        }
                }
                wb.Put([]byte(key), value)
        }
        if err := rs.db.Write(wo, wb); err != nil {
                err = fmt.Errorf("action[batchPutToRocksDB],err:%v", err)
                return err
        }
        return nil
}

// BatchPut puts the key-value pairs in batch.
func (rs *RocksDBStore) BatchPut(cmdMap map[string][]byte, isSync bool) error {
        return rs.BatchDeleteAndPut(nil, cmdMap, isSync)
}

// SeekForPrefix seeks for the place where the prefix is located in the snapshots.
func (rs *RocksDBStore) SeekForPrefix(prefix []byte) (result map[string][]byte, err error) {
        result = make(map[string][]byte)
        snapshot := rs.RocksDBSnapshot()
        it := rs.Iterator(snapshot)
        defer func() {
                it.Close()
                rs.ReleaseSnapshot(snapshot)
        }()
        it.Seek(prefix)
        for ; it.ValidForPrefix(prefix); it.Next() {
                key := it.Key().Data()
                value := it.Value().Data()
                valueByte := make([]byte, len(value))
                copy(valueByte, value)
                result[string(key)] = valueByte
                it.Key().Free()
                it.Value().Free()
        }
        if err := it.Err(); err != nil {
                return nil, err
        }
        return result, nil
}

// RocksDBSnapshot returns the RocksDB snapshot.
func (rs *RocksDBStore) RocksDBSnapshot() *gorocksdb.Snapshot {
        return rs.db.NewSnapshot()
}

// ReleaseSnapshot releases the snapshot and its resources.
func (rs *RocksDBStore) ReleaseSnapshot(snapshot *gorocksdb.Snapshot) {
        rs.db.ReleaseSnapshot(snapshot)
}

// Iterator returns the iterator of the snapshot.
func (rs *RocksDBStore) Iterator(snapshot *gorocksdb.Snapshot) *gorocksdb.Iterator {
        ro := gorocksdb.NewDefaultReadOptions()
        ro.SetFillCache(false)
        ro.SetSnapshot(snapshot)

        return rs.db.NewIterator(ro)
}

func (rs *RocksDBStore) Clear() (err error) {
        wo := gorocksdb.NewDefaultWriteOptions()
        wo.SetSync(true)
        wb := gorocksdb.NewWriteBatch()
        defer func() {
                wo.Destroy()
                wb.Destroy()
        }()
        // NOTE: 0 - 255 include all keys
        wb.DeleteRange([]byte{0}, []byte{255})
        err = rs.db.Write(wo, wb)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package raftstore

import (
        "fmt"
        "strings"
        "sync"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/util/errors"
)

// Error definitions.
var (
        ErrNoSuchNode        = errors.New("no such node")
        ErrIllegalAddress    = errors.New("illegal address")
        ErrUnknownSocketType = errors.New("unknown socket type")
)

// This private struct defines the necessary properties for node address info.
type nodeAddress struct {
        Heartbeat string
        Replicate string
}

// NodeManager defines the necessary methods for node address management.
type NodeManager interface {
        // add node address with specified port.
        AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int)

        // delete node address information
        DeleteNode(nodeID uint64)
}

// NodeResolver defines the methods for node address resolving and management.
// It is extended from SocketResolver and NodeManager.
type NodeResolver interface {
        raft.SocketResolver
        NodeManager
}

// Default thread-safe implementation of the NodeResolver interface.
type nodeResolver struct {
        nodeMap sync.Map
}

// NodeAddress resolves NodeID as net.Addr.
// This method is necessary for SocketResolver interface implementation.
func (r *nodeResolver) NodeAddress(nodeID uint64, stype raft.SocketType) (addr string, err error) {
        val, ok := r.nodeMap.Load(nodeID)
        if !ok {
                err = ErrNoSuchNode
                return
        }
        address, ok := val.(*nodeAddress)
        if !ok {
                err = ErrIllegalAddress
                return
        }
        switch stype {
        case raft.HeartBeat:
                addr = address.Heartbeat
        case raft.Replicate:
                addr = address.Replicate
        default:
                err = ErrUnknownSocketType
        }
        return
}

// AddNode adds node address information.
func (r *nodeResolver) AddNode(nodeID uint64, addr string) {
        r.AddNodeWithPort(nodeID, addr, 0, 0)
}

// AddNodeWithPort adds node address with specified port.
func (r *nodeResolver) AddNodeWithPort(nodeID uint64, addr string, heartbeat int, replicate int) {
        if heartbeat == 0 {
                heartbeat = DefaultHeartbeatPort
        }
        if replicate == 0 {
                replicate = DefaultReplicaPort
        }
        if len(strings.TrimSpace(addr)) != 0 {
                r.nodeMap.Store(nodeID, &nodeAddress{
                        Heartbeat: fmt.Sprintf("%s:%d", addr, heartbeat),
                        Replicate: fmt.Sprintf("%s:%d", addr, replicate),
                })
        }
}

// DeleteNode deletes the node address information of the specified node ID from the NodeManager if possible.
func (r *nodeResolver) DeleteNode(nodeID uint64) {
        r.nodeMap.Delete(nodeID)
}

// NewNodeResolver returns a new NodeResolver instance for node address management and resolving.
func NewNodeResolver() NodeResolver {
        return &nodeResolver{}
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package repl

import (
        "fmt"
        "io"
        "net"
        "strings"
        "time"

        "github.com/cubefs/cubefs/depends/tiglabs/raft"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

var (
        ErrBadNodes       = errors.New("BadNodesErr")
        ErrArgLenMismatch = errors.New("ArgLenMismatchErr")
)

type Packet struct {
        proto.Packet
        followersAddrs  []string
        followerPackets []*FollowerPacket
        IsReleased      int32 // TODO what is released?
        Object          interface{}
        TpObject        *exporter.TimePointCount
        NeedReply       bool
        OrgBuffer       []byte

        // used locally
        shallDegrade bool
        AfterPre     bool
}

type FollowerPacket struct {
        proto.Packet
        respCh chan error
}

func NewFollowerPacket() (fp *FollowerPacket) {
        fp = new(FollowerPacket)
        fp.respCh = make(chan error, 1)
        fp.StartT = time.Now().UnixNano()
        return fp
}

func (p *FollowerPacket) PackErrorBody(action, msg string) {
        p.identificationErrorResultCode(action, msg)
        p.Size = uint32(len([]byte(action + "_" + msg)))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:int(p.Size)], []byte(action+"_"+msg))
}

func (p *FollowerPacket) IsErrPacket() bool {
        return p.ResultCode != proto.OpOk && p.ResultCode != proto.OpInitResultCode
}

func (p *FollowerPacket) identificationErrorResultCode(errLog string, errMsg string) {
        if strings.Contains(errLog, ActionReceiveFromFollower) || strings.Contains(errLog, ActionSendToFollowers) ||
                strings.Contains(errLog, ConnIsNullErr) {
                p.ResultCode = proto.OpIntraGroupNetErr
                log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
        } else if strings.Contains(errMsg, storage.ParameterMismatchError.Error()) ||
                strings.Contains(errMsg, ErrorUnknownOp.Error()) {
                p.ResultCode = proto.OpArgMismatchErr
        } else if strings.Contains(errMsg, proto.ErrDataPartitionNotExists.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else if strings.Contains(errMsg, storage.ExtentNotFoundError.Error()) ||
                strings.Contains(errMsg, storage.ExtentHasBeenDeletedError.Error()) {
                p.ResultCode = proto.OpNotExistErr
        } else if strings.Contains(errMsg, storage.NoSpaceError.Error()) {
                p.ResultCode = proto.OpDiskNoSpaceErr
        } else if strings.Contains(errMsg, storage.TryAgainError.Error()) {
                p.ResultCode = proto.OpAgain
        } else if strings.Contains(errMsg, raft.ErrNotLeader.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else if strings.Contains(errMsg, raft.ErrStopped.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else {
                log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
                p.ResultCode = proto.OpIntraGroupNetErr
        }
}

func (p *Packet) AfterTp() (ok bool) {
        if p.TpObject != nil {
                p.TpObject.Set(nil)
        }

        return
}

func (p *Packet) clean() {
        if p.Data == nil && p.OrgBuffer == nil {
                return
        }
        p.Object = nil
        p.TpObject = nil
        p.Data = nil
        p.Arg = nil
        if p.OrgBuffer != nil && len(p.OrgBuffer) == util.BlockSize && p.IsNormalWriteOperation() {
                proto.Buffers.Put(p.OrgBuffer)
                p.OrgBuffer = nil
        }
}

func copyPacket(src *Packet, dst *FollowerPacket) {
        dst.Magic = src.Magic
        dst.ExtentType = src.ExtentType
        dst.Opcode = src.Opcode
        dst.ResultCode = src.ResultCode
        dst.CRC = src.CRC
        dst.Size = src.Size
        dst.KernelOffset = src.KernelOffset
        dst.PartitionID = src.PartitionID
        dst.ExtentID = src.ExtentID
        dst.ExtentOffset = src.ExtentOffset
        dst.ReqID = src.ReqID
        dst.Data = src.OrgBuffer
}

func (p *Packet) BeforeTp(clusterID string) (ok bool) {
        if p.IsForwardPkt() && !p.IsRandomWrite() {
                p.TpObject = exporter.NewTPCnt(fmt.Sprintf("PrimaryBackUp_%v", p.GetOpMsg()))
        } else if p.IsRandomWrite() {
                p.TpObject = exporter.NewTPCnt(fmt.Sprintf("Raft_%v", p.GetOpMsg()))
        }

        return
}

func (p *Packet) resolveFollowersAddr() (err error) {
        defer func() {
                if err != nil {
                        p.PackErrorBody(ActionPreparePkt, err.Error())
                }
        }()
        if len(p.Arg) < int(p.ArgLen) {
                err = ErrArgLenMismatch
                return
        }
        str := string(p.Arg[:int(p.ArgLen)])
        followerAddrs := strings.SplitN(str, proto.AddrSplit, -1)
        followerNum := uint8(len(followerAddrs) - 1)
        p.followersAddrs = make([]string, followerNum)
        p.followerPackets = make([]*FollowerPacket, followerNum)
        p.OrgBuffer = p.Data
        if followerNum > 0 {
                p.followersAddrs = followerAddrs[:int(followerNum)]
                log.LogInfof("action[resolveFollowersAddr] %v", p.followersAddrs)
        }
        if p.RemainingFollowers < 0 {
                err = ErrBadNodes
                return
        }

        return
}

func NewPacket() (p *Packet) {
        p = new(Packet)
        p.Magic = proto.ProtoMagic
        p.StartT = time.Now().UnixNano()
        p.NeedReply = true
        return
}

func NewPacketToGetAllWatermarks(partitionID uint64, extentType uint8) (p *Packet) {
        p = new(Packet)
        p.Opcode = proto.OpGetAllWatermarks
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        p.ExtentType = extentType

        return
}

func NewPacketToReadTinyDeleteRecord(partitionID uint64, offset int64) (p *Packet) {
        p = new(Packet)
        p.Opcode = proto.OpReadTinyDeleteRecord
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ReqID = proto.GenerateRequestID()
        p.ExtentOffset = offset

        return
}

func NewReadTinyDeleteRecordResponsePacket(requestID int64, partitionID uint64) (p *Packet) {
        p = new(Packet)
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpOk
        p.ReqID = requestID
        p.ExtentType = proto.NormalExtentType

        return
}

func NewExtentRepairReadPacket(partitionID uint64, extentID uint64, offset, size int) (p *Packet) {
        p = new(Packet)
        p.ExtentID = extentID
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ExtentOffset = int64(offset)
        p.Size = uint32(size)
        p.Opcode = proto.OpExtentRepairRead
        p.ExtentType = proto.NormalExtentType
        p.ReqID = proto.GenerateRequestID()

        return
}

func NewTinyExtentRepairReadPacket(partitionID uint64, extentID uint64, offset, size int) (p *Packet) {
        p = new(Packet)
        p.ExtentID = extentID
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ExtentOffset = int64(offset)
        p.Size = uint32(size)
        p.Opcode = proto.OpTinyExtentRepairRead
        p.ExtentType = proto.TinyExtentType
        p.ReqID = proto.GenerateRequestID()

        return
}

func NewTinyExtentStreamReadResponsePacket(requestID int64, partitionID uint64, extentID uint64) (p *Packet) {
        p = new(Packet)
        p.ExtentID = extentID
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpTinyExtentRepairRead
        p.ReqID = requestID
        p.ExtentType = proto.TinyExtentType
        p.StartT = time.Now().UnixNano()

        return
}

func NewStreamReadResponsePacket(requestID int64, partitionID uint64, extentID uint64) (p *Packet) {
        p = new(Packet)
        p.ExtentID = extentID
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpOk
        p.ReqID = requestID
        p.ExtentType = proto.NormalExtentType

        return
}

func NewPacketToNotifyExtentRepair(partitionID uint64) (p *Packet) {
        p = new(Packet)
        p.Opcode = proto.OpNotifyReplicasToRepair
        p.PartitionID = partitionID
        p.Magic = proto.ProtoMagic
        p.ExtentType = proto.NormalExtentType
        p.ReqID = proto.GenerateRequestID()

        return
}

func (p *Packet) IsErrPacket() bool {
        return p.ResultCode != proto.OpOk && p.ResultCode != proto.OpInitResultCode
}

func (p *Packet) getErrMessage() (m string) {
        return fmt.Sprintf("req(%v) err(%v)", p.GetUniqueLogId(), string(p.Data[:p.Size]))
}

var ErrorUnknownOp = errors.New("unknown opcode")

func (p *Packet) identificationErrorResultCode(errLog string, errMsg string) {
        log.LogDebugf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
        if strings.Contains(errLog, ActionReceiveFromFollower) || strings.Contains(errLog, ActionSendToFollowers) ||
                strings.Contains(errLog, ConnIsNullErr) {
                p.ResultCode = proto.OpIntraGroupNetErr
        } else if strings.Contains(errMsg, storage.ParameterMismatchError.Error()) ||
                strings.Contains(errMsg, ErrorUnknownOp.Error()) {
                p.ResultCode = proto.OpArgMismatchErr
        } else if strings.Contains(errMsg, proto.ErrDataPartitionNotExists.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else if strings.Contains(errMsg, storage.ExtentNotFoundError.Error()) ||
                strings.Contains(errMsg, storage.ExtentHasBeenDeletedError.Error()) {
                p.ResultCode = proto.OpNotExistErr
        } else if strings.Contains(errMsg, storage.NoSpaceError.Error()) {
                p.ResultCode = proto.OpDiskNoSpaceErr
        } else if strings.Contains(errMsg, storage.BrokenDiskError.Error()) {
                p.ResultCode = proto.OpDiskErr
        } else if strings.Contains(errMsg, storage.TryAgainError.Error()) {
                p.ResultCode = proto.OpAgain
        } else if strings.Contains(errMsg, raft.ErrNotLeader.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else if strings.Contains(errMsg, raft.ErrStopped.Error()) {
                p.ResultCode = proto.OpTryOtherAddr
        } else if strings.Contains(errMsg, storage.VerNotConsistentError.Error()) {
                p.ResultCode = proto.ErrCodeVersionOpError
                // log.LogDebugf("action[identificationErrorResultCode] not change ver erro code, (%v)", string(debug.Stack()))
        } else {
                log.LogErrorf("action[identificationErrorResultCode] error %v, errmsg %v", errLog, errMsg)
                p.ResultCode = proto.OpIntraGroupNetErr
        }
}

func (p *Packet) PackErrorBody(action, msg string) {
        p.identificationErrorResultCode(action, msg)
        p.Size = uint32(len([]byte(action + "_" + msg)))
        p.Data = make([]byte, p.Size)
        copy(p.Data[:int(p.Size)], []byte(action+"_"+msg))
}

func (p *Packet) ReadFull(c net.Conn, opcode uint8, readSize int) (err error) {
        if p.IsNormalWriteOperation() && readSize == util.BlockSize {
                p.Data, _ = proto.Buffers.Get(readSize)
        } else {
                p.Data = make([]byte, readSize)
        }
        _, err = io.ReadFull(c, p.Data[:readSize])
        return
}

func (p *Packet) IsMasterCommand() bool {
        switch p.Opcode {
        case
                proto.OpDataNodeHeartbeat,
                proto.OpVersionOperation,
                proto.OpLoadDataPartition,
                proto.OpCreateDataPartition,
                proto.OpDeleteDataPartition,
                proto.OpDecommissionDataPartition,
                proto.OpAddDataPartitionRaftMember,
                proto.OpRemoveDataPartitionRaftMember,
                proto.OpDataPartitionTryToLeader:
                return true
        default:
                return false
        }
}

func (p *Packet) IsForwardPacket() bool {
        r := p.RemainingFollowers > 0 && !p.isSpecialReplicaCntPacket()
        return r
}

func (p *Packet) isSpecialReplicaCntPacket() bool {
        r := p.RemainingFollowers == 127
        return r
}

// A leader packet is the packet send to the leader and does not require packet forwarding.
func (p *Packet) IsLeaderPacket() (ok bool) {
        if (p.IsForwardPkt() || p.isSpecialReplicaCntPacket()) &&
                (p.IsNormalWriteOperation() || p.IsCreateExtentOperation() || p.IsMarkDeleteExtentOperation()) {
                ok = true
        }

        return
}

func (p *Packet) IsTinyExtentType() bool {
        return p.ExtentType == proto.TinyExtentType
}

func (p *Packet) IsNormalWriteOperation() bool {
        return p.Opcode == proto.OpWrite || p.Opcode == proto.OpSyncWrite
}

func (p *Packet) IsSnapshotModWriteAppendOperation() bool {
        return p.Opcode == proto.OpRandomWriteAppend || p.Opcode == proto.OpSyncRandomWriteAppend
}

func (p *Packet) IsCreateExtentOperation() bool {
        return p.Opcode == proto.OpCreateExtent
}

func (p *Packet) IsMarkDeleteExtentOperation() bool {
        return p.Opcode == proto.OpMarkDelete || p.Opcode == proto.OpSplitMarkDelete
}

func (p *Packet) IsMarkSplitExtentOperation() bool {
        return p.Opcode == proto.OpSplitMarkDelete
}

func (p *Packet) IsBatchDeleteExtents() bool {
        return p.Opcode == proto.OpBatchDeleteExtent
}

func (p *Packet) IsBroadcastMinAppliedID() bool {
        return p.Opcode == proto.OpBroadcastMinAppliedID
}

func (p *Packet) IsRandomWrite() bool {
        return p.Opcode == proto.OpRandomWrite || p.Opcode == proto.OpSyncRandomWrite ||
                p.Opcode == proto.OpRandomWriteVer || p.Opcode == proto.OpSyncRandomWriteVer
}

func (p *Packet) IsSyncWrite() bool {
        return p.Opcode == proto.OpSyncWrite || p.Opcode == proto.OpSyncRandomWrite
}

func (p *Packet) SetDegrade() {
        p.shallDegrade = true
}

func (p *Packet) UnsetDegrade() {
        p.shallDegrade = false
}

func (p *Packet) ShallDegrade() bool {
        return p.shallDegrade
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package repl

import (
        "container/list"
        "fmt"
        "net"
        "os"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

var gConnPool = util.NewConnectPool()

// ReplProtocol defines the struct of the replication protocol.
// 1. ServerConn reads a packet from the client socket, and analyzes the addresses of the followers.
// 2. After the preparation, the packet is send to toBeProcessedCh. If failure happens, send it to the response channel.
// 3. OperatorAndForwardPktGoRoutine fetches a packet from toBeProcessedCh, and determine if it needs to be forwarded to the followers.
// 4. receiveResponse fetches a reply from responseCh, executes postFunc, and writes a response to the client if necessary.
type ReplProtocol struct {
        packetListLock sync.RWMutex

        packetList *list.List    // stores all the received packets from the client
        ackCh      chan struct{} // if sending to all the replicas succeeds, then a signal to this channel

        toBeProcessedCh chan *Packet // the goroutine receives an available packet and then sends it to this channel
        responseCh      chan *Packet // this chan is used to write response to the client

        sourceConn net.Conn
        exitC      chan bool
        exited     int32
        exitedMu   sync.RWMutex

        followerConnects map[string]*FollowerTransport
        lock             sync.RWMutex

        prepareFunc  func(p *Packet) error             // prepare packet
        operatorFunc func(p *Packet, c net.Conn) error // operator
        postFunc     func(p *Packet) error             // post-processing packet

        getSmuxConn func(addr string) (c net.Conn, err error)
        putSmuxConn func(conn net.Conn, force bool)

        isError int32
        replId  int64
}

type FollowerTransport struct {
        addr     string
        conn     net.Conn
        sendCh   chan *FollowerPacket
        recvCh   chan *FollowerPacket
        exitCh   chan struct{}
        exitedMu sync.RWMutex
        isclosed int32
}

func NewFollowersTransport(addr string, c net.Conn) (ft *FollowerTransport, err error) {
        ft = new(FollowerTransport)
        ft.addr = addr
        ft.conn = c
        ft.sendCh = make(chan *FollowerPacket, 200)
        ft.recvCh = make(chan *FollowerPacket, 200)
        ft.exitCh = make(chan struct{})
        go ft.serverWriteToFollower()
        go ft.serverReadFromFollower()

        return
}

func (ft *FollowerTransport) serverWriteToFollower() {
        for {
                select {
                case p := <-ft.sendCh:
                        if err := p.WriteToConn(ft.conn); err != nil {
                                p.PackErrorBody(ActionSendToFollowers, err.Error())
                                p.respCh <- fmt.Errorf(string(p.Data[:p.Size]))
                                log.LogErrorf("serverWriteToFollower ft.addr(%v), err (%v)", ft.addr, err.Error())
                                ft.conn.Close()
                                continue
                        }
                        ft.recvCh <- p
                case <-ft.exitCh:
                        ft.exitedMu.Lock()
                        if atomic.AddInt32(&ft.isclosed, -1) == FollowerTransportExited {
                                ft.conn.Close()
                                atomic.StoreInt32(&ft.isclosed, FollowerTransportExited)
                        }
                        ft.exitedMu.Unlock()
                        return
                }
        }
}

func (ft *FollowerTransport) serverReadFromFollower() {
        for {
                select {
                case p := <-ft.recvCh:
                        ft.readFollowerResult(p)
                case <-ft.exitCh:
                        ft.exitedMu.Lock()
                        if atomic.AddInt32(&ft.isclosed, -1) == FollowerTransportExited {
                                ft.conn.Close()
                                atomic.StoreInt32(&ft.isclosed, FollowerTransportExited)
                        }
                        ft.exitedMu.Unlock()
                        return
                }
        }
}

// Read the response from the follower
func (ft *FollowerTransport) readFollowerResult(request *FollowerPacket) (err error) {
        reply := NewPacket()
        defer func() {
                reply.clean()
                request.respCh <- err
                if err != nil {
                        ft.conn.Close()
                }
        }()
        if request.IsErrPacket() {
                err = fmt.Errorf(string(request.Data[:request.Size]))
                return
        }
        timeOut := proto.ReadDeadlineTime
        if request.IsBatchDeleteExtents() {
                timeOut = proto.BatchDeleteExtentReadDeadLineTime
        }
        if err = reply.ReadFromConnWithVer(ft.conn, timeOut); err != nil {
                log.LogErrorf("readFollowerResult ft.addr(%v), err(%v)", ft.addr, err.Error())
                return
        }

        if reply.ReqID != request.ReqID || reply.PartitionID != request.PartitionID ||
                reply.ExtentOffset != request.ExtentOffset || reply.CRC != request.CRC || reply.ExtentID != request.ExtentID {
                err = fmt.Errorf(ActionCheckReply+" request(%v), reply(%v)  ", request.GetUniqueLogId(),
                        reply.GetUniqueLogId())
                return
        }

        if reply.IsErrPacket() {
                err = fmt.Errorf(string(reply.Data[:reply.Size]))
                return
        }
        log.LogDebugf("action[ActionReceiveFromFollower] %v.", reply.LogMessage(ActionReceiveFromFollower,
                ft.addr, request.StartT, err))
        return
}

func (ft *FollowerTransport) Destory() {
        ft.exitedMu.Lock()
        atomic.StoreInt32(&ft.isclosed, FollowerTransportExiting)
        close(ft.exitCh)
        ft.exitedMu.Unlock()
        for {
                if atomic.LoadInt32(&ft.isclosed) == FollowerTransportExited {
                        break
                }
                time.Sleep(time.Millisecond)
        }
        close(ft.sendCh)
        close(ft.recvCh)
}

func (ft *FollowerTransport) Write(p *FollowerPacket) {
        ft.sendCh <- p
}

func NewReplProtocol(inConn net.Conn, prepareFunc func(p *Packet) error,
        operatorFunc func(p *Packet, c net.Conn) error, postFunc func(p *Packet) error) *ReplProtocol {
        rp := new(ReplProtocol)
        rp.packetList = list.New()
        rp.ackCh = make(chan struct{}, RequestChanSize)
        rp.toBeProcessedCh = make(chan *Packet, RequestChanSize)
        rp.responseCh = make(chan *Packet, RequestChanSize)
        rp.exitC = make(chan bool, 1)
        rp.sourceConn = inConn
        rp.followerConnects = make(map[string]*FollowerTransport)
        rp.prepareFunc = prepareFunc
        rp.operatorFunc = operatorFunc
        rp.postFunc = postFunc
        rp.exited = ReplRuning
        rp.replId = proto.GenerateRequestID()
        go rp.OperatorAndForwardPktGoRoutine()
        go rp.ReceiveResponseFromFollowersGoRoutine()
        go rp.writeResponseToClientGoRroutine()

        return rp
}

func (rp *ReplProtocol) SetSmux(f func(addr string) (net.Conn, error), putSmux func(conn net.Conn, force bool)) {
        rp.getSmuxConn = f
        rp.putSmuxConn = putSmux
}

// ServerConn keeps reading data from the socket to analyze the follower address, execute the prepare function,
// and throw the packets to the to-be-processed channel.
func (rp *ReplProtocol) ServerConn() {
        var err error
        defer func() {
                rp.Stop()
                rp.exitedMu.Lock()
                if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
                        rp.sourceConn.Close()
                        rp.cleanResource()
                }
                rp.exitedMu.Unlock()
        }()
        for {
                select {
                case <-rp.exitC:
                        return
                default:
                        if err = rp.readPkgAndPrepare(); err != nil {
                                return
                        }
                }
        }
}

// Receive response from all followers.
func (rp *ReplProtocol) ReceiveResponseFromFollowersGoRoutine() {
        for {
                select {
                case <-rp.ackCh:
                        rp.checkLocalResultAndReciveAllFollowerResponse()
                case <-rp.exitC:
                        rp.exitedMu.Lock()
                        if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
                                rp.sourceConn.Close()
                                rp.cleanResource()
                        }
                        rp.exitedMu.Unlock()
                        return
                }
        }
}

func (rp *ReplProtocol) setReplProtocolError(request *Packet, index int) {
        atomic.StoreInt32(&rp.isError, ReplProtocolError)
}

func (rp *ReplProtocol) hasError() bool {
        return atomic.LoadInt32(&rp.isError) == ReplProtocolError
}

func (rp *ReplProtocol) readPkgAndPrepare() (err error) {
        request := NewPacket()
        if err = request.ReadFromConnWithVer(rp.sourceConn, proto.NoReadDeadlineTime); err != nil {
                return
        }
        // log.LogDebugf("action[readPkgAndPrepare] packet(%v) op %v from remote(%v) conn(%v) ",
        //        request.GetUniqueLogId(), request.Opcode, rp.sourceConn.RemoteAddr().String(), rp.sourceConn)

        if err = request.resolveFollowersAddr(); err != nil {
                err = rp.putResponse(request)
                return
        }
        if err = rp.prepareFunc(request); err != nil {
                err = rp.putResponse(request)
                return
        }

        err = rp.putToBeProcess(request)

        return
}

func (rp *ReplProtocol) sendRequestToAllFollowers(request *Packet) (index int, err error) {
        for index = 0; index < len(request.followersAddrs); index++ {
                var transport *FollowerTransport
                if transport, err = rp.allocateFollowersConns(request, index); err != nil {
                        request.PackErrorBody(ActionSendToFollowers, err.Error())
                        return
                }
                followerRequest := NewFollowerPacket()
                copyPacket(request, followerRequest)
                followerRequest.RemainingFollowers = 0
                request.followerPackets[index] = followerRequest
                transport.Write(followerRequest)
        }

        return
}

// OperatorAndForwardPktGoRoutine reads packets from the to-be-processed channel and writes responses to the client.
// 1. Read a packet from toBeProcessCh, and determine if it needs to be forwarded or not. If the answer is no, then
//           process the packet locally and put it into responseCh.
// 2. If the packet needs to be forwarded, the first send it to the followers, and execute the operator function.
//    Then notify receiveResponse to read the followers' responses.
// 3. Read a reply from responseCh, and write to the client.
func (rp *ReplProtocol) OperatorAndForwardPktGoRoutine() {
        for {
                select {
                case request := <-rp.toBeProcessedCh:
                        if !request.IsForwardPacket() {
                                rp.operatorFunc(request, rp.sourceConn)
                                rp.putResponse(request)
                        } else {
                                index, err := rp.sendRequestToAllFollowers(request)
                                if err != nil {
                                        rp.setReplProtocolError(request, index)
                                        rp.putResponse(request)
                                } else {
                                        rp.pushPacketToList(request)
                                        rp.operatorFunc(request, rp.sourceConn)
                                        rp.putAck()
                                }
                        }
                case <-rp.exitC:
                        rp.exitedMu.Lock()
                        if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
                                rp.sourceConn.Close()
                                rp.cleanResource()
                        }
                        rp.exitedMu.Unlock()
                        return
                }
        }
}

func (rp *ReplProtocol) writeResponseToClientGoRroutine() {
        for {
                select {
                case request := <-rp.responseCh:
                        rp.writeResponse(request)
                case <-rp.exitC:
                        rp.exitedMu.Lock()
                        if atomic.AddInt32(&rp.exited, -1) == ReplHasExited {
                                rp.sourceConn.Close()
                                rp.cleanResource()
                        }
                        rp.exitedMu.Unlock()
                        return
                }
        }
}

// func (rp *ReplProtocol) operatorFuncWithWaitGroup(wg *sync.WaitGroup, request *Packet) {
//         defer wg.Done()
//         rp.operatorFunc(request, rp.sourceConn)
// }

// Read a packet from the list, scan all the connections of the followers of this packet and read the responses.
// If failed to read the response, then mark the packet as failure, and delete it from the list.
// If all the reads succeed, then mark the packet as success.
func (rp *ReplProtocol) checkLocalResultAndReciveAllFollowerResponse() {
        var e *list.Element

        if e = rp.getNextPacket(); e == nil {
                return
        }
        response := e.Value.(*Packet)
        defer func() {
                rp.deletePacket(response, e)
        }()
        if response.IsErrPacket() {
                return
        }
        // NOTE: wait for all followers
        for index := 0; index < len(response.followersAddrs); index++ {
                followerPacket := response.followerPackets[index]
                err := <-followerPacket.respCh
                if err != nil {
                        // NOTE: we meet timeout error
                        // set the request status to be timeout
                        if err == os.ErrDeadlineExceeded {
                                response.PackErrorBody(ActionReceiveFromFollower, err.Error())
                                return
                        }
                        // NOTE: other errors, continue to receive response from followers
                        response.PackErrorBody(ActionReceiveFromFollower, err.Error())
                        continue
                }
        }
}

// Write a reply to the client.
func (rp *ReplProtocol) writeResponse(reply *Packet) {
        var err error
        defer func() {
                reply.clean()
        }()
        log.LogDebugf("writeResponse.opcode %v reply %v conn(%v)", reply.Opcode, reply.GetUniqueLogId(), rp.sourceConn.RemoteAddr().String())
        if reply.IsErrPacket() {
                err = fmt.Errorf(reply.LogMessage(ActionWriteToClient, rp.sourceConn.RemoteAddr().String(),
                        reply.StartT, fmt.Errorf(string(reply.Data[:reply.Size]))))
                if reply.ResultCode == proto.OpNotExistErr || reply.ResultCode == proto.ErrCodeVersionOpError {
                        log.LogInfof(err.Error())
                } else {
                        log.LogErrorf(err.Error())
                }
                rp.Stop()
        }
        log.LogDebugf("try rsp opcode %v %v %v", rp.replId, reply.Opcode, rp.sourceConn.RemoteAddr().String())
        // execute the post-processing function
        rp.postFunc(reply)
        if !reply.NeedReply {
                if reply.Opcode == proto.OpTryWriteAppend || reply.Opcode == proto.OpSyncTryWriteAppend {
                        log.LogDebugf("try rsp opcode %v", reply.Opcode)
                }
                return
        }

        if err = reply.WriteToConn(rp.sourceConn); err != nil {
                err = fmt.Errorf(reply.LogMessage(ActionWriteToClient, fmt.Sprintf("local(%v)->remote(%v)", rp.sourceConn.LocalAddr().String(),
                        rp.sourceConn.RemoteAddr().String()), reply.StartT, err))
                log.LogErrorf(err.Error())
                rp.Stop()
        }
        log.LogDebugf(reply.LogMessage(ActionWriteToClient,
                rp.sourceConn.RemoteAddr().String(), reply.StartT, err))
}

// Stop stops the replication protocol.
func (rp *ReplProtocol) Stop() {
        rp.exitedMu.Lock()
        defer rp.exitedMu.Unlock()
        if atomic.LoadInt32(&rp.exited) == ReplRuning {
                if rp.exitC != nil {
                        close(rp.exitC)
                }
                atomic.StoreInt32(&rp.exited, ReplExiting)
        }
}

type SmuxConn struct {
        once sync.Once
        net.Conn
        put func(conn net.Conn, force bool)
}

func (d *SmuxConn) Close() error {
        d.once.Do(func() {
                d.put(d.Conn, true)
        })
        return nil
}

// Allocate the connections to the followers. We use partitionId + extentId + followerAddr as the key.
// Note that we need to ensure the order of packets sent to the datanode is consistent here.
func (rp *ReplProtocol) allocateFollowersConns(p *Packet, index int) (transport *FollowerTransport, err error) {
        rp.lock.RLock()
        transport = rp.followerConnects[p.followersAddrs[index]]
        rp.lock.RUnlock()
        if transport == nil {
                addr := p.followersAddrs[index]

                var conn net.Conn
                if (p.IsMarkDeleteExtentOperation() || p.IsBatchDeleteExtents()) && rp.getSmuxConn != nil {
                        var smuxCon net.Conn
                        smuxCon, err = rp.getSmuxConn(addr)
                        if err != nil {
                                return
                        }

                        conn = &SmuxConn{
                                Conn: smuxCon,
                                put:  rp.putSmuxConn,
                        }

                } else {
                        conn, err = gConnPool.GetConnect(addr)
                        if err != nil {
                                return
                        }
                }

                transport, err = NewFollowersTransport(addr, conn)
                if err != nil {
                        return
                }

                rp.lock.Lock()
                rp.followerConnects[p.followersAddrs[index]] = transport
                rp.lock.Unlock()
        }

        return
}

func (rp *ReplProtocol) getNextPacket() (e *list.Element) {
        rp.packetListLock.RLock()
        e = rp.packetList.Front()
        rp.packetListLock.RUnlock()

        return
}

func (rp *ReplProtocol) pushPacketToList(e *Packet) {
        rp.packetListLock.Lock()
        rp.packetList.PushBack(e)
        rp.packetListLock.Unlock()
}

func (rp *ReplProtocol) cleanToBeProcessCh() {
        request := len(rp.toBeProcessedCh)
        for i := 0; i < request; i++ {
                select {
                case p := <-rp.toBeProcessedCh:
                        rp.postFunc(p)
                        p.clean()
                default:
                        return
                }
        }
}

func (rp *ReplProtocol) cleanResponseCh() {
        replys := len(rp.responseCh)
        for i := 0; i < replys; i++ {
                select {
                case p := <-rp.responseCh:
                        rp.postFunc(p)
                        p.clean()
                default:
                        return
                }
        }
}

// If the replication protocol exits, then clear all the packet resources.
func (rp *ReplProtocol) cleanResource() {
        rp.packetListLock.Lock()
        for e := rp.packetList.Front(); e != nil; e = e.Next() {
                request := e.Value.(*Packet)
                rp.postFunc(request)
                request.clean()
        }
        rp.cleanToBeProcessCh()
        rp.cleanResponseCh()
        rp.packetList = list.New()
        rp.lock.RLock()
        for _, transport := range rp.followerConnects {
                transport.Destory()
        }
        rp.lock.RUnlock()
        close(rp.responseCh)
        close(rp.toBeProcessedCh)
        close(rp.ackCh)
        rp.packetList = nil
        rp.followerConnects = nil
        rp.packetListLock.Unlock()
}

func (rp *ReplProtocol) deletePacket(reply *Packet, e *list.Element) (success bool) {
        rp.packetListLock.Lock()
        defer rp.packetListLock.Unlock()
        rp.packetList.Remove(e)
        success = true
        rp.putResponse(reply)
        return
}

func (rp *ReplProtocol) putResponse(reply *Packet) (err error) {
        select {
        case rp.responseCh <- reply:
                return
        default:
                return fmt.Errorf("response Chan has full (%v)", len(rp.responseCh))
        }
}

func (rp *ReplProtocol) putToBeProcess(request *Packet) (err error) {
        select {
        case rp.toBeProcessedCh <- request:
                return
        default:
                return fmt.Errorf("toBeProcessedCh Chan has full (%v)", len(rp.toBeProcessedCh))
        }
}

func (rp *ReplProtocol) putAck() (err error) {
        select {
        case rp.ackCh <- struct{}{}:
                return
        default:
                return fmt.Errorf("ack Chan has full (%v)", len(rp.ackCh))
        }
}

package auth

import (
        "encoding/json"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auth"
        "github.com/cubefs/cubefs/util/cryptoutil"
)

func (api *API) GetTicket(clientId string, clientKey string, serviceID string) (ticket *auth.Ticket, err error) {
        var (
                key      []byte
                ts       int64
                msgResp  proto.AuthGetTicketResp
                respData []byte
        )
        message := proto.AuthGetTicketReq{
                Type:      proto.MsgAuthTicketReq,
                ClientID:  clientId,
                ServiceID: serviceID,
        }
        if key, err = cryptoutil.Base64Decode(clientKey); err != nil {
                return
        }
        if message.Verifier, ts, err = cryptoutil.GenVerifier(key); err != nil {
                return
        }
        if respData, err = api.ac.request(clientId, clientKey, key, message, proto.ClientGetTicket, serviceID); err != nil {
                return
        }
        if err = json.Unmarshal(respData, &msgResp); err != nil {
                return
        }
        if err = proto.VerifyTicketRespComm(&msgResp, proto.MsgAuthTicketReq, clientId, serviceID, ts); err != nil {
                return
        }
        ticket = &auth.Ticket{
                ID:         clientId,
                SessionKey: cryptoutil.Base64Encode(msgResp.SessionKey.Key),
                ServiceID:  cryptoutil.Base64Encode(msgResp.SessionKey.Key),
                Ticket:     msgResp.Ticket,
        }
        return
}

package auth

import (
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/keystore"
)

func (api *API) AdminCreateKey(clientID, clientKey, userID, role string, caps []byte) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID:   userID,
                Role: role,
                Caps: caps,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthCreateKeyReq, proto.AdminCreateKey)
}

func (api *API) AdminDeleteKey(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID: userID,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthDeleteKeyReq, proto.AdminDeleteKey)
}

func (api *API) AdminGetKey(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID: userID,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthGetKeyReq, proto.AdminGetKey)
}

func (api *API) AdminAddCaps(clientID, clientKey, userID string, caps []byte) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID:   userID,
                Caps: caps,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthAddCapsReq, proto.AdminAddCaps)
}

func (api *API) AdminDeleteCaps(clientID, clientKey, userID string, caps []byte) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID:   userID,
                Caps: caps,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthDeleteCapsReq, proto.AdminDeleteCaps)
}

func (api *API) AdminGetCaps(clientID, clientKey, userID string) (res *keystore.KeyInfo, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        keyInfo := &keystore.KeyInfo{
                ID: userID,
        }
        return api.ac.serveAdminRequest(clientID, clientKey, api.ac.ticket, keyInfo, proto.MsgAuthGetCapsReq, proto.AdminGetCaps)
}

package auth

import (
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/keystore"
)

type API struct {
        ac *AuthClient
}

func (api *API) OSSAddCaps(clientID, clientKey, accessKey string, caps []byte) (newAKCaps *keystore.AccessKeyCaps, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        akCaps := &keystore.AccessKeyCaps{
                AccessKey: accessKey,
                Caps:      caps,
        }
        return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSAddCapsReq, proto.OSAddCaps)
}

func (api *API) OSSDeleteCaps(clientID, clientKey, accessKey string, caps []byte) (newAKCaps *keystore.AccessKeyCaps, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        akCaps := &keystore.AccessKeyCaps{
                AccessKey: accessKey,
                Caps:      caps,
        }
        return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSDeleteCapsReq, proto.OSDeleteCaps)
}

func (api *API) OSSGetCaps(clientID, clientKey, accessKey string) (caps *keystore.AccessKeyCaps, err error) {
        if api.ac.ticket == nil {
                if api.ac.ticket, err = api.GetTicket(clientID, clientKey, proto.AuthServiceID); err != nil {
                        return
                }
        }
        akCaps := &keystore.AccessKeyCaps{
                AccessKey: accessKey,
        }
        return api.ac.serveOSSRequest(clientID, clientKey, api.ac.ticket, akCaps, proto.MsgAuthOSGetCapsReq, proto.OSGetCaps)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package auth

import (
        "encoding/json"
        "fmt"
        "net/http"
        "os"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/auth"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/keystore"
        "github.com/cubefs/cubefs/util/log"
)

const (
        requestTimeout       = 30 * time.Second
        RequestMaxRetry      = 5
        RequestSleepInterval = 100 * time.Millisecond
)

type AuthClient struct {
        sync.RWMutex
        authnodes   []string
        enableHTTPS bool
        certFile    string
        ticket      *auth.Ticket
        leaderAddr  string
}

func (c *AuthClient) API() *API {
        return &API{
                ac: c,
        }
}

func NewAuthClient(authNodes []string, enableHTTPS bool, certFile string) *AuthClient {
        return &AuthClient{authnodes: authNodes, enableHTTPS: enableHTTPS, certFile: certFile}
}

func (c *AuthClient) request(clientID, clientKey string, key []byte, data interface{}, path, serviceID string) (respData []byte, err error) {
        var (
                body     []byte
                urlProto string
                url      string
                client   *http.Client
                certFile []byte
        )
        if c.enableHTTPS {
                urlProto = "https://"
                if certFile, err = loadCertfile(c.certFile); err != nil {
                        err = fmt.Errorf("load cert file failed: %v, certFile[%v]", err, c.certFile)
                        log.LogWarnf("%v", err)
                        return
                }
                client, err = cryptoutil.CreateClientX(&certFile)
                if err != nil {
                        return
                }
        } else {
                urlProto = "http://"
                client = &http.Client{}
        }
        // TODO don't retry if the param is wrong
        for i := 0; i < RequestMaxRetry; i++ {
                for _, ip := range c.authnodes {
                        url = urlProto + ip + path
                        body, err = proto.SendData(client, url, data)
                        if err != nil {
                                continue
                        }
                        var jobj *proto.HTTPAuthReply
                        if err = json.Unmarshal(body, &jobj); err != nil {
                                return nil, fmt.Errorf("unmarshal response body err:%v", err)
                        }
                        if jobj.Code != 0 {
                                if jobj.Code == proto.ErrCodeExpiredTicket {
                                        c.ticket, err = c.API().GetTicket(clientID, clientKey, serviceID)
                                        if err == nil {
                                                c.request(clientID, clientKey, key, data, path, serviceID)
                                        }
                                }
                                err = fmt.Errorf(jobj.Msg)
                                return nil, fmt.Errorf("request error, code[%d], msg[%s]", jobj.Code, err)
                        }
                        data := fmt.Sprint(jobj.Data)
                        if respData, err = cryptoutil.DecodeMessage(data, key); err != nil {
                                return nil, fmt.Errorf("decode message error: %v", err)
                        }
                        return
                }
                log.LogWarnf("Request authnode: getReply error and will RETRY, url(%v) err(%v)", url, err)
                time.Sleep(RequestSleepInterval)
        }
        log.LogWarnf("Request authnode exit: send to addr(%v) err(%v)", url, err)
        return nil, fmt.Errorf("Request authnode: getReply error, url(%v) err(%v)", url, err)
}

func (c *AuthClient) serveOSSRequest(id, key string, ticket *auth.Ticket, akCaps *keystore.AccessKeyCaps, reqType proto.MsgType, reqPath string) (caps *keystore.AccessKeyCaps, err error) {
        var (
                sessionKey []byte
                ts         int64
                resp       proto.AuthOSAccessKeyResp
                respData   []byte
        )
        apiReq := &proto.APIAccessReq{
                Type:      reqType,
                ClientID:  id,
                ServiceID: proto.AuthServiceID,
                Ticket:    ticket.Ticket,
        }
        if sessionKey, err = cryptoutil.Base64Decode(ticket.SessionKey); err != nil {
                return nil, err
        }
        if apiReq.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
                return nil, err
        }
        message := &proto.AuthOSAccessKeyReq{
                APIReq: *apiReq,
                AKCaps: *akCaps,
        }
        if respData, err = c.request(id, key, sessionKey, message, reqPath, proto.AuthServiceID); err != nil {
                return
        }
        if err = json.Unmarshal(respData, &resp); err != nil {
                return
        }
        if err = proto.VerifyAPIRespComm(&resp.APIResp, reqType, id, proto.AuthServiceID, ts); err != nil {
                return
        }
        return &resp.AKCaps, err
}

func (c *AuthClient) serveAdminRequest(id, key string, ticket *auth.Ticket, keyInfo *keystore.KeyInfo, reqType proto.MsgType, reqPath string) (res *keystore.KeyInfo, err error) {
        var (
                sessionKey []byte
                ts         int64
                resp       proto.AuthAPIAccessResp
                respData   []byte
        )
        apiReq := &proto.APIAccessReq{
                Type:      reqType,
                ClientID:  id,
                ServiceID: proto.AuthServiceID,
                Ticket:    ticket.Ticket,
        }
        if sessionKey, err = cryptoutil.Base64Decode(ticket.SessionKey); err != nil {
                return nil, err
        }
        if apiReq.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
                return nil, err
        }
        message := &proto.AuthAPIAccessReq{
                APIReq:  *apiReq,
                KeyInfo: *keyInfo,
        }
        if respData, err = c.request(id, key, sessionKey, message, reqPath, proto.AuthServiceID); err != nil {
                return
        }
        if err = json.Unmarshal(respData, &resp); err != nil {
                return
        }
        if err = proto.VerifyAPIRespComm(&resp.APIResp, reqType, id, proto.AuthServiceID, ts); err != nil {
                return
        }
        return &resp.KeyInfo, err
}

func loadCertfile(path string) (caCert []byte, err error) {
        caCert, err = os.ReadFile(path)
        if err != nil {
                return
        }
        return
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package blobstore

import (
        "bytes"
        "context"
        "io"
        "time"

        "github.com/cubefs/cubefs/blobstore/api/access"
        "github.com/cubefs/cubefs/blobstore/common/codemode"
        ebsproto "github.com/cubefs/cubefs/blobstore/common/proto"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
        "github.com/google/uuid"
)

const (
        MaxRetryTimes      = 3
        RetrySleepInterval = 100 * time.Millisecond
)

type BlobStoreClient struct {
        client access.API
}

func NewEbsClient(cfg access.Config) (*BlobStoreClient, error) {
        cli, err := access.New(cfg)
        return &BlobStoreClient{
                client: cli,
        }, err
}

func (ebs *BlobStoreClient) Read(ctx context.Context, volName string, buf []byte, offset uint64, size uint64, oek proto.ObjExtentKey) (readN int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ebs-read", err, bgTime, 1)
        }()

        requestId := uuid.New().String()
        log.LogDebugf("TRACE Ebs Read Enter requestId(%v), oek(%v)", requestId, oek)
        ctx = access.WithRequestID(ctx, requestId)
        start := time.Now()

        metric := exporter.NewTPCnt(createOPMetric(buf, "ebsread"))
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: volName})
        }()
        blobs := oek.Blobs
        sliceInfos := make([]access.SliceInfo, 0)
        for _, b := range blobs {
                sliceInfo := access.SliceInfo{
                        MinBid: ebsproto.BlobID(b.MinBid),
                        Vid:    ebsproto.Vid(b.Vid),
                        Count:  uint32(b.Count),
                }
                sliceInfos = append(sliceInfos, sliceInfo)
        }
        loc := access.Location{
                ClusterID: ebsproto.ClusterID(oek.Cid),
                Size:      oek.Size,
                Crc:       oek.Crc,
                CodeMode:  codemode.CodeMode(oek.CodeMode),
                BlobSize:  oek.BlobSize,
                Blobs:     sliceInfos,
        }
        // func get has retry
        log.LogDebugf("TRACE Ebs Read,oek(%v) loc(%v)", oek, loc)
        var body io.ReadCloser
        defer func() {
                if body != nil {
                        body.Close()
                }
        }()
        for i := 0; i < MaxRetryTimes; i++ {
                body, err = ebs.client.Get(ctx, &access.GetArgs{Location: loc, Offset: offset, ReadSize: size})
                if err == nil {
                        break
                }
                log.LogWarnf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v),retryTimes(%v)", oek, err, requestId, i)
                time.Sleep(RetrySleepInterval)
        }
        if err != nil {
                log.LogErrorf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v)", oek, err, requestId)
                return 0, err
        }

        readN, err = io.ReadFull(body, buf)
        if err != nil {
                log.LogErrorf("TRACE Ebs Read,oek(%v), err(%v), requestId(%v)", oek, err, requestId)
                return 0, err
        }
        elapsed := time.Since(start)
        log.LogDebugf("TRACE Ebs Read Exit,oek(%v) readN(%v),bufLen(%v),consume(%v)ns", oek, readN, len(buf), elapsed.Nanoseconds())
        return readN, nil
}

func (ebs *BlobStoreClient) Write(ctx context.Context, volName string, data []byte, size uint32) (location access.Location, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ebs-write", err, bgTime, 1)
        }()

        requestId := uuid.New().String()
        log.LogDebugf("TRACE Ebs Write Enter,requestId(%v)  len(%v)", requestId, size)
        start := time.Now()
        ctx = access.WithRequestID(ctx, requestId)
        metric := exporter.NewTPCnt(createOPMetric(data, "ebswrite"))
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: volName})
        }()

        for i := 0; i < MaxRetryTimes; i++ {
                location, _, err = ebs.client.Put(ctx, &access.PutArgs{
                        Size: int64(size),
                        Body: bytes.NewReader(data),
                })
                if err == nil {
                        break
                }
                log.LogWarnf("TRACE Ebs write, err(%v), requestId(%v),retryTimes(%v)", err, requestId, i)
                time.Sleep(RetrySleepInterval)
        }
        if err != nil {
                log.LogErrorf("TRACE Ebs write,err(%v),requestId(%v)", err.Error(), requestId)
                return location, err
        }
        elapsed := time.Since(start)
        log.LogDebugf("TRACE Ebs Write Exit,requestId(%v)  len(%v) consume(%v)ns", requestId, len(data), elapsed.Nanoseconds())
        return location, nil
}

func (ebs *BlobStoreClient) Delete(oeks []proto.ObjExtentKey) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ebs-delete", err, bgTime, 1)
        }()

        ctx, cancel := context.WithTimeout(context.TODO(), time.Second*3)
        defer cancel()

        locs := make([]access.Location, 0)

        for _, oek := range oeks {
                sliceInfos := make([]access.SliceInfo, 0)
                for _, b := range oek.Blobs {
                        sliceInfo := access.SliceInfo{
                                MinBid: ebsproto.BlobID(b.MinBid),
                                Vid:    ebsproto.Vid(b.Vid),
                                Count:  uint32(b.Count),
                        }
                        sliceInfos = append(sliceInfos, sliceInfo)
                }

                loc := access.Location{
                        ClusterID: ebsproto.ClusterID(oek.Cid),
                        Size:      oek.Size,
                        Crc:       oek.Crc,
                        CodeMode:  codemode.CodeMode(oek.CodeMode),
                        BlobSize:  oek.BlobSize,
                        Blobs:     sliceInfos,
                }
                locs = append(locs, loc)
        }

        requestId := uuid.New().String()
        log.LogDebugf("start Ebs delete Enter,requestId(%v)  len(%v)", requestId, len(oeks))
        start := time.Now()
        ctx = access.WithRequestID(ctx, requestId)
        metric := exporter.NewTPCnt("ebsdel")
        defer func() {
                metric.SetWithLabels(err, map[string]string{})
        }()

        elapsed := time.Since(start)
        _, err = ebs.client.Delete(ctx, &access.DeleteArgs{Locations: locs})
        if err != nil {
                log.LogErrorf("[EbsDelete] Ebs delete error, id(%v), consume(%v)ns, err(%v)", requestId, elapsed.Nanoseconds(), err.Error())
                return err
        }

        log.LogDebugf("Ebs delete Exit,requestId(%v)  len(%v) consume(%v)ns", requestId, len(oeks), elapsed.Nanoseconds())

        return err
}

func createOPMetric(buf []byte, tag string) string {
        if len(buf) >= 0 && len(buf) < 4*util.KB {
                return tag + "0K_4K"
        } else if len(buf) >= 4*util.KB && len(buf) < 128*util.KB {
                return tag + "4K_128K"
        } else if len(buf) >= 128*util.KB && len(buf) < 1*util.MB {
                return tag + "128K_1M"
        } else if len(buf) >= 1*util.MB && len(buf) < 4*util.MB {
                return tag + "1M_4M"
        }
        return tag + "4M_8M"
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package blobstore

import (
        "context"
        "fmt"
        "io"
        "os"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/blockcache/bcache"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/manager"
        "github.com/cubefs/cubefs/sdk/data/stream"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

type rwSlice struct {
        index        int
        fileOffset   uint64
        size         uint32
        rOffset      uint64
        rSize        uint32
        read         int
        Data         []byte
        extentKey    proto.ExtentKey
        objExtentKey proto.ObjExtentKey
}

func (s rwSlice) String() string {
        return fmt.Sprintf("rwSlice{fileOffset(%v),size(%v),rOffset(%v),rSize(%v),read(%v),extentKey(%v),objExtentKey(%v)}", s.fileOffset, s.size, s.rOffset, s.rSize, s.read, s.extentKey, s.objExtentKey)
}

func (reader *Reader) String() string {
        return fmt.Sprintf("Reader{address(%v),volName(%v),volType(%v),ino(%v),fileSize(%v),enableBcache(%v),cacheAction(%v),fileCache(%v),cacheThreshold(%v)},readConcurrency(%v)",
                &reader, reader.volName, reader.volType, reader.ino, reader.fileLength, reader.enableBcache, reader.cacheAction, reader.fileCache, reader.cacheThreshold, reader.readConcurrency)
}

type Reader struct {
        volName         string
        volType         int
        ino             uint64
        offset          uint64
        data            []byte
        err             chan error
        bc              *bcache.BcacheClient
        mw              *meta.MetaWrapper
        ec              *stream.ExtentClient
        ebs             *BlobStoreClient
        readConcurrency int
        cacheTimeout    time.Duration
        wg              sync.WaitGroup
        once            sync.Once
        sync.Mutex
        close           bool
        extentKeys      []proto.ExtentKey
        missExtentKeys  []proto.ExtentKey
        objExtentKeys   []proto.ObjExtentKey
        enableBcache    bool
        cacheAction     int
        fileCache       bool
        cacheThreshold  int
        fileLength      uint64
        valid           bool
        inflightL2cache sync.Map
        limitManager    *manager.LimitManager
}

type ClientConfig struct {
        VolName         string
        VolType         int
        BlockSize       int
        Ino             uint64
        Bc              *bcache.BcacheClient
        Mw              *meta.MetaWrapper
        Ec              *stream.ExtentClient
        Ebsc            *BlobStoreClient
        EnableBcache    bool
        WConcurrency    int
        ReadConcurrency int
        CacheAction     int
        FileCache       bool
        FileSize        uint64
        CacheThreshold  int
}

func NewReader(config ClientConfig) (reader *Reader) {
        reader = new(Reader)

        reader.volName = config.VolName
        reader.volType = config.VolType
        reader.ino = config.Ino
        reader.bc = config.Bc
        reader.ebs = config.Ebsc
        reader.mw = config.Mw
        reader.ec = config.Ec
        reader.enableBcache = config.EnableBcache
        reader.readConcurrency = config.ReadConcurrency
        reader.cacheAction = config.CacheAction
        reader.fileCache = config.FileCache
        reader.cacheThreshold = config.CacheThreshold

        if proto.IsCold(reader.volType) {
                reader.ec.UpdateDataPartitionForColdVolume()
        }

        reader.limitManager = reader.ec.LimitManager
        return
}

func (reader *Reader) Read(ctx context.Context, buf []byte, offset int, size int) (int, error) {
        if reader == nil {
                return 0, fmt.Errorf("reader is not opened yet")
        }
        log.LogDebugf("TRACE reader Read Enter. ino(%v) offset(%v) len(%v)", reader.ino, offset, size)
        var (
                read = 0
                err  error
        )
        if reader.close {
                return 0, os.ErrInvalid
        }

        reader.Lock()
        defer reader.Unlock()
        // cold volume,slice read
        var rSlices []*rwSlice
        if size != len(buf) {
                size = len(buf)
        }

        rSlices, err = reader.prepareEbsSlice(offset, uint32(size))
        log.LogDebugf("TRACE reader Read. ino(%v)  rSlices-length(%v) ", reader.ino, len(rSlices))

        if err != nil {
                return 0, err
        }
        sliceSize := len(rSlices)
        if sliceSize > 0 {
                reader.wg.Add(sliceSize)
                pool := New(reader.readConcurrency, sliceSize)
                defer pool.Close()
                reader.err = make(chan error, sliceSize)
                for _, rs := range rSlices {
                        pool.Execute(rs, func(param *rwSlice) {
                                reader.readSliceRange(ctx, param)
                        })
                }

                reader.wg.Wait()
                for i := 0; i < sliceSize; i++ {
                        if err, ok := <-reader.err; !ok || err != nil {
                                return 0, err
                        }
                }
                close(reader.err)
        }
        for i := 0; i < sliceSize; i++ {
                read += copy(buf[read:], rSlices[i].Data)
        }
        log.LogDebugf("TRACE reader Read Exit. ino(%v)  readN(%v) buf-len(%v)", reader.ino, read, len(buf))
        return read, nil
}

func (reader *Reader) Close(ctx context.Context) {
        reader.Lock()
        reader.close = true
        reader.Unlock()
}

func (reader *Reader) prepareEbsSlice(offset int, size uint32) ([]*rwSlice, error) {
        if offset < 0 {
                return nil, syscall.EIO
        }
        chunks := make([]*rwSlice, 0)
        endflag := false
        selected := false

        reader.once.Do(func() {
                reader.refreshEbsExtents()
        })
        fileSize, valid := reader.fileSize()
        reader.fileLength = fileSize
        log.LogDebugf("TRACE blobStore prepareEbsSlice Enter. ino(%v)  fileSize(%v) ", reader.ino, fileSize)
        if !valid {
                log.LogErrorf("Reader: invoke fileSize fail. ino(%v)  offset(%v) size(%v)", reader.ino, offset, size)
                return nil, syscall.EIO
        }
        log.LogDebugf("TRACE blobStore prepareEbsSlice. ino(%v)  offset(%v) size(%v)", reader.ino, offset, size)
        if uint64(offset) >= fileSize {
                return nil, io.EOF
        }

        start := uint64(offset)
        if uint64(offset)+uint64(size) > fileSize {
                size = uint32(fileSize - uint64(offset))
        }
        end := uint64(offset + int(size))
        for index, oek := range reader.objExtentKeys {
                rs := &rwSlice{}
                if oek.FileOffset <= start && start < oek.FileOffset+(oek.Size) {
                        rs.index = index
                        rs.fileOffset = oek.FileOffset
                        rs.size = uint32(oek.Size)
                        rs.rOffset = start - oek.FileOffset
                        rs.rSize = uint32(oek.FileOffset + oek.Size - start)
                        selected = true
                }
                if end <= oek.FileOffset+oek.Size {
                        rs.rSize = uint32(end - start)
                        selected = true
                        endflag = true
                }
                if selected {
                        rs.objExtentKey = oek
                        reader.buildExtentKey(rs)
                        rs.Data = make([]byte, rs.rSize)
                        start = oek.FileOffset + oek.Size
                        chunks = append(chunks, rs)
                        log.LogDebugf("TRACE blobStore prepareEbsSlice. ino(%v)  offset(%v) size(%v) rwSlice(%v)", reader.ino, offset, size, rs)
                }
                if endflag {
                        break
                }
        }
        log.LogDebugf("TRACE blobStore prepareEbsSlice Exit. ino(%v)  offset(%v) size(%v) rwSlices(%v)", reader.ino, offset, size, chunks)
        return chunks, nil
}

func (reader *Reader) buildExtentKey(rs *rwSlice) {
        if len(reader.extentKeys) <= 0 {
                rs.extentKey = proto.ExtentKey{}
        } else {
                low := 0
                high := len(reader.extentKeys) - 1
                for low <= high {
                        mid := (high + low) / 2
                        target := reader.extentKeys[mid]
                        if target.FileOffset == rs.objExtentKey.FileOffset {
                                rs.extentKey = target
                                return
                        } else if target.FileOffset > rs.objExtentKey.FileOffset {
                                high = mid - 1
                        } else {
                                low = mid + 1
                        }
                }
                rs.extentKey = proto.ExtentKey{}
        }
}

func (reader *Reader) readSliceRange(ctx context.Context, rs *rwSlice) (err error) {
        defer reader.wg.Done()
        log.LogDebugf("TRACE blobStore readSliceRange Enter. ino(%v)  rs.fileOffset(%v),rs.rOffset(%v),rs.rSize(%v) ", reader.ino, rs.fileOffset, rs.rOffset, rs.rSize)
        cacheKey := util.GenerateKey(reader.volName, reader.ino, rs.fileOffset)
        log.LogDebugf("TRACE blobStore readSliceRange. ino(%v)  cacheKey(%v) ", reader.ino, cacheKey)
        buf := make([]byte, rs.rSize)
        var readN int

        bgTime := stat.BeginStat()
        stat.EndStat("CacheGet", nil, bgTime, 1)
        // all request for each block.
        metric := exporter.NewTPCnt("CacheGet")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
        }()

        // read local cache
        if reader.enableBcache {
                readN, err = reader.bc.Get(cacheKey, buf, rs.rOffset, rs.rSize)
                if err == nil {
                        reader.ec.BcacheHealth = true
                        if readN == int(rs.rSize) {

                                // L1 cache hit.
                                metric := exporter.NewTPCnt("L1CacheGetHit")
                                stat.EndStat("CacheHit-L1", nil, bgTime, 1)
                                defer func() {
                                        metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
                                }()

                                copy(rs.Data, buf)
                                reader.err <- nil
                                return
                        }
                }
        }

        readLimitOn := false
        // read cfs and cache to bcache
        if rs.extentKey != (proto.ExtentKey{}) {

                // check if dp is exist in preload sence
                err = reader.ec.CheckDataPartitionExsit(rs.extentKey.PartitionId)
                if err == nil || ctx.Value("objectnode") != nil {
                        readN, err, readLimitOn = reader.ec.ReadExtent(reader.ino, &rs.extentKey, buf, int(rs.rOffset), int(rs.rSize))
                        if err == nil && readN == int(rs.rSize) {

                                // L2 cache hit.
                                metric := exporter.NewTPCnt("L2CacheGetHit")
                                stat.EndStat("CacheHit-L2", nil, bgTime, 1)
                                defer func() {
                                        metric.SetWithLabels(err, map[string]string{exporter.Vol: reader.volName})
                                }()

                                copy(rs.Data, buf)
                                reader.err <- nil
                                return
                        }
                } else {
                        log.LogDebugf("checkDataPartitionExsit failed (%v)", err)
                }
                log.LogDebugf("TRACE blobStore readSliceRange. cfs block miss.extentKey=%v,err=%v", rs.extentKey, err)
        }
        if !readLimitOn {
                reader.limitManager.ReadAlloc(ctx, int(rs.rSize))
        }

        readN, err = reader.ebs.Read(ctx, reader.volName, buf, rs.rOffset, uint64(rs.rSize), rs.objExtentKey)
        if err != nil {
                reader.err <- err
                return
        }
        read := copy(rs.Data, buf)
        reader.err <- nil

        // cache full block
        if !reader.needCacheL1() && !reader.needCacheL2() || reader.ec.IsPreloadMode() {
                log.LogDebugf("TRACE blobStore readSliceRange exit without cache. read counter=%v", read)
                return nil
        }

        asyncCtx := context.Background()
        go reader.asyncCache(asyncCtx, cacheKey, rs.objExtentKey)

        log.LogDebugf("TRACE blobStore readSliceRange exit with cache. read counter=%v", read)
        return nil
}

func (reader *Reader) asyncCache(ctx context.Context, cacheKey string, objExtentKey proto.ObjExtentKey) {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("read-async-cache", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE blobStore asyncCache Enter. cacheKey=%v", cacheKey)

        // block is go loading.
        if _, ok := reader.inflightL2cache.Load(cacheKey); ok {
                return
        }

        reader.inflightL2cache.Store(cacheKey, true)
        defer reader.inflightL2cache.Delete(cacheKey)

        buf := make([]byte, objExtentKey.Size)
        read, err := reader.ebs.Read(ctx, reader.volName, buf, 0, uint64(len(buf)), objExtentKey)
        if err != nil || read != len(buf) {
                log.LogErrorf("ERROR blobStore asyncCache fail, size no match. cacheKey=%v, objExtentKey.size=%v, read=%v",
                        cacheKey, len(buf), read)
                return
        }

        if reader.needCacheL2() {
                reader.ec.Write(reader.ino, int(objExtentKey.FileOffset), buf, proto.FlagsCache, nil)
                log.LogDebugf("TRACE blobStore asyncCache(L2) Exit. cacheKey=%v", cacheKey)
                return
        }

        if reader.needCacheL1() {
                reader.bc.Put(cacheKey, buf)
        }

        log.LogDebugf("TRACE blobStore asyncCache(L1) Exit. cacheKey=%v", cacheKey)
}

func (reader *Reader) needCacheL2() bool {
        if reader.cacheAction > proto.NoCache && reader.fileLength < uint64(reader.cacheThreshold) || reader.fileCache {
                return true
        }
        return false
}

func (reader *Reader) needCacheL1() bool {
        return reader.enableBcache
}

func (reader *Reader) refreshEbsExtents() {
        _, _, eks, oeks, err := reader.mw.GetObjExtents(reader.ino)
        if err != nil {
                reader.valid = false
                log.LogErrorf("TRACE blobStore refreshEbsExtents error. ino(%v)  err(%v) ", reader.ino, err)
                return
        }
        reader.valid = true
        reader.extentKeys = eks
        reader.objExtentKeys = oeks
        log.LogDebugf("TRACE blobStore refreshEbsExtents ok. extentKeys(%v)  objExtentKeys(%v) ", reader.extentKeys, reader.objExtentKeys)
}

func (reader *Reader) fileSize() (uint64, bool) {
        objKeys := reader.objExtentKeys
        if !reader.valid {
                return 0, false
        }
        if len(objKeys) > 0 {
                lastIndex := len(objKeys) - 1
                return objKeys[lastIndex].FileOffset + objKeys[lastIndex].Size, true
        }
        return 0, true
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package blobstore

type Instance struct {
        mq chan task
}

type task struct {
        op *rwSlice
        fn func(op *rwSlice)
}

func New(worker int, size int) Instance {
        mq := make(chan task, size)
        for i := 0; i < worker; i++ {
                go func() {
                        for {
                                task, ok := <-mq
                                if !ok {
                                        break
                                }
                                task.fn(task.op)
                        }
                }()
        }
        return Instance{mq}
}

func (r Instance) Execute(op *rwSlice, fn func(op *rwSlice)) {
        r.mq <- task{
                op: op,
                fn: fn,
        }
}

func (r Instance) Close() {
        close(r.mq)
}

type Executor struct {
        tokens chan int
}

func NewExecutor(maxConcurrency int) *Executor {
        exec := &Executor{
                tokens: make(chan int, maxConcurrency),
        }
        for i := 0; i < maxConcurrency; i++ {
                exec.tokens <- i
        }
        return exec
}

func (exec *Executor) Run(fn func()) {
        i := <-exec.tokens
        go func() {
                fn()
                exec.tokens <- i
        }()
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package blobstore

import (
        "context"
        "fmt"
        "hash"
        "io"
        "sort"
        "sync"
        "sync/atomic"
        "syscall"

        "github.com/cubefs/cubefs/blockcache/bcache"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/manager"
        "github.com/cubefs/cubefs/sdk/data/stream"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/buf"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

const (
        MaxBufferSize = 512 * util.MB
)

type wSliceErr struct {
        err        error
        fileOffset uint64
        size       uint32
}

type Writer struct {
        volType      int
        volName      string
        blockSize    int
        ino          uint64
        err          chan *wSliceErr
        bc           *bcache.BcacheClient
        mw           *meta.MetaWrapper
        ec           *stream.ExtentClient
        ebsc         *BlobStoreClient
        wConcurrency int
        wg           sync.WaitGroup
        once         sync.Once
        sync.RWMutex
        enableBcache   bool
        cacheAction    int
        buf            []byte
        fileOffset     int
        fileCache      bool
        fileSize       uint64
        cacheThreshold int
        dirty          bool
        blockPosition  int
        limitManager   *manager.LimitManager
}

func NewWriter(config ClientConfig) (writer *Writer) {
        writer = new(Writer)

        writer.volName = config.VolName
        writer.volType = config.VolType
        writer.blockSize = config.BlockSize
        writer.ino = config.Ino
        writer.err = nil
        writer.bc = config.Bc
        writer.mw = config.Mw
        writer.ec = config.Ec
        writer.ebsc = config.Ebsc
        writer.wConcurrency = config.WConcurrency
        writer.wg = sync.WaitGroup{}
        writer.once = sync.Once{}
        writer.RWMutex = sync.RWMutex{}
        writer.enableBcache = config.EnableBcache
        writer.cacheAction = config.CacheAction
        writer.fileCache = config.FileCache
        writer.fileSize = config.FileSize
        writer.cacheThreshold = config.CacheThreshold
        writer.dirty = false
        writer.allocateCache()
        writer.limitManager = writer.ec.LimitManager

        return
}

func (writer *Writer) String() string {
        return fmt.Sprintf("Writer{address(%v),volName(%v),volType(%v),ino(%v),blockSize(%v),fileSize(%v),enableBcache(%v),cacheAction(%v),fileCache(%v),cacheThreshold(%v)},wConcurrency(%v)",
                &writer, writer.volName, writer.volType, writer.ino, writer.blockSize, writer.fileSize, writer.enableBcache, writer.cacheAction, writer.fileCache, writer.cacheThreshold, writer.wConcurrency)
}

func (writer *Writer) WriteWithoutPool(ctx context.Context, offset int, data []byte) (size int, err error) {
        // atomic.StoreInt32(&writer.idle, 0)
        if writer == nil {
                return 0, fmt.Errorf("writer is not opened yet")
        }
        log.LogDebugf("TRACE blobStore WriteWithoutPool Enter: ino(%v) offset(%v) len(%v) fileSize(%v)",
                writer.ino, offset, len(data), writer.CacheFileSize())

        if len(data) > MaxBufferSize || offset != writer.CacheFileSize() {
                log.LogErrorf("TRACE blobStore WriteWithoutPool error,may be len(%v)>512MB,offset(%v)!=fileSize(%v)",
                        len(data), offset, writer.CacheFileSize())
                err = syscall.EOPNOTSUPP
                return
        }
        // write buffer
        log.LogDebugf("TRACE blobStore WriteWithoutPool: ino(%v) offset(%v) len(%v)",
                writer.ino, offset, len(data))

        size, err = writer.doBufferWriteWithoutPool(ctx, data, offset)

        return
}

func (writer *Writer) Write(ctx context.Context, offset int, data []byte, flags int) (size int, err error) {
        // atomic.StoreInt32(&writer.idle, 0)
        if writer == nil {
                return 0, fmt.Errorf("writer is not opened yet")
        }
        log.LogDebugf("TRACE blobStore Write Enter: ino(%v) offset(%v) len(%v) flags&proto.FlagsAppend(%v) fileSize(%v)", writer.ino, offset, len(data), flags&proto.FlagsAppend, writer.CacheFileSize())

        if len(data) > MaxBufferSize || flags&proto.FlagsAppend == 0 || offset != writer.CacheFileSize() {
                log.LogErrorf("TRACE blobStore Write error,may be len(%v)>512MB,flags(%v)!=flagAppend,offset(%v)!=fileSize(%v)", len(data), flags&proto.FlagsAppend, offset, writer.CacheFileSize())
                err = syscall.EOPNOTSUPP
                return
        }
        // write buffer
        log.LogDebugf("TRACE blobStore Write: ino(%v) offset(%v) len(%v) flags&proto.FlagsSyncWrite(%v)", writer.ino, offset, len(data), flags&proto.FlagsSyncWrite)
        if flags&proto.FlagsSyncWrite == 0 {
                size, err = writer.doBufferWrite(ctx, data, offset)
                return
        }
        // parallel io write ebs direct
        size, err = writer.doParallelWrite(ctx, data, offset)
        return
}

func (writer *Writer) doParallelWrite(ctx context.Context, data []byte, offset int) (size int, err error) {
        log.LogDebugf("TRACE blobStore doDirectWrite: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))
        writer.Lock()
        defer writer.Unlock()
        wSlices := writer.prepareWriteSlice(offset, data)
        log.LogDebugf("TRACE blobStore prepareWriteSlice: wSlices(%v)", wSlices)
        sliceSize := len(wSlices)

        writer.wg.Add(sliceSize)
        writer.err = make(chan *wSliceErr, sliceSize)
        pool := New(writer.wConcurrency, sliceSize)
        defer pool.Close()
        for _, wSlice := range wSlices {
                pool.Execute(wSlice, func(param *rwSlice) {
                        writer.writeSlice(ctx, param, true)
                })
        }
        writer.wg.Wait()
        for i := 0; i < sliceSize; i++ {
                if wErr := <-writer.err; wErr != nil {
                        log.LogErrorf("slice write error,ino(%v) fileoffset(%v) sliceSize(%v) err(%v)",
                                writer.ino, wErr.fileOffset, wErr.size, wErr.err)
                        return 0, wErr.err
                }
        }
        close(writer.err)
        // update meta
        oeks := make([]proto.ObjExtentKey, 0)
        for _, wSlice := range wSlices {
                size += int(wSlice.size)
                oeks = append(oeks, wSlice.objExtentKey)
        }
        log.LogDebugf("TRACE blobStore appendObjExtentKeys: oeks(%v)", oeks)
        if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
                log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", writer.ino, offset, len(data), err)
                return
        }
        atomic.AddUint64(&writer.fileSize, uint64(size))

        for _, wSlice := range wSlices {
                writer.cacheLevel2(wSlice)
        }

        return
}

func (writer *Writer) cacheLevel2(wSlice *rwSlice) {
        if writer.cacheAction == proto.RWCache && (wSlice.fileOffset+uint64(wSlice.size)) < uint64(writer.cacheThreshold) || writer.fileCache {
                buf := make([]byte, wSlice.size)
                offSet := int(wSlice.fileOffset)
                copy(buf, wSlice.Data)
                go writer.asyncCache(writer.ino, offSet, buf)
        }
}

func (writer *Writer) WriteFromReader(ctx context.Context, reader io.Reader, h hash.Hash) (size uint64, err error) {
        var (
                tmp         = buf.ReadBufPool.Get().([]byte)
                exec        = NewExecutor(writer.wConcurrency)
                leftToWrite int
        )
        defer buf.ReadBufPool.Put(tmp)

        writer.fileOffset = 0
        writer.err = make(chan *wSliceErr)

        var oeksLock sync.RWMutex
        oeks := make([]proto.ObjExtentKey, 0)

        writeBuff := func() {
                bufSize := len(writer.buf)
                log.LogDebugf("writeBuff: bufSize(%v), leftToWrite(%v), err(%v)", bufSize, leftToWrite, err)
                if bufSize == writer.blockSize || (leftToWrite == 0 && err == io.EOF) {
                        wSlice := &rwSlice{
                                fileOffset: uint64(writer.fileOffset - bufSize),
                                size:       uint32(bufSize),
                        }
                        wSlice.Data = make([]byte, bufSize)
                        copy(wSlice.Data, writer.buf)
                        writer.buf = writer.buf[:0]
                        if (err == nil || err == io.EOF) && h != nil {
                                h.Write(wSlice.Data)
                                log.LogDebugf("writeBuff: bufSize(%v), md5", bufSize)
                        }
                        writer.wg.Add(1)

                        write := func() {
                                defer writer.wg.Done()
                                err := writer.writeSlice(ctx, wSlice, false)
                                if err != nil {
                                        writer.Lock()
                                        if len(writer.err) > 0 {
                                                writer.Unlock()
                                                return
                                        }
                                        wErr := &wSliceErr{
                                                err:        err,
                                                fileOffset: wSlice.fileOffset,
                                                size:       wSlice.size,
                                        }
                                        writer.err <- wErr
                                        writer.Unlock()
                                        return
                                }

                                oeksLock.Lock()
                                oeks = append(oeks, wSlice.objExtentKey)
                                oeksLock.Unlock()

                                writer.cacheLevel2(wSlice)
                        }

                        exec.Run(write)
                }
        }

LOOP:
        for {
                position := 0
                leftToWrite, err = reader.Read(tmp)
                if err != nil && err != io.EOF {
                        return
                }

                for leftToWrite > 0 {
                        log.LogDebugf("WriteFromReader: leftToWrite(%v), err(%v)", leftToWrite, err)
                        writer.RLock()
                        errNum := len(writer.err)
                        writer.RUnlock()
                        if errNum > 0 {
                                break LOOP
                        }

                        freeSize := writer.blockSize - len(writer.buf)
                        writeSize := util.Min(leftToWrite, freeSize)
                        writer.buf = append(writer.buf, tmp[position:position+writeSize]...)
                        position += writeSize
                        leftToWrite -= writeSize
                        writer.fileOffset += writeSize
                        writer.dirty = true

                        writeBuff()

                }
                if err == io.EOF {
                        log.LogDebugf("WriteFromReader: EOF")
                        if len(writer.buf) > 0 {
                                writeBuff()
                        }
                        err = nil
                        writer.wg.Wait()
                        var wErr *wSliceErr
                        select {
                        case wErr := <-writer.err:
                                err = wErr.err
                        default:
                        }
                        if err != nil {
                                log.LogErrorf("slice write error,ino(%v) fileoffset(%v)  sliceSize(%v) err(%v)", writer.ino, wErr.fileOffset, wErr.size, err)
                        }
                        break
                }
        }

        log.LogDebugf("WriteFromReader before sort: %v", oeks)
        sort.Slice(oeks, func(i, j int) bool {
                return oeks[i].FileOffset < oeks[j].FileOffset
        })
        log.LogDebugf("WriteFromReader after sort: %v", oeks)
        if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
                log.LogErrorf("WriteFromReader error,meta append ebsc extent keys fail,ino(%v), err(%v)", writer.ino, err)
                return
        }

        size = uint64(writer.fileOffset)
        atomic.AddUint64(&writer.fileSize, size)
        return
}

func (writer *Writer) doBufferWriteWithoutPool(ctx context.Context, data []byte, offset int) (size int, err error) {
        log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool Enter: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))

        writer.fileOffset = offset
        dataSize := len(data)
        position := 0
        log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
        writer.Lock()
        defer writer.Unlock()
        for dataSize > 0 {
                freeSize := writer.blockSize - len(writer.buf)
                if dataSize < freeSize {
                        freeSize = dataSize
                }
                log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.fileSize(%v) writer.fileOffset(%v) position(%v) freeSize(%v)", writer.ino, writer.fileSize, writer.fileOffset, position, freeSize)
                writer.buf = append(writer.buf, data[position:position+freeSize]...)
                log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool:ino(%v) writer.buf.len(%v)", writer.ino, len(writer.buf))
                position += freeSize
                dataSize -= freeSize
                writer.fileOffset += freeSize
                writer.dirty = true

                if len(writer.buf) == writer.blockSize {
                        log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
                        writer.Unlock()
                        err = writer.flushWithoutPool(writer.ino, ctx, false)
                        writer.Lock()
                        if err != nil {
                                writer.buf = writer.buf[:len(writer.buf)-len(data)]
                                writer.fileOffset -= len(data)
                                return
                        }

                }
        }

        size = len(data)
        atomic.AddUint64(&writer.fileSize, uint64(size))

        log.LogDebugf("TRACE blobStore doBufferWriteWithoutPool Exit: ino(%v) writer.fileSize(%v) writer.fileOffset(%v)", writer.ino, writer.fileSize, writer.fileOffset)
        return size, nil
}

func (writer *Writer) doBufferWrite(ctx context.Context, data []byte, offset int) (size int, err error) {
        log.LogDebugf("TRACE blobStore doBufferWrite Enter: ino(%v) offset(%v) len(%v)", writer.ino, offset, len(data))

        writer.fileOffset = offset
        dataSize := len(data)
        position := 0
        log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
        writer.Lock()
        defer writer.Unlock()
        for dataSize > 0 {
                freeSize := writer.blockSize - writer.blockPosition
                if dataSize < freeSize {
                        freeSize = dataSize
                }
                log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.fileSize(%v) writer.fileOffset(%v) writer.blockPosition(%v) position(%v) freeSize(%v)", writer.ino, writer.fileSize, writer.fileOffset, writer.blockPosition, position, freeSize)
                copy(writer.buf[writer.blockPosition:], data[position:position+freeSize])
                log.LogDebugf("TRACE blobStore doBufferWrite:ino(%v) writer.buf.len(%v)", writer.ino, len(writer.buf))
                position += freeSize
                writer.blockPosition += freeSize
                dataSize -= freeSize
                writer.fileOffset += freeSize
                writer.dirty = true

                if writer.blockPosition == writer.blockSize {
                        log.LogDebugf("TRACE blobStore doBufferWrite: ino(%v) writer.buf.len(%v) writer.blocksize(%v)", writer.ino, len(writer.buf), writer.blockSize)
                        writer.Unlock()
                        err = writer.flush(writer.ino, ctx, false)
                        writer.Lock()
                        if err != nil {
                                writer.buf = writer.buf[:writer.blockPosition-freeSize]
                                writer.fileOffset -= freeSize
                                writer.blockPosition -= freeSize
                                return
                        }
                }
        }

        size = len(data)
        atomic.AddUint64(&writer.fileSize, uint64(size))

        log.LogDebugf("TRACE blobStore doBufferWrite Exit: ino(%v) writer.fileSize(%v) writer.fileOffset(%v)", writer.ino, writer.fileSize, writer.fileOffset)
        return size, nil
}

func (writer *Writer) FlushWithoutPool(ino uint64, ctx context.Context) (err error) {
        if writer == nil {
                return
        }
        return writer.flushWithoutPool(ino, ctx, true)
}

func (writer *Writer) Flush(ino uint64, ctx context.Context) (err error) {
        if writer == nil {
                return
        }
        return writer.flush(ino, ctx, true)
}

func (writer *Writer) shouldCacheCfs() bool {
        return writer.cacheAction == proto.RWCache
}

func (writer *Writer) prepareWriteSlice(offset int, data []byte) []*rwSlice {
        size := len(data)
        wSlices := make([]*rwSlice, 0)
        wSliceCount := size / writer.blockSize
        remainSize := size % writer.blockSize
        for index := 0; index < wSliceCount; index++ {
                offset := offset + index*writer.blockSize
                wSlice := &rwSlice{
                        index:      index,
                        fileOffset: uint64(offset),
                        size:       uint32(writer.blockSize),
                        Data:       data[index*writer.blockSize : (index+1)*writer.blockSize],
                }
                wSlices = append(wSlices, wSlice)
        }
        offset = offset + wSliceCount*writer.blockSize
        if remainSize > 0 {
                wSlice := &rwSlice{
                        index:      wSliceCount,
                        fileOffset: uint64(offset),
                        size:       uint32(remainSize),
                        Data:       data[wSliceCount*writer.blockSize:],
                }
                wSlices = append(wSlices, wSlice)
        }

        return wSlices
}

func (writer *Writer) writeSlice(ctx context.Context, wSlice *rwSlice, wg bool) (err error) {
        if wg {
                defer writer.wg.Done()
        }
        writer.limitManager.WriteAlloc(ctx, int(wSlice.size))
        log.LogDebugf("TRACE blobStore,writeSlice to ebs. ino(%v) fileOffset(%v) len(%v)", writer.ino, wSlice.fileOffset, wSlice.size)
        location, err := writer.ebsc.Write(ctx, writer.volName, wSlice.Data, wSlice.size)
        if err != nil {
                if wg {
                        writer.err <- &wSliceErr{err: err, fileOffset: wSlice.fileOffset, size: wSlice.size}
                }
                return err
        }
        log.LogDebugf("TRACE blobStore,location(%v)", location)
        blobs := make([]proto.Blob, 0)
        for _, info := range location.Blobs {
                blob := proto.Blob{
                        MinBid: uint64(info.MinBid),
                        Count:  uint64(info.Count),
                        Vid:    uint64(info.Vid),
                }
                blobs = append(blobs, blob)
        }
        wSlice.objExtentKey = proto.ObjExtentKey{
                Cid:        uint64(location.ClusterID),
                CodeMode:   uint8(location.CodeMode),
                Size:       location.Size,
                BlobSize:   location.BlobSize,
                Blobs:      blobs,
                BlobsLen:   uint32(len(blobs)),
                FileOffset: wSlice.fileOffset,
                Crc:        location.Crc,
        }
        log.LogDebugf("TRACE blobStore,objExtentKey(%v)", wSlice.objExtentKey)

        if wg {
                writer.err <- nil
        }
        return
}

func (writer *Writer) asyncCache(ino uint64, offset int, data []byte) {
        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("write-async-cache", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE asyncCache Enter,fileOffset(%v) len(%v)", offset, len(data))
        write, err := writer.ec.Write(ino, offset, data, proto.FlagsCache, nil)
        log.LogDebugf("TRACE asyncCache Exit,write(%v) err(%v)", write, err)
}

func (writer *Writer) resetBufferWithoutPool() {
        writer.buf = writer.buf[:0]
}

func (writer *Writer) resetBuffer() {
        // writer.buf = writer.buf[:0]
        writer.blockPosition = 0
}

func (writer *Writer) flushWithoutPool(inode uint64, ctx context.Context, flushFlag bool) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("blobstore-flush", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE blobStore flushWithoutPool: ino(%v) buf-len(%v) flushFlag(%v)", inode, len(writer.buf), flushFlag)
        writer.Lock()
        defer func() {
                writer.dirty = false
                writer.Unlock()
        }()

        if len(writer.buf) == 0 || !writer.dirty {
                return
        }
        bufferSize := len(writer.buf)
        wSlice := &rwSlice{
                fileOffset: uint64(writer.fileOffset - bufferSize),
                size:       uint32(bufferSize),
                Data:       writer.buf,
        }
        err = writer.writeSlice(ctx, wSlice, false)
        if err != nil {
                if flushFlag {
                        atomic.AddUint64(&writer.fileSize, -uint64(bufferSize))
                }
                return
        }

        oeks := make([]proto.ObjExtentKey, 0)
        // update meta
        oeks = append(oeks, wSlice.objExtentKey)
        if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
                log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", inode, wSlice.fileOffset, wSlice.size, err)
                return
        }
        writer.resetBufferWithoutPool()

        writer.cacheLevel2(wSlice)
        return
}

func (writer *Writer) flush(inode uint64, ctx context.Context, flushFlag bool) (err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("blobstore-flush", err, bgTime, 1)
        }()

        log.LogDebugf("TRACE blobStore flush: ino(%v) buf-len(%v) flushFlag(%v)", inode, len(writer.buf), flushFlag)
        writer.Lock()
        defer func() {
                writer.dirty = false
                writer.Unlock()
        }()

        if len(writer.buf) == 0 || !writer.dirty {
                return
        }
        bufferSize := writer.blockPosition
        wSlice := &rwSlice{
                fileOffset: uint64(writer.fileOffset - bufferSize),
                size:       uint32(bufferSize),
                Data:       writer.buf,
        }
        err = writer.writeSlice(ctx, wSlice, false)
        if err != nil {
                if flushFlag {
                        atomic.AddUint64(&writer.fileSize, -uint64(bufferSize))
                }
                return
        }

        oeks := make([]proto.ObjExtentKey, 0)
        // update meta
        oeks = append(oeks, wSlice.objExtentKey)
        if err = writer.mw.AppendObjExtentKeys(writer.ino, oeks); err != nil {
                log.LogErrorf("slice write error,meta append ebsc extent keys fail,ino(%v) fileOffset(%v) len(%v) err(%v)", inode, wSlice.fileOffset, wSlice.size, err)
                return
        }
        writer.resetBuffer()

        writer.cacheLevel2(wSlice)
        return
}

func (writer *Writer) CacheFileSize() int {
        return int(atomic.LoadUint64(&writer.fileSize))
}

func (writer *Writer) FreeCache() {
        if writer == nil {
                return
        }
        if buf.CachePool == nil {
                return
        }
        writer.once.Do(func() {
                tmpBuf := writer.buf
                writer.buf = nil
                if tmpBuf != nil {
                        buf.CachePool.Put(tmpBuf)
                }
        })
}

func (writer *Writer) allocateCache() {
        if buf.CachePool == nil {
                return
        }
        writer.buf = buf.CachePool.Get()
}

package manager

import (
        "container/list"
        "context"
        "math"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

const (
        runNow               = 1
        runLater             = 2
        gridHitLimitCnt      = 1
        girdCntOneSecond     = 3
        gridWindowTimeScope  = 10
        qosExpireTime        = 20
        qosReportMinGap      = uint32(time.Second) / 2
        defaultMagnifyFactor = 100
)

type UploadFlowInfoFunc func(clientInfo wrapper.SimpleClientInfo) error

type GridElement struct {
        time     time.Time
        used     uint64
        limit    uint64
        buffer   uint64
        hitLimit bool
        ID       uint64
        sync.RWMutex
}

type AllocElement struct {
        used    uint32
        magnify uint32
        future  *util.Future
}

type LimitFactor struct {
        factorType         uint32
        gridList           *list.List
        waitList           *list.List
        gidHitLimitCnt     uint8
        mgr                *LimitManager
        gridId             uint64
        magnify            uint32
        winBuffer          uint64
        lock               sync.RWMutex
        valAllocApply      uint64
        valAllocCommit     uint64
        valAllocLastApply  uint64
        valAllocLastCommit uint64
        isSetLimitZero     bool
}

func (factor *LimitFactor) getNeedByMagnify(allocCnt uint32, magnify uint32) uint64 {
        if magnify == 0 {
                return 0
        }
        if allocCnt > 1000 {
                log.QosWriteDebugf("action[getNeedByMagnify] allocCnt %v", allocCnt)
                magnify = defaultMagnifyFactor
        }

        need := uint64(allocCnt * magnify)
        if factor.factorType == proto.FlowWriteType || factor.factorType == proto.FlowReadType {
                if need > util.GB/8 {
                        need = util.GB / 8
                }
        }
        return need
}

func (factor *LimitFactor) alloc(allocCnt uint32) (ret uint8, future *util.Future) {
        log.QosWriteDebugf("action[alloc] type [%v] alloc [%v], tmp factor waitlist [%v] hitlimtcnt [%v] len [%v]", proto.QosTypeString(factor.factorType),
                allocCnt, factor.waitList.Len(), factor.gidHitLimitCnt, factor.gridList.Len())
        atomic.AddUint64(&factor.valAllocApply, uint64(allocCnt))
        if !factor.mgr.enable {
                // used not accurate also fine, the purpose is get master's info
                // without lock can better performance just the used value large than 0
                gridEnd := factor.gridList.Back()
                if gridEnd != nil {
                        grid := gridEnd.Value.(*GridElement)
                        // grid.used = grid.used+uint64(allocCnt)
                        atomic.AddUint64(&grid.used, uint64(allocCnt))
                        // atomic.CompareAndSwapUint64(&factor.valAllocApply, factor.valAllocApply, factor.valAllocApply+uint64(allocCnt))

                }
                return runNow, nil
        }

        type activeSt struct {
                activeUpdate bool
                needWait     bool
        }
        activeState := &activeSt{}
        defer func(active *activeSt) {
                if !active.needWait {
                        factor.lock.RUnlock()
                } else if !active.activeUpdate {
                        factor.lock.Unlock()
                }
        }(activeState)

        factor.lock.RLock()
        grid := factor.gridList.Back().Value.(*GridElement)

        if factor.mgr.enable && (factor.waitList.Len() > 0 || atomic.LoadUint64(&grid.used)+uint64(allocCnt) > grid.limit+grid.buffer) {
                factor.lock.RUnlock()
                factor.lock.Lock()
                activeState.needWait = true
                future = util.NewFuture()

                factor.waitList.PushBack(&AllocElement{
                        used:    allocCnt,
                        future:  future,
                        magnify: factor.magnify,
                })

                if !grid.hitLimit {
                        factor.gidHitLimitCnt++
                        // 1s have several gird, gidHitLimitCnt is the count that gird count hit limit in latest 1s,
                        // if gidHitLimitCnt larger than limit then request for enlarge factor limit
                        // GetSimpleVolView will call back simpleClient function to get factor info and send to master
                        if factor.gidHitLimitCnt >= factor.mgr.HitTriggerCnt {
                                tmpTime := time.Now()
                                if factor.mgr.lastReqTime.Add(time.Duration(factor.mgr.ReqPeriod) * time.Second).Before(tmpTime) {
                                        factor.mgr.lastReqTime = tmpTime
                                        log.QosWriteDebugf("CheckGrid factor [%v] unlock before active update simple vol view,gird id[%v] limit[%v] buffer [%v] used [%v]",
                                                proto.QosTypeString(factor.factorType), grid.ID, grid.limit, grid.buffer, grid.used)
                                        // unlock need call here,UpdateSimpleVolView will lock again
                                        grid.hitLimit = true
                                        factor.lock.Unlock()
                                        activeState.activeUpdate = true
                                        go factor.mgr.WrapperUpdate(factor.mgr.simpleClient)
                                }
                        }
                }
                grid.hitLimit = true
                return runLater, future
        }
        atomic.AddUint64(&grid.used, uint64(allocCnt))
        // atomic.CompareAndSwapUint64(&grid.used, grid.used, grid.used+uint64(allocCnt))
        return runNow, future
}

func (factor *LimitFactor) SetLimit(limitVal uint64, bufferVal uint64) {
        log.QosWriteDebugf("action[SetLimit] factor type [%v] limitVal [%v] bufferVal [%v]", proto.QosTypeString(factor.factorType), limitVal, bufferVal)
        var grid *GridElement
        factor.mgr.lastTimeOfSetLimit = time.Now()
        factor.lock.Lock()

        defer func() {
                factor.TryReleaseWaitList()
                factor.lock.Unlock()
        }()

        if factor.gridList.Len() == 0 {
                grid = &GridElement{
                        time:   time.Now(),
                        limit:  limitVal / girdCntOneSecond,
                        buffer: bufferVal / girdCntOneSecond,
                        ID:     factor.gridId,
                }
                factor.gridId++
                factor.gridList.PushBack(grid)
        } else {
                grid = factor.gridList.Back().Value.(*GridElement)
                grid.buffer = bufferVal / girdCntOneSecond
                grid.limit = limitVal / girdCntOneSecond
        }

        if grid.limit == 0 {
                factor.isSetLimitZero = true
                switch factor.factorType {
                case proto.IopsReadType, proto.IopsWriteType:
                        grid.limit = proto.MinIopsLimit / girdCntOneSecond
                        if grid.limit == 0 {
                                grid.limit = 1
                        }
                case proto.FlowReadType, proto.FlowWriteType:
                        grid.limit = proto.MinFLowLimit / girdCntOneSecond
                        if grid.limit == 0 {
                                grid.limit = 10 * util.KB
                        }
                default:
                        // do nothing
                }
        } else {
                factor.isSetLimitZero = false
        }

        grid = factor.gridList.Back().Value.(*GridElement)
        log.QosWriteDebugf("action[SetLimit] factor type [%v] gird id %v limit %v buffer %v",
                proto.QosTypeString(factor.factorType), grid.ID, grid.limit, grid.buffer)
}

// clean wait list if limit be enlrarged by master
// no lock need for parallel,caller own the lock and will release it
func (factor *LimitFactor) TryReleaseWaitList() {
        gridIter := factor.gridList.Back()
        tGrid := gridIter.Value.(*GridElement)
        cnt := 0

        for factor.waitList.Len() > 0 {
                value := factor.waitList.Front()
                ele := value.Value.(*AllocElement)

                // log.LogDebugf("action[TryReleaseWaitList] type [%v] ele used [%v]", proto.QosTypeString(factor.factorType), ele.used)
                for atomic.LoadUint64(&tGrid.used)+uint64(ele.used) > tGrid.limit+tGrid.buffer {

                        log.LogWarnf("action[TryReleaseWaitList] type [%v] new gird be used up.alloc in waitlist left cnt [%v],"+
                                "grid be allocated [%v] grid limit [%v] and buffer[%v], gird id:[%v], use pregrid size[%v]",
                                proto.QosTypeString(factor.factorType), factor.waitList.Len(), tGrid.used, tGrid.limit, tGrid.buffer,
                                tGrid.ID, uint32(tGrid.limit+tGrid.buffer-tGrid.used))

                        tUsed := atomic.LoadUint64(&tGrid.used)
                        val := tGrid.limit + tGrid.buffer - tUsed                        // uint may out range
                        if tGrid.limit+tGrid.buffer > tUsed && ele.used >= uint32(val) { // not atomic pretect,grid used may larger than limit and buffer
                                ele.used -= uint32(val)
                                log.QosWriteDebugf("action[TryReleaseWaitList] type [%v] ele used reduce [%v] and left [%v]", proto.QosTypeString(factor.factorType), val, ele.used)
                                // atomic.AddUint64(&curGrid.used, tGrid.limit+ tGrid.buffer)
                                atomic.AddUint64(&tGrid.used, val)
                        }
                        cnt++
                        if gridIter.Prev() == nil || cnt >= girdCntOneSecond {
                                return
                        }
                        gridIter = gridIter.Prev()
                        tGrid = gridIter.Value.(*GridElement)
                }
                atomic.AddUint64(&tGrid.used, uint64(ele.used))
                log.QosWriteDebugf("action[TryReleaseWaitList] type [%v] ele used [%v] consumed!", proto.QosTypeString(factor.factorType), ele.used)
                ele.future.Respond(true, nil)

                value = value.Next()
                factor.waitList.Remove(factor.waitList.Front())
        }
}

func (factor *LimitFactor) CheckGrid() {
        defer func() {
                factor.lock.Unlock()
        }()
        factor.lock.Lock()

        grid := factor.gridList.Back().Value.(*GridElement)
        newGrid := &GridElement{
                time:   time.Now(),
                limit:  grid.limit,
                used:   0,
                buffer: grid.buffer,
                ID:     factor.gridId,
        }
        factor.gridId++

        if factor.mgr.enable && factor.mgr.lastTimeOfSetLimit.Add(time.Second*qosExpireTime).Before(newGrid.time) {
                log.LogWarnf("action[CheckGrid]. qos recv no command from master in long time, last time %v, grid time %v",
                        factor.mgr.lastTimeOfSetLimit, newGrid.time)
        }
        if factor.mgr.enable {
                log.QosWriteDebugf("action[CheckGrid] factor type:[%v] gridlistLen:[%v] waitlistLen:[%v] hitlimitcnt:[%v] "+
                        "add new grid info girdid[%v] used:[%v] limit:[%v] buffer:[%v] time:[%v]",
                        proto.QosTypeString(factor.factorType), factor.gridList.Len(), factor.waitList.Len(), factor.gidHitLimitCnt,
                        newGrid.ID, newGrid.used, newGrid.limit, newGrid.buffer, newGrid.time)
        }

        factor.gridList.PushBack(newGrid)
        for factor.gridList.Len() > gridWindowTimeScope*girdCntOneSecond {
                firstGrid := factor.gridList.Front().Value.(*GridElement)
                if firstGrid.hitLimit {
                        factor.gidHitLimitCnt--
                        if factor.mgr.enable {
                                log.QosWriteDebugf("action[CheckGrid] factor [%v] after minus gidHitLimitCnt:[%v]",
                                        proto.QosTypeString(factor.factorType), factor.gidHitLimitCnt)
                        }
                }
                if factor.mgr.enable {
                        log.QosWriteDebugf("action[CheckGrid] type:[%v] remove oldest grid id[%v] info buffer:[%v] limit:[%v] used[%v] from gridlist",
                                proto.QosTypeString(factor.factorType), firstGrid.ID, firstGrid.buffer, firstGrid.limit, firstGrid.used)
                }
                factor.gridList.Remove(factor.gridList.Front())
        }
        factor.TryReleaseWaitList()
}

func newLimitFactor(mgr *LimitManager, factorType uint32) *LimitFactor {
        limit := &LimitFactor{
                mgr:        mgr,
                factorType: factorType,
                waitList:   list.New(),
                gridList:   list.New(),
                magnify:    defaultMagnifyFactor,
        }

        limit.SetLimit(0, 0)
        return limit
}

type LimitManager struct {
        ID                 uint64
        limitMap           map[uint32]*LimitFactor
        enable             bool
        simpleClient       wrapper.SimpleClientInfo
        exitCh             chan struct{}
        WrapperUpdate      UploadFlowInfoFunc
        ReqPeriod          uint32
        HitTriggerCnt      uint8
        lastReqTime        time.Time
        lastTimeOfSetLimit time.Time
        isLastReqValid     bool
        once               sync.Once
}

func NewLimitManager(client wrapper.SimpleClientInfo) *LimitManager {
        mgr := &LimitManager{
                limitMap:      make(map[uint32]*LimitFactor, 0),
                enable:        false, // assign from master
                simpleClient:  client,
                HitTriggerCnt: gridHitLimitCnt,
                ReqPeriod:     1,
        }
        mgr.limitMap[proto.IopsReadType] = newLimitFactor(mgr, proto.IopsReadType)
        mgr.limitMap[proto.IopsWriteType] = newLimitFactor(mgr, proto.IopsWriteType)
        mgr.limitMap[proto.FlowWriteType] = newLimitFactor(mgr, proto.FlowWriteType)
        mgr.limitMap[proto.FlowReadType] = newLimitFactor(mgr, proto.FlowReadType)

        mgr.ScheduleCheckGrid()
        return mgr
}

func (factor *LimitFactor) GetWaitTotalSize() (waitSize uint64) {
        value := factor.waitList.Front()
        for {
                if value == nil {
                        break
                }
                ele := value.Value.(*AllocElement)
                waitSize += uint64(ele.used)
                value = value.Next()
        }
        return
}

func (limitManager *LimitManager) CalcNeedByPow(limitFactor *LimitFactor, used uint64) (need uint64) {
        if limitFactor.waitList.Len() == 0 {
                return 0
        }
        if limitFactor.factorType == proto.FlowReadType || limitFactor.factorType == proto.FlowWriteType {
                used += limitFactor.GetWaitTotalSize()
                if used < 128*util.KB {
                        need = 128 * util.KB
                        return
                }
                need = uint64(300 * util.MB * math.Pow(float64(used)/float64(300*util.MB), 0.8))
        } else {
                if used == 0 {
                        used = uint64(limitFactor.waitList.Len())
                }
                need = uint64(300 * math.Pow(float64(used)/float64(300), 0.8))
        }

        return
}

func (limitManager *LimitManager) GetFlowInfo() (*proto.ClientReportLimitInfo, bool) {
        info := &proto.ClientReportLimitInfo{
                FactorMap: make(map[uint32]*proto.ClientLimitInfo, 0),
        }
        var (
                validCliInfo bool
                griCnt       int
                limit        uint64
                buffer       uint64
        )
        for factorType, limitFactor := range limitManager.limitMap {
                limitFactor.lock.RLock()

                var reqUsed uint64
                griCnt = 0
                grid := limitFactor.gridList.Back()
                grid = grid.Prev()
                // reqUsed := limitFactor.valAllocLastCommit

                for griCnt < limitFactor.gridList.Len()-1 {
                        reqUsed += atomic.LoadUint64(&grid.Value.(*GridElement).used)
                        limit += grid.Value.(*GridElement).limit
                        buffer += grid.Value.(*GridElement).buffer
                        griCnt++

                        // log.LogDebugf("action[GetFlowInfo] type [%v] grid id[%v] used %v limit %v buffer %v time %v sum_used %v sum_limit %v,len %v",
                        //        proto.QosTypeString(factorType),
                        //        grid.Value.(*GridElement).ID,
                        //        grid.Value.(*GridElement).used,
                        //        grid.Value.(*GridElement).limit,
                        //        grid.Value.(*GridElement).buffer,
                        //        grid.Value.(*GridElement).time,
                        //        reqUsed,
                        //        limit, limitFactor.gridList.Len())
                        if grid.Prev() == nil || griCnt >= girdCntOneSecond {
                                log.QosWriteDebugf("action[[GetFlowInfo] type [%v] grid count %v reqused %v list len %v",
                                        proto.QosTypeString(factorType), griCnt, reqUsed, limitFactor.gridList.Len())
                                break
                        }
                        grid = grid.Prev()
                }

                if griCnt > 0 {
                        timeElapse := uint64(time.Second) * uint64(griCnt) / girdCntOneSecond
                        if timeElapse < uint64(qosReportMinGap) {
                                log.LogWarnf("action[GetFlowInfo] type [%v] timeElapse [%v] since last report",
                                        proto.QosTypeString(limitFactor.factorType), timeElapse)
                                timeElapse = uint64(qosReportMinGap) // time of interval get vol view from master todo:change to config time
                        }
                        reqUsed = uint64(float64(reqUsed) / (float64(timeElapse) / float64(time.Second)))
                }

                factor := &proto.ClientLimitInfo{
                        Used:       reqUsed,
                        Need:       limitManager.CalcNeedByPow(limitFactor, reqUsed),
                        UsedLimit:  limitFactor.gridList.Back().Value.(*GridElement).limit * girdCntOneSecond,
                        UsedBuffer: limitFactor.gridList.Back().Value.(*GridElement).buffer * girdCntOneSecond,
                }

                limitFactor.lock.RUnlock()

                info.FactorMap[factorType] = factor
                info.Host = wrapper.LocalIP
                info.Status = proto.QosStateNormal
                info.ID = limitManager.ID
                if limitFactor.waitList.Len() > 0 ||
                        !limitFactor.isSetLimitZero ||
                        factor.Used|factor.Need > 0 {
                        log.QosWriteDebugf("action[GetFlowInfo] type [%v]  len [%v] isSetLimitZero [%v] used [%v] need [%v]", proto.QosTypeString(limitFactor.factorType),
                                limitFactor.waitList.Len(), limitFactor.isSetLimitZero, factor.Used, factor.Need)
                        validCliInfo = true
                }

                if griCnt > 0 {
                        log.QosWriteDebugf("action[GetFlowInfo] type [%v] last commit[%v] report to master "+
                                "with simpleClient limit info [%v,%v,%v,%v],host [%v], "+
                                "status [%v] grid [%v, %v, %v]",
                                proto.QosTypeString(limitFactor.factorType), limitFactor.valAllocLastCommit,
                                factor.Used, factor.Need, factor.UsedBuffer, factor.UsedLimit, info.Host,
                                info.Status, grid.Value.(*GridElement).ID, grid.Value.(*GridElement).limit, grid.Value.(*GridElement).buffer)
                }
        }

        lastValid := limitManager.isLastReqValid
        limitManager.isLastReqValid = validCliInfo

        limitManager.once.Do(func() {
                validCliInfo = true
        })
        // client has no user request then don't report to master
        if !lastValid && !validCliInfo {
                return info, false
        }
        return info, true
}

func (limitManager *LimitManager) ScheduleCheckGrid() {
        go func() {
                ticker := time.NewTicker(1000 / girdCntOneSecond * time.Millisecond)
                defer func() {
                        ticker.Stop()
                }()
                var cnt uint64
                for {
                        select {
                        case <-limitManager.exitCh:
                                return
                        case <-ticker.C:
                                cnt++
                                for factorType, limitFactor := range limitManager.limitMap {
                                        limitFactor.CheckGrid()
                                        if cnt%girdCntOneSecond == 0 {
                                                log.QosWriteDebugf("action[ScheduleCheckGrid] type [%v] factor apply val:[%v] commit val:[%v]",
                                                        proto.QosTypeString(factorType), atomic.LoadUint64(&limitFactor.valAllocApply), atomic.LoadUint64(&limitFactor.valAllocCommit))
                                                limitFactor.valAllocLastApply = atomic.LoadUint64(&limitFactor.valAllocLastApply)
                                                limitFactor.valAllocLastCommit = atomic.LoadUint64(&limitFactor.valAllocCommit)
                                                atomic.StoreUint64(&limitFactor.valAllocApply, 0)
                                                atomic.StoreUint64(&limitFactor.valAllocCommit, 0)
                                        }
                                }
                        }
                }
        }()
}

func (limitManager *LimitManager) SetClientLimit(limit *proto.LimitRsp2Client) {
        if limit == nil {
                log.LogErrorf("action[SetClientLimit] limit info is nil")
                return
        }

        if limitManager.enable != limit.Enable {
                log.LogWarnf("action[SetClientLimit] enable [%v]", limit.Enable)
        }
        limitManager.enable = limit.Enable
        if limit.HitTriggerCnt > 0 {
                log.LogWarnf("action[SetClientLimit] update to HitTriggerCnt [%v] from [%v]", limitManager.HitTriggerCnt, limit.HitTriggerCnt)
                limitManager.HitTriggerCnt = limit.HitTriggerCnt
        }
        if limit.ReqPeriod > 0 {
                log.LogWarnf("action[SetClientLimit] update to ReqPeriod [%v] from [%v]", limitManager.ReqPeriod, limit.ReqPeriod)
                limitManager.ReqPeriod = limit.ReqPeriod
        }

        for factorType, clientLimitInfo := range limit.FactorMap {
                limitManager.limitMap[factorType].SetLimit(clientLimitInfo.UsedLimit, clientLimitInfo.UsedBuffer)
        }
        for factorType, magnify := range limit.Magnify {
                if magnify > 0 && magnify != limitManager.limitMap[factorType].magnify {
                        log.QosWriteDebugf("action[SetClientLimit] type [%v] update magnify [%v] to [%v]",
                                proto.QosTypeString(factorType), limitManager.limitMap[factorType].magnify, magnify)
                        limitManager.limitMap[factorType].magnify = magnify
                }
        }
}

func (limitManager *LimitManager) ReadAlloc(ctx context.Context, size int) {
        limitManager.WaitN(ctx, limitManager.limitMap[proto.IopsReadType], 1)
        limitManager.WaitN(ctx, limitManager.limitMap[proto.FlowReadType], size)
}

func (limitManager *LimitManager) WriteAlloc(ctx context.Context, size int) {
        limitManager.WaitN(ctx, limitManager.limitMap[proto.IopsWriteType], 1)
        limitManager.WaitN(ctx, limitManager.limitMap[proto.FlowWriteType], size)
}

// WaitN blocks until alloc success
func (limitManager *LimitManager) WaitN(ctx context.Context, lim *LimitFactor, n int) (err error) {
        var fut *util.Future
        var ret uint8
        if ret, fut = lim.alloc(uint32(n)); ret == runNow {
                atomic.AddUint64(&lim.valAllocCommit, uint64(n))
                log.QosWriteDebugf("action[WaitN] type [%v] return now waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
                return nil
        }

        respCh, errCh := fut.AsyncResponse()

        select {
        case <-ctx.Done():
                log.LogWarnf("action[WaitN] type [%v] ctx done return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
                return ctx.Err()
        case err = <-errCh:
                log.LogWarnf("action[WaitN] type [%v] err return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
                return
        case <-respCh:
                atomic.AddUint64(&lim.valAllocCommit, uint64(n))
                log.QosWriteDebugf("action[WaitN] type [%v] return waitlistlen [%v]", proto.QosTypeString(lim.factorType), lim.waitList.Len())
                return nil
                // default:
        }
}

func (limitManager *LimitManager) UpdateFlowInfo(limit *proto.LimitRsp2Client) {
        limitManager.SetClientLimit(limit)
}

func (limitManager *LimitManager) SetClientID(id uint64) (err error) {
        limitManager.ID = id
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/log"
)

// ExtentRequest defines the struct for the request of read or write an extent.
type ExtentRequest struct {
        FileOffset int
        Size       int
        Data       []byte
        ExtentKey  *proto.ExtentKey
}

// String returns the string format of the extent request.
func (er *ExtentRequest) String() string {
        return fmt.Sprintf("FileOffset(%v) Size(%v) ExtentKey(%v)", er.FileOffset, er.Size, er.ExtentKey)
}

// NewExtentRequest returns a new extent request.
func NewExtentRequest(offset, size int, data []byte, ek *proto.ExtentKey) *ExtentRequest {
        return &ExtentRequest{
                FileOffset: offset,
                Size:       size,
                Data:       data,
                ExtentKey:  ek,
        }
}

// ExtentCache defines the struct of the extent cache.
type ExtentCache struct {
        sync.RWMutex
        inode   uint64
        gen     uint64 // generation number
        size    uint64 // size of the cache
        root    *btree.BTree
        discard *btree.BTree
        verSeq  uint64
}

// NewExtentCache returns a new extent cache.
func NewExtentCache(inode uint64) *ExtentCache {
        return &ExtentCache{
                inode:   inode,
                root:    btree.NewWithSize(8, 4),
                discard: btree.NewWithSize(8, 4),
        }
}

func (cache *ExtentCache) LogOutPut() {
        cache.root.Ascend(func(bi btree.Item) bool {
                ek := bi.(*proto.ExtentKey)
                log.LogDebugf("ExtentCache update: local ino(%v) ek(%v)", cache.inode, ek)
                return true
        })
}

func (cache *ExtentCache) RefreshForce(inode uint64, getExtents GetExtentsFunc) error {
        gen, size, extents, err := getExtents(inode)
        if err != nil {
                return err
        }
        // log.LogDebugf("Local ExtentCache before update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
        cache.update(gen, size, true, extents)
        log.LogDebugf("Local ExtentCache after update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
        return nil
}

// Refresh refreshes the extent cache.
func (cache *ExtentCache) Refresh(inode uint64, getExtents GetExtentsFunc) error {
        if cache.root.Len() > 0 {
                return nil
        }

        gen, size, extents, err := getExtents(inode)
        if err != nil {
                return err
        }
        // log.LogDebugf("Local ExtentCache before update: ino(%v) gen(%v) size(%v) extents(%v)", inode, cache.gen, cache.size, cache.List())
        cache.update(gen, size, false, extents)
        log.LogDebugf("Local ExtentCache after update: ino(%v) gen(%v) size(%v)", inode, cache.gen, cache.size)
        return nil
}

func (cache *ExtentCache) update(gen, size uint64, force bool, eks []proto.ExtentKey) {
        cache.Lock()
        defer cache.Unlock()

        log.LogDebugf("ExtentCache update: ino(%v) cache.gen(%v) cache.size(%v) gen(%v) size(%v)", cache.inode, cache.gen, cache.size, gen, size)
        if !force && cache.gen != 0 && cache.gen >= gen {
                log.LogDebugf("ExtentCache update: no need to update, ino(%v) gen(%v) size(%v)", cache.inode, gen, size)
                return
        }

        cache.gen = gen
        cache.size = size
        cache.root.Clear(false)
        for _, ek := range eks {
                extent := ek
                log.LogDebugf("action[update] update cache replace or insert ek [%v]", ek.String())
                cache.root.ReplaceOrInsert(&extent)
        }
}

// Split extent key.
func (cache *ExtentCache) SplitExtentKey(inodeID uint64, ekPivot *proto.ExtentKey) (err error) {
        cache.Lock()
        defer cache.Unlock()

        // log.LogDebugf("before cache output")
        // cache.LogOutPut()
        // When doing the append, we do not care about the data after the file offset.
        // Those data will be overwritten by the current extent anyway.
        var ekFind *proto.ExtentKey
        var ekLeft *proto.ExtentKey
        var ekRight *proto.ExtentKey
        cache.root.DescendLessOrEqual(ekPivot, func(i btree.Item) bool {
                if ekFind == nil {
                        ekFind = i.(*proto.ExtentKey)
                        log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ek [%v]", inodeID, ekFind)
                        return true
                }
                ekLeft = i.(*proto.ExtentKey)
                log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ekLeft [%v]", inodeID, ekLeft)
                return false
        })

        cache.root.AscendGreaterThan(ekPivot, func(i btree.Item) bool {
                ekRight = i.(*proto.ExtentKey)
                log.LogDebugf("action[ExtentCache.SplitExtentKey] inode %v ekRight [%v]", inodeID, ekRight)
                return false
        })

        if ekFind == nil {
                err = fmt.Errorf("inode %v not found ek fileOff[%v] seq[%v]", inodeID, ekPivot.FileOffset, ekPivot.GetSeq())
                return
        }
        ek := &proto.ExtentKey{}
        *ek = *ekFind
        cache.root.Delete(ekFind)
        if nil != cache.root.Get(ekFind) {
                log.LogDebugf("ExtentCache Delete: ino(%v) ek(%v) ", cache.inode, ekFind)
                panic(nil)
        }
        log.LogDebugf("ExtentCache Delete: ino(%v) ek(%v) ", cache.inode, ekFind)
        ek.AddModGen()
        log.LogDebugf("action[SplitExtentKey] inode %v ek [%v] ekPivot [%v] ekLeft [%v]", inodeID, ek, ekPivot, ekLeft)
        // begin
        if ek.FileOffset == ekPivot.FileOffset {
                ek.Size = ek.Size - ekPivot.Size
                ek.FileOffset = ek.FileOffset + uint64(ekPivot.Size)
                ek.ExtentOffset = ek.ExtentOffset + uint64(ekPivot.Size)
                if ekLeft != nil && ekLeft.IsSequenceWithSameSeq(ekPivot) {
                        log.LogDebugf("SplitExtentKey.merge.begin. ekLeft %v and %v", ekLeft, ekPivot)
                        ekLeft.Size += ekPivot.Size
                        log.LogDebugf("action[SplitExtentKey] inode %v ek [%v], ekPivot[%v] ekLeft[%v]", inodeID, ek, ekPivot, ekLeft)
                        cache.root.ReplaceOrInsert(ekLeft)
                        cache.root.ReplaceOrInsert(ek)
                        cache.gen++
                        return
                }
                log.LogDebugf("action[SplitExtentKey] inode %v ek [%v]", inodeID, ek)
        } else if ek.FileOffset+uint64(ek.Size) == ekPivot.FileOffset+uint64(ekPivot.Size) { // end
                ek.Size = ek.Size - ekPivot.Size
                log.LogDebugf("action[SplitExtentKey] inode %v ek [%v]", inodeID, ek)
                if ekRight != nil && ekPivot.IsSequenceWithSameSeq(ekRight) {
                        cache.root.Delete(ekRight)
                        ekRight.FileOffset = ekPivot.FileOffset
                        ekRight.ExtentOffset = ekPivot.ExtentOffset
                        ekRight.Size += ekPivot.Size
                        cache.root.ReplaceOrInsert(ekRight)
                        cache.root.ReplaceOrInsert(ek)
                        log.LogDebugf("SplitExtentKey.merge.end. ek %v and %v", ekPivot, ekRight)
                        cache.gen++
                        return
                }
        } else {
                newSize := uint32(ekPivot.FileOffset - ek.FileOffset) // middle
                ekEnd := &proto.ExtentKey{
                        FileOffset:   ekPivot.FileOffset + uint64(ekPivot.Size),
                        PartitionId:  ek.PartitionId,
                        ExtentId:     ek.ExtentId,
                        ExtentOffset: ek.ExtentOffset + uint64(newSize+ekPivot.Size),
                        Size:         ek.Size - newSize - ekPivot.Size,
                        SnapInfo: &proto.ExtSnapInfo{
                                VerSeq: ek.GetSeq(),
                                ModGen: ek.GetModGen(),
                        },
                }
                log.LogDebugf("action[SplitExtentKey] inode %v add ekEnd [%v] after split size(%v,%v,%v)", inodeID, ekEnd, newSize, ekPivot.Size, ekEnd.Size)
                cache.root.ReplaceOrInsert(ekEnd)
                log.LogDebugf("ExtentCache ReplaceOrInsert: ino(%v) ek(%v) ", cache.inode, ekEnd)
                ek.Size = newSize
        }

        cache.root.ReplaceOrInsert(ek)
        cache.root.ReplaceOrInsert(ekPivot)

        log.LogDebugf("action[SplitExtentKey] inode %v ek [%v], ekPivot[%v]", inodeID, ek, ekPivot)
        cache.gen++

        // log.LogDebugf("before cache output")
        // cache.LogOutPut()
        return
}

// Append appends an extent key.
func (cache *ExtentCache) Append(ek *proto.ExtentKey, sync bool) (discardExtents []proto.ExtentKey) {
        log.LogDebugf("action[ExtentCache.Append] ek %v", ek)
        ekEnd := ek.FileOffset + uint64(ek.Size)
        lower := &proto.ExtentKey{FileOffset: ek.FileOffset}
        upper := &proto.ExtentKey{FileOffset: ekEnd}
        discard := make([]*proto.ExtentKey, 0)

        cache.Lock()
        defer cache.Unlock()

        //cache.root.Descend(func(i btree.Item) bool {
        //        ek := i.(*proto.ExtentKey)
        //        // skip if the start offset matches with the given offset
        //        log.LogDebugf("action[Append.LoopPrint.Enter] inode %v ek [%v]", cache.inode, ek.String())
        //        return true
        //})

        // When doing the append, we do not care about the data after the file offset.
        // Those data will be overwritten by the current extent anyway.
        cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
                found := i.(*proto.ExtentKey)
                discard = append(discard, found)
                return true
        })

        // After deleting the data between lower and upper, we will do the append
        for _, key := range discard {
                cache.root.Delete(key)
                log.LogDebugf("ExtentCache del: ino(%v) ek(%v) ", cache.inode, key)
                if key.PartitionId != 0 && key.ExtentId != 0 && (key.PartitionId != ek.PartitionId || key.ExtentId != ek.ExtentId || ek.ExtentOffset != key.ExtentOffset) {
                        if sync || (ek.PartitionId == 0 && ek.ExtentId == 0) {
                                cache.discard.ReplaceOrInsert(key)
                                // log.LogDebugf("ExtentCache Append add to discard: ino(%v) ek(%v) discard(%v)", cache.inode, ek, key)
                        }
                }
        }

        cache.root.ReplaceOrInsert(ek)
        if sync {
                cache.gen++
                discardExtents = make([]proto.ExtentKey, 0, cache.discard.Len())
                cache.discard.AscendRange(lower, upper, func(i btree.Item) bool {
                        found := i.(*proto.ExtentKey)
                        if found.PartitionId != ek.PartitionId || found.ExtentId != ek.ExtentId || found.ExtentOffset != ek.ExtentOffset {
                                discardExtents = append(discardExtents, *found)
                        }
                        return true
                })
        }
        if ekEnd > cache.size {
                cache.size = ekEnd
        }

        log.LogDebugf("ExtentCache Append: ino(%v) sync(%v) ek(%v) local discard(%v) discardExtents(%v), seq(%v)",
                cache.inode, sync, ek, discard, discardExtents, ek.GetSeq())
        return
}

func (cache *ExtentCache) RemoveDiscard(discardExtents []proto.ExtentKey) {
        cache.Lock()
        defer cache.Unlock()
        for _, ek := range discardExtents {
                cache.discard.Delete(&ek)
                // log.LogDebugf("ExtentCache ClearDiscard: ino(%v) discard(%v)", cache.inode, ek)
        }
}

func (cache *ExtentCache) TruncDiscard(size uint64) {
        cache.Lock()
        defer cache.Unlock()
        if size >= cache.size {
                return
        }
        pivot := &proto.ExtentKey{FileOffset: size}
        discardExtents := make([]proto.ExtentKey, 0, cache.discard.Len())
        cache.discard.AscendGreaterOrEqual(pivot, func(i btree.Item) bool {
                found := i.(*proto.ExtentKey)
                discardExtents = append(discardExtents, *found)
                return true
        })
        for _, key := range discardExtents {
                cache.discard.Delete(&key)
        }
        log.LogDebugf("truncate ExtentCache discard: ino(%v) size(%v) discard(%v)", cache.inode, size, discardExtents)
}

// Max returns the max extent key in the cache.
func (cache *ExtentCache) Max() *proto.ExtentKey {
        cache.RLock()
        defer cache.RUnlock()
        ek := cache.root.Max().(*proto.ExtentKey)
        return ek
}

// Size returns the size of the cache.
func (cache *ExtentCache) Size() (size int, gen uint64) {
        cache.RLock()
        defer cache.RUnlock()
        return int(cache.size), cache.gen
}

// SetSize set the size of the cache.
func (cache *ExtentCache) SetSize(size uint64, sync bool) {
        cache.Lock()
        defer cache.Unlock()
        cache.size = size
        if sync {
                cache.gen++
        }
}

// List returns a list of the extents in the cache.
func (cache *ExtentCache) List() []*proto.ExtentKey {
        cache.RLock()
        root := cache.root.Clone()
        cache.RUnlock()

        extents := make([]*proto.ExtentKey, 0, root.Len())
        root.Ascend(func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                extents = append(extents, ek)
                return true
        })
        return extents
}

// Get returns the extent key based on the given offset.
func (cache *ExtentCache) Get(offset uint64) (ret *proto.ExtentKey) {
        pivot := &proto.ExtentKey{FileOffset: offset}
        cache.RLock()
        defer cache.RUnlock()

        cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                // log.LogDebugf("ExtentCache GetConnect: ino(%v) ek(%v) offset(%v)", cache.inode, ek, offset)
                if offset >= ek.FileOffset && offset < ek.FileOffset+uint64(ek.Size) {
                        ret = ek
                }
                return false
        })
        return ret
}

// GetEndForAppendWrite returns the extent key whose end offset equals the given offset.
func (cache *ExtentCache) GetEndForAppendWrite(offset uint64, verSeq uint64, needCheck bool) (ret *proto.ExtentKey) {
        pivot := &proto.ExtentKey{FileOffset: offset}
        cache.RLock()
        defer cache.RUnlock()

        var lastExistEk *proto.ExtentKey
        var lastExistEkTest *proto.ExtentKey
        cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                // skip if the start offset matches with the given offset
                if offset == ek.FileOffset {
                        lastExistEk = ek
                        return true
                }

                if offset == ek.FileOffset+uint64(ek.Size) {
                        if !needCheck || ek.GetSeq() == verSeq {
                                if int(ek.ExtentOffset)+int(ek.Size) >= util.ExtentSize {
                                        log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v req offset %v verseq %v not found, exist ek [%v]",
                                                cache.inode, offset, verSeq, ek.String())
                                        ret = nil
                                        return false
                                }
                                //?? should not have the neighbor extent in the next
                                if lastExistEk != nil && ek.IsFileInSequence(lastExistEk) {
                                        log.LogErrorf("action[ExtentCache.GetEndForAppendWrite] ek %v is InSequence exist sequence extent %v", ek, lastExistEk)
                                        ret = nil
                                        return false
                                }
                                log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v offset %v verseq %v found,ek [%v] lastExistEk[%v], lastExistEkTest[%v]",
                                        cache.inode, offset, verSeq, ek.String(), lastExistEk, lastExistEkTest)
                                ret = ek
                        } else {
                                log.LogDebugf("action[ExtentCache.GetEndForAppendWrite] inode %v req offset %v verseq %v not found, exist ek [%v]", cache.inode, offset, verSeq, ek.String())
                        }

                        return false
                }
                lastExistEkTest = ek
                return true
        })
        return ret
}

// PrepareReadRequests classifies the incoming request.
func (cache *ExtentCache) PrepareReadRequests(offset, size int, data []byte) []*ExtentRequest {
        requests := make([]*ExtentRequest, 0)
        pivot := &proto.ExtentKey{FileOffset: uint64(offset)}
        upper := &proto.ExtentKey{FileOffset: uint64(offset + size)}
        start := offset
        end := offset + size

        cache.RLock()
        defer cache.RUnlock()

        lower := &proto.ExtentKey{}
        cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                lower.FileOffset = ek.FileOffset
                return false
        })

        cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                ekStart := int(ek.FileOffset)
                ekEnd := int(ek.FileOffset) + int(ek.Size)

                log.LogDebugf("PrepareReadRequests: req[ino(%v) start(%v) end(%v)] ek[extentID(%v),FileOffset(Start(%v) End(%v))]",
                        cache.inode, start, end, ek.ExtentId, ekStart, ekEnd)

                if start < ekStart {
                        if end <= ekStart {
                                return false
                        } else if end < ekEnd {
                                // add hole (start, ekStart)
                                req := NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
                                requests = append(requests, req)
                                // add non-hole (ekStart, end)
                                req = NewExtentRequest(ekStart, end-ekStart, data[ekStart-offset:end-offset], ek)
                                requests = append(requests, req)
                                start = end
                                return false
                        } else {
                                // add hole (start, ekStart)
                                req := NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
                                requests = append(requests, req)

                                // add non-hole (ekStart, ekEnd)
                                req = NewExtentRequest(ekStart, ekEnd-ekStart, data[ekStart-offset:ekEnd-offset], ek)
                                requests = append(requests, req)

                                start = ekEnd
                                return true
                        }
                } else if start < ekEnd {
                        if end <= ekEnd {
                                // add non-hole (start, end)
                                req := NewExtentRequest(start, end-start, data[start-offset:end-offset], ek)
                                requests = append(requests, req)
                                start = end
                                return false
                        } else {
                                // add non-hole (start, ekEnd), start = ekEnd
                                req := NewExtentRequest(start, ekEnd-start, data[start-offset:ekEnd-offset], ek)
                                requests = append(requests, req)
                                start = ekEnd
                                return true
                        }
                } else {
                        return true
                }
        })

        log.LogDebugf("PrepareReadRequests: ino(%v) start(%v) end(%v)", cache.inode, start, end)
        if start < end {
                // add hole (start, end)
                req := NewExtentRequest(start, end-start, data[start-offset:end-offset], nil)
                requests = append(requests, req)
        }

        return requests
}

// PrepareWriteRequests TODO explain
func (cache *ExtentCache) PrepareWriteRequests(offset, size int, data []byte) []*ExtentRequest {
        requests := make([]*ExtentRequest, 0)
        pivot := &proto.ExtentKey{FileOffset: uint64(offset)}
        upper := &proto.ExtentKey{FileOffset: uint64(offset + size)}
        start := offset
        end := offset + size

        cache.RLock()
        defer cache.RUnlock()

        lower := &proto.ExtentKey{}
        cache.root.DescendLessOrEqual(pivot, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                lower.FileOffset = ek.FileOffset
                log.LogDebugf("action[ExtentCache.PrepareWriteRequests] ek [%v], pivot[%v]", ek, pivot)
                return false
        })

        cache.root.AscendRange(lower, upper, func(i btree.Item) bool {
                ek := i.(*proto.ExtentKey)
                ekStart := int(ek.FileOffset)
                ekEnd := int(ek.FileOffset) + int(ek.Size)

                log.LogDebugf("action[ExtentCache.PrepareWriteRequests]: ino(%v) start(%v) end(%v) ekStart(%v) ekEnd(%v)", cache.inode, start, end, ekStart, ekEnd)

                if start <= ekStart {
                        if end <= ekStart {
                                return false
                        } else if end < ekEnd {
                                var req *ExtentRequest
                                if start < ekStart {
                                        // add hole (start, ekStart)
                                        req = NewExtentRequest(start, ekStart-start, data[start-offset:ekStart-offset], nil)
                                        requests = append(requests, req)
                                }
                                // add non-hole (ekStart, end)
                                req = NewExtentRequest(ekStart, end-ekStart, data[ekStart-offset:end-offset], ek)
                                requests = append(requests, req)
                                start = end
                                return false
                        } else {
                                return true
                        }
                } else if start < ekEnd {
                        if end <= ekEnd {
                                // add non-hole (start, end)
                                req := NewExtentRequest(start, end-start, data[start-offset:end-offset], ek)
                                requests = append(requests, req)
                                start = end
                                return false
                        } else {
                                // add non-hole (start, ekEnd), start = ekEnd
                                req := NewExtentRequest(start, ekEnd-start, data[start-offset:ekEnd-offset], ek)
                                requests = append(requests, req)
                                start = ekEnd
                                return true
                        }
                } else {
                        return true
                }
        })

        log.LogDebugf("PrepareWriteRequests: ino(%v) start(%v) end(%v)", cache.inode, start, end)
        if start < end {
                // add hole (start, end)
                req := NewExtentRequest(start, end-start, data[start-offset:end-offset], nil)
                requests = append(requests, req)
        }

        return requests
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "container/list"
        "context"
        "fmt"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/manager"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"

        "golang.org/x/time/rate"
)

type (
        SplitExtentKeyFunc  func(parentInode, inode uint64, key proto.ExtentKey) error
        AppendExtentKeyFunc func(parentInode, inode uint64, key proto.ExtentKey, discard []proto.ExtentKey) (int, error)
        GetExtentsFunc      func(inode uint64) (uint64, uint64, []proto.ExtentKey, error)
        TruncateFunc        func(inode, size uint64, fullPath string) error
        EvictIcacheFunc     func(inode uint64)
        LoadBcacheFunc      func(key string, buf []byte, offset uint64, size uint32) (int, error)
        CacheBcacheFunc     func(key string, buf []byte) error
        EvictBacheFunc      func(key string) error
)

const (
        MaxMountRetryLimit = 6
        MountRetryInterval = time.Second * 5

        defaultReadLimitRate  = rate.Inf
        defaultReadLimitBurst = 128

        defaultWriteLimitRate  = rate.Inf
        defaultWriteLimitBurst = 128

        defaultStreamerLimit = 100000
        defMaxStreamerLimit  = 10000000
        kHighWatermarkPct    = 1.01
        slowStreamerEvictNum = 10
        fastStreamerEvictNum = 10000
)

var (
        // global object pools for memory optimization
        openRequestPool    *sync.Pool
        writeRequestPool   *sync.Pool
        flushRequestPool   *sync.Pool
        releaseRequestPool *sync.Pool
        truncRequestPool   *sync.Pool
        evictRequestPool   *sync.Pool
)

func init() {
        // init object pools
        openRequestPool = &sync.Pool{New: func() interface{} {
                return &OpenRequest{}
        }}
        writeRequestPool = &sync.Pool{New: func() interface{} {
                return &WriteRequest{}
        }}
        flushRequestPool = &sync.Pool{New: func() interface{} {
                return &FlushRequest{}
        }}
        releaseRequestPool = &sync.Pool{New: func() interface{} {
                return &ReleaseRequest{}
        }}
        truncRequestPool = &sync.Pool{New: func() interface{} {
                return &TruncRequest{}
        }}
        evictRequestPool = &sync.Pool{New: func() interface{} {
                return &EvictRequest{}
        }}
}

type ExtentConfig struct {
        Volume            string
        VolumeType        int
        Masters           []string
        FollowerRead      bool
        NearRead          bool
        Preload           bool
        ReadRate          int64
        WriteRate         int64
        BcacheEnable      bool
        BcacheDir         string
        MaxStreamerLimit  int64
        VerReadSeq        uint64
        OnAppendExtentKey AppendExtentKeyFunc
        OnSplitExtentKey  SplitExtentKeyFunc
        OnGetExtents      GetExtentsFunc
        OnTruncate        TruncateFunc
        OnEvictIcache     EvictIcacheFunc
        OnLoadBcache      LoadBcacheFunc
        OnCacheBcache     CacheBcacheFunc
        OnEvictBcache     EvictBacheFunc

        DisableMetaCache             bool
        MinWriteAbleDataPartitionCnt int
}

type MultiVerMgr struct {
        verReadSeq   uint64 // verSeq in config used as snapshot read
        latestVerSeq uint64 // newest verSeq from master for datanode write to check
        verList      *proto.VolVersionInfoList
        sync.RWMutex
}

// ExtentClient defines the struct of the extent client.
type ExtentClient struct {
        streamers          map[uint64]*Streamer
        streamerList       *list.List
        streamerLock       sync.Mutex
        maxStreamerLimit   int
        readLimiter        *rate.Limiter
        writeLimiter       *rate.Limiter
        disableMetaCache   bool
        volumeType         int
        volumeName         string
        bcacheEnable       bool
        bcacheDir          string
        BcacheHealth       bool
        preload            bool
        LimitManager       *manager.LimitManager
        dataWrapper        *wrapper.Wrapper
        appendExtentKey    AppendExtentKeyFunc
        splitExtentKey     SplitExtentKeyFunc
        getExtents         GetExtentsFunc
        truncate           TruncateFunc
        evictIcache        EvictIcacheFunc // May be null, must check before using
        loadBcache         LoadBcacheFunc
        cacheBcache        CacheBcacheFunc
        evictBcache        EvictBacheFunc
        inflightL1cache    sync.Map
        inflightL1BigBlock int32
        multiVerMgr        *MultiVerMgr
}

func (client *ExtentClient) UidIsLimited(uid uint32) bool {
        client.dataWrapper.UidLock.RLock()
        defer client.dataWrapper.UidLock.RUnlock()
        if uInfo, ok := client.dataWrapper.Uids[uid]; ok {
                if uInfo.Limited {
                        log.LogDebugf("uid %v is limited", uid)
                        return true
                }
        }
        log.LogDebugf("uid %v is not limited", uid)
        return false
}

func (client *ExtentClient) evictStreamer() bool {
        // remove from list
        item := client.streamerList.Back()
        if item == nil {
                return false
        }

        client.streamerList.Remove(item)
        ino := item.Value.(uint64)

        s, ok := client.streamers[ino]
        if !ok {
                return true
        }

        if s.isOpen {
                client.streamerList.PushFront(ino)
                return true
        }

        delete(s.client.streamers, s.inode)
        return true
}

func (client *ExtentClient) batchEvictStramer(batchCnt int) {
        client.streamerLock.Lock()
        defer client.streamerLock.Unlock()

        for cnt := 0; cnt < batchCnt; cnt++ {
                ok := client.evictStreamer()
                if !ok {
                        break
                }
        }
}

func (client *ExtentClient) backgroundEvictStream() {
        t := time.NewTicker(2 * time.Second)
        for range t.C {
                start := time.Now()
                streamerSize := client.streamerList.Len()
                highWatermark := int(float32(client.maxStreamerLimit) * kHighWatermarkPct)
                for streamerSize > client.maxStreamerLimit {
                        // fast evict
                        if streamerSize > highWatermark {
                                client.batchEvictStramer(fastStreamerEvictNum)
                        } else {
                                client.batchEvictStramer(slowStreamerEvictNum)
                        }
                        streamerSize = client.streamerList.Len()
                        log.LogInfof("batch evict cnt(%d), cost(%d), now(%d)", 1, time.Since(start).Microseconds(), streamerSize)
                }
                log.LogInfof("streamer total cnt(%d), cost(%d) ns", streamerSize, time.Since(start).Nanoseconds())
        }
}

// NewExtentClient returns a new extent client.
func NewExtentClient(config *ExtentConfig) (client *ExtentClient, err error) {
        client = new(ExtentClient)
        client.LimitManager = manager.NewLimitManager(client)
        client.LimitManager.WrapperUpdate = client.UploadFlowInfo
        limit := 0
retry:

        client.dataWrapper, err = wrapper.NewDataPartitionWrapper(client, config.Volume, config.Masters, config.Preload, config.MinWriteAbleDataPartitionCnt, config.VerReadSeq)
        if err != nil {
                log.LogErrorf("NewExtentClient: new data partition wrapper failed: volume(%v) mayRetry(%v) err(%v)",
                        config.Volume, limit, err)
                if strings.Contains(err.Error(), proto.ErrVolNotExists.Error()) {
                        return nil, proto.ErrVolNotExists
                }
                if limit >= MaxMountRetryLimit {
                        return nil, errors.Trace(err, "Init data wrapper failed!")
                } else {
                        limit++
                        time.Sleep(MountRetryInterval * time.Duration(limit))
                        goto retry
                }
        }

        client.streamers = make(map[uint64]*Streamer)
        client.multiVerMgr = &MultiVerMgr{verList: &proto.VolVersionInfoList{}}

        client.appendExtentKey = config.OnAppendExtentKey
        client.splitExtentKey = config.OnSplitExtentKey
        client.getExtents = config.OnGetExtents
        client.truncate = config.OnTruncate
        client.evictIcache = config.OnEvictIcache
        client.dataWrapper.InitFollowerRead(config.FollowerRead)
        client.dataWrapper.SetNearRead(config.NearRead)
        client.loadBcache = config.OnLoadBcache
        client.cacheBcache = config.OnCacheBcache
        client.evictBcache = config.OnEvictBcache
        client.volumeType = config.VolumeType
        client.volumeName = config.Volume
        client.bcacheEnable = config.BcacheEnable
        client.bcacheDir = config.BcacheDir
        client.multiVerMgr.verReadSeq = client.dataWrapper.GetReadVerSeq()
        client.BcacheHealth = true
        client.preload = config.Preload
        client.disableMetaCache = config.DisableMetaCache

        var readLimit, writeLimit rate.Limit
        if config.ReadRate <= 0 {
                readLimit = defaultReadLimitRate
        } else {
                readLimit = rate.Limit(config.ReadRate)
        }
        if config.WriteRate <= 0 {
                writeLimit = defaultWriteLimitRate
        } else {
                writeLimit = rate.Limit(config.WriteRate)
        }
        client.readLimiter = rate.NewLimiter(readLimit, defaultReadLimitBurst)
        client.writeLimiter = rate.NewLimiter(writeLimit, defaultWriteLimitBurst)

        if config.MaxStreamerLimit <= 0 {
                client.disableMetaCache = true
                return
        }

        if config.MaxStreamerLimit <= defaultStreamerLimit {
                client.maxStreamerLimit = defaultStreamerLimit
        } else if config.MaxStreamerLimit > defMaxStreamerLimit {
                client.maxStreamerLimit = defMaxStreamerLimit
        } else {
                client.maxStreamerLimit = int(config.MaxStreamerLimit)
        }

        client.maxStreamerLimit += fastStreamerEvictNum

        log.LogInfof("max streamer limit %d", client.maxStreamerLimit)
        client.streamerList = list.New()
        go client.backgroundEvictStream()

        return
}

func (client *ExtentClient) GetEnablePosixAcl() bool {
        return client.dataWrapper.EnablePosixAcl
}

func (client *ExtentClient) GetFlowInfo() (*proto.ClientReportLimitInfo, bool) {
        log.LogInfof("action[ExtentClient.GetFlowInfo]")
        return client.LimitManager.GetFlowInfo()
}

func (client *ExtentClient) UpdateFlowInfo(limit *proto.LimitRsp2Client) {
        log.LogInfof("action[UpdateFlowInfo.UpdateFlowInfo]")
        client.LimitManager.SetClientLimit(limit)
        return
}

func (client *ExtentClient) SetClientID(id uint64) (err error) {
        client.LimitManager.ID = id
        return
}

func (client *ExtentClient) GetVolumeName() string {
        return client.volumeName
}

func (client *ExtentClient) GetLatestVer() uint64 {
        return atomic.LoadUint64(&client.multiVerMgr.latestVerSeq)
}

func (client *ExtentClient) GetReadVer() uint64 {
        return atomic.LoadUint64(&client.multiVerMgr.verReadSeq)
}

func (client *ExtentClient) GetVerMgr() *proto.VolVersionInfoList {
        return client.multiVerMgr.verList
}

func (client *ExtentClient) UpdateLatestVer(verList *proto.VolVersionInfoList) (err error) {
        verSeq := verList.GetLastVer()
        log.LogDebugf("action[UpdateLatestVer] verSeq %v verList[%v] mgr seq %v", verSeq, verList, client.multiVerMgr.latestVerSeq)
        if verSeq == 0 || verSeq <= atomic.LoadUint64(&client.multiVerMgr.latestVerSeq) {
                return
        }
        client.multiVerMgr.Lock()
        defer client.multiVerMgr.Unlock()
        if verSeq <= atomic.LoadUint64(&client.multiVerMgr.latestVerSeq) {
                return
        }

        log.LogDebugf("action[UpdateLatestVer] update verSeq [%v] to [%v]", client.multiVerMgr.latestVerSeq, verSeq)
        atomic.StoreUint64(&client.multiVerMgr.latestVerSeq, verSeq)
        client.multiVerMgr.verList = verList

        client.streamerLock.Lock()
        defer client.streamerLock.Unlock()
        for _, streamer := range client.streamers {
                if streamer.verSeq != verSeq {
                        log.LogDebugf("action[ExtentClient.UpdateLatestVer] stream inode %v ver %v try update to %v", streamer.inode, streamer.verSeq, verSeq)
                        oldVer := streamer.verSeq
                        streamer.verSeq = verSeq
                        streamer.extents.verSeq = verSeq
                        if err = streamer.GetExtentsForce(); err != nil {
                                log.LogErrorf("action[UpdateLatestVer] inode %v streamer %v", streamer.inode, streamer.verSeq)
                                streamer.verSeq = oldVer
                                streamer.extents.verSeq = oldVer
                                return err
                        }
                        atomic.StoreInt32(&streamer.needUpdateVer, 1)
                        log.LogDebugf("action[ExtentClient.UpdateLatestVer] finhsed stream inode %v ver update to %v", streamer.inode, verSeq)
                }
        }
        return nil
}

// Open request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) OpenStream(inode uint64) error {
        client.streamerLock.Lock()
        s, ok := client.streamers[inode]
        if !ok {
                s = NewStreamer(client, inode)
                client.streamers[inode] = s
        }
        return s.IssueOpenRequest()
}

// Open request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) OpenStreamWithCache(inode uint64, needBCache bool) error {
        client.streamerLock.Lock()
        s, ok := client.streamers[inode]
        if !ok {
                s = NewStreamer(client, inode)
                client.streamers[inode] = s
                if !client.disableMetaCache && needBCache {
                        client.streamerList.PushFront(inode)
                }
        }
        s.needBCache = needBCache
        if !s.isOpen && !client.disableMetaCache {
                s.isOpen = true
                log.LogDebugf("open stream again, ino(%v)", s.inode)
                s.request = make(chan interface{}, 64)
                s.pendingCache = make(chan bcacheKey, 1)
                go s.server()
                go s.asyncBlockCache()
        }
        return s.IssueOpenRequest()
}

// Release request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) CloseStream(inode uint64) error {
        client.streamerLock.Lock()
        s, ok := client.streamers[inode]
        if !ok {
                client.streamerLock.Unlock()
                return nil
        }
        return s.IssueReleaseRequest()
}

// Evict request shall grab the lock until request is sent to the request channel
func (client *ExtentClient) EvictStream(inode uint64) error {
        client.streamerLock.Lock()
        s, ok := client.streamers[inode]
        if !ok {
                client.streamerLock.Unlock()
                return nil
        }
        if s.isOpen {
                s.isOpen = false
                err := s.IssueEvictRequest()
                if err != nil {
                        return err
                }
                s.done <- struct{}{}
        } else {
                delete(s.client.streamers, s.inode)
                s.client.streamerLock.Unlock()
        }

        return nil
}

// RefreshExtentsCache refreshes the extent cache.
func (client *ExtentClient) RefreshExtentsCache(inode uint64) error {
        s := client.GetStreamer(inode)
        if s == nil {
                return nil
        }
        return s.GetExtents()
}

func (client *ExtentClient) ForceRefreshExtentsCache(inode uint64) error {
        s := client.GetStreamer(inode)
        if s == nil {
                return nil
        }
        return s.GetExtentsForce()
}

// GetExtentCacheGen return extent generation
func (client *ExtentClient) GetExtentCacheGen(inode uint64) uint64 {
        s := client.GetStreamer(inode)
        if s == nil {
                return 0
        }
        return s.extents.gen
}

func (client *ExtentClient) GetExtents(inode uint64) []*proto.ExtentKey {
        s := client.GetStreamer(inode)
        if s == nil {
                return nil
        }
        return s.extents.List()
}

// FileSize returns the file size.
func (client *ExtentClient) FileSize(inode uint64) (size int, gen uint64, valid bool) {
        s := client.GetStreamer(inode)
        if s == nil {
                return
        }
        valid = true
        size, gen = s.extents.Size()
        return
}

// SetFileSize set the file size.
func (client *ExtentClient) SetFileSize(inode uint64, size int) {
        s := client.GetStreamer(inode)
        if s != nil {
                log.LogDebugf("SetFileSize: ino(%v) size(%v)", inode, size)
                s.extents.SetSize(uint64(size), true)
        }
}

// Write writes the data.
func (client *ExtentClient) Write(inode uint64, offset int, data []byte, flags int, checkFunc func() error) (write int, err error) {
        prefix := fmt.Sprintf("Write{ino(%v)offset(%v)size(%v)}", inode, offset, len(data))
        s := client.GetStreamer(inode)
        if s == nil {
                log.LogErrorf("Prefix(%v): stream is not opened yet", prefix)
                return 0, syscall.EBADF
        }

        s.once.Do(func() {
                // TODO unhandled error
                s.GetExtents()
        })

        write, err = s.IssueWriteRequest(offset, data, flags, checkFunc)
        if err != nil {
                log.LogError(errors.Stack(err))
                exporter.Warning(err.Error())
        }
        return
}

func (client *ExtentClient) Truncate(mw *meta.MetaWrapper, parentIno uint64, inode uint64, size int, fullPath string) error {
        prefix := fmt.Sprintf("Truncate{ino(%v)size(%v)}", inode, size)
        s := client.GetStreamer(inode)
        if s == nil {
                log.LogErrorf("Prefix(%v): stream is not opened yet", prefix)
                return syscall.EBADF
        }
        var info *proto.InodeInfo
        var err error
        var oldSize uint64
        if mw.EnableSummary {
                info, err = mw.InodeGet_ll(inode)
                oldSize = info.Size
        }
        err = s.IssueTruncRequest(size, fullPath)
        if err != nil {
                err = errors.Trace(err, prefix)
                log.LogError(errors.Stack(err))
        }
        if mw.EnableSummary {
                go mw.UpdateSummary_ll(parentIno, 0, 0, int64(size)-int64(oldSize))
        }

        return err
}

func (client *ExtentClient) Flush(inode uint64) error {
        s := client.GetStreamer(inode)
        if s == nil {
                log.LogErrorf("Flush: stream is not opened yet, ino(%v)", inode)
                return syscall.EBADF
        }
        return s.IssueFlushRequest()
}

func (client *ExtentClient) Read(inode uint64, data []byte, offset int, size int) (read int, err error) {
        // log.LogErrorf("======> ExtentClient Read Enter, inode(%v), len(data)=(%v), offset(%v), size(%v).", inode, len(data), offset, size)
        // t1 := time.Now()
        if size == 0 {
                return
        }

        s := client.GetStreamer(inode)
        if s == nil {
                log.LogErrorf("Read: stream is not opened yet, ino(%v) offset(%v) size(%v)", inode, offset, size)
                return 0, syscall.EBADF
        }

        s.once.Do(func() {
                s.GetExtents()
        })

        err = s.IssueFlushRequest()
        if err != nil {
                return
        }

        read, err = s.read(data, offset, size)
        // log.LogErrorf("======> ExtentClient Read Exit, inode(%v), time[%v us].", inode, time.Since(t1).Microseconds())
        return
}

func (client *ExtentClient) ReadExtent(inode uint64, ek *proto.ExtentKey, data []byte, offset int, size int) (read int, err error, isStream bool) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("read-extent", err, bgTime, 1)
        }()

        var reader *ExtentReader
        var req *ExtentRequest
        if size == 0 {
                return
        }

        s := client.GetStreamer(inode)
        if s == nil {
                err = fmt.Errorf("Read: stream is not opened yet, ino(%v) ek(%v)", inode, ek)
                return
        }
        err = s.IssueFlushRequest()
        if err != nil {
                return
        }
        reader, err = s.GetExtentReader(ek)
        if err != nil {
                return
        }

        needCache := false
        cacheKey := util.GenerateKey(s.client.volumeName, s.inode, ek.FileOffset)
        if _, ok := client.inflightL1cache.Load(cacheKey); !ok && client.shouldBcache() {
                client.inflightL1cache.Store(cacheKey, true)
                needCache = true
        }
        defer client.inflightL1cache.Delete(cacheKey)

        // do cache.
        if needCache {
                // read full extent
                buf := make([]byte, ek.Size)
                req = NewExtentRequest(int(ek.FileOffset), int(ek.Size), buf, ek)
                read, err = reader.Read(req)
                if err != nil {
                        return
                }
                read = copy(data, req.Data[offset:offset+size])
                if client.cacheBcache != nil {
                        buf := make([]byte, len(req.Data))
                        copy(buf, req.Data)
                        go func() {
                                log.LogDebugf("ReadExtent L2->L1 Enter cacheKey(%v),client.shouldBcache(%v),needCache(%v)", cacheKey, client.shouldBcache(), needCache)
                                if err := client.cacheBcache(cacheKey, buf); err != nil {
                                        client.BcacheHealth = false
                                        log.LogDebugf("ReadExtent L2->L1 failed, err(%v), set BcacheHealth to false.", err)
                                }
                                log.LogDebugf("ReadExtent L2->L1 Exit cacheKey(%v),client.BcacheHealth(%v),needCache(%v)", cacheKey, client.BcacheHealth, needCache)
                        }()
                }
                return
        } else {
                // read data by offset:size
                req = NewExtentRequest(int(ek.FileOffset)+offset, size, data, ek)
                ctx := context.Background()
                s.client.readLimiter.Wait(ctx)
                s.client.LimitManager.ReadAlloc(ctx, size)
                isStream = true

                read, err = reader.Read(req)
                if err != nil {
                        return
                }
                read = copy(data, req.Data)
                return
        }
}

// GetStreamer returns the streamer.
func (client *ExtentClient) GetStreamer(inode uint64) *Streamer {
        client.streamerLock.Lock()
        defer client.streamerLock.Unlock()
        s, ok := client.streamers[inode]
        if !ok {
                return nil
        }
        if !s.isOpen {
                s.isOpen = true
                s.request = make(chan interface{}, 64)
                s.pendingCache = make(chan bcacheKey, 1)
                go s.server()
                go s.asyncBlockCache()
        }
        return s
}

func (client *ExtentClient) GetRate() string {
        return fmt.Sprintf("read: %v\nwrite: %v\n", getRate(client.readLimiter), getRate(client.writeLimiter))
}

func (client *ExtentClient) shouldBcache() bool {
        return client.bcacheEnable && client.BcacheHealth
}

func getRate(lim *rate.Limiter) string {
        val := int(lim.Limit())
        if val > 0 {
                return fmt.Sprintf("%v", val)
        }
        return "unlimited"
}

func (client *ExtentClient) SetReadRate(val int) string {
        return setRate(client.readLimiter, val)
}

func (client *ExtentClient) SetWriteRate(val int) string {
        return setRate(client.writeLimiter, val)
}

func setRate(lim *rate.Limiter, val int) string {
        if val > 0 {
                lim.SetLimit(rate.Limit(val))
                return fmt.Sprintf("%v", val)
        }
        lim.SetLimit(rate.Inf)
        return "unlimited"
}

func (client *ExtentClient) Close() error {
        // release streamers
        var inodes []uint64
        client.streamerLock.Lock()
        inodes = make([]uint64, 0, len(client.streamers))
        for inode := range client.streamers {
                inodes = append(inodes, inode)
        }
        client.streamerLock.Unlock()
        for _, inode := range inodes {
                _ = client.EvictStream(inode)
        }
        client.dataWrapper.Stop()
        return nil
}

func (client *ExtentClient) AllocatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zones string) (err error) {
        return client.dataWrapper.AllocatePreLoadDataPartition(volName, count, capacity, ttl, zones)
}

func (client *ExtentClient) CheckDataPartitionExsit(partitionID uint64) error {
        _, err := client.dataWrapper.GetDataPartition(partitionID)
        return err
}

func (client *ExtentClient) GetDataPartitionForWrite() error {
        exclude := make(map[string]struct{})
        _, err := client.dataWrapper.GetDataPartitionForWrite(exclude)
        return err
}

func (client *ExtentClient) UpdateDataPartitionForColdVolume() error {
        return client.dataWrapper.UpdateDataPartition()
}

func (client *ExtentClient) IsPreloadMode() bool {
        return client.preload
}

func (client *ExtentClient) UploadFlowInfo(clientInfo wrapper.SimpleClientInfo) error {
        return client.dataWrapper.UploadFlowInfo(clientInfo, false)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "container/list"
        "sync"
)

// DirtyExtentList defines the struct of the dirty extent list.
type DirtyExtentList struct {
        sync.RWMutex
        list *list.List
}

// NewDirtyExtentList returns a new DirtyExtentList instance.
func NewDirtyExtentList() *DirtyExtentList {
        return &DirtyExtentList{
                list: list.New(),
        }
}

// Put puts a new extent handler into the dirty extent list.
func (dl *DirtyExtentList) Put(eh *ExtentHandler) {
        dl.Lock()
        defer dl.Unlock()
        dl.list.PushBack(eh)
}

// Get gets the next element in the dirty extent list.
func (dl *DirtyExtentList) Get() *list.Element {
        dl.RLock()
        defer dl.RUnlock()
        return dl.list.Front()
}

// Remove removes the element from the dirty extent list.
func (dl *DirtyExtentList) Remove(e *list.Element) {
        dl.Lock()
        defer dl.Unlock()
        dl.list.Remove(e)
}

// Len returns the size of the dirty extent list.
func (dl *DirtyExtentList) Len() int {
        dl.RLock()
        defer dl.RUnlock()
        return dl.list.Len()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "fmt"
        "net"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/sdk/meta"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

// State machines
const (
        ExtentStatusOpen int32 = iota
        ExtentStatusClosed
        ExtentStatusRecovery
        ExtentStatusError
)

var gExtentHandlerID = uint64(0)

// GetExtentHandlerID returns the extent handler ID.
func GetExtentHandlerID() uint64 {
        return atomic.AddUint64(&gExtentHandlerID, 1)
}

// ExtentHandler defines the struct of the extent handler.
type ExtentHandler struct {
        // Fields created as it is, i.e. will not be changed.
        stream     *Streamer
        id         uint64 // extent handler id
        inode      uint64
        fileOffset int
        storeMode  int

        // Either open/closed/recovery/error.
        // Can transit from one state to the next adjacent state ONLY.
        status int32

        // Created, filled and sent in Write.
        packet *Packet

        // Updated in *write* method ONLY.
        size int

        // Pending packets in sender and receiver.
        // Does not involve the packet in open handler.
        inflight int32

        // For ExtentStore,the extent ID is assigned in the sender.
        // For TinyStore, the extent ID is assigned in the receiver.
        // Will not be changed once assigned.
        extID int

        // Allocated in the sender, and released in the receiver.
        // Will not be changed.
        conn *net.TCPConn
        dp   *wrapper.DataPartition

        // Issue a signal to this channel when *inflight* hits zero.
        // To wake up *waitForFlush*.
        empty chan struct{}

        // Created and updated in *receiver* ONLY.
        // Not protected by lock, therefore can be used ONLY when there is no
        // pending and new packets.
        key   *proto.ExtentKey
        dirty bool // indicate if open handler is dirty.

        // Created in receiver ONLY in recovery status.
        // Will not be changed once assigned.
        recoverHandler *ExtentHandler

        // The stream writer gets the write requests, and constructs the packets
        // to be sent to the request channel.
        // The *sender* gets the packets from the *request* channel, sends it to the corresponding data
        // node, and then throw it back to the *reply* channel.
        // The *receiver* gets the packets from the *reply* channel, waits for the
        // reply from the data node, and then deals with it.
        request chan *Packet
        reply   chan *Packet

        // Signaled in stream writer ONLY to exit *receiver*.
        doneReceiver chan struct{}

        // Signaled in receiver ONLY to exit *sender*.
        doneSender chan struct{}

        // ver update need alloc new extent
        verUpdate chan uint64
        appendLK  sync.Mutex
        lastKey   proto.ExtentKey
}

// NewExtentHandler returns a new extent handler.
func NewExtentHandler(stream *Streamer, offset int, storeMode int, size int) *ExtentHandler {
        // log.LogDebugf("NewExtentHandler stack(%v)", string(debug.Stack()))
        eh := &ExtentHandler{
                stream:       stream,
                id:           GetExtentHandlerID(),
                inode:        stream.inode,
                fileOffset:   offset,
                size:         size,
                storeMode:    storeMode,
                empty:        make(chan struct{}, 1024),
                request:      make(chan *Packet, 1024),
                reply:        make(chan *Packet, 1024),
                doneSender:   make(chan struct{}),
                doneReceiver: make(chan struct{}),
        }

        go eh.receiver()
        go eh.sender()

        return eh
}

// String returns the string format of the extent handler.
func (eh *ExtentHandler) String() string {
        return fmt.Sprintf("ExtentHandler{ID(%v)Inode(%v)FileOffset(%v)Size(%v)StoreMode(%v)Status(%v)Dp(%v)Ver(%v)key(%v)lastKey(%v)}",
                eh.id, eh.inode, eh.fileOffset, eh.size, eh.storeMode, eh.status, eh.dp, eh.stream.verSeq, eh.key, eh.lastKey)
}

func (eh *ExtentHandler) write(data []byte, offset, size int, direct bool) (ek *proto.ExtentKey, err error) {
        var total, write int

        status := eh.getStatus()
        if status >= ExtentStatusClosed {
                err = errors.NewErrorf("ExtentHandler Write: Full or Recover eh(%v) key(%v)", eh, eh.key)
                return
        }

        var blksize int
        if eh.storeMode == proto.TinyExtentType {
                blksize = eh.stream.tinySizeLimit()
        } else {
                blksize = util.BlockSize
        }

        // If this write request is not continuous, and cannot be merged
        // into the extent handler, just close it and return error.
        // In this case, the caller should try to create a new extent handler.
        if proto.IsHot(eh.stream.client.volumeType) {
                if eh.fileOffset+eh.size != offset || eh.size+size > util.ExtentSize ||
                        (eh.storeMode == proto.TinyExtentType && eh.size+size > blksize) {

                        err = errors.New("ExtentHandler: full or incontinuous")
                        return
                }
        }

        for total < size {
                if eh.packet == nil {
                        eh.packet = NewWritePacket(eh.inode, offset+total, eh.storeMode)
                        if direct {
                                eh.packet.Opcode = proto.OpSyncWrite
                        }
                        // log.LogDebugf("ExtentHandler Write: NewPacket, eh(%v) packet(%v)", eh, eh.packet)
                }
                packsize := int(eh.packet.Size)
                write = util.Min(size-total, blksize-packsize)
                if write > 0 {
                        copy(eh.packet.Data[packsize:packsize+write], data[total:total+write])
                        eh.packet.Size += uint32(write)
                        total += write
                }

                if int(eh.packet.Size) >= blksize {
                        eh.flushPacket()
                }
        }

        eh.size += total

        // This is just a local cache to prepare write requests.
        // Partition and extent are not allocated.
        ek = &proto.ExtentKey{
                FileOffset: uint64(eh.fileOffset),
                Size:       uint32(eh.size),
        }
        return ek, nil
}

func (eh *ExtentHandler) sender() {
        var err error

        for {
                select {
                case packet := <-eh.request:
                        log.LogDebugf("ExtentHandler sender begin: eh(%v) packet(%v)", eh, packet)
                        if eh.getStatus() >= ExtentStatusRecovery {
                                log.LogWarnf("sender in recovery: eh(%v) packet(%v)", eh, packet)
                                eh.reply <- packet
                                continue
                        }

                        // Initialize dp, conn, and extID
                        if eh.dp == nil {
                                if err = eh.allocateExtent(); err != nil {
                                        eh.setClosed()
                                        eh.setRecovery()
                                        // if dp is not specified and yet we failed, then error out.
                                        // otherwise, just try to recover.
                                        if eh.key == nil {
                                                eh.setError()
                                                log.LogErrorf("sender: eh(%v) err(%v)", eh, err)
                                        } else {
                                                log.LogWarnf("sender: eh(%v) err(%v)", eh, err)
                                        }
                                        eh.reply <- packet
                                        continue
                                }
                        }

                        // For ExtentStore, calculate the extent offset.
                        // For TinyStore, the extent offset is always 0 in the request packet,
                        // and the reply packet tells the real extent offset.
                        extOffset := int(packet.KernelOffset) - eh.fileOffset
                        if eh.key != nil {
                                extOffset += int(eh.key.ExtentOffset)
                        }

                        // fill the packet according to the extent
                        packet.PartitionID = eh.dp.PartitionID
                        packet.ExtentType = uint8(eh.storeMode)
                        packet.ExtentID = uint64(eh.extID)
                        packet.ExtentOffset = int64(extOffset)
                        packet.Arg = ([]byte)(eh.dp.GetAllAddrs())
                        packet.ArgLen = uint32(len(packet.Arg))
                        packet.RemainingFollowers = uint8(len(eh.dp.Hosts) - 1)
                        if len(eh.dp.Hosts) == 1 {
                                packet.RemainingFollowers = 127
                        }
                        packet.StartT = time.Now().UnixNano()

                        log.LogDebugf("ExtentHandler sender: extent allocated, eh(%v) dp(%v) extID(%v) packet(%v)", eh, eh.dp, eh.extID, packet.GetUniqueLogId())

                        if err = packet.writeToConn(eh.conn); err != nil {
                                log.LogWarnf("sender writeTo: failed, eh(%v) err(%v) packet(%v)", eh, err, packet)
                                eh.setClosed()
                                eh.setRecovery()
                        }
                        eh.reply <- packet
                case <-eh.doneSender:
                        eh.setClosed()
                        log.LogDebugf("sender: done, eh(%v) size(%v) ek(%v)", eh, eh.size, eh.key)
                        return
                }
        }
}

func (eh *ExtentHandler) receiver() {
        for {
                select {
                case packet := <-eh.reply:
                        eh.processReply(packet)
                case <-eh.doneReceiver:
                        log.LogDebugf("receiver done: eh(%v) size(%v) ek(%v)", eh, eh.size, eh.key)
                        return
                }
        }
}

func (eh *ExtentHandler) processReply(packet *Packet) {
        defer func() {
                if atomic.AddInt32(&eh.inflight, -1) <= 0 {
                        eh.empty <- struct{}{}
                }
        }()

        status := eh.getStatus()
        if status >= ExtentStatusError {
                eh.discardPacket(packet)
                log.LogErrorf("processReply discard packet: handler is in error status, inflight(%v) eh(%v) packet(%v)", atomic.LoadInt32(&eh.inflight), eh, packet)
                return
        } else if status >= ExtentStatusRecovery {
                if err := eh.recoverPacket(packet); err != nil {
                        eh.discardPacket(packet)
                        log.LogErrorf("processReply discard packet: handler is in recovery status, inflight(%v) eh(%v) packet(%v) err(%v)", atomic.LoadInt32(&eh.inflight), eh, packet, err)
                }
                log.LogDebugf("processReply recover packet: handler is in recovery status, inflight(%v) from eh(%v) to recoverHandler(%v) packet(%v)", atomic.LoadInt32(&eh.inflight), eh, eh.recoverHandler, packet)
                return
        }
        var verUpdate bool
        reply := NewReply(packet.ReqID, packet.PartitionID, packet.ExtentID)
        err := reply.ReadFromConnWithVer(eh.conn, proto.ReadDeadlineTime)
        if err != nil {
                eh.processReplyError(packet, err.Error())
                return
        }

        if reply.VerSeq > atomic.LoadUint64(&eh.stream.verSeq) || (eh.key != nil && reply.VerSeq > eh.key.GetSeq()) {
                log.LogDebugf("processReply.UpdateLatestVer update verseq according to data rsp from version %v to %v", eh.stream.verSeq, reply.VerSeq)
                if err = eh.stream.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: reply.VerList}); err != nil {
                        eh.processReplyError(packet, err.Error())
                        return
                }
                if err = eh.appendExtentKey(); err != nil {
                        eh.processReplyError(packet, err.Error())
                        return
                }
                eh.key = nil
                verUpdate = true
        }
        if reply.ResultCode != proto.OpOk {
                if reply.ResultCode != proto.ErrCodeVersionOpError {
                        errmsg := fmt.Sprintf("reply NOK: reply(%v)", reply)
                        log.LogDebugf("processReply packet (%v) errmsg (%v)", packet, errmsg)
                        eh.processReplyError(packet, errmsg)
                        return
                }
                // todo(leonchang) need check safety
                log.LogWarnf("processReply: get reply, eh(%v) packet(%v) reply(%v)", eh, packet, reply)
                eh.stream.GetExtentsForce()
        }

        if !packet.isValidWriteReply(reply) {
                errmsg := fmt.Sprintf("request and reply does not match: reply(%v)", reply)
                eh.processReplyError(packet, errmsg)
                return
        }

        if reply.CRC != packet.CRC {
                errmsg := fmt.Sprintf("inconsistent CRC: reqCRC(%v) replyCRC(%v) reply(%v) ", packet.CRC, reply.CRC, reply)
                eh.processReplyError(packet, errmsg)
                return
        }

        eh.dp.RecordWrite(packet.StartT)

        var extID, extOffset uint64

        if eh.storeMode == proto.TinyExtentType {
                extID = reply.ExtentID
                extOffset = uint64(reply.ExtentOffset)
        } else {
                extID = packet.ExtentID
                extOffset = packet.KernelOffset - uint64(eh.fileOffset)
        }
        fileOffset := uint64(eh.fileOffset)
        if verUpdate {
                fileOffset = reply.KernelOffset
        }
        if eh.key == nil || verUpdate {
                eh.key = &proto.ExtentKey{
                        FileOffset:   fileOffset,
                        PartitionId:  packet.PartitionID,
                        ExtentId:     extID,
                        ExtentOffset: extOffset,
                        Size:         packet.Size,
                        SnapInfo: &proto.ExtSnapInfo{
                                VerSeq: reply.VerSeq,
                        },
                }
        } else {
                eh.key.Size += packet.Size
        }

        proto.Buffers.Put(packet.Data)
        packet.Data = nil
        eh.dirty = true
        return
}

func (eh *ExtentHandler) processReplyError(packet *Packet, errmsg string) {
        eh.setClosed()
        eh.setRecovery()
        if err := eh.recoverPacket(packet); err != nil {
                eh.discardPacket(packet)
                log.LogErrorf("processReplyError discard packet: eh(%v) packet(%v) err(%v) errmsg(%v)", eh, packet, err, errmsg)
        }
}

func (eh *ExtentHandler) flush() (err error) {
        eh.flushPacket()
        eh.waitForFlush()
        err = eh.appendExtentKey()
        if err != nil {
                return
        }

        if eh.storeMode == proto.TinyExtentType {
                eh.setClosed()
        }

        status := eh.getStatus()
        if status >= ExtentStatusError {
                err = errors.New(fmt.Sprintf("StreamWriter flush: extent handler in error status, eh(%v) size(%v)", eh, eh.size))
        }
        return
}

func (eh *ExtentHandler) cleanup() (err error) {
        eh.doneSender <- struct{}{}
        eh.doneReceiver <- struct{}{}
        if eh.conn != nil {
                conn := eh.conn
                eh.conn = nil
                // TODO unhandled error
                if status := eh.getStatus(); status >= ExtentStatusRecovery {
                        StreamConnPool.PutConnect(conn, true)
                } else {
                        StreamConnPool.PutConnect(conn, false)
                }
        }
        return
}

// can ONLY be called when the handler is not open any more
func (eh *ExtentHandler) appendExtentKey() (err error) {
        eh.appendLK.Lock()
        defer eh.appendLK.Unlock()

        if eh.key != nil {
                if eh.dirty {
                        if proto.IsCold(eh.stream.client.volumeType) && eh.status == ExtentStatusError {
                                return
                        }
                        var (
                                discard []proto.ExtentKey
                                status  int
                        )

                        ekey := *eh.key
                        doAppend := func() (err error) {
                                discard = eh.stream.extents.Append(&ekey, true)
                                status, err = eh.stream.client.appendExtentKey(eh.stream.parentInode, eh.inode, ekey, discard)
                                if atomic.LoadInt32(&eh.stream.needUpdateVer) > 0 {
                                        if errUpdateExtents := eh.stream.GetExtentsForce(); errUpdateExtents != nil {
                                                log.LogErrorf("action[appendExtentKey] inode %v GetExtents err %v errUpdateExtents %v", eh.stream.inode, err, errUpdateExtents)
                                                return
                                        }
                                }
                                if err == nil && len(discard) > 0 {
                                        eh.stream.extents.RemoveDiscard(discard)
                                }
                                return
                        }
                        if err = doAppend(); err == nil {
                                eh.dirty = false
                                eh.lastKey = *eh.key
                                log.LogDebugf("action[appendExtentKey] status %v, needUpdateVer %v, eh{%v}", status, eh.stream.needUpdateVer, eh)
                                return
                        }
                        // Due to the asynchronous synchronization of version numbers, the extent cache version of the client is updated first before being written to the meta.
                        // However, it is possible for the client version to lag behind the meta version, resulting in partial inconsistencies in judgment.
                        // For example, if the version remains unchanged in the client,
                        // the append-write principle is to reuse the extent key while changing the length. But if the meta has already changed its version,
                        // a new extent key information needs to be constructed for retrying the operation.
                        log.LogWarnf("action[appendExtentKey] status %v, handler %v, err %v", status, eh, err)
                        if status == meta.StatusConflictExtents &&
                                (atomic.LoadInt32(&eh.stream.needUpdateVer) > 0 || eh.stream.verSeq > 0) &&
                                eh.lastKey.PartitionId != 0 {
                                log.LogDebugf("action[appendExtentKey] do append again err %v, key %v", err, ekey)
                                if eh.lastKey.IsSameExtent(&ekey) &&
                                        eh.lastKey.FileOffset == ekey.FileOffset &&
                                        eh.lastKey.ExtentOffset == ekey.ExtentOffset &&
                                        eh.lastKey.Size < ekey.Size {
                                        ekey.FileOffset += uint64(eh.lastKey.Size)
                                        ekey.ExtentOffset += uint64(eh.lastKey.Size)
                                        ekey.Size -= eh.lastKey.Size
                                        ekey.SetSeq(eh.stream.verSeq)
                                        eh.lastKey = ekey
                                        if err = doAppend(); err != nil {
                                                eh.key = nil
                                                eh.lastKey.PartitionId = 0
                                        } else {
                                                *eh.key = ekey
                                        }
                                        log.LogDebugf("action[appendExtentKey] do append again err %v, key %v", err, ekey)
                                }
                        }
                } else {
                        /*
                         * Update extents cache using the ek stored in the eh. This is
                         * indispensable because the ek in the extent cache might be
                         * a temp one with dpid 0, especially when current eh failed and
                         * create a new eh to do recovery.
                         */
                        _ = eh.stream.extents.Append(eh.key, false)
                }
        }
        if err == nil {
                eh.dirty = false
        } else {
                log.LogErrorf("action[appendExtentKey] %v do append again err %v", eh, err)
                eh.lastKey.PartitionId = 0
        }
        return
}

// This function is meaningful to be called from stream writer flush method,
// because there is no new write request.
func (eh *ExtentHandler) waitForFlush() {
        if atomic.LoadInt32(&eh.inflight) <= 0 {
                return
        }

        //        t := time.NewTicker(10 * time.Second)
        //        defer t.Stop()

        for {
                select {
                case <-eh.empty:
                        if atomic.LoadInt32(&eh.inflight) <= 0 {
                                return
                        }
                        //                case <-t.C:
                        //                        if atomic.LoadInt32(&eh.inflight) <= 0 {
                        //                                return
                        //                        }
                }
        }
}

func (eh *ExtentHandler) recoverPacket(packet *Packet) error {
        packet.errCount++
        if packet.errCount >= MaxPacketErrorCount || proto.IsCold(eh.stream.client.volumeType) {
                return errors.New(fmt.Sprintf("recoverPacket failed: reach max error limit, eh(%v) packet(%v)", eh, packet))
        }

        handler := eh.recoverHandler
        if handler == nil {
                // Always use normal extent store mode for recovery.
                // Because tiny extent files are limited, tiny store
                // failures might due to lack of tiny extent file.
                handler = NewExtentHandler(eh.stream, int(packet.KernelOffset), proto.NormalExtentType, 0)
                handler.setClosed()
        }
        handler.pushToRequest(packet)
        if eh.recoverHandler == nil {
                eh.recoverHandler = handler
                // Note: put it to dirty list after packet is sent, so this
                // handler is not skipped in flush.
                eh.stream.dirtylist.Put(handler)
        }
        return nil
}

func (eh *ExtentHandler) discardPacket(packet *Packet) {
        proto.Buffers.Put(packet.Data)
        packet.Data = nil
        eh.setError()
}

func (eh *ExtentHandler) allocateExtent() (err error) {
        var (
                dp    *wrapper.DataPartition
                conn  *net.TCPConn
                extID int
        )

        log.LogDebugf("ExtentHandler allocateExtent enter: eh(%v)", eh)

        exclude := make(map[string]struct{})

        for i := 0; i < MaxSelectDataPartitionForWrite; i++ {
                if eh.key == nil {
                        if dp, err = eh.stream.client.dataWrapper.GetDataPartitionForWrite(exclude); err != nil {
                                log.LogWarnf("allocateExtent: failed to get write data partition, eh(%v) exclude(%v), clear exclude and try again!", eh, exclude)
                                exclude = make(map[string]struct{})
                                continue
                        }

                        extID = 0
                        if eh.storeMode == proto.NormalExtentType {
                                extID, err = eh.createExtent(dp)
                        }
                        if err != nil {
                                log.LogWarnf("allocateExtent: exclude dp[%v] for write caused by create extent failed, eh(%v) err(%v) exclude(%v)",
                                        dp, eh, err, exclude)
                                eh.stream.client.dataWrapper.RemoveDataPartitionForWrite(dp.PartitionID)
                                dp.CheckAllHostsIsAvail(exclude)
                                continue
                        }
                } else {
                        if dp, err = eh.stream.client.dataWrapper.GetDataPartition(eh.key.PartitionId); err != nil {
                                log.LogWarnf("allocateExtent: failed to get write data partition, eh(%v)", eh)
                                break
                        }
                        extID = int(eh.key.ExtentId)
                }

                if conn, err = StreamConnPool.GetConnect(dp.Hosts[0]); err != nil {
                        log.LogWarnf("allocateExtent: failed to create connection, eh(%v) err(%v) dp(%v) exclude(%v)",
                                eh, err, dp, exclude)
                        // If storeMode is tinyExtentType and can't create connection, we also check host status.
                        dp.CheckAllHostsIsAvail(exclude)
                        if eh.key != nil {
                                break
                        }
                        continue
                }

                // success
                eh.dp = dp
                eh.conn = conn
                eh.extID = extID

                // log.LogDebugf("ExtentHandler allocateExtent exit: eh(%v) dp(%v) extID(%v)", eh, dp, extID)
                return nil
        }

        errmsg := fmt.Sprintf("allocateExtent failed: hit max retry limit")
        if err != nil {
                err = errors.Trace(err, errmsg)
        } else {
                err = errors.New(errmsg)
        }
        return err
}

func (eh *ExtentHandler) createConnection(dp *wrapper.DataPartition) (*net.TCPConn, error) {
        conn, err := net.DialTimeout("tcp", dp.Hosts[0], time.Second)
        if err != nil {
                return nil, err
        }
        connect := conn.(*net.TCPConn)
        // TODO unhandled error
        connect.SetKeepAlive(true)
        connect.SetNoDelay(true)
        return connect, nil
}

func (eh *ExtentHandler) createExtent(dp *wrapper.DataPartition) (extID int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("createExtent", err, bgTime, 1)
        }()

        conn, err := StreamConnPool.GetConnect(dp.Hosts[0])
        if err != nil {
                return extID, errors.Trace(err, "createExtent: failed to create connection, eh(%v) datapartionHosts(%v)", eh, dp.Hosts[0])
        }

        defer func() {
                if err != nil {
                        StreamConnPool.PutConnect(conn, true)
                } else {
                        StreamConnPool.PutConnect(conn, false)
                }
        }()

        p := NewCreateExtentPacket(dp, eh.inode)
        if err = p.WriteToConn(conn); err != nil {
                return extID, errors.Trace(err, "createExtent: failed to WriteToConn, packet(%v) datapartionHosts(%v)", p, dp.Hosts[0])
        }

        if err = p.ReadFromConnWithVer(conn, proto.ReadDeadlineTime*2); err != nil {
                return extID, errors.Trace(err, "createExtent: failed to ReadFromConn, packet(%v) datapartionHosts(%v)", p, dp.Hosts[0])
        }

        if p.ResultCode != proto.OpOk {
                return extID, errors.New(fmt.Sprintf("createExtent: ResultCode NOK, packet(%v) datapartionHosts(%v) ResultCode(%v)", p, dp.Hosts[0], p.GetResultMsg()))
        }

        extID = int(p.ExtentID)
        if extID <= 0 {
                return extID, errors.New(fmt.Sprintf("createExtent: illegal extID(%v) from (%v)", extID, dp.Hosts[0]))
        }

        return extID, nil
}

// Handler lock is held by the caller.
func (eh *ExtentHandler) flushPacket() {
        if eh.packet == nil {
                return
        }

        eh.pushToRequest(eh.packet)
        eh.packet = nil
}

func (eh *ExtentHandler) pushToRequest(packet *Packet) {
        // Increase before sending the packet, because inflight is used
        // to determine if the handler has finished.
        atomic.AddInt32(&eh.inflight, 1)
        eh.request <- packet
}

func (eh *ExtentHandler) getStatus() int32 {
        return atomic.LoadInt32(&eh.status)
}

func (eh *ExtentHandler) setClosed() bool {
        //        log.LogDebugf("action[ExtentHandler.setClosed] stack (%v)", string(debug.Stack()))
        return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusOpen, ExtentStatusClosed)
}

func (eh *ExtentHandler) setRecovery() bool {
        // log.LogDebugf("action[ExtentHandler.setRecovery] stack (%v)", string(debug.Stack()))
        return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusClosed, ExtentStatusRecovery)
}

func (eh *ExtentHandler) setError() bool {
        // log.LogDebugf("action[ExtentHandler.setError] stack (%v)", string(debug.Stack()))
        if proto.IsHot(eh.stream.client.volumeType) {
                atomic.StoreInt32(&eh.stream.status, StreamerError)
        }
        return atomic.CompareAndSwapInt32(&eh.status, ExtentStatusRecovery, ExtentStatusError)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "fmt"
        "hash/crc32"
        "net"
        "strings"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

// ExtentReader defines the struct of the extent reader.
type ExtentReader struct {
        inode        uint64
        key          *proto.ExtentKey
        dp           *wrapper.DataPartition
        followerRead bool
        retryRead    bool
}

// NewExtentReader returns a new extent reader.
func NewExtentReader(inode uint64, key *proto.ExtentKey, dp *wrapper.DataPartition, followerRead bool, retryRead bool) *ExtentReader {
        return &ExtentReader{
                inode:        inode,
                key:          key,
                dp:           dp,
                followerRead: followerRead,
                retryRead:    retryRead,
        }
}

// String returns the string format of the extent reader.
func (reader *ExtentReader) String() (m string) {
        return fmt.Sprintf("inode (%v) extentKey(%v)", reader.inode,
                reader.key.Marshal())
}

// Read reads the extent request.
func (reader *ExtentReader) Read(req *ExtentRequest) (readBytes int, err error) {
        offset := req.FileOffset - int(reader.key.FileOffset) + int(reader.key.ExtentOffset)
        size := req.Size

        reqPacket := NewReadPacket(reader.key, offset, size, reader.inode, req.FileOffset, reader.followerRead)
        sc := NewStreamConn(reader.dp, reader.followerRead)

        log.LogDebugf("ExtentReader Read enter: size(%v) req(%v) reqPacket(%v)", size, req, reqPacket)

        err = sc.Send(&reader.retryRead, reqPacket, func(conn *net.TCPConn) (error, bool) {
                readBytes = 0
                for readBytes < size {
                        replyPacket := NewReply(reqPacket.ReqID, reader.dp.PartitionID, reqPacket.ExtentID)
                        bufSize := util.Min(util.ReadBlockSize, size-readBytes)
                        replyPacket.Data = req.Data[readBytes : readBytes+bufSize]
                        e := replyPacket.readFromConn(conn, proto.ReadDeadlineTime)

                        if e != nil {
                                log.LogWarnf("Extent Reader Read: failed to read from connect, ino(%v) req(%v) readBytes(%v) err(%v)", reader.inode, reqPacket, readBytes, e)
                                // Upon receiving TryOtherAddrError, other hosts will be retried.
                                return TryOtherAddrError, false
                        }

                        if replyPacket.ResultCode == proto.OpAgain {
                                return nil, true
                        }

                        e = reader.checkStreamReply(reqPacket, replyPacket)
                        if e != nil {
                                log.LogWarnf("checkStreamReply failed:(%v) reply msg:(%v)", e, replyPacket.GetResultMsg())
                                // Dont change the error message, since the caller will
                                // check if it is NotLeaderErr.
                                return e, false
                        }

                        readBytes += int(replyPacket.Size)
                }
                return nil, false
        })

        if err != nil {
                // if cold vol and cach is invaild
                if !reader.retryRead && (err == TryOtherAddrError || strings.Contains(err.Error(), "ExistErr")) {
                        log.LogWarnf("Extent Reader Read: err(%v) req(%v) reqPacket(%v)", err, req, reqPacket)
                } else {
                        log.LogErrorf("Extent Reader Read: err(%v) req(%v) reqPacket(%v)", err, req, reqPacket)
                }
        }

        log.LogDebugf("ExtentReader Read exit: req(%v) reqPacket(%v) readBytes(%v) err(%v)", req, reqPacket, readBytes, err)
        return
}

func (reader *ExtentReader) checkStreamReply(request *Packet, reply *Packet) (err error) {
        if reply.ResultCode == proto.OpTryOtherAddr {
                return TryOtherAddrError
        }

        if reply.ResultCode != proto.OpOk {
                if request.Opcode == proto.OpStreamFollowerRead {
                        log.LogWarnf("checkStreamReply: ResultCode(%v) NOK, OpStreamFollowerRead return TryOtherAddrError, "+
                                "req(%v) reply(%v)", reply.GetResultMsg(), request, reply)
                        return TryOtherAddrError
                }
                err = errors.New(fmt.Sprintf("checkStreamReply: ResultCode(%v) NOK", reply.GetResultMsg()))
                return
        }
        if !request.isValidReadReply(reply) {
                err = errors.New(fmt.Sprintf("checkStreamReply: inconsistent req and reply, req(%v) reply(%v)", request, reply))
                return
        }
        expectCrc := crc32.ChecksumIEEE(reply.Data[:reply.Size])
        if reply.CRC != expectCrc {
                err = errors.New(fmt.Sprintf("checkStreamReply: inconsistent CRC, expectCRC(%v) replyCRC(%v)", expectCrc, reply.CRC))
                return
        }
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "encoding/binary"
        "fmt"
        "hash/crc32"
        "io"
        "net"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/util"
)

// Packet defines a wrapper of the packet in proto.
type Packet struct {
        proto.Packet
        inode    uint64
        errCount int
}

// String returns the string format of the packet.
func (p *Packet) String() string {
        return fmt.Sprintf("ReqID(%v)Op(%v)Inode(%v)FileOffset(%v)Size(%v)PartitionID(%v)ExtentID(%v)ExtentOffset(%v)CRC(%v)ResultCode(%v:%v)Seq(%v)",
                p.ReqID, p.GetOpMsg(), p.inode, p.KernelOffset, p.Size, p.PartitionID, p.ExtentID, p.ExtentOffset, p.CRC, p.ResultCode, p.GetResultMsg(), p.VerSeq)
}

func NewWriteTinyDirectly(inode uint64, dpID uint64, offset int, dp *wrapper.DataPartition) *Packet {
        reqPacket := NewWritePacket(inode, offset, proto.TinyExtentType)
        reqPacket.PartitionID = dpID
        reqPacket.RemainingFollowers = uint8(len(dp.Hosts) - 1)
        reqPacket.Arg = ([]byte)(dp.GetAllAddrs())
        reqPacket.ArgLen = uint32(len(reqPacket.Arg))
        if len(dp.Hosts) == 1 {
                reqPacket.RemainingFollowers = 127
        }
        return reqPacket
}

// NewWritePacket returns a new write packet.
func NewWritePacket(inode uint64, fileOffset, storeMode int) *Packet {
        p := new(Packet)
        p.ReqID = proto.GenerateRequestID()
        p.Magic = proto.ProtoMagic
        p.Opcode = proto.OpWrite
        p.inode = inode
        p.KernelOffset = uint64(fileOffset)
        if storeMode == proto.TinyExtentType {
                p.Data, _ = proto.Buffers.Get(util.DefaultTinySizeLimit)
        } else {
                p.Data, _ = proto.Buffers.Get(util.BlockSize)
        }
        return p
}

// NewOverwritePacket returns a new overwrite packet.
func NewOverwriteByAppendPacket(dp *wrapper.DataPartition, extentID uint64, extentOffset int,
        inode uint64, fileOffset int, direct bool, op uint8) *Packet {
        p := new(Packet)
        p.PartitionID = dp.PartitionID
        p.Magic = proto.ProtoMagic
        p.ExtentType = proto.NormalExtentType
        p.ExtentID = extentID
        p.ExtentOffset = int64(extentOffset)
        p.ReqID = proto.GenerateRequestID()
        p.Arg = nil
        p.ArgLen = 0
        p.RemainingFollowers = 0
        p.Opcode = op

        if direct {
                if op == proto.OpRandomWriteAppend {
                        p.Opcode = proto.OpSyncRandomWriteAppend
                } else if op == proto.OpTryWriteAppend {
                        p.Opcode = proto.OpSyncTryWriteAppend
                }
        }

        p.inode = inode
        p.KernelOffset = uint64(fileOffset)
        p.Data, _ = proto.Buffers.Get(util.BlockSize)
        return p
}

// NewOverwritePacket returns a new overwrite packet.
func NewOverwritePacket(dp *wrapper.DataPartition, extentID uint64, extentOffset int, inode uint64, fileOffset int) *Packet {
        p := new(Packet)
        p.PartitionID = dp.PartitionID
        p.Magic = proto.ProtoMagic
        p.ExtentType = proto.NormalExtentType
        p.ExtentID = extentID
        p.ExtentOffset = int64(extentOffset)
        p.ReqID = proto.GenerateRequestID()
        p.Arg = nil
        p.ArgLen = 0
        p.RemainingFollowers = 0
        p.Opcode = proto.OpRandomWriteVer // proto.OpRandomWrite
        p.inode = inode
        p.KernelOffset = uint64(fileOffset)
        p.Data, _ = proto.Buffers.Get(util.BlockSize)
        return p
}

// NewReadPacket returns a new read packet.
func NewReadPacket(key *proto.ExtentKey, extentOffset, size int, inode uint64, fileOffset int, followerRead bool) *Packet {
        p := new(Packet)
        p.ExtentID = key.ExtentId
        p.PartitionID = key.PartitionId
        p.Magic = proto.ProtoMagic
        p.ExtentOffset = int64(extentOffset)
        p.Size = uint32(size)
        if followerRead {
                p.Opcode = proto.OpStreamFollowerRead
        } else {
                p.Opcode = proto.OpStreamRead
        }
        p.ExtentType = proto.NormalExtentType
        p.ReqID = proto.GenerateRequestID()
        p.RemainingFollowers = 0
        p.inode = inode
        p.KernelOffset = uint64(fileOffset)
        return p
}

// NewCreateExtentPacket returns a new packet to create extent.
func NewCreateExtentPacket(dp *wrapper.DataPartition, inode uint64) *Packet {
        p := new(Packet)
        p.PartitionID = dp.PartitionID
        p.Magic = proto.ProtoMagic
        p.ExtentType = proto.NormalExtentType
        p.Arg = ([]byte)(dp.GetAllAddrs())
        p.ArgLen = uint32(len(p.Arg))
        p.RemainingFollowers = uint8(len(dp.Hosts) - 1)
        if len(dp.Hosts) == 1 {
                p.RemainingFollowers = 127
        }
        p.ReqID = proto.GenerateRequestID()
        p.Opcode = proto.OpCreateExtent
        p.Data = make([]byte, 8)
        binary.BigEndian.PutUint64(p.Data, inode)
        p.Size = uint32(len(p.Data))
        return p
}

// NewReply returns a new reply packet. TODO rename to NewReplyPacket?
func NewReply(reqID int64, partitionID uint64, extentID uint64) *Packet {
        p := new(Packet)
        p.ReqID = reqID
        p.PartitionID = partitionID
        p.ExtentID = extentID
        p.Magic = proto.ProtoMagic
        p.ExtentType = proto.NormalExtentType
        return p
}

func (p *Packet) isValidWriteReply(q *Packet) bool {
        if p.ReqID == q.ReqID && p.PartitionID == q.PartitionID {
                return true
        }
        return false
}

func (p *Packet) isValidReadReply(q *Packet) bool {
        if p.ReqID == q.ReqID && p.PartitionID == q.PartitionID && p.ExtentID == q.ExtentID {
                return true
        }
        return false
}

func (p *Packet) writeToConn(conn net.Conn) error {
        p.CRC = crc32.ChecksumIEEE(p.Data[:p.Size])
        return p.WriteToConn(conn)
}

func (p *Packet) readFromConn(c net.Conn, deadlineTime time.Duration) (err error) {
        if deadlineTime != proto.NoReadDeadlineTime {
                c.SetReadDeadline(time.Now().Add(deadlineTime * time.Second))
        }
        header, _ := proto.Buffers.Get(util.PacketHeaderSize)
        defer proto.Buffers.Put(header)
        if _, err = io.ReadFull(c, header); err != nil {
                return
        }
        if err = p.UnmarshalHeader(header); err != nil {
                return
        }

        if p.ArgLen > 0 {
                if err = readToBuffer(c, &p.Arg, int(p.ArgLen)); err != nil {
                        return
                }
        }

        if p.Size < 0 {
                return
        }

        size := int(p.Size)
        if size > len(p.Data) {
                size = len(p.Data)
        }

        _, err = io.ReadFull(c, p.Data[:size])
        return
}

func readToBuffer(c net.Conn, buf *[]byte, readSize int) (err error) {
        if *buf == nil || readSize != util.BlockSize {
                *buf = make([]byte, readSize)
        }
        _, err = io.ReadFull(c, (*buf)[:readSize])
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "fmt"
        "net"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

var (
        TryOtherAddrError = errors.New("TryOtherAddrError")
        DpDiscardError    = errors.New("DpDiscardError")
)

const (
        StreamSendMaxRetry      = 200
        StreamSendSleepInterval = 100 * time.Millisecond
)

type GetReplyFunc func(conn *net.TCPConn) (err error, again bool)

// StreamConn defines the struct of the stream connection.
type StreamConn struct {
        dp       *wrapper.DataPartition
        currAddr string
}

var StreamConnPool = util.NewConnectPool()

// NewStreamConn returns a new stream connection.
func NewStreamConn(dp *wrapper.DataPartition, follower bool) (sc *StreamConn) {
        if !follower {
                sc = &StreamConn{
                        dp:       dp,
                        currAddr: dp.LeaderAddr,
                }
                return
        }

        defer func() {
                if sc.currAddr == "" {
                        /*
                         * If followerRead is enabled, and there is no preferred choice,
                         * currAddr can be arbitrarily selected from the hosts.
                         */
                        for _, h := range dp.Hosts {
                                if h != "" {
                                        sc.currAddr = h
                                        break
                                }
                        }
                }
        }()

        if dp.ClientWrapper.NearRead() {
                sc = &StreamConn{
                        dp:       dp,
                        currAddr: getNearestHost(dp),
                }
                return
        }

        epoch := atomic.AddUint64(&dp.Epoch, 1)
        hosts := sortByStatus(dp, false)
        choice := len(hosts)
        currAddr := dp.LeaderAddr
        if choice > 0 {
                index := int(epoch) % choice
                currAddr = hosts[index]
        }

        sc = &StreamConn{
                dp:       dp,
                currAddr: currAddr,
        }
        return
}

// String returns the string format of the stream connection.
func (sc *StreamConn) String() string {
        return fmt.Sprintf("Partition(%v) CurrentAddr(%v) Hosts(%v)", sc.dp.PartitionID, sc.currAddr, sc.dp.Hosts)
}

// Send send the given packet over the network through the stream connection until success
// or the maximum number of retries is reached.
func (sc *StreamConn) Send(retry *bool, req *Packet, getReply GetReplyFunc) (err error) {
        for i := 0; i < StreamSendMaxRetry; i++ {
                err = sc.sendToDataPartition(req, retry, getReply)
                if err == nil || err == proto.ErrCodeVersionOp || !*retry || err == TryOtherAddrError {
                        return
                }
                log.LogWarnf("StreamConn Send: err(%v)", err)
                time.Sleep(StreamSendSleepInterval)
        }
        return errors.New(fmt.Sprintf("StreamConn Send: retried %v times and still failed, sc(%v) reqPacket(%v)", StreamSendMaxRetry, sc, req))
}

func (sc *StreamConn) sendToDataPartition(req *Packet, retry *bool, getReply GetReplyFunc) (err error) {
        conn, err := StreamConnPool.GetConnect(sc.currAddr)
        if err == nil {
                log.LogDebugf("req opcode %v, conn %v", req.Opcode, conn)
                err = sc.sendToConn(conn, req, getReply)
                if err == nil {
                        StreamConnPool.PutConnect(conn, false)
                        return
                }
                log.LogWarnf("sendToDataPartition: send to curr addr failed, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
                StreamConnPool.PutConnect(conn, true)
                if err != TryOtherAddrError || !*retry {
                        return
                }
        } else {
                log.LogWarnf("sendToDataPartition: get connection to curr addr failed, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
        }

        hosts := sortByStatus(sc.dp, true)

        for _, addr := range hosts {
                log.LogWarnf("sendToDataPartition: try addr(%v) reqPacket(%v)", addr, req)
                conn, err = StreamConnPool.GetConnect(addr)
                if err != nil {
                        log.LogWarnf("sendToDataPartition: failed to get connection to addr(%v) reqPacket(%v) err(%v)", addr, req, err)
                        continue
                }
                sc.currAddr = addr
                sc.dp.LeaderAddr = addr
                err = sc.sendToConn(conn, req, getReply)
                if err == nil {
                        StreamConnPool.PutConnect(conn, false)
                        return
                }
                StreamConnPool.PutConnect(conn, true)
                if err != TryOtherAddrError {
                        return
                }
                log.LogWarnf("sendToDataPartition: try addr(%v) failed! reqPacket(%v) err(%v)", addr, req, err)
        }
        return errors.New(fmt.Sprintf("sendToPatition Failed: sc(%v) reqPacket(%v)", sc, req))
}

func (sc *StreamConn) sendToConn(conn *net.TCPConn, req *Packet, getReply GetReplyFunc) (err error) {
        for i := 0; i < StreamSendMaxRetry; i++ {
                log.LogDebugf("sendToConn: send to addr(%v), reqPacket(%v)", sc.currAddr, req)
                err = req.WriteToConn(conn)
                if err != nil {
                        msg := fmt.Sprintf("sendToConn: failed to write to addr(%v) err(%v)", sc.currAddr, err)
                        log.LogWarn(msg)
                        break
                }

                var again bool
                err, again = getReply(conn)
                if !again {
                        if err != nil {
                                log.LogWarnf("sendToConn: getReply error and RETURN, addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
                        }
                        break
                }

                log.LogWarnf("sendToConn: getReply error and will RETRY, sc(%v) err(%v)", sc, err)
                time.Sleep(StreamSendSleepInterval)
        }

        log.LogDebugf("sendToConn exit: send to addr(%v) reqPacket(%v) err(%v)", sc.currAddr, req, err)
        return
}

// sortByStatus will return hosts list sort by host status for DataPartition.
// If param selectAll is true, hosts with status(true) is in front and hosts with status(false) is in behind.
// If param selectAll is false, only return hosts with status(true).
func sortByStatus(dp *wrapper.DataPartition, selectAll bool) (hosts []string) {
        var failedHosts []string
        hostsStatus := dp.ClientWrapper.HostsStatus
        var dpHosts []string
        if dp.ClientWrapper.FollowerRead() && dp.ClientWrapper.NearRead() {
                dpHosts = dp.NearHosts
                if len(dpHosts) == 0 {
                        dpHosts = dp.Hosts
                }
        } else {
                dpHosts = dp.Hosts
        }

        for _, addr := range dpHosts {
                status, ok := hostsStatus[addr]
                if ok {
                        if status {
                                hosts = append(hosts, addr)
                        } else {
                                failedHosts = append(failedHosts, addr)
                        }
                } else {
                        failedHosts = append(failedHosts, addr)
                        log.LogWarnf("sortByStatus: can not find host[%v] in HostsStatus, dp[%d]", addr, dp.PartitionID)
                }
        }

        if selectAll {
                hosts = append(hosts, failedHosts...)
        }

        return
}

func getNearestHost(dp *wrapper.DataPartition) string {
        hostsStatus := dp.ClientWrapper.HostsStatus
        for _, addr := range dp.NearHosts {
                status, ok := hostsStatus[addr]
                if ok {
                        if !status {
                                continue
                        }
                }
                return addr
        }
        return dp.LeaderAddr
}

func NewStreamConnByHost(host string) *StreamConn {
        return &StreamConn{
                currAddr: host,
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "context"
        "fmt"
        "io"
        "sync"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/blockcache/bcache"
        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/buf"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
)

// One inode corresponds to one streamer. All the requests to the same inode will be queued.
// TODO rename streamer here is not a good name as it also handles overwrites, not just stream write.
type Streamer struct {
        client               *ExtentClient
        inode                uint64
        parentInode          uint64
        status               int32
        refcnt               int
        idle                 int // how long there is no new request
        traversed            int // how many times the streamer is traversed
        extents              *ExtentCache
        once                 sync.Once
        handler              *ExtentHandler   // current open handler
        dirtylist            *DirtyExtentList // dirty handlers
        dirty                bool             // whether current open handler is in the dirty list
        isOpen               bool
        needBCache           bool
        request              chan interface{} // request channel, write/flush/close
        done                 chan struct{}    // stream writer is being closed
        writeLock            sync.Mutex
        inflightEvictL1cache sync.Map
        pendingCache         chan bcacheKey
        verSeq               uint64
        needUpdateVer        int32
}

type bcacheKey struct {
        cacheKey  string
        extentKey *proto.ExtentKey
}

// NewStreamer returns a new streamer.
func NewStreamer(client *ExtentClient, inode uint64) *Streamer {
        s := new(Streamer)
        s.client = client
        s.inode = inode
        s.parentInode = 0
        s.extents = NewExtentCache(inode)
        s.request = make(chan interface{}, 64)
        s.done = make(chan struct{})
        s.dirtylist = NewDirtyExtentList()
        s.isOpen = true
        s.pendingCache = make(chan bcacheKey, 1)
        s.verSeq = client.multiVerMgr.latestVerSeq
        s.extents.verSeq = client.multiVerMgr.latestVerSeq
        go s.server()
        go s.asyncBlockCache()
        return s
}

func (s *Streamer) SetParentInode(inode uint64) {
        s.parentInode = inode
}

// String returns the string format of the streamer.
func (s *Streamer) String() string {
        return fmt.Sprintf("Streamer{ino(%v)}", s.inode)
}

// TODO should we call it RefreshExtents instead?
func (s *Streamer) GetExtents() error {
        if s.client.disableMetaCache || !s.needBCache {
                return s.extents.RefreshForce(s.inode, s.client.getExtents)
        }

        return s.extents.Refresh(s.inode, s.client.getExtents)
}

func (s *Streamer) GetExtentsForce() error {
        return s.extents.RefreshForce(s.inode, s.client.getExtents)
}

// GetExtentReader returns the extent reader.
// TODO: use memory pool
func (s *Streamer) GetExtentReader(ek *proto.ExtentKey) (*ExtentReader, error) {
        partition, err := s.client.dataWrapper.GetDataPartition(ek.PartitionId)
        if err != nil {
                return nil, err
        }

        if partition.IsDiscard {
                log.LogWarnf("GetExtentReader: datapartition %v is discard", partition.PartitionID)
                return nil, DpDiscardError
        }

        retryRead := true
        if proto.IsCold(s.client.volumeType) {
                retryRead = false
        }

        reader := NewExtentReader(s.inode, ek, partition, s.client.dataWrapper.FollowerRead(), retryRead)
        return reader, nil
}

func (s *Streamer) read(data []byte, offset int, size int) (total int, err error) {
        var (
                readBytes       int
                reader          *ExtentReader
                requests        []*ExtentRequest
                revisedRequests []*ExtentRequest
        )
        log.LogDebugf("action[streamer.read] offset %v size %v", offset, size)
        ctx := context.Background()
        s.client.readLimiter.Wait(ctx)
        s.client.LimitManager.ReadAlloc(ctx, size)
        requests = s.extents.PrepareReadRequests(offset, size, data)
        for _, req := range requests {
                if req.ExtentKey == nil {
                        continue
                }
                if req.ExtentKey.PartitionId == 0 || req.ExtentKey.ExtentId == 0 {
                        s.writeLock.Lock()
                        if err = s.IssueFlushRequest(); err != nil {
                                s.writeLock.Unlock()
                                return 0, err
                        }
                        revisedRequests = s.extents.PrepareReadRequests(offset, size, data)
                        s.writeLock.Unlock()
                        break
                }
        }

        if revisedRequests != nil {
                requests = revisedRequests
        }

        filesize, _ := s.extents.Size()
        log.LogDebugf("read: ino(%v) requests(%v) filesize(%v)", s.inode, requests, filesize)
        for _, req := range requests {
                log.LogDebugf("action[streamer.read] req %v", req)
                if req.ExtentKey == nil {
                        zeros := make([]byte, len(req.Data))
                        copy(req.Data, zeros)

                        if req.FileOffset+req.Size > filesize {
                                if req.FileOffset > filesize {
                                        return
                                }
                                req.Size = filesize - req.FileOffset
                                total += req.Size
                                err = io.EOF
                                return
                        }

                        // Reading a hole, just fill zero
                        total += req.Size
                        log.LogDebugf("Stream read hole: ino(%v) req(%v) total(%v)", s.inode, req, total)
                } else {
                        log.LogDebugf("Stream read: ino(%v) req(%v) s.needBCache(%v) s.client.bcacheEnable(%v)", s.inode, req, s.needBCache, s.client.bcacheEnable)
                        if s.needBCache {
                                bcacheMetric := exporter.NewCounter("fileReadL1Cache")
                                bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
                        }

                        // skip hole,ek is not nil,read block cache firstly
                        log.LogDebugf("Stream read: ino(%v) req(%v) s.client.bcacheEnable(%v) s.needBCache(%v)", s.inode, req, s.client.bcacheEnable, s.needBCache)
                        cacheKey := util.GenerateRepVolKey(s.client.volumeName, s.inode, req.ExtentKey.PartitionId, req.ExtentKey.ExtentId, req.ExtentKey.FileOffset)
                        if s.client.bcacheEnable && s.needBCache && filesize <= bcache.MaxFileSize {
                                offset := req.FileOffset - int(req.ExtentKey.FileOffset)
                                if s.client.loadBcache != nil {
                                        readBytes, err = s.client.loadBcache(cacheKey, req.Data, uint64(offset), uint32(req.Size))
                                        if err == nil && readBytes == req.Size {
                                                total += req.Size
                                                bcacheMetric := exporter.NewCounter("fileReadL1CacheHit")
                                                bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
                                                log.LogDebugf("TRACE Stream read. hit blockCache: ino(%v) cacheKey(%v) readBytes(%v) err(%v)", s.inode, cacheKey, readBytes, err)
                                                continue
                                        }
                                }
                                log.LogDebugf("TRACE Stream read. miss blockCache cacheKey(%v) loadBcache(%v)", cacheKey, s.client.loadBcache)
                        }

                        if s.needBCache {
                                bcacheMetric := exporter.NewCounter("fileReadL1CacheMiss")
                                bcacheMetric.AddWithLabels(1, map[string]string{exporter.Vol: s.client.volumeName})
                        }

                        // read extent
                        reader, err = s.GetExtentReader(req.ExtentKey)
                        if err != nil {
                                log.LogErrorf("action[streamer.read] req %v err %v", req, err)
                                break
                        }

                        if s.client.bcacheEnable && s.needBCache && filesize <= bcache.MaxFileSize {
                                // limit big block cache
                                if s.exceedBlockSize(req.ExtentKey.Size) && atomic.LoadInt32(&s.client.inflightL1BigBlock) > 10 {
                                        // do nothing
                                } else {
                                        select {
                                        case s.pendingCache <- bcacheKey{cacheKey: cacheKey, extentKey: req.ExtentKey}:
                                                if s.exceedBlockSize(req.ExtentKey.Size) {
                                                        atomic.AddInt32(&s.client.inflightL1BigBlock, 1)
                                                }
                                        default:
                                        }
                                }
                        }

                        readBytes, err = reader.Read(req)
                        log.LogDebugf("TRACE Stream read: ino(%v) req(%v) readBytes(%v) err(%v)", s.inode, req, readBytes, err)

                        total += readBytes

                        if err != nil || readBytes < req.Size {
                                if total == 0 {
                                        log.LogErrorf("Stream read: ino(%v) req(%v) readBytes(%v) err(%v)", s.inode, req, readBytes, err)
                                }
                                break
                        }
                }
        }
        log.LogDebugf("action[streamer.read] offset %v size %v exit", offset, size)
        return
}

func (s *Streamer) asyncBlockCache() {
        if !s.needBCache || !s.isOpen {
                return
        }
        t := time.NewTicker(3 * time.Second)
        defer t.Stop()
        for {
                select {
                case pending := <-s.pendingCache:
                        ek := pending.extentKey
                        cacheKey := pending.cacheKey
                        log.LogDebugf("asyncBlockCache: cacheKey=(%v) ek=(%v)", cacheKey, ek)

                        // read full extent
                        var data []byte
                        if ek.Size == bcache.MaxBlockSize {
                                data = buf.BCachePool.Get()
                        } else {
                                data = make([]byte, ek.Size)
                        }
                        reader, err := s.GetExtentReader(ek)
                        fullReq := NewExtentRequest(int(ek.FileOffset), int(ek.Size), data, ek)
                        readBytes, err := reader.Read(fullReq)
                        if err != nil || readBytes != len(data) {
                                log.LogWarnf("asyncBlockCache: Stream read full extent error. fullReq(%v) readBytes(%v) err(%v)", fullReq, readBytes, err)
                                if ek.Size == bcache.MaxBlockSize {
                                        buf.BCachePool.Put(data)
                                }
                                if s.exceedBlockSize(ek.Size) {
                                        atomic.AddInt32(&s.client.inflightL1BigBlock, -1)
                                }
                                return
                        }
                        if s.client.cacheBcache != nil {
                                log.LogDebugf("TRACE read. write blockCache cacheKey(%v) len_buf(%v),", cacheKey, len(data))
                                s.client.cacheBcache(cacheKey, data)
                        }
                        if ek.Size == bcache.MaxBlockSize {
                                buf.BCachePool.Put(data)
                        }
                        if s.exceedBlockSize(ek.Size) {
                                atomic.AddInt32(&s.client.inflightL1BigBlock, -1)
                        }
                case <-t.C:
                        if s.refcnt <= 0 {
                                s.isOpen = false
                                return
                        }
                }
        }
}

func (s *Streamer) exceedBlockSize(size uint32) bool {
        if size > bcache.BigExtentSize {
                return true
        }
        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stream

import (
        "context"
        "fmt"
        "hash/crc32"
        "net"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        "github.com/cubefs/cubefs/storage"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        MaxSelectDataPartitionForWrite = 32
        MaxNewHandlerRetry             = 3
        MaxPacketErrorCount            = 128
        MaxDirtyListLen                = 0
)

const (
        StreamerNormal int32 = iota
        StreamerError
        LastEKVersionNotEqual
)

const (
        streamWriterFlushPeriod       = 3
        streamWriterIdleTimeoutPeriod = 10
)

// VerUpdateRequest defines an verseq update request.
type VerUpdateRequest struct {
        err    error
        verSeq uint64
        done   chan struct{}
}

// OpenRequest defines an open request.
type OpenRequest struct {
        done chan struct{}
}

// WriteRequest defines a write request.
type WriteRequest struct {
        fileOffset int
        size       int
        data       []byte
        flags      int
        writeBytes int
        err        error
        done       chan struct{}
        checkFunc  func() error
}

// FlushRequest defines a flush request.
type FlushRequest struct {
        err  error
        done chan struct{}
}

// ReleaseRequest defines a release request.
type ReleaseRequest struct {
        err  error
        done chan struct{}
}

// TruncRequest defines a truncate request.
type TruncRequest struct {
        size     int
        err      error
        fullPath string
        done     chan struct{}
}

// EvictRequest defines an evict request.
type EvictRequest struct {
        err  error
        done chan struct{}
}

// Open request shall grab the lock until request is sent to the request channel
func (s *Streamer) IssueOpenRequest() error {
        request := openRequestPool.Get().(*OpenRequest)
        request.done = make(chan struct{}, 1)
        s.request <- request
        s.client.streamerLock.Unlock()
        <-request.done
        openRequestPool.Put(request)
        return nil
}

func (s *Streamer) IssueWriteRequest(offset int, data []byte, flags int, checkFunc func() error) (write int, err error) {
        if atomic.LoadInt32(&s.status) >= StreamerError {
                return 0, errors.New(fmt.Sprintf("IssueWriteRequest: stream writer in error status, ino(%v)", s.inode))
        }

        s.writeLock.Lock()
        request := writeRequestPool.Get().(*WriteRequest)
        request.data = data
        request.fileOffset = offset
        request.size = len(data)
        request.flags = flags
        request.done = make(chan struct{}, 1)
        request.checkFunc = checkFunc

        s.request <- request
        s.writeLock.Unlock()

        <-request.done
        err = request.err
        write = request.writeBytes
        writeRequestPool.Put(request)
        return
}

func (s *Streamer) IssueFlushRequest() error {
        request := flushRequestPool.Get().(*FlushRequest)
        request.done = make(chan struct{}, 1)
        s.request <- request
        <-request.done
        err := request.err
        flushRequestPool.Put(request)
        return err
}

func (s *Streamer) IssueReleaseRequest() error {
        request := releaseRequestPool.Get().(*ReleaseRequest)
        request.done = make(chan struct{}, 1)
        s.request <- request
        s.client.streamerLock.Unlock()
        <-request.done
        err := request.err
        releaseRequestPool.Put(request)
        return err
}

func (s *Streamer) IssueTruncRequest(size int, fullPath string) error {
        request := truncRequestPool.Get().(*TruncRequest)
        request.size = size
        request.fullPath = fullPath
        request.done = make(chan struct{}, 1)
        s.request <- request
        <-request.done
        err := request.err
        truncRequestPool.Put(request)
        return err
}

func (s *Streamer) IssueEvictRequest() error {
        request := evictRequestPool.Get().(*EvictRequest)
        request.done = make(chan struct{}, 1)
        s.request <- request
        s.client.streamerLock.Unlock()
        <-request.done
        err := request.err
        evictRequestPool.Put(request)
        return err
}

func (s *Streamer) GetStoreMod(offset int, size int) (storeMode int) {
        // Small files are usually written in a single write, so use tiny extent
        // store only for the first write operation.
        if offset > 0 || offset+size > s.tinySizeLimit() {
                storeMode = proto.NormalExtentType
        } else {
                storeMode = proto.TinyExtentType
        }
        return
}

func (s *Streamer) server() {
        t := time.NewTicker(2 * time.Second)
        defer t.Stop()
        for {
                select {
                case request := <-s.request:
                        s.handleRequest(request)
                        s.idle = 0
                        s.traversed = 0
                case <-s.done:
                        s.abort()
                        log.LogDebugf("done server: evict, ino(%v)", s.inode)
                        return
                case <-t.C:
                        s.traverse()
                        if s.refcnt <= 0 {

                                s.client.streamerLock.Lock()
                                if s.idle >= streamWriterIdleTimeoutPeriod && len(s.request) == 0 {
                                        if s.client.disableMetaCache || !s.needBCache {
                                                delete(s.client.streamers, s.inode)
                                                if s.client.evictIcache != nil {
                                                        s.client.evictIcache(s.inode)
                                                }
                                        }

                                        s.isOpen = false
                                        // fail the remaining requests in such case
                                        s.clearRequests()
                                        s.client.streamerLock.Unlock()

                                        log.LogDebugf("done server: no requests for a long time, ino(%v)", s.inode)
                                        return
                                }
                                s.client.streamerLock.Unlock()

                                s.idle++
                        }
                }
        }
}

func (s *Streamer) clearRequests() {
        for {
                select {
                case request := <-s.request:
                        s.abortRequest(request)
                default:
                        return
                }
        }
}

func (s *Streamer) abortRequest(request interface{}) {
        switch request := request.(type) {
        case *OpenRequest:
                request.done <- struct{}{}
        case *WriteRequest:
                request.err = syscall.EAGAIN
                request.done <- struct{}{}
        case *TruncRequest:
                request.err = syscall.EAGAIN
                request.done <- struct{}{}
        case *FlushRequest:
                request.err = syscall.EAGAIN
                request.done <- struct{}{}
        case *ReleaseRequest:
                request.err = syscall.EAGAIN
                request.done <- struct{}{}
        case *EvictRequest:
                request.err = syscall.EAGAIN
                request.done <- struct{}{}
        default:
        }
}

func (s *Streamer) handleRequest(request interface{}) {
        if atomic.LoadInt32(&s.needUpdateVer) == 1 {
                s.closeOpenHandler()
                atomic.StoreInt32(&s.needUpdateVer, 0)
        }

        switch request := request.(type) {
        case *OpenRequest:
                s.open()
                request.done <- struct{}{}
        case *WriteRequest:
                request.writeBytes, request.err = s.write(request.data, request.fileOffset, request.size, request.flags, request.checkFunc)
                request.done <- struct{}{}
        case *TruncRequest:
                request.err = s.truncate(request.size, request.fullPath)
                request.done <- struct{}{}
        case *FlushRequest:
                request.err = s.flush()
                request.done <- struct{}{}
        case *ReleaseRequest:
                request.err = s.release()
                request.done <- struct{}{}
        case *EvictRequest:
                request.err = s.evict()
                request.done <- struct{}{}
        case *VerUpdateRequest:
                request.err = s.updateVer(request.verSeq)
                request.done <- struct{}{}
        default:
        }
}

func (s *Streamer) write(data []byte, offset, size, flags int, checkFunc func() error) (total int, err error) {
        var (
                direct     bool
                retryTimes int8
        )

        if flags&proto.FlagsSyncWrite != 0 {
                direct = true
        }
begin:
        if flags&proto.FlagsAppend != 0 {
                filesize, _ := s.extents.Size()
                offset = filesize
        }

        log.LogDebugf("Streamer write enter: ino(%v) offset(%v) size(%v) flags(%v)", s.inode, offset, size, flags)

        ctx := context.Background()
        s.client.writeLimiter.Wait(ctx)

        requests := s.extents.PrepareWriteRequests(offset, size, data)
        log.LogDebugf("Streamer write: ino(%v) prepared requests(%v)", s.inode, requests)

        isChecked := false
        // Must flush before doing overwrite
        for _, req := range requests {
                if req.ExtentKey == nil {
                        continue
                }
                err = s.flush()
                if err != nil {
                        return
                }
                // some extent key in requests with partition id 0 means it's append operation and on flight.
                // need to flush and get the right key then used to make modification
                requests = s.extents.PrepareWriteRequests(offset, size, data)
                log.LogDebugf("Streamer write: ino(%v) prepared requests after flush(%v)", s.inode, requests)
                break
        }

        for _, req := range requests {
                var writeSize int
                if req.ExtentKey != nil {
                        if s.client.bcacheEnable {
                                cacheKey := util.GenerateRepVolKey(s.client.volumeName, s.inode, req.ExtentKey.PartitionId, req.ExtentKey.ExtentId, uint64(req.FileOffset))
                                if _, ok := s.inflightEvictL1cache.Load(cacheKey); !ok {
                                        go func(cacheKey string) {
                                                s.inflightEvictL1cache.Store(cacheKey, true)
                                                s.client.evictBcache(cacheKey)
                                                s.inflightEvictL1cache.Delete(cacheKey)
                                        }(cacheKey)
                                }
                        }
                        log.LogDebugf("action[streamer.write] inode [%v] latest seq [%v] extentkey seq [%v]  info [%v] before compare seq",
                                s.inode, s.verSeq, req.ExtentKey.GetSeq(), req.ExtentKey)
                        if req.ExtentKey.GetSeq() == s.verSeq {
                                writeSize, err = s.doOverwrite(req, direct)
                                if err == proto.ErrCodeVersionOp {
                                        log.LogDebugf("action[streamer.write] write need version update")
                                        if err = s.GetExtentsForce(); err != nil {
                                                log.LogErrorf("action[streamer.write] err %v", err)
                                                return
                                        }
                                        if retryTimes > 3 {
                                                err = proto.ErrCodeVersionOp
                                                log.LogWarnf("action[streamer.write] err %v", err)
                                                return
                                        }
                                        time.Sleep(time.Millisecond * 100)
                                        retryTimes++
                                        log.LogDebugf("action[streamer.write] err %v retryTimes %v", err, retryTimes)
                                        goto begin
                                }
                                log.LogDebugf("action[streamer.write] err %v retryTimes %v", err, retryTimes)
                        } else {
                                log.LogDebugf("action[streamer.write] ino %v do OverWriteByAppend extent key (%v) because seq not equal", s.inode, req.ExtentKey)
                                writeSize, _, err, _ = s.doOverWriteByAppend(req, direct)
                        }
                        if s.client.bcacheEnable {
                                cacheKey := util.GenerateKey(s.client.volumeName, s.inode, uint64(req.FileOffset))
                                go s.client.evictBcache(cacheKey)
                        }
                } else {
                        if !isChecked && checkFunc != nil {
                                isChecked = true
                                if err = checkFunc(); err != nil {
                                        return
                                }
                        }
                        writeSize, err = s.doWriteAppend(req, direct)
                }
                if err != nil {
                        log.LogErrorf("Streamer write: ino(%v) err(%v)", s.inode, err)
                        break
                }
                total += writeSize
        }
        if filesize, _ := s.extents.Size(); offset+total > filesize {
                s.extents.SetSize(uint64(offset+total), false)
                log.LogDebugf("Streamer write: ino(%v) filesize changed to (%v)", s.inode, offset+total)
        }
        log.LogDebugf("Streamer write exit: ino(%v) offset(%v) size(%v) done total(%v) err(%v)", s.inode, offset, size, total, err)
        return
}

func (s *Streamer) doOverWriteByAppend(req *ExtentRequest, direct bool) (total int, extKey *proto.ExtentKey, err error, status int32) {
        // the extent key needs to be updated because when preparing the requests,
        // the obtained extent key could be a local key which can be inconsistent with the remote key.
        // the OpTryWriteAppend is a special case, ignore it
        req.ExtentKey = s.extents.Get(uint64(req.FileOffset))
        return s.doDirectWriteByAppend(req, direct, proto.OpRandomWriteAppend)
}

func (s *Streamer) tryDirectAppendWrite(req *ExtentRequest, direct bool) (total int, extKey *proto.ExtentKey, err error, status int32) {
        req.ExtentKey = s.handler.key
        return s.doDirectWriteByAppend(req, direct, proto.OpTryWriteAppend)
}

func (s *Streamer) doDirectWriteByAppend(req *ExtentRequest, direct bool, op uint8) (total int, extKey *proto.ExtentKey, err error, status int32) {
        var (
                dp        *wrapper.DataPartition
                reqPacket *Packet
        )

        log.LogDebugf("action[doDirectWriteByAppend] inode %v enter in req %v", s.inode, req)
        err = s.flush()
        if err != nil {
                return
        }

        if req.ExtentKey == nil {
                err = errors.New(fmt.Sprintf("doOverwrite: extent key not exist, ino(%v) ekFileOffset(%v) ek(%v)", s.inode, req.FileOffset, req.ExtentKey))
                return
        }

        if dp, err = s.client.dataWrapper.GetDataPartition(req.ExtentKey.PartitionId); err != nil {
                // TODO unhandled error
                errors.Trace(err, "doDirectWriteByAppend: ino(%v) failed to get datapartition, ek(%v)", s.inode, req.ExtentKey)
                return
        }

        retry := true
        if proto.IsCold(s.client.volumeType) {
                retry = false
        }
        log.LogDebugf("action[doDirectWriteByAppend] inode %v  data process", s.inode)

        addr := dp.LeaderAddr
        if storage.IsTinyExtent(req.ExtentKey.ExtentId) {
                addr = dp.Hosts[0]
                reqPacket = NewWriteTinyDirectly(s.inode, req.ExtentKey.PartitionId, req.FileOffset, dp)
        } else {
                reqPacket = NewOverwriteByAppendPacket(dp, req.ExtentKey.ExtentId, int(req.ExtentKey.ExtentOffset)+int(req.ExtentKey.Size),
                        s.inode, req.FileOffset, direct, op)
        }

        sc := &StreamConn{
                dp:       dp,
                currAddr: addr,
        }

        replyPacket := new(Packet)
        if req.Size > util.BlockSize {
                log.LogErrorf("action[doDirectWriteByAppend] inode %v size too large %v", s.inode, req.Size)
                panic(nil)
        }
        for total < req.Size { // normally should only run once due to key exist in the system must be less than BlockSize
                // right position in extent:offset-ek4FileOffset+total+ekExtOffset .
                // ekExtOffset will be set by replay packet at addExtentInfo(datanode)

                if direct {
                        reqPacket.Opcode = op
                }
                if req.ExtentKey.ExtentId <= storage.TinyExtentCount {
                        reqPacket.ExtentType = proto.TinyExtentType
                }

                packSize := util.Min(req.Size-total, util.BlockSize)
                copy(reqPacket.Data[:packSize], req.Data[total:total+packSize])
                reqPacket.Size = uint32(packSize)
                reqPacket.CRC = crc32.ChecksumIEEE(reqPacket.Data[:packSize])

                err = sc.Send(&retry, reqPacket, func(conn *net.TCPConn) (error, bool) {
                        e := replyPacket.ReadFromConnWithVer(conn, proto.ReadDeadlineTime)
                        if e != nil {
                                log.LogWarnf("doDirectWriteByAppend.Stream Writer doOverwrite: ino(%v) failed to read from connect, req(%v) err(%v)", s.inode, reqPacket, e)
                                // Upon receiving TryOtherAddrError, other hosts will be retried.
                                return TryOtherAddrError, false
                        }
                        log.LogDebugf("action[doDirectWriteByAppend] .UpdateLatestVer ino(%v) get replyPacket %v", s.inode, replyPacket)
                        if replyPacket.VerSeq > sc.dp.ClientWrapper.SimpleClient.GetLatestVer() {
                                err = sc.dp.ClientWrapper.SimpleClient.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
                                if err != nil {
                                        return err, false
                                }
                        }
                        log.LogDebugf("action[doDirectWriteByAppend] ino(%v) get replyPacket opcode %v resultCode %v", s.inode, replyPacket.Opcode, replyPacket.ResultCode)
                        if replyPacket.ResultCode == proto.OpAgain {
                                return nil, true
                        }

                        if replyPacket.ResultCode == proto.OpTryOtherExtent {
                                status = int32(proto.OpTryOtherExtent)
                                return nil, false
                        }

                        if replyPacket.ResultCode == proto.OpTryOtherAddr {
                                e = TryOtherAddrError
                                log.LogDebugf("action[doDirectWriteByAppend] data process err %v", e)
                        }
                        return e, false
                })

                proto.Buffers.Put(reqPacket.Data)
                reqPacket.Data = nil
                log.LogDebugf("doDirectWriteByAppend: ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)

                if err != nil || replyPacket.ResultCode != proto.OpOk {
                        status = int32(replyPacket.ResultCode)
                        err = errors.New(fmt.Sprintf("doOverwrite: failed or reply NOK: err(%v) ino(%v) req(%v) replyPacket(%v)", err, s.inode, req, replyPacket))
                        log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
                        s.handler.key = nil // direct write key cann't be used again in flush process
                        break
                }

                if !reqPacket.isValidWriteReply(replyPacket) || reqPacket.CRC != replyPacket.CRC {
                        err = errors.New(fmt.Sprintf("doOverwrite: is not the corresponding reply, ino(%v) req(%v) replyPacket(%v)", s.inode, req, replyPacket))
                        log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
                        break
                }

                total += packSize
                break
        }
        if err != nil {
                log.LogErrorf("action[doDirectWriteByAppend] data process err %v", err)
                return
        }
        if replyPacket.VerSeq > s.verSeq {
                s.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
        }
        extKey = &proto.ExtentKey{
                FileOffset:   uint64(req.FileOffset),
                PartitionId:  req.ExtentKey.PartitionId,
                ExtentId:     replyPacket.ExtentID,
                ExtentOffset: uint64(replyPacket.ExtentOffset),
                Size:         uint32(total),
                SnapInfo: &proto.ExtSnapInfo{
                        VerSeq: s.verSeq,
                },
        }
        if op == proto.OpRandomWriteAppend || op == proto.OpSyncRandomWriteAppend {
                log.LogDebugf("action[doDirectWriteByAppend] inode %v local cache process start extKey %v", s.inode, extKey)
                if err = s.extents.SplitExtentKey(s.inode, extKey); err != nil {
                        log.LogErrorf("action[doDirectWriteByAppend] inode %v llocal cache process err %v", s.inode, err)
                        return
                }
                log.LogDebugf("action[doDirectWriteByAppend] inode %v meta extent split with ek (%v)", s.inode, extKey)
                if err = s.client.splitExtentKey(s.parentInode, s.inode, *extKey); err != nil {
                        log.LogErrorf("action[doDirectWriteByAppend] inode %v meta extent split process err %v", s.inode, err)
                        return
                }
        } else {
                discards := s.extents.Append(extKey, true)
                var st int
                if st, err = s.client.appendExtentKey(s.parentInode, s.inode, *extKey, discards); err != nil {
                        status = int32(st)
                        log.LogErrorf("action[doDirectWriteByAppend] inode %v meta extent split process err %v", s.inode, err)
                        return
                }
                log.LogDebugf("action[doDirectWriteByAppend] handler fileoffset %v size %v key %v", s.handler.fileOffset, s.handler.size, s.handler.key)
                // adjust the handler key to last direct write one
                s.handler.fileOffset = int(extKey.FileOffset)
                s.handler.size = int(extKey.Size)
                s.handler.key = extKey
        }
        if atomic.LoadInt32(&s.needUpdateVer) > 0 {
                if err = s.GetExtentsForce(); err != nil {
                        log.LogErrorf("action[doDirectWriteByAppend] inode %v GetExtents err %v", s.inode, err)
                        return
                }
        }
        log.LogDebugf("action[doDirectWriteByAppend] inode %v process over!", s.inode)
        return
}

func (s *Streamer) doOverwrite(req *ExtentRequest, direct bool) (total int, err error) {
        var dp *wrapper.DataPartition

        err = s.flush()
        if err != nil {
                return
        }

        offset := req.FileOffset
        size := req.Size

        // the extent key needs to be updated because when preparing the requests,
        // the obtained extent key could be a local key which can be inconsistent with the remote key.
        req.ExtentKey = s.extents.Get(uint64(offset))
        ekFileOffset := int(req.ExtentKey.FileOffset)
        ekExtOffset := int(req.ExtentKey.ExtentOffset)
        if req.ExtentKey == nil {
                err = errors.New(fmt.Sprintf("doOverwrite: extent key not exist, ino(%v) ekFileOffset(%v) ek(%v)", s.inode, ekFileOffset, req.ExtentKey))
                return
        }

        if dp, err = s.client.dataWrapper.GetDataPartition(req.ExtentKey.PartitionId); err != nil {
                // TODO unhandled error
                errors.Trace(err, "doOverwrite: ino(%v) failed to get datapartition, ek(%v)", s.inode, req.ExtentKey)
                return
        }

        retry := true
        if proto.IsCold(s.client.volumeType) {
                retry = false
        }

        sc := NewStreamConn(dp, false)

        for total < size {
                reqPacket := NewOverwritePacket(dp, req.ExtentKey.ExtentId, offset-ekFileOffset+total+ekExtOffset, s.inode, offset)
                reqPacket.VerSeq = s.client.multiVerMgr.latestVerSeq
                reqPacket.VerList = make([]*proto.VolVersionInfo, len(s.client.multiVerMgr.verList.VerList))
                copy(reqPacket.VerList, s.client.multiVerMgr.verList.VerList)
                reqPacket.ExtentType |= proto.MultiVersionFlag
                reqPacket.ExtentType |= proto.VersionListFlag

                log.LogDebugf("action[doOverwrite] inode %v extentid %v,extentOffset %v(%v,%v,%v,%v) offset %v, streamer seq %v", s.inode, req.ExtentKey.ExtentId, reqPacket.ExtentOffset,
                        offset, ekFileOffset, total, ekExtOffset, offset, s.verSeq)
                if direct {
                        reqPacket.Opcode = proto.OpSyncRandomWrite
                }
                packSize := util.Min(size-total, util.BlockSize)
                copy(reqPacket.Data[:packSize], req.Data[total:total+packSize])
                reqPacket.Size = uint32(packSize)
                reqPacket.CRC = crc32.ChecksumIEEE(reqPacket.Data[:packSize])
                reqPacket.VerSeq = s.verSeq

                replyPacket := new(Packet)
                err = sc.Send(&retry, reqPacket, func(conn *net.TCPConn) (error, bool) {
                        e := replyPacket.ReadFromConnWithVer(conn, proto.ReadDeadlineTime)
                        if e != nil {
                                log.LogWarnf("Stream Writer doOverwrite: ino(%v) failed to read from connect, req(%v) err(%v)", s.inode, reqPacket, e)
                                // Upon receiving TryOtherAddrError, other hosts will be retried.
                                return TryOtherAddrError, false
                        }
                        log.LogDebugf("action[doOverwrite] streamer verseq (%v) datanode rsp seq (%v) code(%v)", s.verSeq, replyPacket.VerSeq, replyPacket.ResultCode)
                        if replyPacket.ResultCode == proto.OpAgain {
                                return nil, true
                        }

                        if replyPacket.ResultCode == proto.OpTryOtherAddr {
                                e = TryOtherAddrError
                        }

                        if replyPacket.ResultCode == proto.ErrCodeVersionOpError {
                                e = proto.ErrCodeVersionOp
                                log.LogDebugf("action[doOverwrite] .UpdateLatestVer verseq (%v) be updated by datanode rsp (%v) ", s.verSeq, replyPacket)
                                s.verSeq = replyPacket.VerSeq
                                s.extents.verSeq = s.verSeq
                                s.client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: replyPacket.VerList})
                                return e, false
                        }

                        return e, false
                })

                proto.Buffers.Put(reqPacket.Data)
                reqPacket.Data = nil
                log.LogDebugf("doOverwrite: ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)

                if err != nil || replyPacket.ResultCode != proto.OpOk {
                        if replyPacket.ResultCode == proto.ErrCodeVersionOpError {
                                err = proto.ErrCodeVersionOp
                                log.LogWarnf("doOverwrite: need retry.ino(%v) req(%v) reqPacket(%v) err(%v) replyPacket(%v)", s.inode, req, reqPacket, err, replyPacket)
                                return
                        }
                        err = errors.New(fmt.Sprintf("doOverwrite: failed or reply NOK: err(%v) ino(%v) req(%v) replyPacket(%v)", err, s.inode, req, replyPacket))
                        break
                }

                if !reqPacket.isValidWriteReply(replyPacket) || reqPacket.CRC != replyPacket.CRC {
                        err = errors.New(fmt.Sprintf("doOverwrite: is not the corresponding reply, ino(%v) req(%v) replyPacket(%v)", s.inode, req, replyPacket))
                        break
                }

                total += packSize
        }
        return
}

func (s *Streamer) tryInitExtentHandlerByLastEk(offset, size int) (isLastEkVerNotEqual bool) {
        storeMode := s.GetStoreMod(offset, size)
        getEndEkFunc := func() *proto.ExtentKey {
                if ek := s.extents.GetEndForAppendWrite(uint64(offset), s.verSeq, false); ek != nil && !storage.IsTinyExtent(ek.ExtentId) {
                        return ek
                }
                return nil
        }

        checkVerFunc := func(currentEK *proto.ExtentKey) {
                if currentEK.GetSeq() != s.verSeq {
                        log.LogDebugf("tryInitExtentHandlerByLastEk. exist ek seq %v vs request seq %v", currentEK.GetSeq(), s.verSeq)
                        if int(currentEK.ExtentOffset)+int(currentEK.Size)+size > util.ExtentSize {
                                s.closeOpenHandler()
                                return
                        }
                        isLastEkVerNotEqual = true
                }
        }

        initExtentHandlerFunc := func(currentEK *proto.ExtentKey) {
                checkVerFunc(currentEK)
                log.LogDebugf("tryInitExtentHandlerByLastEk: found ek in ExtentCache, extent_id(%v) req_offset(%v) req_size(%v), currentEK [%v] streamer seq %v",
                        currentEK.ExtentId, offset, size, currentEK, s.verSeq)
                _, pidErr := s.client.dataWrapper.GetDataPartition(currentEK.PartitionId)
                if pidErr == nil {
                        seq := currentEK.GetSeq()
                        if isLastEkVerNotEqual {
                                seq = s.verSeq
                        }
                        log.LogDebugf("tryInitExtentHandlerByLastEk NewExtentHandler")
                        handler := NewExtentHandler(s, int(currentEK.FileOffset), storeMode, int(currentEK.Size))
                        handler.key = &proto.ExtentKey{
                                FileOffset:   currentEK.FileOffset,
                                PartitionId:  currentEK.PartitionId,
                                ExtentId:     currentEK.ExtentId,
                                ExtentOffset: currentEK.ExtentOffset,
                                Size:         currentEK.Size,
                                SnapInfo: &proto.ExtSnapInfo{
                                        VerSeq: seq,
                                },
                        }
                        handler.lastKey = *currentEK

                        if s.handler != nil {
                                log.LogDebugf("tryInitExtentHandlerByLastEk: close old handler, currentEK.PartitionId(%v)",
                                        currentEK.PartitionId)
                                s.closeOpenHandler()
                        }

                        s.handler = handler
                        s.dirty = false
                        log.LogDebugf("tryInitExtentHandlerByLastEk: currentEK.PartitionId(%v) found", currentEK.PartitionId)
                } else {
                        log.LogDebugf("tryInitExtentHandlerByLastEk: currentEK.PartitionId(%v) not found", currentEK.PartitionId)
                }
        }

        if storeMode == proto.NormalExtentType {
                if s.handler == nil {
                        log.LogDebugf("tryInitExtentHandlerByLastEk: handler nil")
                        if ek := getEndEkFunc(); ek != nil {
                                initExtentHandlerFunc(ek)
                        }
                } else {
                        if s.handler.fileOffset+s.handler.size == offset {
                                if s.handler.key != nil {
                                        checkVerFunc(s.handler.key)
                                }
                                return
                        } else {
                                if ek := getEndEkFunc(); ek != nil {
                                        log.LogDebugf("tryInitExtentHandlerByLastEk: getEndEkFunc get ek %v", ek)
                                        initExtentHandlerFunc(ek)
                                } else {
                                        log.LogDebugf("tryInitExtentHandlerByLastEk: not found ek")
                                }
                        }
                }
        }

        return
}

// First, attempt sequential writes using neighboring extent keys. If the last extent has a different version,
// it indicates that the extent may have been fully utilized by the previous version.
// Next, try writing and directly checking the extent at the datanode. If the extent cannot be reused, create a new extent for writing.
func (s *Streamer) doWriteAppend(req *ExtentRequest, direct bool) (writeSize int, err error) {
        var status int32
        // try append write, get response
        log.LogDebugf("action[streamer.write] doWriteAppend req: ExtentKey(%v) FileOffset(%v) size(%v)",
                req.ExtentKey, req.FileOffset, req.Size)
        // First, attempt sequential writes using neighboring extent keys. If the last extent has a different version,
        // it indicates that the extent may have been fully utilized by the previous version.
        // Next, try writing and directly checking the extent at the datanode. If the extent cannot be reused, create a new extent for writing.
        if writeSize, err, status = s.doWriteAppendEx(req.Data, req.FileOffset, req.Size, direct, true); status == LastEKVersionNotEqual {
                log.LogDebugf("action[streamer.write] tryDirectAppendWrite req %v FileOffset %v size %v", req.ExtentKey, req.FileOffset, req.Size)
                if writeSize, _, err, status = s.tryDirectAppendWrite(req, direct); status == int32(proto.OpTryOtherExtent) {
                        log.LogDebugf("action[streamer.write] doWriteAppend again req %v FileOffset %v size %v", req.ExtentKey, req.FileOffset, req.Size)
                        writeSize, err, _ = s.doWriteAppendEx(req.Data, req.FileOffset, req.Size, direct, false)
                }
        }
        log.LogDebugf("action[streamer.write] doWriteAppend status %v err %v", status, err)
        return
}

func (s *Streamer) doWriteAppendEx(data []byte, offset, size int, direct bool, reUseEk bool) (total int, err error, status int32) {
        var (
                ek        *proto.ExtentKey
                storeMode int
        )

        // Small files are usually written in a single write, so use tiny extent
        // store only for the first write operation.
        storeMode = s.GetStoreMod(offset, size)

        log.LogDebugf("doWriteAppendEx enter: ino(%v) offset(%v) size(%v) storeMode(%v)", s.inode, offset, size, storeMode)
        if proto.IsHot(s.client.volumeType) {
                if reUseEk {
                        if isLastEkVerNotEqual := s.tryInitExtentHandlerByLastEk(offset, size); isLastEkVerNotEqual {
                                log.LogDebugf("doWriteAppendEx enter: ino(%v) tryInitExtentHandlerByLastEk worked but seq not equal", s.inode)
                                status = LastEKVersionNotEqual
                                return
                        }
                } else if s.handler != nil {
                        s.closeOpenHandler()
                }

                for i := 0; i < MaxNewHandlerRetry; i++ {
                        if s.handler == nil {
                                s.handler = NewExtentHandler(s, offset, storeMode, 0)
                                s.dirty = false
                        } else if s.handler.storeMode != storeMode {
                                // store mode changed, so close open handler and start a new one
                                s.closeOpenHandler()
                                continue
                        }
                        ek, err = s.handler.write(data, offset, size, direct)
                        if err == nil && ek != nil {
                                ek.SetSeq(s.verSeq)
                                if !s.dirty {
                                        s.dirtylist.Put(s.handler)
                                        s.dirty = true
                                }
                                break
                        }
                        s.closeOpenHandler()
                }
        } else {
                s.handler = NewExtentHandler(s, offset, storeMode, 0)
                s.dirty = false
                ek, err = s.handler.write(data, offset, size, direct)
                if err == nil && ek != nil {
                        if !s.dirty {
                                s.dirtylist.Put(s.handler)
                                s.dirty = true
                        }
                }

                err = s.closeOpenHandler()
        }

        if err != nil || ek == nil {
                log.LogErrorf("doWriteAppendEx error: ino(%v) offset(%v) size(%v) err(%v) ek(%v)", s.inode, offset, size, err, ek)
                return
        }

        // This ek is just a local cache for PrepareWriteRequest, so ignore discard eks here.
        _ = s.extents.Append(ek, false)
        total = size

        return
}

func (s *Streamer) flush() (err error) {
        for {
                element := s.dirtylist.Get()
                if element == nil {
                        break
                }
                eh := element.Value.(*ExtentHandler)

                log.LogDebugf("Streamer flush begin: eh(%v)", eh)
                err = eh.flush()
                if err != nil {
                        log.LogErrorf("Streamer flush failed: eh(%v)", eh)
                        return
                }
                eh.stream.dirtylist.Remove(element)
                if eh.getStatus() == ExtentStatusOpen {
                        s.dirty = false
                        log.LogDebugf("Streamer flush handler open: eh(%v)", eh)
                } else {
                        // TODO unhandled error
                        eh.cleanup()
                        log.LogDebugf("Streamer flush handler cleaned up: eh(%v)", eh)
                }
                log.LogDebugf("Streamer flush end: eh(%v)", eh)
        }
        return
}

func (s *Streamer) traverse() (err error) {
        s.traversed++
        length := s.dirtylist.Len()
        for i := 0; i < length; i++ {
                element := s.dirtylist.Get()
                if element == nil {
                        break
                }
                eh := element.Value.(*ExtentHandler)

                log.LogDebugf("Streamer traverse begin: eh(%v)", eh)
                if eh.getStatus() >= ExtentStatusClosed {
                        // handler can be in different status such as close, recovery, and error,
                        // and therefore there can be packet that has not been flushed yet.
                        eh.flushPacket()
                        if atomic.LoadInt32(&eh.inflight) > 0 {
                                log.LogDebugf("Streamer traverse skipped: non-zero inflight, eh(%v)", eh)
                                continue
                        }
                        err = eh.appendExtentKey()
                        if err != nil {
                                log.LogWarnf("Streamer traverse abort: appendExtentKey failed, eh(%v) err(%v)", eh, err)
                                // set the streamer to error status to avoid further writes
                                if err == syscall.EIO {
                                        atomic.StoreInt32(&eh.stream.status, StreamerError)
                                }
                                return
                        }
                        s.dirtylist.Remove(element)
                        eh.cleanup()
                } else {
                        if s.traversed < streamWriterFlushPeriod {
                                log.LogDebugf("Streamer traverse skipped: traversed(%v) eh(%v)", s.traversed, eh)
                                continue
                        }
                        if err = eh.flush(); err != nil {
                                log.LogWarnf("Streamer traverse flush: eh(%v) err(%v)", eh, err)
                        }
                }
                log.LogDebugf("Streamer traverse end: eh(%v)", eh)
        }
        return
}

func (s *Streamer) closeOpenHandler() (err error) {
        // just in case to avoid infinite loop
        var cnt int = 2 * MaxPacketErrorCount

        handler := s.handler
        for handler != nil && cnt >= 0 {
                handler.setClosed()
                if s.dirtylist.Len() < MaxDirtyListLen {
                        handler.flushPacket()
                } else {
                        // TODO unhandled error
                        err = s.handler.flush()
                }
                handler = handler.recoverHandler
                cnt--
        }

        if s.handler != nil {
                if !s.dirty {
                        // in case the current handler is not on the dirty list and will not get cleaned up
                        // TODO unhandled error
                        log.LogDebugf("action[Streamer.closeOpenHandler]")
                        s.handler.cleanup()
                }
                s.handler = nil
        }
        return err
}

func (s *Streamer) open() {
        s.refcnt++
        log.LogDebugf("open: streamer(%v) refcnt(%v)", s, s.refcnt)
}

func (s *Streamer) release() error {
        s.refcnt--
        s.closeOpenHandler()
        err := s.flush()
        if err != nil {
                s.abort()
        }
        log.LogDebugf("release: streamer(%v) refcnt(%v)", s, s.refcnt)
        return err
}

func (s *Streamer) evict() error {
        s.client.streamerLock.Lock()
        if s.refcnt > 0 || len(s.request) != 0 {
                s.client.streamerLock.Unlock()
                return errors.New(fmt.Sprintf("evict: streamer(%v) refcnt(%v)", s, s.refcnt))
        }
        if s.client.disableMetaCache || !s.needBCache {
                delete(s.client.streamers, s.inode)
        }
        s.client.streamerLock.Unlock()
        return nil
}

func (s *Streamer) abort() {
        for {
                element := s.dirtylist.Get()
                if element == nil {
                        break
                }
                eh := element.Value.(*ExtentHandler)
                s.dirtylist.Remove(element)
                // TODO unhandled error
                eh.cleanup()
        }
}

func (s *Streamer) truncate(size int, fullPath string) error {
        s.closeOpenHandler()
        err := s.flush()
        if err != nil {
                return err
        }

        err = s.client.truncate(s.inode, uint64(size), fullPath)
        if err != nil {
                return err
        }

        oldsize, _ := s.extents.Size()
        if oldsize <= size {
                s.extents.SetSize(uint64(size), true)
                return nil
        }

        s.extents.TruncDiscard(uint64(size))
        return s.GetExtentsForce()
}

func (s *Streamer) updateVer(verSeq uint64) (err error) {
        log.LogInfof("action[stream.updateVer] ver %v update to %v", s.verSeq, verSeq)
        if s.verSeq != verSeq {
                log.LogInfof("action[stream.updateVer] ver %v update to %v", s.verSeq, verSeq)
                s.verSeq = verSeq
                s.extents.verSeq = verSeq
        }
        return
}

func (s *Streamer) tinySizeLimit() int {
        return util.DefaultTinySizeLimit
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package wrapper

import (
        "fmt"
        "net"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

// DataPartition defines the wrapper of the data partition.
type DataPartition struct {
        // Will not be changed
        proto.DataPartitionResponse
        RandomWrite   bool
        NearHosts     []string
        ClientWrapper *Wrapper
        Metrics       *DataPartitionMetrics
}

// DataPartitionMetrics defines the wrapper of the metrics related to the data partition.
type DataPartitionMetrics struct {
        sync.RWMutex
        AvgReadLatencyNano  int64
        AvgWriteLatencyNano int64
        SumReadLatencyNano  int64
        SumWriteLatencyNano int64
        ReadOpNum           int64
        WriteOpNum          int64
}

func (dp *DataPartition) RecordWrite(startT int64) {
        if startT == 0 {
                log.LogWarnf("RecordWrite: invalid start time")
                return
        }
        cost := time.Now().UnixNano() - startT

        dp.Metrics.Lock()
        defer dp.Metrics.Unlock()

        dp.Metrics.WriteOpNum++
        dp.Metrics.SumWriteLatencyNano += cost

        return
}

func (dp *DataPartition) MetricsRefresh() {
        dp.Metrics.Lock()
        defer dp.Metrics.Unlock()

        if dp.Metrics.ReadOpNum != 0 {
                dp.Metrics.AvgReadLatencyNano = dp.Metrics.SumReadLatencyNano / dp.Metrics.ReadOpNum
        } else {
                dp.Metrics.AvgReadLatencyNano = 0
        }

        if dp.Metrics.WriteOpNum != 0 {
                dp.Metrics.AvgWriteLatencyNano = dp.Metrics.SumWriteLatencyNano / dp.Metrics.WriteOpNum
        } else {
                dp.Metrics.AvgWriteLatencyNano = 0
        }

        dp.Metrics.SumReadLatencyNano = 0
        dp.Metrics.SumWriteLatencyNano = 0
        dp.Metrics.ReadOpNum = 0
        dp.Metrics.WriteOpNum = 0
}

func (dp *DataPartition) GetAvgRead() int64 {
        dp.Metrics.RLock()
        defer dp.Metrics.RUnlock()

        return dp.Metrics.AvgReadLatencyNano
}

func (dp *DataPartition) GetAvgWrite() int64 {
        dp.Metrics.RLock()
        defer dp.Metrics.RUnlock()

        return dp.Metrics.AvgWriteLatencyNano
}

type DataPartitionSorter []*DataPartition

func (ds DataPartitionSorter) Len() int {
        return len(ds)
}

func (ds DataPartitionSorter) Swap(i, j int) {
        ds[i], ds[j] = ds[j], ds[i]
}

func (ds DataPartitionSorter) Less(i, j int) bool {
        return ds[i].Metrics.AvgWriteLatencyNano < ds[j].Metrics.AvgWriteLatencyNano
}

// NewDataPartitionMetrics returns a new DataPartitionMetrics instance.
func NewDataPartitionMetrics() *DataPartitionMetrics {
        metrics := new(DataPartitionMetrics)
        return metrics
}

// String returns the string format of the data partition.
func (dp *DataPartition) String() string {
        return fmt.Sprintf("PartitionID(%v) Type(%v), Status(%v) ReplicaNum(%v) Hosts(%v) NearHosts(%v)",
                dp.PartitionID, dp.PartitionType, dp.Status, dp.ReplicaNum, dp.Hosts, dp.NearHosts)
}

func (dp *DataPartition) CheckAllHostsIsAvail(exclude map[string]struct{}) {
        var (
                conn net.Conn
                err  error
        )
        for i := 0; i < len(dp.Hosts); i++ {
                host := dp.Hosts[i]
                if conn, err = util.DailTimeOut(host, proto.ReadDeadlineTime*time.Second); err != nil {
                        log.LogWarnf("CheckAllHostsIsAvail: dial host (%v) err(%v)", host, err)
                        if strings.Contains(err.Error(), syscall.ECONNREFUSED.Error()) {
                                exclude[host] = struct{}{}
                        }
                        continue
                }
                conn.Close()
        }
}

// GetAllAddrs returns the addresses of all the replicas of the data partition.
func (dp *DataPartition) GetAllAddrs() string {
        return strings.Join(dp.Hosts[1:], proto.AddrSplit) + proto.AddrSplit
}

func isExcluded(dp *DataPartition, exclude map[string]struct{}) bool {
        for _, host := range dp.Hosts {
                if _, exist := exclude[host]; exist {
                        return true
                }
        }
        return false
}

// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package wrapper

import (
        "errors"
        "strings"

        "github.com/cubefs/cubefs/util/log"
)

// This type defines the constructor used to create and initialize the selector.
type DataPartitionSelectorConstructor = func(param string) (DataPartitionSelector, error)

// DataPartitionSelector is the interface defines the methods necessary to implement
// a selector for data partition selecting.
type DataPartitionSelector interface {
        // Name return name of current selector instance.
        Name() string

        // Refresh refreshes current selector instance by specified data partitions.
        Refresh(partitions []*DataPartition) error

        // Select returns an data partition picked by selector.
        Select(excludes map[string]struct{}) (*DataPartition, error)

        // RemoveDP removes specified data partition.
        RemoveDP(partitionID uint64)

        // Count return number of data partitions held by selector.
        Count() int
}

var (
        dataPartitionSelectorConstructors = make(map[string]DataPartitionSelectorConstructor)

        ErrDuplicatedDataPartitionSelectorConstructor = errors.New("duplicated data partition selector constructor")
        ErrDataPartitionSelectorConstructorNotExist   = errors.New("data partition selector constructor not exist")
)

// RegisterDataPartitionSelector registers a selector constructor.
// Users can register their own defined selector through this method.
func RegisterDataPartitionSelector(name string, constructor DataPartitionSelectorConstructor) error {
        clearName := strings.TrimSpace(strings.ToLower(name))
        if _, exist := dataPartitionSelectorConstructors[clearName]; exist {
                return ErrDuplicatedDataPartitionSelectorConstructor
        }
        dataPartitionSelectorConstructors[clearName] = constructor
        return nil
}

func newDataPartitionSelector(name string, param string) (newDpSelector DataPartitionSelector, err error) {
        clearName := strings.TrimSpace(strings.ToLower(name))
        constructor, exist := dataPartitionSelectorConstructors[clearName]
        if !exist {
                return nil, ErrDataPartitionSelectorConstructorNotExist
        }
        return constructor(param)
}

func (w *Wrapper) initDpSelector() (err error) {
        w.dpSelectorChanged = false
        selectorName := w.dpSelectorName
        if strings.TrimSpace(selectorName) == "" {
                log.LogInfof("initDpSelector: can not find dp selector[%v], use default selector", w.dpSelectorName)
                selectorName = DefaultRandomSelectorName
        }
        var selector DataPartitionSelector
        if selector, err = newDataPartitionSelector(selectorName, w.dpSelectorParm); err != nil {
                log.LogErrorf("initDpSelector: dpSelector[%v] init failed caused by [%v], use default selector", w.dpSelectorName,
                        err)
                return
        }
        w.dpSelector = selector
        return
}

func (w *Wrapper) refreshDpSelector(partitions []*DataPartition) {
        w.Lock.RLock()
        dpSelector := w.dpSelector
        dpSelectorChanged := w.dpSelectorChanged
        w.Lock.RUnlock()

        if dpSelectorChanged {
                selectorName := w.dpSelectorName
                if strings.TrimSpace(selectorName) == "" {
                        log.LogWarnf("refreshDpSelector: can not find dp selector[%v], use default selector", w.dpSelectorName)
                        selectorName = DefaultRandomSelectorName
                }
                newDpSelector, err := newDataPartitionSelector(selectorName, w.dpSelectorParm)
                if err != nil {
                        log.LogErrorf("refreshDpSelector: change dpSelector to [%v %v] failed caused by [%v],"+
                                " use last valid selector. Please change dpSelector config through master.",
                                w.dpSelectorName, w.dpSelectorParm, err)
                } else {
                        w.Lock.Lock()
                        log.LogInfof("refreshDpSelector: change dpSelector to [%v %v]", w.dpSelectorName, w.dpSelectorParm)
                        w.dpSelector = newDpSelector
                        w.dpSelectorChanged = false
                        dpSelector = newDpSelector
                        w.Lock.Unlock()
                }
        }

        _ = dpSelector.Refresh(partitions)
}

// getDataPartitionForWrite returns an available data partition for write.
func (w *Wrapper) GetDataPartitionForWrite(exclude map[string]struct{}) (*DataPartition, error) {
        w.Lock.RLock()
        dpSelector := w.dpSelector
        w.Lock.RUnlock()

        return dpSelector.Select(exclude)
}

func (w *Wrapper) RemoveDataPartitionForWrite(partitionID uint64) {
        w.Lock.RLock()
        dpSelector := w.dpSelector
        w.Lock.RUnlock()

        if dpSelector.Count() <= 1 {
                return
        }

        dpSelector.RemoveDP(partitionID)
}

// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package wrapper

import (
        "fmt"
        "math/rand"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        DefaultRandomSelectorName = "default"
)

func init() {
        _ = RegisterDataPartitionSelector(DefaultRandomSelectorName, newDefaultRandomSelector)
}

func newDefaultRandomSelector(_ string) (selector DataPartitionSelector, e error) {
        selector = &DefaultRandomSelector{
                localLeaderPartitions: make([]*DataPartition, 0),
                partitions:            make([]*DataPartition, 0),
        }
        return
}

type DefaultRandomSelector struct {
        sync.RWMutex
        localLeaderPartitions []*DataPartition
        partitions            []*DataPartition
}

func (s *DefaultRandomSelector) Name() string {
        return DefaultRandomSelectorName
}

func (s *DefaultRandomSelector) Refresh(partitions []*DataPartition) (err error) {
        var localLeaderPartitions []*DataPartition
        for i := 0; i < len(partitions); i++ {
                if strings.Split(partitions[i].Hosts[0], ":")[0] == LocalIP {
                        localLeaderPartitions = append(localLeaderPartitions, partitions[i])
                }
        }

        s.Lock()
        defer s.Unlock()

        s.localLeaderPartitions = localLeaderPartitions
        s.partitions = partitions
        return
}

func (s *DefaultRandomSelector) Select(exclude map[string]struct{}) (dp *DataPartition, err error) {
        dp = s.getLocalLeaderDataPartition(exclude)
        if dp != nil {
                return dp, nil
        }

        s.RLock()
        partitions := s.partitions
        s.RUnlock()

        dp = s.getRandomDataPartition(partitions, exclude)

        if dp != nil {
                return dp, nil
        }
        log.LogErrorf("DefaultRandomSelector: no writable data partition with %v partitions and exclude(%v)",
                len(partitions), exclude)
        return nil, fmt.Errorf("no writable data partition")
}

func (s *DefaultRandomSelector) RemoveDP(partitionID uint64) {
        s.RLock()
        rwPartitionGroups := s.partitions
        localLeaderPartitions := s.localLeaderPartitions
        s.RUnlock()

        var i int
        for i = 0; i < len(rwPartitionGroups); i++ {
                if rwPartitionGroups[i].PartitionID == partitionID {
                        break
                }
        }
        if i >= len(rwPartitionGroups) {
                return
        }
        newRwPartition := make([]*DataPartition, 0)
        newRwPartition = append(newRwPartition, rwPartitionGroups[:i]...)
        newRwPartition = append(newRwPartition, rwPartitionGroups[i+1:]...)

        defer func() {
                s.Lock()
                s.partitions = newRwPartition
                s.Unlock()
        }()

        for i = 0; i < len(localLeaderPartitions); i++ {
                if localLeaderPartitions[i].PartitionID == partitionID {
                        break
                }
        }
        if i >= len(localLeaderPartitions) {
                return
        }
        newLocalLeaderPartitions := make([]*DataPartition, 0)
        newLocalLeaderPartitions = append(newLocalLeaderPartitions, localLeaderPartitions[:i]...)
        newLocalLeaderPartitions = append(newLocalLeaderPartitions, localLeaderPartitions[i+1:]...)

        s.Lock()
        defer s.Unlock()
        s.localLeaderPartitions = newLocalLeaderPartitions

        return
}

func (s *DefaultRandomSelector) Count() int {
        s.RLock()
        defer s.RUnlock()
        return len(s.partitions)
}

func (s *DefaultRandomSelector) getLocalLeaderDataPartition(exclude map[string]struct{}) *DataPartition {
        s.RLock()
        localLeaderPartitions := s.localLeaderPartitions
        s.RUnlock()
        return s.getRandomDataPartition(localLeaderPartitions, exclude)
}

func (s *DefaultRandomSelector) getRandomDataPartition(partitions []*DataPartition, exclude map[string]struct{}) (
        dp *DataPartition) {
        length := len(partitions)
        if length == 0 {
                return nil
        }

        rand.Seed(time.Now().UnixNano())
        index := rand.Intn(length)
        dp = partitions[index]
        if !isExcluded(dp, exclude) {
                log.LogDebugf("DefaultRandomSelector: select dp[%v] address[%p], index %v", dp, dp, index)
                return dp
        }

        log.LogWarnf("DefaultRandomSelector: first random partition was excluded, get partition from others")

        var currIndex int
        for i := 0; i < length; i++ {
                currIndex = (index + i) % length
                if !isExcluded(partitions[currIndex], exclude) {
                        log.LogDebugf("DefaultRandomSelector: select dp[%v], index %v", partitions[currIndex], currIndex)
                        return partitions[currIndex]
                }
        }
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package wrapper

import (
        "fmt"
        "math/rand"
        "strconv"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        KFasterRandomSelectorName = "kfaster"
)

func init() {
        _ = RegisterDataPartitionSelector(KFasterRandomSelectorName, newKFasterRandomSelector)
}

func newKFasterRandomSelector(selectorParam string) (selector DataPartitionSelector, e error) {
        param, err := strconv.Atoi(selectorParam)
        if err != nil {
                return nil, fmt.Errorf("KFasterRandomSelector: get param failed[%v]", err)
        }

        if (param <= 0) || (param >= 100) {
                return nil, fmt.Errorf("KFasterRandomSelector: invalid param[%v]", param)
        }

        selector = &KFasterRandomSelector{
                kValueHundred: param,
                partitions:    make([]*DataPartition, 0),
        }
        log.LogInfof("KFasterRandomSelector: init selector success, kValueHundred is %v", param)
        return
}

type KFasterRandomSelector struct {
        sync.RWMutex
        kValueHundred int
        kValue        int
        partitions    []*DataPartition
}

func (s *KFasterRandomSelector) Name() string {
        return KFasterRandomSelectorName
}

func (s *KFasterRandomSelector) Refresh(partitions []*DataPartition) (err error) {
        kValue := (len(partitions)-1)*s.kValueHundred/100 + 1
        selectKminDataPartition(partitions, kValue)

        s.Lock()
        defer s.Unlock()

        s.kValue = kValue
        s.partitions = partitions
        return
}

func (s *KFasterRandomSelector) Select(exclude map[string]struct{}) (dp *DataPartition, err error) {
        s.RLock()
        partitions := s.partitions
        kValue := s.kValue
        s.RUnlock()

        if len(partitions) == 0 {
                log.LogError("KFasterRandomSelector: no writable data partition with empty partitions")
                return nil, fmt.Errorf("no writable data partition")
        }

        // select random dataPartition from fasterRwPartitions
        rand.Seed(time.Now().UnixNano())
        index := rand.Intn(kValue)
        dp = partitions[index]
        if !isExcluded(dp, exclude) {
                log.LogDebugf("KFasterRandomSelector: select faster dp[%v], index %v, kValue(%v/%v)",
                        dp, index, kValue, len(partitions))
                return dp, nil
        }

        log.LogWarnf("KFasterRandomSelector: first random fasterRwPartition was excluded, get partition from other faster")

        // if partitions[index] is excluded, select next in fasterRwPartitions
        for i := 1; i < kValue; i++ {
                dp = partitions[(index+i)%kValue]
                if !isExcluded(dp, exclude) {
                        log.LogDebugf("KFasterRandomSelector: select faster dp[%v], index %v, kValue(%v/%v)",
                                dp, (index+i)%kValue, kValue, len(partitions))
                        return dp, nil
                }
        }

        log.LogWarnf("KFasterRandomSelector: all fasterRwPartitions were excluded, get partition from slower")

        // if all fasterRwPartitions are excluded, select random dataPartition in slowerRwPartitions
        slowerRwPartitionsNum := len(partitions) - kValue
        for i := 0; i < slowerRwPartitionsNum; i++ {
                dp = partitions[(index+i)%slowerRwPartitionsNum+kValue]
                if !isExcluded(dp, exclude) {
                        log.LogDebugf("KFasterRandomSelector: select slower dp[%v], index %v, kValue(%v/%v)",
                                dp, (index+i)%slowerRwPartitionsNum+kValue, kValue, len(partitions))
                        return dp, nil
                }
        }
        log.LogErrorf("KFasterRandomSelector: no writable data partition with %v partitions and exclude(%v)",
                len(partitions), exclude)
        return nil, fmt.Errorf("no writable data partition")
}

func (s *KFasterRandomSelector) RemoveDP(partitionID uint64) {
        s.RLock()
        partitions := s.partitions
        s.RUnlock()

        var i int
        for i = 0; i < len(partitions); i++ {
                if partitions[i].PartitionID == partitionID {
                        break
                }
        }
        if i >= len(partitions) {
                return
        }
        newRwPartition := make([]*DataPartition, 0)
        newRwPartition = append(newRwPartition, partitions[:i]...)
        newRwPartition = append(newRwPartition, partitions[i+1:]...)

        s.Refresh(newRwPartition)

        return
}

func (s *KFasterRandomSelector) Count() int {
        s.RLock()
        defer s.RUnlock()
        return len(s.partitions)
}

func swap(s []*DataPartition, i int, j int) {
        s[i], s[j] = s[j], s[i]
}

func partByPrivot(partitions []*DataPartition, low, high int) int {
        var i, j int
        for {
                for i = low + 1; i < high; i++ {
                        if partitions[i].GetAvgWrite() > partitions[low].GetAvgWrite() {
                                break
                        }
                }
                for j = high; j > low; j-- {
                        if partitions[j].GetAvgWrite() <= partitions[low].GetAvgWrite() {
                                break
                        }
                }
                if i >= j {
                        break
                }
                swap(partitions, i, j)
        }
        if low != j {
                swap(partitions, low, j)
        }
        return j
}

func selectKminDataPartition(partitions []*DataPartition, k int) int {
        if len(partitions) <= 1 {
                return k
        }
        low, high := 0, len(partitions)-1
        for {
                privot := partByPrivot(partitions, low, high)
                if privot < k {
                        low = privot + 1
                } else if privot > k {
                        high = privot - 1
                } else {
                        return k
                }
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package wrapper

import (
        "fmt"
        syslog "log"
        "math"
        "net"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        masterSDK "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/iputil"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/ump"
)

var (
        LocalIP                             string
        DefaultMinWriteAbleDataPartitionCnt = 10
)

type DataPartitionView struct {
        DataPartitions []*DataPartition
}

type SimpleClientInfo interface {
        GetFlowInfo() (*proto.ClientReportLimitInfo, bool)
        UpdateFlowInfo(limit *proto.LimitRsp2Client)
        SetClientID(id uint64) error
        UpdateLatestVer(verList *proto.VolVersionInfoList) error
        GetReadVer() uint64
        GetLatestVer() uint64
        GetVerMgr() *proto.VolVersionInfoList
}

// Wrapper TODO rename. This name does not reflect what it is doing.
type Wrapper struct {
        Lock                  sync.RWMutex
        clusterName           string
        volName               string
        volType               int
        EnablePosixAcl        bool
        masters               []string
        partitions            map[uint64]*DataPartition
        followerRead          bool
        followerReadClientCfg bool
        nearRead              bool
        dpSelectorChanged     bool
        dpSelectorName        string
        dpSelectorParm        string
        mc                    *masterSDK.MasterClient
        stopOnce              sync.Once
        stopC                 chan struct{}

        dpSelector DataPartitionSelector

        HostsStatus map[string]bool
        Uids        map[uint32]*proto.UidSimpleInfo
        UidLock     sync.RWMutex
        preload     bool
        LocalIp     string

        minWriteAbleDataPartitionCnt int
        verConfReadSeq               uint64
        verReadSeq                   uint64
        SimpleClient                 SimpleClientInfo
}

func (w *Wrapper) GetMasterClient() *masterSDK.MasterClient {
        return w.mc
}

// NewDataPartitionWrapper returns a new data partition wrapper.
func NewDataPartitionWrapper(client SimpleClientInfo, volName string, masters []string, preload bool, minWriteAbleDataPartitionCnt int, verReadSeq uint64) (w *Wrapper, err error) {
        log.LogInfof("action[NewDataPartitionWrapper] verReadSeq %v", verReadSeq)

        w = new(Wrapper)
        w.stopC = make(chan struct{})
        w.masters = masters
        w.mc = masterSDK.NewMasterClient(masters, false)
        w.volName = volName
        w.partitions = make(map[uint64]*DataPartition)
        w.HostsStatus = make(map[string]bool)
        w.preload = preload

        w.minWriteAbleDataPartitionCnt = minWriteAbleDataPartitionCnt
        if w.minWriteAbleDataPartitionCnt < 0 {
                w.minWriteAbleDataPartitionCnt = DefaultMinWriteAbleDataPartitionCnt
        }

        if w.LocalIp, err = ump.GetLocalIpAddr(); err != nil {
                err = errors.Trace(err, "NewDataPartitionWrapper:")
                return
        }

        if err = w.updateClusterInfo(); err != nil {
                err = errors.Trace(err, "NewDataPartitionWrapper:")
                return
        }

        if err = w.GetSimpleVolView(); err != nil {
                err = errors.Trace(err, "NewDataPartitionWrapper:")
                return
        }

        w.UploadFlowInfo(client, true)

        if err = w.initDpSelector(); err != nil {
                log.LogErrorf("NewDataPartitionWrapper: init initDpSelector failed, [%v]", err)
        }
        if err = w.updateDataPartition(true); err != nil {
                err = errors.Trace(err, "NewDataPartitionWrapper:")
                return
        }
        if err = w.updateDataNodeStatus(); err != nil {
                log.LogErrorf("NewDataPartitionWrapper: init DataNodeStatus failed, [%v]", err)
        }
        w.verConfReadSeq = verReadSeq
        if verReadSeq > 0 {
                var verList *proto.VolVersionInfoList
                if verList, err = w.mc.AdminAPI().GetVerList(volName); err != nil {
                        return
                }
                if verReadSeq, err = w.CheckReadVerSeq(volName, verReadSeq, verList); err != nil {
                        log.LogErrorf("NewDataPartitionWrapper: init Read with ver [%v] error [%v]", verReadSeq, err)
                        return
                }
        }
        w.verReadSeq = verReadSeq
        w.SimpleClient = client
        go w.uploadFlowInfoByTick(client)
        go w.update(client)
        return
}

func (w *Wrapper) Stop() {
        w.stopOnce.Do(func() {
                close(w.stopC)
        })
}

func (w *Wrapper) InitFollowerRead(clientConfig bool) {
        w.followerReadClientCfg = clientConfig
        w.followerRead = w.followerReadClientCfg || w.followerRead
}

func (w *Wrapper) FollowerRead() bool {
        return w.followerRead
}

func (w *Wrapper) tryGetPartition(index uint64) (partition *DataPartition, ok bool) {
        w.Lock.RLock()
        defer w.Lock.RUnlock()
        partition, ok = w.partitions[index]
        return
}

func (w *Wrapper) updateClusterInfo() (err error) {
        var info *proto.ClusterInfo
        if info, err = w.mc.AdminAPI().GetClusterInfo(); err != nil {
                log.LogWarnf("UpdateClusterInfo: get cluster info fail: err(%v)", err)
                return
        }
        log.LogInfof("UpdateClusterInfo: get cluster info: cluster(%v) localIP(%v)", info.Cluster, info.Ip)
        w.clusterName = info.Cluster
        LocalIP = info.Ip
        return
}

func (w *Wrapper) UpdateUidsView(view *proto.SimpleVolView) {
        w.UidLock.Lock()
        defer w.UidLock.Unlock()
        w.Uids = make(map[uint32]*proto.UidSimpleInfo)
        for _, uid := range view.Uids {
                if !uid.Limited {
                        continue
                }
                w.Uids[uid.UID] = &uid
        }
        log.LogDebugf("uid info be updated to %v", view.Uids)
}

func (w *Wrapper) GetSimpleVolView() (err error) {
        var view *proto.SimpleVolView

        if view, err = w.mc.AdminAPI().GetVolumeSimpleInfo(w.volName); err != nil {
                log.LogWarnf("GetSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
                return
        }

        if view.Status == 1 {
                log.LogWarnf("GetSimpleVolView: volume has been marked for deletion: volume(%v) status(%v - 0:normal/1:markDelete)",
                        w.volName, view.Status)
                return proto.ErrVolNotExists
        }

        w.followerRead = view.FollowerRead
        w.dpSelectorName = view.DpSelectorName
        w.dpSelectorParm = view.DpSelectorParm
        w.volType = view.VolType
        w.EnablePosixAcl = view.EnablePosixAcl
        w.UpdateUidsView(view)

        log.LogDebugf("GetSimpleVolView: get volume simple info: ID(%v) name(%v) owner(%v) status(%v) capacity(%v) "+
                "metaReplicas(%v) dataReplicas(%v) mpCnt(%v) dpCnt(%v) followerRead(%v) createTime(%v) dpSelectorName(%v) "+
                "dpSelectorParm(%v) uids(%v)",
                view.ID, view.Name, view.Owner, view.Status, view.Capacity, view.MpReplicaNum, view.DpReplicaNum, view.MpCnt,
                view.DpCnt, view.FollowerRead, view.CreateTime, view.DpSelectorName, view.DpSelectorParm, view.Uids)

        return
}

func (w *Wrapper) uploadFlowInfoByTick(clientInfo SimpleClientInfo) {
        ticker := time.NewTicker(5 * time.Second)
        for {
                select {
                case <-ticker.C:
                        w.UploadFlowInfo(clientInfo, false)
                case <-w.stopC:
                        return
                }
        }
}

func (w *Wrapper) update(clientInfo SimpleClientInfo) {
        ticker := time.NewTicker(time.Minute)
        taskFunc := func() {
                w.updateSimpleVolView()
                w.updateDataPartition(false)
                w.updateDataNodeStatus()
                w.CheckPermission()
                w.updateVerlist(clientInfo)
        }
        taskFunc()
        for {
                select {
                case <-ticker.C:
                        taskFunc()
                case <-w.stopC:
                        return
                }
        }
}

func (w *Wrapper) UploadFlowInfo(clientInfo SimpleClientInfo, init bool) (err error) {
        var limitRsp *proto.LimitRsp2Client

        flowInfo, isNeedReport := clientInfo.GetFlowInfo()
        if !isNeedReport {
                log.LogDebugf("action[UploadFlowInfo] no need report!")
                return nil
        }

        if limitRsp, err = w.mc.AdminAPI().UploadFlowInfo(w.volName, flowInfo); err != nil {
                log.LogWarnf("UpdateSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
                return
        }

        if init {
                if limitRsp.ID == 0 {
                        err = fmt.Errorf("init client get id 0")
                        log.LogInfof("action[UploadFlowInfo] err %v", err.Error())
                        return
                }
                log.LogInfof("action[UploadFlowInfo] get id %v", limitRsp.ID)
                clientInfo.SetClientID(limitRsp.ID)
        }
        clientInfo.UpdateFlowInfo(limitRsp)
        return
}

func (w *Wrapper) CheckPermission() {
        if info, err := w.mc.UserAPI().AclOperation(w.volName, w.LocalIp, util.AclCheckIP); err != nil {
                syslog.Println(err)
        } else if !info.OK {
                syslog.Println(err)
                log.LogFatal("Client Addr not allowed to access CubeFS Cluster!")
        }
}

func (w *Wrapper) updateVerlist(client SimpleClientInfo) (err error) {
        verList, err := w.mc.AdminAPI().GetVerList(w.volName)
        if err != nil {
                log.LogErrorf("CheckReadVerSeq: get cluster fail: err(%v)", err)
                return err
        }

        if verList == nil {
                msg := fmt.Sprintf("get verList nil, vol [%v] reqd seq [%v]", w.volName, w.verReadSeq)
                log.LogErrorf("action[CheckReadVerSeq] %v", msg)
                return fmt.Errorf("%v", msg)
        }

        if w.verReadSeq > 0 {
                if _, err = w.CheckReadVerSeq(w.volName, w.verConfReadSeq, verList); err != nil {
                        log.LogFatalf("updateSimpleVolView: readSeq abnormal %v", err)
                }
                return
        }

        log.LogDebugf("updateSimpleVolView.UpdateLatestVer.try update to verlist[%v]", verList)
        if err = client.UpdateLatestVer(verList); err != nil {
                log.LogWarnf("updateSimpleVolView: UpdateLatestVer ver %v faile err %v", verList.GetLastVer(), err)
                return
        }
        return
}

func (w *Wrapper) updateSimpleVolView() (err error) {
        var view *proto.SimpleVolView
        if view, err = w.mc.AdminAPI().GetVolumeSimpleInfo(w.volName); err != nil {
                log.LogWarnf("updateSimpleVolView: get volume simple info fail: volume(%v) err(%v)", w.volName, err)
                return
        }

        w.UpdateUidsView(view)

        if w.followerRead != view.FollowerRead && !w.followerReadClientCfg {
                log.LogDebugf("UpdateSimpleVolView: update followerRead from old(%v) to new(%v)",
                        w.followerRead, view.FollowerRead)
                w.followerRead = view.FollowerRead
        }

        if w.dpSelectorName != view.DpSelectorName || w.dpSelectorParm != view.DpSelectorParm {
                log.LogDebugf("UpdateSimpleVolView: update dpSelector from old(%v %v) to new(%v %v)",
                        w.dpSelectorName, w.dpSelectorParm, view.DpSelectorName, view.DpSelectorParm)
                w.Lock.Lock()
                w.dpSelectorName = view.DpSelectorName
                w.dpSelectorParm = view.DpSelectorParm
                w.dpSelectorChanged = true
                w.Lock.Unlock()
        }

        return nil
}

func (w *Wrapper) updateDataPartitionByRsp(isInit bool, DataPartitions []*proto.DataPartitionResponse) (err error) {
        convert := func(response *proto.DataPartitionResponse) *DataPartition {
                return &DataPartition{
                        DataPartitionResponse: *response,
                        ClientWrapper:         w,
                }
        }

        if proto.IsCold(w.volType) {
                w.clearPartitions()
        }
        rwPartitionGroups := make([]*DataPartition, 0)
        for index, partition := range DataPartitions {
                if partition == nil {
                        log.LogErrorf("action[updateDataPartitionByRsp] index [%v] is nil", index)
                        continue
                }
                dp := convert(partition)
                if w.followerRead && w.nearRead {
                        dp.NearHosts = w.sortHostsByDistance(dp.Hosts)
                }
                log.LogInfof("updateDataPartition: dp(%v)", dp)
                w.replaceOrInsertPartition(dp)
                // do not insert preload dp in cold vol
                if proto.IsCold(w.volType) && proto.IsPreLoadDp(dp.PartitionType) {
                        continue
                }
                if dp.Status == proto.ReadWrite {
                        dp.MetricsRefresh()
                        rwPartitionGroups = append(rwPartitionGroups, dp)
                        log.LogInfof("updateDataPartition: dp(%v) address(%p) insert to rwPartitionGroups", dp.PartitionID, dp)
                }
        }

        // isInit used to identify whether this call is caused by mount action
        if isInit || len(rwPartitionGroups) >= w.minWriteAbleDataPartitionCnt || (proto.IsCold(w.volType) && (len(rwPartitionGroups) >= 1)) {
                log.LogInfof("updateDataPartition: refresh dpSelector of volume(%v) with %v rw partitions(%v all), isInit(%v), minWriteAbleDataPartitionCnt(%v)",
                        w.volName, len(rwPartitionGroups), len(DataPartitions), isInit, w.minWriteAbleDataPartitionCnt)
                w.refreshDpSelector(rwPartitionGroups)
        } else {
                err = errors.New("updateDataPartition: no writable data partition")
                log.LogWarnf("updateDataPartition: no enough writable data partitions, volume(%v) with %v rw partitions(%v all), isInit(%v), minWriteAbleDataPartitionCnt(%v)",
                        w.volName, len(rwPartitionGroups), len(DataPartitions), isInit, w.minWriteAbleDataPartitionCnt)
        }

        log.LogInfof("updateDataPartition: finish")
        return err
}

func (w *Wrapper) updateDataPartition(isInit bool) (err error) {
        if w.preload {
                return
        }
        var dpv *proto.DataPartitionsView
        if dpv, err = w.mc.ClientAPI().EncodingGzip().GetDataPartitions(w.volName); err != nil {
                log.LogErrorf("updateDataPartition: get data partitions fail: volume(%v) err(%v)", w.volName, err)
                return
        }
        log.LogInfof("updateDataPartition: get data partitions: volume(%v) partitions(%v)", w.volName, len(dpv.DataPartitions))
        return w.updateDataPartitionByRsp(isInit, dpv.DataPartitions)
}

func (w *Wrapper) UpdateDataPartition() (err error) {
        return w.updateDataPartition(false)
}

// getDataPartitionFromMaster will call master to get data partition info which not include in  cache updated by
// updateDataPartition which may not take effect if nginx be placed for reduce the pressure of master
func (w *Wrapper) getDataPartitionFromMaster(isInit bool, dpId uint64) (err error) {
        var dpInfo *proto.DataPartitionInfo
        if dpInfo, err = w.mc.AdminAPI().GetDataPartition(w.volName, dpId); err != nil {
                log.LogErrorf("getDataPartitionFromMaster: get data partitions fail: volume(%v) dpId(%v) err(%v)",
                        w.volName, dpId, err)
                return
        }

        log.LogInfof("getDataPartitionFromMaster: get data partitions: volume(%v), dpId(%v)", w.volName, dpId)
        var leaderAddr string
        for _, replica := range dpInfo.Replicas {
                if replica.IsLeader {
                        leaderAddr = replica.Addr
                }
        }

        dpr := new(proto.DataPartitionResponse)
        dpr.PartitionID = dpId
        dpr.Status = dpInfo.Status
        dpr.ReplicaNum = dpInfo.ReplicaNum
        dpr.Hosts = make([]string, len(dpInfo.Hosts))
        copy(dpr.Hosts, dpInfo.Hosts)
        dpr.LeaderAddr = leaderAddr
        dpr.IsRecover = dpInfo.IsRecover
        dpr.IsDiscard = dpInfo.IsDiscard

        DataPartitions := make([]*proto.DataPartitionResponse, 1)
        DataPartitions = append(DataPartitions, dpr)
        return w.updateDataPartitionByRsp(isInit, DataPartitions)
}

func (w *Wrapper) clearPartitions() {
        w.Lock.Lock()
        defer w.Lock.Unlock()
        w.partitions = make(map[uint64]*DataPartition)
}

func (w *Wrapper) AllocatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zones string) (err error) {
        var dpv *proto.DataPartitionsView

        if dpv, err = w.mc.AdminAPI().CreatePreLoadDataPartition(volName, count, capacity, ttl, zones); err != nil {
                log.LogWarnf("CreatePreLoadDataPartition fail: err(%v)", err)
                return
        }
        convert := func(response *proto.DataPartitionResponse) *DataPartition {
                return &DataPartition{
                        DataPartitionResponse: *response,
                        ClientWrapper:         w,
                }
        }
        rwPartitionGroups := make([]*DataPartition, 0)
        for _, partition := range dpv.DataPartitions {
                dp := convert(partition)
                if proto.IsCold(w.volType) && !proto.IsPreLoadDp(dp.PartitionType) {
                        continue
                }
                log.LogInfof("updateDataPartition: dp(%v)", dp)
                w.replaceOrInsertPartition(dp)
                dp.MetricsRefresh()
                rwPartitionGroups = append(rwPartitionGroups, dp)
        }

        w.refreshDpSelector(rwPartitionGroups)
        return nil
}

func (w *Wrapper) replaceOrInsertPartition(dp *DataPartition) {
        var oldstatus int8
        w.Lock.Lock()
        old, ok := w.partitions[dp.PartitionID]
        if ok {
                oldstatus = old.Status

                old.Status = dp.Status
                old.ReplicaNum = dp.ReplicaNum
                old.Hosts = dp.Hosts
                old.IsDiscard = dp.IsDiscard
                old.NearHosts = dp.Hosts

                dp.Metrics = old.Metrics
        } else {
                dp.Metrics = NewDataPartitionMetrics()
                w.partitions[dp.PartitionID] = dp
        }

        w.Lock.Unlock()

        if ok && oldstatus != dp.Status {
                log.LogInfof("partition:dp[%v] address %p status change (%v) -> (%v)", dp.PartitionID, &old, oldstatus, dp.Status)
        }
}

// GetDataPartition returns the data partition based on the given partition ID.
func (w *Wrapper) GetDataPartition(partitionID uint64) (*DataPartition, error) {
        dp, ok := w.tryGetPartition(partitionID)
        if !ok && !proto.IsCold(w.volType) { // cache miss && hot volume
                err := w.getDataPartitionFromMaster(false, partitionID)
                if err == nil {
                        dp, ok = w.tryGetPartition(partitionID)
                        if !ok {
                                return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
                        }
                        return dp, nil
                }
                return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
        }
        if !ok {
                return nil, fmt.Errorf("partition[%v] not exsit", partitionID)
        }
        return dp, nil
}

func (w *Wrapper) GetReadVerSeq() uint64 {
        return w.verReadSeq
}

func (w *Wrapper) CheckReadVerSeq(volName string, verReadSeq uint64, verList *proto.VolVersionInfoList) (readReadVer uint64, err error) {
        w.Lock.RLock()
        defer w.Lock.RUnlock()

        log.LogInfof("action[CheckReadVerSeq] vol [%v] req seq [%v]", volName, verReadSeq)

        readReadVer = verReadSeq
        // Whether it is version 0 or any other version, there may be uncommitted versions between the requested version
        // and the next official version. In this case, the data needs to be read.
        if verReadSeq == math.MaxUint64 {
                verReadSeq = 0
        }

        var (
                id     int
                ver    *proto.VolVersionInfo
                verLen = len(verList.VerList)
        )
        for id, ver = range verList.VerList {
                if id == verLen-1 {
                        err = fmt.Errorf("action[CheckReadVerSeq] readReadVer %v not found", readReadVer)
                        break
                }
                log.LogInfof("action[CheckReadVerSeq] ver %v,%v", ver.Ver, ver.Status)
                if ver.Ver == verReadSeq {
                        if ver.Status != proto.VersionNormal {
                                err = fmt.Errorf("action[CheckReadVerSeq] status %v not right", ver.Status)
                                return
                        }
                        readReadVer = verList.VerList[id+1].Ver - 1
                        log.LogInfof("action[CheckReadVerSeq] get read ver %v", readReadVer)
                        return
                }
        }

        err = fmt.Errorf("not found read ver %v", verReadSeq)
        return
}

// WarningMsg returns the warning message that contains the cluster name.
func (w *Wrapper) WarningMsg() string {
        return fmt.Sprintf("%s_client_warning", w.clusterName)
}

func (w *Wrapper) updateDataNodeStatus() (err error) {
        var cv *proto.ClusterView
        cv, err = w.mc.AdminAPI().GetCluster()
        if err != nil {
                log.LogErrorf("updateDataNodeStatus: get cluster fail: err(%v)", err)
                return
        }

        newHostsStatus := make(map[string]bool)
        for _, node := range cv.DataNodes {
                newHostsStatus[node.Addr] = node.IsActive
        }
        log.LogInfof("updateDataNodeStatus: update %d hosts status", len(newHostsStatus))

        w.HostsStatus = newHostsStatus

        return
}

func (w *Wrapper) SetNearRead(nearRead bool) {
        w.nearRead = nearRead
        log.LogInfof("SetNearRead: set nearRead to %v", w.nearRead)
}

func (w *Wrapper) NearRead() bool {
        return w.nearRead
}

// Sort hosts by distance form local
func (w *Wrapper) sortHostsByDistance(srcHosts []string) []string {
        hosts := make([]string, len(srcHosts))
        copy(hosts, srcHosts)

        for i := 0; i < len(hosts); i++ {
                for j := i + 1; j < len(hosts); j++ {
                        if distanceFromLocal(hosts[i]) > distanceFromLocal(hosts[j]) {
                                hosts[i], hosts[j] = hosts[j], hosts[i]
                        }
                }
        }
        return hosts
}

func distanceFromLocal(b string) int {
        remote := strings.Split(b, ":")[0]

        return iputil.GetDistance(net.ParseIP(LocalIP), net.ParseIP(remote))
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "fmt"
        "strconv"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

type AdminAPI struct {
        mc *MasterClient
        h  map[string]string // extra headers
}

func (api *AdminAPI) WithHeader(key, val string) *AdminAPI {
        return &AdminAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}

func (api *AdminAPI) EncodingWith(encoding string) *AdminAPI {
        return api.WithHeader(headerAcceptEncoding, encoding)
}

func (api *AdminAPI) EncodingGzip() *AdminAPI {
        return api.EncodingWith(encodingGzip)
}

func (api *AdminAPI) GetCluster() (cv *proto.ClusterView, err error) {
        cv = &proto.ClusterView{}
        err = api.mc.requestWith(cv, newRequest(get, proto.AdminGetCluster).Header(api.h))
        return
}

func (api *AdminAPI) GetClusterNodeInfo() (cn *proto.ClusterNodeInfo, err error) {
        cn = &proto.ClusterNodeInfo{}
        err = api.mc.requestWith(cn, newRequest(get, proto.AdminGetNodeInfo).Header(api.h))
        return
}

func (api *AdminAPI) GetClusterIP() (cp *proto.ClusterIP, err error) {
        cp = &proto.ClusterIP{}
        err = api.mc.requestWith(cp, newRequest(get, proto.AdminGetIP).Header(api.h))
        return
}

func (api *AdminAPI) GetClusterStat() (cs *proto.ClusterStatInfo, err error) {
        cs = &proto.ClusterStatInfo{}
        err = api.mc.requestWith(cs, newRequest(get, proto.AdminClusterStat).Header(api.h).NoTimeout())
        return
}

func (api *AdminAPI) ListZones() (zoneViews []*proto.ZoneView, err error) {
        zoneViews = make([]*proto.ZoneView, 0)
        err = api.mc.requestWith(&zoneViews, newRequest(get, proto.GetAllZones).Header(api.h))
        return
}

func (api *AdminAPI) ListNodeSets(zoneName string) (nodeSetStats []*proto.NodeSetStat, err error) {
        params := make([]anyParam, 0)
        if zoneName != "" {
                params = append(params, anyParam{"zoneName", zoneName})
        }
        nodeSetStats = make([]*proto.NodeSetStat, 0)
        err = api.mc.requestWith(&nodeSetStats, newRequest(get, proto.GetAllNodeSets).Header(api.h).Param(params...))
        return
}

func (api *AdminAPI) GetNodeSet(nodeSetId string) (nodeSetStatInfo *proto.NodeSetStatInfo, err error) {
        nodeSetStatInfo = &proto.NodeSetStatInfo{}
        err = api.mc.requestWith(nodeSetStatInfo, newRequest(get, proto.GetNodeSet).
                Header(api.h).addParam("nodesetId", nodeSetId))
        return
}

func (api *AdminAPI) UpdateNodeSet(nodeSetId string, dataNodeSelector string, metaNodeSelector string) (err error) {
        return api.mc.request(newRequest(get, proto.UpdateNodeSet).Header(api.h).Param(
                anyParam{"nodesetId", nodeSetId},
                anyParam{"dataNodeSelector", dataNodeSelector},
                anyParam{"metaNodeSelector", metaNodeSelector},
        ))
}

func (api *AdminAPI) UpdateZone(name string, enable bool, dataNodesetSelector string, metaNodesetSelector string, dataNodeSelector string, metaNodeSelector string) (err error) {
        return api.mc.request(newRequest(post, proto.UpdateZone).Header(api.h).Param(
                anyParam{"name", name},
                anyParam{"enable", enable},
                anyParam{"dataNodesetSelector", dataNodesetSelector},
                anyParam{"metaNodesetSelector", metaNodesetSelector},
                anyParam{"dataNodeSelector", dataNodeSelector},
                anyParam{"metaNodeSelector", metaNodeSelector},
        ))
}

func (api *AdminAPI) Topo() (topo *proto.TopologyView, err error) {
        topo = &proto.TopologyView{}
        err = api.mc.requestWith(topo, newRequest(get, proto.GetTopologyView).Header(api.h))
        return
}

func (api *AdminAPI) GetDataPartition(volName string, partitionID uint64) (partition *proto.DataPartitionInfo, err error) {
        partition = &proto.DataPartitionInfo{}
        err = api.mc.requestWith(partition, newRequest(get, proto.AdminGetDataPartition).
                Header(api.h).Param(anyParam{"id", partitionID}, anyParam{"name", volName}))
        return
}

func (api *AdminAPI) GetDataPartitionById(partitionID uint64) (partition *proto.DataPartitionInfo, err error) {
        partition = &proto.DataPartitionInfo{}
        err = api.mc.requestWith(partition, newRequest(get, proto.AdminGetDataPartition).
                Header(api.h).addParamAny("id", partitionID))
        return
}

func (api *AdminAPI) DiagnoseDataPartition(ignoreDiscardDp bool) (diagnosis *proto.DataPartitionDiagnosis, err error) {
        diagnosis = &proto.DataPartitionDiagnosis{}
        err = api.mc.requestWith(diagnosis, newRequest(get, proto.AdminDiagnoseDataPartition).
                Header(api.h).addParamAny("ignoreDiscard", ignoreDiscardDp))
        return
}

func (api *AdminAPI) DiagnoseMetaPartition() (diagnosis *proto.MetaPartitionDiagnosis, err error) {
        diagnosis = &proto.MetaPartitionDiagnosis{}
        err = api.mc.requestWith(diagnosis, newRequest(get, proto.AdminDiagnoseMetaPartition).Header(api.h))
        return
}

func (api *AdminAPI) LoadDataPartition(volName string, partitionID uint64, clientIDKey string) (err error) {
        return api.mc.request(newRequest(get, proto.AdminLoadDataPartition).Header(api.h).Param(
                anyParam{"id", partitionID},
                anyParam{"name", volName},
                anyParam{"clientIDKey", clientIDKey},
        ))
}

func (api *AdminAPI) CreateDataPartition(volName string, count int, clientIDKey string) (err error) {
        return api.mc.request(newRequest(get, proto.AdminCreateDataPartition).Header(api.h).Param(
                anyParam{"name", volName},
                anyParam{"count", count},
                anyParam{"clientIDKey", clientIDKey},
        ))
}

func (api *AdminAPI) DecommissionDataPartition(dataPartitionID uint64, nodeAddr string, raftForce bool, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminDecommissionDataPartition).Header(api.h)
        request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("raftForceDel", strconv.FormatBool(raftForce))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) DecommissionMetaPartition(metaPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminDecommissionMetaPartition).Header(api.h)
        request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) DeleteDataReplica(dataPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminDeleteDataReplica).Header(api.h)
        request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) AddDataReplica(dataPartitionID uint64, nodeAddr, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminAddDataReplica).Header(api.h)
        request.addParam("id", strconv.FormatUint(dataPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) DeleteMetaReplica(metaPartitionID uint64, nodeAddr string, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminDeleteMetaReplica).Header(api.h)
        request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) AddMetaReplica(metaPartitionID uint64, nodeAddr string, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminAddMetaReplica).Header(api.h)
        request.addParam("id", strconv.FormatUint(metaPartitionID, 10))
        request.addParam("addr", nodeAddr)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) DeleteVolume(volName, authKey string) (err error) {
        request := newRequest(get, proto.AdminDeleteVol).Header(api.h)
        request.addParam("name", volName)
        request.addParam("authKey", authKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) DeleteVolumeWithAuthNode(volName, authKey, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminDeleteVol).Header(api.h)
        request.addParam("name", volName)
        request.addParam("authKey", authKey)
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) UpdateVolume(
        vv *proto.SimpleVolView,
        txTimeout int64,
        txMask string,
        txForceReset bool,
        txConflictRetryNum int64,
        txConflictRetryInterval int64,
        txOpLimit int,
        clientIDKey string,
) (err error) {
        request := newRequest(get, proto.AdminUpdateVol).Header(api.h)
        request.addParam("name", vv.Name)
        request.addParam("description", vv.Description)
        request.addParam("authKey", util.CalcAuthKey(vv.Owner))
        request.addParam("zoneName", vv.ZoneName)
        request.addParam("capacity", strconv.FormatUint(vv.Capacity, 10))
        request.addParam("followerRead", strconv.FormatBool(vv.FollowerRead))
        request.addParam("ebsBlkSize", strconv.Itoa(vv.ObjBlockSize))
        request.addParam("cacheCap", strconv.FormatUint(vv.CacheCapacity, 10))
        request.addParam("cacheAction", strconv.Itoa(vv.CacheAction))
        request.addParam("cacheThreshold", strconv.Itoa(vv.CacheThreshold))
        request.addParam("cacheTTL", strconv.Itoa(vv.CacheTtl))
        request.addParam("cacheHighWater", strconv.Itoa(vv.CacheHighWater))
        request.addParam("cacheLowWater", strconv.Itoa(vv.CacheLowWater))
        request.addParam("cacheLRUInterval", strconv.Itoa(vv.CacheLruInterval))
        request.addParam("cacheRuleKey", vv.CacheRule)
        request.addParam("dpReadOnlyWhenVolFull", strconv.FormatBool(vv.DpReadOnlyWhenVolFull))
        request.addParam("replicaNum", strconv.FormatUint(uint64(vv.DpReplicaNum), 10))
        request.addParam("enableQuota", strconv.FormatBool(vv.EnableQuota))
        request.addParam("deleteLockTime", strconv.FormatInt(vv.DeleteLockTime, 10))
        request.addParam("clientIDKey", clientIDKey)
        if txMask != "" {
                request.addParam("enableTxMask", txMask)
                request.addParam("txForceReset", strconv.FormatBool(txForceReset))
        }
        if txTimeout > 0 {
                request.addParam("txTimeout", strconv.FormatInt(txTimeout, 10))
        }
        if txConflictRetryNum > 0 {
                request.addParam("txConflictRetryNum", strconv.FormatInt(txConflictRetryNum, 10))
        }
        if txOpLimit > 0 {
                request.addParam("txOpLimit", strconv.Itoa(txOpLimit))
        }
        if txConflictRetryInterval > 0 {
                request.addParam("txConflictRetryInterval", strconv.FormatInt(txConflictRetryInterval, 10))
        }
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) PutDataPartitions(volName string, dpsView []byte) (err error) {
        return api.mc.request(newRequest(post, proto.AdminPutDataPartitions).
                Header(api.h).addParam("name", volName).Body(dpsView))
}

func (api *AdminAPI) VolShrink(volName string, capacity uint64, authKey, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminVolShrink).Header(api.h)
        request.addParam("name", volName)
        request.addParam("authKey", authKey)
        request.addParam("capacity", strconv.FormatUint(capacity, 10))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) VolExpand(volName string, capacity uint64, authKey, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminVolExpand).Header(api.h)
        request.addParam("name", volName)
        request.addParam("authKey", authKey)
        request.addParam("capacity", strconv.FormatUint(capacity, 10))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) CreateVolName(volName, owner string, capacity uint64, deleteLockTime int64, crossZone, normalZonesFirst bool, business string,
        mpCount, dpCount, replicaNum, dpSize, volType int, followerRead bool, zoneName, cacheRuleKey string, ebsBlkSize,
        cacheCapacity, cacheAction, cacheThreshold, cacheTTL, cacheHighWater, cacheLowWater, cacheLRUInterval int,
        dpReadOnlyWhenVolFull bool, txMask string, txTimeout uint32, txConflictRetryNum int64, txConflictRetryInterval int64, optEnableQuota string,
        clientIDKey string,
) (err error) {
        request := newRequest(get, proto.AdminCreateVol).Header(api.h)
        request.addParam("name", volName)
        request.addParam("owner", owner)
        request.addParam("capacity", strconv.FormatUint(capacity, 10))
        request.addParam("deleteLockTime", strconv.FormatInt(deleteLockTime, 10))
        request.addParam("crossZone", strconv.FormatBool(crossZone))
        request.addParam("normalZonesFirst", strconv.FormatBool(normalZonesFirst))
        request.addParam("description", business)
        request.addParam("mpCount", strconv.Itoa(mpCount))
        request.addParam("dpCount", strconv.Itoa(dpCount))
        request.addParam("replicaNum", strconv.Itoa(replicaNum))
        request.addParam("dpSize", strconv.Itoa(dpSize))
        request.addParam("volType", strconv.Itoa(volType))
        request.addParam("followerRead", strconv.FormatBool(followerRead))
        request.addParam("zoneName", zoneName)
        request.addParam("cacheRuleKey", cacheRuleKey)
        request.addParam("ebsBlkSize", strconv.Itoa(ebsBlkSize))
        request.addParam("cacheCap", strconv.Itoa(cacheCapacity))
        request.addParam("cacheAction", strconv.Itoa(cacheAction))
        request.addParam("cacheThreshold", strconv.Itoa(cacheThreshold))
        request.addParam("cacheTTL", strconv.Itoa(cacheTTL))
        request.addParam("cacheHighWater", strconv.Itoa(cacheHighWater))
        request.addParam("cacheLowWater", strconv.Itoa(cacheLowWater))
        request.addParam("cacheLRUInterval", strconv.Itoa(cacheLRUInterval))
        request.addParam("dpReadOnlyWhenVolFull", strconv.FormatBool(dpReadOnlyWhenVolFull))
        request.addParam("enableQuota", optEnableQuota)
        request.addParam("clientIDKey", clientIDKey)
        if txMask != "" {
                request.addParam("enableTxMask", txMask)
        }
        if txTimeout > 0 {
                request.addParam("txTimeout", strconv.FormatUint(uint64(txTimeout), 10))
        }
        if txConflictRetryNum > 0 {
                request.addParam("txConflictRetryNum", strconv.FormatInt(txConflictRetryNum, 10))
        }
        if txConflictRetryInterval > 0 {
                request.addParam("txConflictRetryInterval", strconv.FormatInt(txConflictRetryInterval, 10))
        }
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) CreateDefaultVolume(volName, owner string) (err error) {
        request := newRequest(get, proto.AdminCreateVol).Header(api.h)
        request.addParam("name", volName)
        request.addParam("owner", owner)
        request.addParam("capacity", "10")
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) GetVolumeSimpleInfo(volName string) (vv *proto.SimpleVolView, err error) {
        vv = &proto.SimpleVolView{}
        err = api.mc.requestWith(vv, newRequest(get, proto.AdminGetVol).Header(api.h).addParam("name", volName))
        return
}

func (api *AdminAPI) SetVolumeForbidden(volName string, forbidden bool) (err error) {
        request := newRequest(post, proto.AdminVolForbidden).Header(api.h)
        request.addParam("name", volName)
        request.addParam("forbidden", strconv.FormatBool(forbidden))
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) SetVolumeAuditLog(volName string, enable bool) (err error) {
        request := newRequest(post, proto.AdminVolEnableAuditLog).Header(api.h)
        request.addParam("name", volName)
        request.addParam("enable", strconv.FormatBool(enable))
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) GetMonitorPushAddr() (addr string, err error) {
        err = api.mc.requestWith(&addr, newRequest(get, proto.AdminGetMonitorPushAddr).Header(api.h))
        return
}

func (api *AdminAPI) UploadFlowInfo(volName string, flowInfo *proto.ClientReportLimitInfo) (vv *proto.LimitRsp2Client, err error) {
        if flowInfo == nil {
                return nil, fmt.Errorf("flowinfo is nil")
        }
        vv = &proto.LimitRsp2Client{}
        err = api.mc.requestWith(vv, newRequest(get, proto.QosUpload).Header(api.h).Body(flowInfo).
                Param(anyParam{"name", volName}, anyParam{"qosEnable", "true"}))
        log.LogInfof("action[UploadFlowInfo] enable %v", vv.Enable)
        return
}

func (api *AdminAPI) GetVolumeSimpleInfoWithFlowInfo(volName string) (vv *proto.SimpleVolView, err error) {
        vv = &proto.SimpleVolView{}
        err = api.mc.requestWith(vv, newRequest(get, proto.AdminGetVol).
                Header(api.h).Param(anyParam{"name", volName}, anyParam{"init", "true"}))
        return
}

// access control list
func (api *AdminAPI) CheckACL() (ci *proto.ClusterInfo, err error) {
        ci = &proto.ClusterInfo{}
        err = api.mc.requestWith(ci, newRequest(get, proto.AdminACL).Header(api.h))
        return
}

func (api *AdminAPI) GetClusterInfo() (ci *proto.ClusterInfo, err error) {
        ci = &proto.ClusterInfo{}
        err = api.mc.requestWith(ci, newRequest(get, proto.AdminGetIP).Header(api.h))
        return
}

func (api *AdminAPI) GetVerInfo(volName string) (ci *proto.VolumeVerInfo, err error) {
        ci = &proto.VolumeVerInfo{}
        err = api.mc.requestWith(ci, newRequest(get, proto.AdminGetVolVer).
                Header(api.h).addParam("name", volName))
        return
}

func (api *AdminAPI) CreateMetaPartition(volName string, count int, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminCreateMetaPartition).Header(api.h)
        request.addParam("name", volName)
        request.addParam("count", strconv.Itoa(count))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) ListVols(keywords string) (volsInfo []*proto.VolInfo, err error) {
        volsInfo = make([]*proto.VolInfo, 0)
        err = api.mc.requestWith(&volsInfo, newRequest(get, proto.AdminListVols).
                Header(api.h).addParam("keywords", keywords))
        return
}

func (api *AdminAPI) IsFreezeCluster(isFreeze bool, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminClusterFreeze).Header(api.h)
        request.addParam("enable", strconv.FormatBool(isFreeze))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) SetForbidMpDecommission(disable bool) (err error) {
        request := newRequest(get, proto.AdminClusterForbidMpDecommission).Header(api.h)
        request.addParam("enable", strconv.FormatBool(disable))
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) SetMetaNodeThreshold(threshold float64, clientIDKey string) (err error) {
        request := newRequest(get, proto.AdminSetMetaNodeThreshold).Header(api.h)
        request.addParam("threshold", strconv.FormatFloat(threshold, 'f', 6, 64))
        request.addParam("clientIDKey", clientIDKey)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) SetClusterParas(batchCount, markDeleteRate, deleteWorkerSleepMs, autoRepairRate, loadFactor, maxDpCntLimit, clientIDKey string,
        dataNodesetSelector, metaNodesetSelector, dataNodeSelector, metaNodeSelector string,
) (err error) {
        request := newRequest(get, proto.AdminSetNodeInfo).Header(api.h)
        request.addParam("batchCount", batchCount)
        request.addParam("markDeleteRate", markDeleteRate)
        request.addParam("deleteWorkerSleepMs", deleteWorkerSleepMs)
        request.addParam("autoRepairRate", autoRepairRate)
        request.addParam("loadFactor", loadFactor)
        request.addParam("maxDpCntLimit", maxDpCntLimit)
        request.addParam("clientIDKey", clientIDKey)

        request.addParam("dataNodesetSelector", dataNodesetSelector)
        request.addParam("metaNodesetSelector", metaNodesetSelector)
        request.addParam("dataNodeSelector", dataNodeSelector)
        request.addParam("metaNodeSelector", metaNodeSelector)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) GetClusterParas() (delParas map[string]string, err error) {
        request := newRequest(get, proto.AdminGetNodeInfo).Header(api.h)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        delParas = make(map[string]string)
        err = api.mc.requestWith(&delParas, newRequest(get, proto.AdminGetNodeInfo).Header(api.h))
        return
}

func (api *AdminAPI) CreatePreLoadDataPartition(volName string, count int, capacity, ttl uint64, zongs string) (view *proto.DataPartitionsView, err error) {
        view = &proto.DataPartitionsView{}
        err = api.mc.requestWith(view, newRequest(get, proto.AdminCreatePreLoadDataPartition).Header(api.h).Param(
                anyParam{"name", volName},
                anyParam{"replicaNum", count},
                anyParam{"capacity", capacity},
                anyParam{"cacheTTL", ttl},
                anyParam{"zoneName", zongs},
        ))
        return
}

func (api *AdminAPI) ListQuota(volName string) (quotaInfo []*proto.QuotaInfo, err error) {
        resp := &proto.ListMasterQuotaResponse{}
        if err = api.mc.requestWith(resp, newRequest(get, proto.QuotaList).
                Header(api.h).addParam("name", volName)); err != nil {
                log.LogErrorf("action[ListQuota] fail. %v", err)
                return
        }
        quotaInfo = resp.Quotas
        log.LogInfof("action[ListQuota] success.")
        return quotaInfo, err
}

func (api *AdminAPI) CreateQuota(volName string, quotaPathInfos []proto.QuotaPathInfo, maxFiles uint64, maxBytes uint64) (quotaId uint32, err error) {
        if err = api.mc.requestWith(&quotaId, newRequest(get, proto.QuotaCreate).
                Header(api.h).Body(&quotaPathInfos).Param(
                anyParam{"name", volName},
                anyParam{"maxFiles", maxFiles},
                anyParam{"maxBytes", maxBytes})); err != nil {
                log.LogErrorf("action[CreateQuota] fail. %v", err)
                return
        }
        log.LogInfof("action[CreateQuota] success.")
        return
}

func (api *AdminAPI) UpdateQuota(volName string, quotaId string, maxFiles uint64, maxBytes uint64) (err error) {
        request := newRequest(get, proto.QuotaUpdate).Header(api.h)
        request.addParam("name", volName)
        request.addParam("quotaId", quotaId)
        request.addParam("maxFiles", strconv.FormatUint(maxFiles, 10))
        request.addParam("maxBytes", strconv.FormatUint(maxBytes, 10))
        if _, err = api.mc.serveRequest(request); err != nil {
                log.LogErrorf("action[UpdateQuota] fail. %v", err)
                return
        }
        log.LogInfof("action[UpdateQuota] success.")
        return nil
}

func (api *AdminAPI) DeleteQuota(volName string, quotaId string) (err error) {
        request := newRequest(get, proto.QuotaDelete).Header(api.h)
        request.addParam("name", volName)
        request.addParam("quotaId", quotaId)
        if _, err = api.mc.serveRequest(request); err != nil {
                log.LogErrorf("action[DeleteQuota] fail. %v", err)
                return
        }
        log.LogInfo("action[DeleteQuota] success.")
        return nil
}

func (api *AdminAPI) GetQuota(volName string, quotaId string) (quotaInfo *proto.QuotaInfo, err error) {
        info := &proto.QuotaInfo{}
        if err = api.mc.requestWith(info, newRequest(get, proto.QuotaGet).Header(api.h).
                Param(anyParam{"name", volName}, anyParam{"quotaId", quotaId})); err != nil {
                log.LogErrorf("action[GetQuota] fail. %v", err)
                return
        }
        quotaInfo = info
        log.LogInfof("action[GetQuota] %v success.", *quotaInfo)
        return quotaInfo, err
}

func (api *AdminAPI) QueryBadDisks() (badDisks *proto.BadDiskInfos, err error) {
        badDisks = &proto.BadDiskInfos{}
        err = api.mc.requestWith(badDisks, newRequest(get, proto.QueryBadDisks).Header(api.h))
        return
}

func (api *AdminAPI) DecommissionDisk(addr string, disk string) (err error) {
        return api.mc.request(newRequest(post, proto.DecommissionDisk).Header(api.h).
                addParam("addr", addr).addParam("disk", disk))
}

func (api *AdminAPI) RecommissionDisk(addr string, disk string) (err error) {
        return api.mc.request(newRequest(post, proto.RecommissionDisk).Header(api.h).
                addParam("addr", addr).addParam("disk", disk))
}

func (api *AdminAPI) QueryDecommissionDiskProgress(addr string, disk string) (progress *proto.DecommissionProgress, err error) {
        progress = &proto.DecommissionProgress{}
        err = api.mc.requestWith(progress, newRequest(post, proto.QueryDiskDecoProgress).
                Header(api.h).Param(anyParam{"addr", addr}, anyParam{"disk", disk}))
        return
}

func (api *AdminAPI) ListQuotaAll() (volsInfo []*proto.VolInfo, err error) {
        volsInfo = make([]*proto.VolInfo, 0)
        err = api.mc.requestWith(&volsInfo, newRequest(get, proto.QuotaListAll).Header(api.h))
        return
}

func (api *AdminAPI) GetDiscardDataPartition() (discardDpInfos *proto.DiscardDataPartitionInfos, err error) {
        discardDpInfos = &proto.DiscardDataPartitionInfos{}
        err = api.mc.requestWith(&discardDpInfos, newRequest(get, proto.AdminGetDiscardDp).Header(api.h))
        return
}

func (api *AdminAPI) SetDataPartitionDiscard(partitionId uint64, discard bool) (err error) {
        request := newRequest(post, proto.AdminSetDpDiscard).
                Header(api.h).
                addParam("id", strconv.FormatUint(partitionId, 10)).
                addParam("dpDiscard", strconv.FormatBool(discard))
        if err = api.mc.request(request); err != nil {
                return
        }
        return
}

func (api *AdminAPI) DeleteVersion(volName string, verSeq string) (err error) {
        request := newRequest(get, proto.AdminDelVersion).Header(api.h)
        request.addParam("name", volName)
        request.addParam("verSeq", verSeq)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) SetStrategy(volName string, periodic string, count string, enable string, force string) (err error) {
        request := newRequest(get, proto.AdminSetVerStrategy).Header(api.h)
        request.addParam("name", volName)
        request.addParam("periodic", periodic)
        request.addParam("count", count)
        request.addParam("enable", enable)
        request.addParam("force", force)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) CreateVersion(volName string) (ver *proto.VolVersionInfo, err error) {
        ver = &proto.VolVersionInfo{}
        err = api.mc.requestWith(ver, newRequest(get, proto.AdminCreateVersion).
                Header(api.h).addParam("name", volName))
        return
}

func (api *AdminAPI) GetLatestVer(volName string) (ver *proto.VolVersionInfo, err error) {
        ver = &proto.VolVersionInfo{}
        err = api.mc.requestWith(ver, newRequest(get, proto.AdminGetVersionInfo).
                Header(api.h).addParam("name", volName))
        return
}

func (api *AdminAPI) GetVerList(volName string) (verList *proto.VolVersionInfoList, err error) {
        verList = &proto.VolVersionInfoList{}
        err = api.mc.requestWith(verList, newRequest(get, proto.AdminGetAllVersionInfo).
                Header(api.h).addParam("name", volName))
        log.LogDebugf("GetVerList. vol %v verList %v", volName, verList)
        for _, info := range verList.VerList {
                log.LogDebugf("GetVerList. vol %v verList %v", volName, info)
        }
        return
}

func (api *AdminAPI) SetBucketLifecycle(req *proto.LcConfiguration) (err error) {
        return api.mc.request(newRequest(post, proto.SetBucketLifecycle).Header(api.h).Body(req))
}

func (api *AdminAPI) GetBucketLifecycle(volume string) (lcConf *proto.LcConfiguration, err error) {
        lcConf = &proto.LcConfiguration{}
        err = api.mc.requestWith(lcConf, newRequest(get, proto.GetBucketLifecycle).
                Header(api.h).addParam("name", volume))
        return
}

func (api *AdminAPI) DelBucketLifecycle(volume string) (err error) {
        request := newRequest(get, proto.DeleteBucketLifecycle).Header(api.h)
        request.addParam("name", volume)
        _, err = api.mc.serveRequest(request)
        return
}

func (api *AdminAPI) GetS3QoSInfo() (data []byte, err error) {
        return api.mc.serveRequest(newRequest(get, proto.S3QoSGet).Header(api.h))
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"
        "math/rand"

        "github.com/cubefs/cubefs/proto"
)

type Decoder func([]byte) ([]byte, error)

func (d Decoder) Decode(raw []byte) ([]byte, error) {
        return d(raw)
}

type ClientAPI struct {
        mc *MasterClient
        h  map[string]string // extra headers
}

func (api *ClientAPI) WithHeader(key, val string) *ClientAPI {
        return &ClientAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}

func (api *ClientAPI) EncodingWith(encoding string) *ClientAPI {
        return api.WithHeader(headerAcceptEncoding, encoding)
}

func (api *ClientAPI) EncodingGzip() *ClientAPI {
        return api.EncodingWith(encodingGzip)
}

func (api *ClientAPI) GetVolume(volName string, authKey string) (vv *proto.VolView, err error) {
        vv = &proto.VolView{}
        err = api.mc.requestWith(vv, newRequest(post, proto.ClientVol).
                Header(api.h).Param(anyParam{"name", volName}, anyParam{"authKey", authKey}))
        return
}

func (api *ClientAPI) GetVolumeWithoutAuthKey(volName string) (vv *proto.VolView, err error) {
        vv = &proto.VolView{}
        err = api.mc.requestWith(vv, newRequest(post, proto.ClientVol).
                Header(api.h, proto.SkipOwnerValidation, "true").addParam("name", volName))
        return
}

func (api *ClientAPI) GetVolumeWithAuthnode(volName string, authKey string, token string, decoder Decoder) (vv *proto.VolView, err error) {
        var body []byte
        request := newRequest(post, proto.ClientVol).Header(api.h)
        request.addParam("name", volName)
        request.addParam("authKey", authKey)
        request.addParam(proto.ClientMessage, token)
        if body, err = api.mc.serveRequest(request); err != nil {
                return
        }
        if decoder != nil {
                if body, err = decoder.Decode(body); err != nil {
                        return
                }
        }
        vv = &proto.VolView{}
        if err = json.Unmarshal(body, vv); err != nil {
                return
        }
        return
}

func (api *ClientAPI) GetVolumeStat(volName string) (info *proto.VolStatInfo, err error) {
        info = &proto.VolStatInfo{}
        err = api.mc.requestWith(info, newRequest(get, proto.ClientVolStat).
                Header(api.h).Param(anyParam{"name", volName}, anyParam{"version", proto.LFClient}))
        return
}

func (api *ClientAPI) GetMetaPartition(partitionID uint64) (partition *proto.MetaPartitionInfo, err error) {
        partition = &proto.MetaPartitionInfo{}
        err = api.mc.requestWith(partition, newRequest(get, proto.ClientMetaPartition).
                Header(api.h).addParamAny("id", partitionID))
        return
}

func (api *ClientAPI) GetMetaPartitions(volName string) (views []*proto.MetaPartitionView, err error) {
        views = make([]*proto.MetaPartitionView, 0)
        err = api.mc.requestWith(&views, newRequest(get, proto.ClientMetaPartitions).
                Header(api.h).addParam("name", volName))
        return
}

func (api *ClientAPI) GetDataPartitions(volName string) (view *proto.DataPartitionsView, err error) {
        request := newRequest(get, proto.ClientDataPartitions).Header(api.h).addParam("name", volName)

        lastLeader := api.mc.leaderAddr
        defer api.mc.SetLeader(lastLeader)
        randIndex := rand.Intn(len(api.mc.masters))
        if randIndex >= len(api.mc.masters) {
                err = fmt.Errorf("master len %v less or equal request index %v", len(api.mc.masters), randIndex)
                return
        }
        api.mc.SetLeader(api.mc.masters[randIndex])
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        view = &proto.DataPartitionsView{}
        if err = json.Unmarshal(data, view); err != nil {
                return
        }
        return
}

func (api *ClientAPI) GetPreLoadDataPartitions(volName string) (view *proto.DataPartitionsView, err error) {
        view = &proto.DataPartitionsView{}
        err = api.mc.requestWith(view, newRequest(get, proto.ClientDataPartitions).
                Header(api.h).addParam("name", volName))
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "strconv"

        "github.com/cubefs/cubefs/proto"
)

type NodeAPI struct {
        mc *MasterClient
        h  map[string]string // extra headers
}

func (api *NodeAPI) WithHeader(key, val string) *NodeAPI {
        return &NodeAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}

func (api *NodeAPI) EncodingWith(encoding string) *NodeAPI {
        return api.WithHeader(headerAcceptEncoding, encoding)
}

func (api *NodeAPI) EncodingGzip() *NodeAPI {
        return api.EncodingWith(encodingGzip)
}

func (api *NodeAPI) AddDataNode(serverAddr, zoneName string) (id uint64, err error) {
        request := newRequest(get, proto.AddDataNode).Header(api.h)
        request.addParam("addr", serverAddr)
        request.addParam("zoneName", zoneName)
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        id, err = strconv.ParseUint(string(data), 10, 64)
        return
}

func (api *NodeAPI) AddDataNodeWithAuthNode(serverAddr, zoneName, clientIDKey string) (id uint64, err error) {
        request := newRequest(get, proto.AddDataNode).Header(api.h)
        request.addParam("addr", serverAddr)
        request.addParam("zoneName", zoneName)
        request.addParam("clientIDKey", clientIDKey)
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        id, err = strconv.ParseUint(string(data), 10, 64)
        return
}

func (api *NodeAPI) AddMetaNode(serverAddr, zoneName string) (id uint64, err error) {
        request := newRequest(get, proto.AddMetaNode).Header(api.h)
        request.addParam("addr", serverAddr)
        request.addParam("zoneName", zoneName)
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        id, err = strconv.ParseUint(string(data), 10, 64)
        return
}

func (api *NodeAPI) AddMetaNodeWithAuthNode(serverAddr, zoneName, clientIDKey string) (id uint64, err error) {
        request := newRequest(get, proto.AddMetaNode).Header(api.h)
        request.addParam("addr", serverAddr)
        request.addParam("zoneName", zoneName)
        request.addParam("clientIDKey", clientIDKey)
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        id, err = strconv.ParseUint(string(data), 10, 64)
        return
}

func (api *NodeAPI) GetDataNode(serverHost string) (node *proto.DataNodeInfo, err error) {
        node = &proto.DataNodeInfo{}
        err = api.mc.requestWith(node, newRequest(get, proto.GetDataNode).Header(api.h).addParam("addr", serverHost))
        return
}

func (api *NodeAPI) GetMetaNode(serverHost string) (node *proto.MetaNodeInfo, err error) {
        node = &proto.MetaNodeInfo{}
        err = api.mc.requestWith(node, newRequest(get, proto.GetMetaNode).Header(api.h).addParam("addr", serverHost))
        return
}

func (api *NodeAPI) ResponseMetaNodeTask(task *proto.AdminTask) (err error) {
        return api.mc.request(newRequest(post, proto.GetMetaNodeTaskResponse).Header(api.h).Body(task))
}

func (api *NodeAPI) ResponseDataNodeTask(task *proto.AdminTask) (err error) {
        return api.mc.request(newRequest(post, proto.GetDataNodeTaskResponse).Header(api.h).Body(task))
}

func (api *NodeAPI) DataNodeDecommission(nodeAddr string, count int, clientIDKey string) (err error) {
        request := newRequest(get, proto.DecommissionDataNode).Header(api.h).NoTimeout()
        request.addParam("addr", nodeAddr)
        request.addParam("count", strconv.Itoa(count))
        request.addParam("clientIDKey", clientIDKey)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        return
}

func (api *NodeAPI) MetaNodeDecommission(nodeAddr string, count int, clientIDKey string) (err error) {
        request := newRequest(get, proto.DecommissionMetaNode).Header(api.h).NoTimeout()
        request.addParam("addr", nodeAddr)
        request.addParam("count", strconv.Itoa(count))
        request.addParam("clientIDKey", clientIDKey)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        return
}

func (api *NodeAPI) MetaNodeMigrate(srcAddr, targetAddr string, count int, clientIDKey string) (err error) {
        request := newRequest(get, proto.MigrateMetaNode).Header(api.h).NoTimeout()
        request.addParam("srcAddr", srcAddr)
        request.addParam("targetAddr", targetAddr)
        request.addParam("count", strconv.Itoa(count))
        request.addParam("clientIDKey", clientIDKey)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        return
}

func (api *NodeAPI) DataNodeMigrate(srcAddr, targetAddr string, count int, clientIDKey string) (err error) {
        request := newRequest(get, proto.MigrateDataNode).Header(api.h).NoTimeout()
        request.addParam("srcAddr", srcAddr)
        request.addParam("targetAddr", targetAddr)
        request.addParam("count", strconv.Itoa(count))
        request.addParam("clientIDKey", clientIDKey)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        return
}

func (api *NodeAPI) AddLcNode(serverAddr string) (id uint64, err error) {
        request := newRequest(get, proto.AddLcNode).Header(api.h).addParam("addr", serverAddr)
        var data []byte
        if data, err = api.mc.serveRequest(request); err != nil {
                return
        }
        id, err = strconv.ParseUint(string(data), 10, 64)
        return
}

func (api *NodeAPI) ResponseLcNodeTask(task *proto.AdminTask) (err error) {
        return api.mc.request(newRequest(post, proto.GetLcNodeTaskResponse).Header(api.h).Body(task))
}

package master

import (
        "fmt"
        "os"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/ump"
)

type UserAPI struct {
        mc *MasterClient
        h  map[string]string // extra headers
}

func (api *UserAPI) WithHeader(key, val string) *UserAPI {
        return &UserAPI{mc: api.mc, h: mergeHeader(api.h, key, val)}
}

func (api *UserAPI) EncodingWith(encoding string) *UserAPI {
        return api.WithHeader(headerAcceptEncoding, encoding)
}

func (api *UserAPI) EncodingGzip() *UserAPI {
        return api.EncodingWith(encodingGzip)
}

func (api *UserAPI) CreateUser(param *proto.UserCreateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(post, proto.UserCreate).
                Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
        return
}

func (api *UserAPI) DeleteUser(userID string, clientIDKey string) (err error) {
        request := newRequest(post, proto.UserDelete).Header(api.h)
        request.addParam("user", userID)
        request.addParam("clientIDKey", clientIDKey)
        if _, err = api.mc.serveRequest(request); err != nil {
                return
        }
        return
}

func (api *UserAPI) UpdateUser(param *proto.UserUpdateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(post, proto.UserUpdate).
                Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
        return
}

func (api *UserAPI) GetAKInfo(accesskey string) (userInfo *proto.UserInfo, err error) {
        localIP, _ := ump.GetLocalIpAddr()
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(get, proto.UserGetAKInfo).
                Header(api.h).Param(anyParam{"ak", accesskey}, anyParam{"ip", localIP}))
        return
}

func (api *UserAPI) AclOperation(volName string, localIP string, op uint32) (aclInfo *proto.AclRsp, err error) {
        aclInfo = &proto.AclRsp{}
        if err = api.mc.requestWith(aclInfo, newRequest(get, proto.AdminACL).Header(api.h).Param(
                anyParam{"name", volName},
                anyParam{"ip", localIP},
                anyParam{"op", op},
        )); err != nil {
                fmt.Fprintf(os.Stdout, "AclOperation err %v\n", err)
                return
        }
        return
}

func (api *UserAPI) UidOperation(volName string, uid string, op uint32, val string) (uidInfo *proto.UidSpaceRsp, err error) {
        uidInfo = &proto.UidSpaceRsp{}
        if err = api.mc.requestWith(uidInfo, newRequest(get, proto.AdminUid).Header(api.h).Param(
                anyParam{"name", volName},
                anyParam{"uid", uid},
                anyParam{"op", op},
                anyParam{"capacity", val},
        )); err != nil {
                fmt.Fprintf(os.Stdout, "UidOperation err %v\n", err)
                return
        }
        return
}

func (api *UserAPI) GetUserInfo(userID string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(get, proto.UserGetInfo).Header(api.h).addParam("user", userID))
        return
}

func (api *UserAPI) UpdatePolicy(param *proto.UserPermUpdateParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(post, proto.UserUpdatePolicy).
                Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
        return
}

func (api *UserAPI) RemovePolicy(param *proto.UserPermRemoveParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(post, proto.UserRemovePolicy).
                Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
        return
}

func (api *UserAPI) DeleteVolPolicy(vol, clientIDKey string) (err error) {
        return api.mc.request(newRequest(post, proto.UserDeleteVolPolicy).Header(api.h).
                addParam("name", vol).addParam("clientIDKey", clientIDKey))
}

func (api *UserAPI) TransferVol(param *proto.UserTransferVolParam, clientIDKey string) (userInfo *proto.UserInfo, err error) {
        userInfo = &proto.UserInfo{}
        err = api.mc.requestWith(userInfo, newRequest(post, proto.UserTransferVol).
                Header(api.h).Body(param).addParam("clientIDKey", clientIDKey))
        return
}

func (api *UserAPI) ListUsers(keywords string) (users []*proto.UserInfo, err error) {
        users = make([]*proto.UserInfo, 0)
        err = api.mc.requestWith(&users, newRequest(get, proto.UserList).Header(api.h).addParam("keywords", keywords))
        return
}

func (api *UserAPI) ListUsersOfVol(vol string) (users []string, err error) {
        users = make([]string, 0)
        err = api.mc.requestWith(&users, newRequest(get, proto.UsersOfVol).Header(api.h).addParam("name", vol))
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net/http"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/compressor"
        "github.com/cubefs/cubefs/util/log"
)

// TODO: re-use response body.

const (
        requestTimeout = 30 * time.Second

        encodingGzip          = compressor.EncodingGzip
        headerAcceptEncoding  = proto.HeaderAcceptEncoding
        headerContentEncoding = proto.HeaderContentEncoding

        get  = http.MethodGet
        post = http.MethodPost
)

var ErrNoValidMaster = errors.New("no valid master")

type MasterCLientWithResolver struct {
        MasterClient
        resolver       *NameResolver
        updateInverval int
        stopC          chan struct{}
}

type MasterClient struct {
        sync.RWMutex
        masters     []string
        useSSL      bool
        leaderAddr  string
        timeout     time.Duration
        clientIDKey string

        adminAPI  *AdminAPI
        clientAPI *ClientAPI
        nodeAPI   *NodeAPI
        userAPI   *UserAPI
}

func (c *MasterClient) ReplaceMasterAddresses(addrs []string) {
        c.Lock()
        defer c.Unlock()
        c.masters = addrs
        c.leaderAddr = ""
}

// AddNode add the given address as the master address.
func (c *MasterClient) AddNode(address string) {
        c.Lock()
        c.updateMaster(address)
        c.Unlock()
}

// Leader returns the current leader address.
func (c *MasterClient) Leader() (addr string) {
        c.RLock()
        addr = c.leaderAddr
        c.RUnlock()
        return
}

func (c *MasterClient) ClientIDKey() string {
        return c.clientIDKey
}

func (c *MasterClient) AdminAPI() *AdminAPI {
        return c.adminAPI
}

func (c *MasterClient) ClientAPI() *ClientAPI {
        return c.clientAPI
}

func (c *MasterClient) NodeAPI() *NodeAPI {
        return c.nodeAPI
}

func (c *MasterClient) UserAPI() *UserAPI {
        return c.userAPI
}

// Change the leader address.
func (c *MasterClient) SetLeader(addr string) {
        c.Lock()
        c.leaderAddr = addr
        c.Unlock()
}

// Change the request timeout
func (c *MasterClient) SetTimeout(timeout uint16) {
        c.Lock()
        c.timeout = time.Duration(timeout) * time.Second
        c.Unlock()
}

func (c *MasterClient) SetClientIDKey(clientIDKey string) {
        c.Lock()
        c.clientIDKey = clientIDKey
        c.Unlock()
}

func (c *MasterClient) serveRequest(r *request) (repsData []byte, err error) {
        leaderAddr, nodes := c.prepareRequest()
        host := leaderAddr
        for i := -1; i < len(nodes); i++ {
                if i == -1 {
                        if host == "" {
                                continue
                        }
                } else {
                        host = nodes[i]
                }
                var resp *http.Response
                schema := "http"
                if c.useSSL {
                        schema = "https"
                }
                url := fmt.Sprintf("%s://%s%s", schema, host, r.path)
                resp, err = c.httpRequest(r.method, url, r)
                if err != nil {
                        log.LogErrorf("serveRequest: send http request fail: method(%v) url(%v) err(%v)", r.method, url, err)
                        continue
                }
                stateCode := resp.StatusCode
                repsData, err = io.ReadAll(resp.Body)
                _ = resp.Body.Close()
                if err != nil {
                        log.LogErrorf("serveRequest: read http response body fail: err(%v)", err)
                        continue
                }

                switch stateCode {
                case http.StatusForbidden:
                        curMasterAddr := strings.TrimSpace(string(repsData))
                        curMasterAddr = strings.Replace(curMasterAddr, "\n", "", -1)
                        if len(curMasterAddr) == 0 {
                                log.LogWarnf("serveRequest: server response status 403: request(%s) status"+
                                        "(403), body is empty", host)
                                err = ErrNoValidMaster
                                return
                        }
                        repsData, err = c.serveRequest(r)
                        return
                case http.StatusOK:
                        if leaderAddr != host {
                                log.LogDebugf("server Request resp new master[%v] old [%v]", host, leaderAddr)
                                c.SetLeader(host)
                        }
                        repsData, err = compressor.New(resp.Header.Get(headerContentEncoding)).Decompress(repsData)
                        if err != nil {
                                log.LogErrorf("serveRequest: decompress response body fail: err(%v)", err)
                                return nil, fmt.Errorf("decompress response body err:%v", err)
                        }
                        body := new(proto.HTTPReplyRaw)
                        if err := body.Unmarshal(repsData); err != nil {
                                log.LogErrorf("unmarshal response body err:%v", err)
                                return nil, fmt.Errorf("unmarshal response body err:%v", err)

                        }
                        if body.Code != proto.ErrCodeSuccess {
                                log.LogWarnf("serveRequest: code[%v], msg[%v], data[%v] ", body.Code, body.Msg, body.Data)
                                if body.Code == proto.ErrCodeInternalError && len(body.Msg) > 0 {
                                        return nil, errors.New(body.Msg)
                                }
                                return nil, proto.ParseErrorCode(body.Code)
                        }
                        return body.Bytes(), nil
                default:
                        msg := fmt.Sprintf("serveRequest: unknown status: host(%v) uri(%v) status(%v) body(%s).",
                                resp.Request.URL.String(), host, stateCode, strings.Replace(string(repsData), "\n", "", -1))
                        err = errors.New(msg)
                        log.LogErrorf(msg)
                        continue
                }
        }
        return
}

func (c *MasterClient) requestWith(rst interface{}, r *request) error {
        if r.err != nil {
                return r.err
        }
        buf, err := c.serveRequest(r)
        if err != nil {
                return err
        }
        if rst == nil {
                return nil
        }
        return json.Unmarshal(buf, rst)
}

// result is nil
func (c *MasterClient) request(r *request) error {
        return c.requestWith(nil, r)
}

// Nodes returns all master addresses.
func (c *MasterClient) Nodes() (nodes []string) {
        c.RLock()
        nodes = c.masters
        c.RUnlock()
        return
}

// prepareRequest returns the leader address and all master addresses.
func (c *MasterClient) prepareRequest() (addr string, nodes []string) {
        c.RLock()
        addr = c.leaderAddr
        nodes = c.masters
        c.RUnlock()
        return
}

func (c *MasterClient) httpRequest(method, url string, r *request) (resp *http.Response, err error) {
        client := http.DefaultClient
        if !r.noTimeout {
                client.Timeout = c.timeout
        }
        reader := bytes.NewReader(r.body)
        var req *http.Request
        fullUrl := c.mergeRequestUrl(url, r.params)
        log.LogDebugf("httpRequest: method(%v) url(%v) bodyLength[%v].", method, fullUrl, len(r.body))
        if req, err = http.NewRequest(method, fullUrl, reader); err != nil {
                return
        }
        req.Header.Set("Content-Type", "application/json")
        req.Header.Set("Connection", "close")
        for k, v := range r.header {
                req.Header.Set(k, v)
        }
        resp, err = client.Do(req)
        return
}

func (c *MasterClient) updateMaster(address string) {
        contains := false
        for _, master := range c.masters {
                if master == address {
                        contains = true
                        break
                }
        }
        if !contains {
                c.masters = append(c.masters, address)
        }
        c.leaderAddr = address
}

func (c *MasterClient) mergeRequestUrl(url string, params map[string]string) string {
        if len(params) > 0 {
                buff := bytes.NewBuffer([]byte(url))
                isFirstParam := true
                for k, v := range params {
                        if isFirstParam {
                                buff.WriteString("?")
                                isFirstParam = false
                        } else {
                                buff.WriteString("&")
                        }
                        buff.WriteString(k)
                        buff.WriteString("=")
                        buff.WriteString(v)
                }
                return buff.String()
        }
        return url
}

func NewMasterCLientWithResolver(masters []string, useSSL bool, updateInverval int) *MasterCLientWithResolver {
        mc := &MasterCLientWithResolver{
                MasterClient:   MasterClient{masters: masters, useSSL: useSSL, timeout: requestTimeout},
                updateInverval: updateInverval,
                stopC:          make(chan struct{}),
        }
        mc.adminAPI = &AdminAPI{mc: &mc.MasterClient}
        mc.clientAPI = &ClientAPI{mc: &mc.MasterClient}
        mc.nodeAPI = &NodeAPI{mc: &mc.MasterClient}
        mc.userAPI = &UserAPI{mc: &mc.MasterClient}
        resolver, err := NewNameResolver(masters)
        if err != nil {
                return nil
        } else {
                mc.resolver = resolver
        }
        return mc
}

func (mc *MasterCLientWithResolver) Start() (err error) {
        failed := true
        for i := 0; i < 3; i++ {
                var changed bool
                changed, err = mc.resolver.Resolve()
                if changed && err == nil {
                        var addrs []string
                        addrs, err = mc.resolver.GetAllAddresses()
                        if err == nil {
                                mc.ReplaceMasterAddresses(addrs)
                                failed = false
                                break
                        } else {
                                log.LogWarnf("MasterCLientWithResolver: Resolve failed: %v, retry %v", err, i)
                        }
                }
        }

        if failed {
                err = errors.New("MasterCLientWithResolver: Resolve failed")
                log.LogErrorf("MasterCLientWithResolver: Resolve failed")
                return
        }

        if len(mc.resolver.domains) == 0 {
                log.LogDebugf("MasterCLientWithResolver: No domains found, skipping resolving timely")
                return
        }

        go func() {
                ticker := time.NewTicker(time.Duration(mc.updateInverval) * time.Minute)
                // timer := time.NewTimer(0)
                defer ticker.Stop()
                for {
                        select {
                        case <-mc.stopC:
                                log.LogInfo("MasterCLientWithResolver goroutine stopped")
                                return
                        case <-ticker.C:
                                changed, err := mc.resolver.Resolve()
                                if changed && err == nil {
                                        addrs, err := mc.resolver.GetAllAddresses()
                                        if err == nil {
                                                mc.ReplaceMasterAddresses(addrs)
                                        }

                                }
                                // timer.Reset(time.Duration(mc.updateInverval) * time.Minute)
                        }
                }
        }()
        return nil
}

func (mc *MasterCLientWithResolver) Stop() {
        select {
        case mc.stopC <- struct{}{}:
                log.LogDebugf("stop resolver, notified!")
        default:
                log.LogDebugf("stop resolver, skipping notify!")
        }
}

// NewMasterHelper returns a new MasterClient instance.
func NewMasterClient(masters []string, useSSL bool) *MasterClient {
        mc := &MasterClient{masters: masters, useSSL: useSSL, timeout: requestTimeout}
        mc.adminAPI = &AdminAPI{mc: mc}
        mc.clientAPI = &ClientAPI{mc: mc}
        mc.nodeAPI = &NodeAPI{mc: mc}
        mc.userAPI = &UserAPI{mc: mc}
        return mc
}

// NewMasterClientFromString parse raw master address configuration
// string and returns a new MasterClient instance.
// Notes that a valid format raw string must match: "{HOST}:{PORT},{HOST}:{PORT}"
func NewMasterClientFromString(masterAddr string, useSSL bool) *MasterClient {
        masters := make([]string, 0)
        for _, master := range strings.Split(masterAddr, ",") {
                master = strings.TrimSpace(master)
                if master != "" {
                        masters = append(masters, master)
                }
        }
        return NewMasterClient(masters, useSSL)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "errors"
        "fmt"
        "math/rand"
        "net"
        "regexp"
        "strconv"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

var domainRegexp = regexp.MustCompile(`^(?i)[a-z0-9-]+(\.[a-z0-9-]+)+\.?$`)

func IsValidDomain(domain string) bool {
        return domainRegexp.MatchString(domain)
}

type IpCache struct {
        sync.RWMutex
        Ts  int64 // time.Now().Unix()
        Ips []string
}

func (ic *IpCache) SetIps(ips []string) {
        ic.Lock()
        defer ic.Unlock()
        ic.Ips = ips
        ic.Ts = time.Now().Unix()
}

func (ic *IpCache) UpdateTs() {
        ic.Lock()
        defer ic.Unlock()
        ic.Ts = time.Now().Unix()
}

func (ic *IpCache) GetRandomIp() (ip string, err error) {
        ic.RLock()
        defer ic.RUnlock()
        if len(ic.Ips) == 0 {
                return "", fmt.Errorf("ip cache is empty")
        }
        randIndex := rand.Intn(len(ic.Ips))
        return ic.Ips[randIndex], nil
}

func (ic *IpCache) GetAllIps() (ips []string, err error) {
        ic.RLock()
        defer ic.RUnlock()
        if len(ic.Ips) == 0 {
                return nil, fmt.Errorf("ip cache is empty")
        }
        return ic.Ips, nil
}

type NameResolver struct {
        domains []string
        ips     []string
        port    uint64
        ic      *IpCache
}

// NewNameResolver parse raw master address configuration
// string and returns a new NameResolver instance.
// Notes that a valid format raw string member of addrs must match: "IP:PORT" or "DOMAIN:PORT"
// and PORT must be the same
func NewNameResolver(addrPorts []string) (ns *NameResolver, err error) {
        if len(addrPorts) == 0 {
                log.LogErrorf("NameResolver: empty addresses for name resolver")
                return nil, fmt.Errorf("empty addresses for name resolver")
        }
        var domains []string
        var ips []string

        port := uint64(0)
        for _, ap := range addrPorts {
                if ap == "" {
                        continue
                }
                arr := strings.Split(ap, ":")
                /*if len(arr) != 2 {
                        return nil, fmt.Errorf("wrong addr format [%v]", ap)
                }*/

                arrNum := len(arr)
                p := uint64(0)
                if arrNum == 2 {
                        p, err = strconv.ParseUint(arr[1], 10, 64)
                        if err != nil {
                                log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
                                return nil, fmt.Errorf("wrong addr format [%v]", ap)
                        }

                } else if arrNum == 1 {
                        p = 80
                } else {
                        log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
                        return nil, fmt.Errorf("wrong addr format [%v]", ap)
                }

                if port == 0 {
                        port = p
                } else if port != p {
                        log.LogErrorf("NameResolver: ports are not the same")
                        return nil, fmt.Errorf("ports are not the same")
                }

                addr := net.ParseIP(arr[0])
                if addr == nil {
                        if IsValidDomain(arr[0]) {
                                domains = append(domains, arr[0])
                        } else {
                                log.LogErrorf("NameResolver: wrong addr format [%v]", ap)
                                return nil, fmt.Errorf("wrong addr format [%v]", ap)
                        }
                } else {
                        ips = append(ips, addr.String())
                }
        }
        ic := &IpCache{}

        ns = &NameResolver{
                domains: domains,
                ips:     ips,
                port:    port,
                ic:      ic,
        }
        log.LogDebugf("NameResolver: add ip[%v], domain[%v], port[%v]", ips, domains, port)
        return ns, nil
}

func (ns *NameResolver) GetRandomIp() (ip string, err error) {
        return ns.ic.GetRandomIp()
}

func (ns *NameResolver) GetAllIps() (ips []string, err error) {
        return ns.ic.GetAllIps()
}

func (ns *NameResolver) GetAllAddresses() (addrs []string, err error) {
        ips, err := ns.ic.GetAllIps()
        if err != nil {
                return nil, err
        }

        for _, ip := range ips {
                addr := fmt.Sprintf("%s:%d", ip, ns.port)
                addrs = append(addrs, addr)
        }
        return addrs, nil
}

func (ns *NameResolver) isChanged(ipSet map[string]struct{}) (changed bool) {
        for _, ip := range ns.ic.Ips {
                if _, ok := ipSet[ip]; !ok {
                        changed = true
                }
        }

        if !changed {
                if len(ipSet) != len(ns.ic.Ips) {
                        changed = true
                }
        }
        return
}

func (ns *NameResolver) Resolve() (changed bool, err error) {
        if len(ns.ips) == 0 && len(ns.domains) == 0 {
                return false, fmt.Errorf("name or ip empty")
        }

        ipSet := make(map[string]struct{}, 0)

        if len(ns.domains) > 0 {
                var addrs []net.IP
                for _, domain := range ns.domains {
                        addrs, err = net.LookupIP(domain)
                        if err != nil {
                                log.LogWarnf("domain [%v] resolved failed", domain)
                                continue
                        } else {
                                for _, ip := range addrs {
                                        ipSet[ip.String()] = struct{}{}
                                }
                        }
                }
        }

        for _, ip := range ns.ips {
                ipSet[ip] = struct{}{}
        }

        if len(ipSet) == 0 {
                return false, errors.New("resolve: resolving result is empty")
        }

        var ips []string
        for ip := range ipSet {
                ips = append(ips, ip)
        }
        changed = ns.isChanged(ipSet)
        if changed {
                log.LogInfof("Resolve: resolving result is changed from %v to %v", ns.ic.Ips, ips)
                ns.ic.SetIps(ips)
        } else {
                log.LogDebugf("Resolve: resolving result is not changed %v", ns.ic.Ips)
        }

        ns.ic.UpdateTs()

        return changed, nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package master

import (
        "encoding/json"
        "fmt"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
)

type request struct {
        method string
        path   string
        params map[string]string
        header map[string]string
        body   []byte
        err    error

        noTimeout bool
}

type anyParam struct {
        key string
        val interface{}
}

var ReqHeaderUA = fmt.Sprintf("cubefs-sdk/%v (commit %v)", proto.Version, proto.CommitID)

func (r *request) addParamAny(key string, value interface{}) *request {
        r.params[key] = util.Any2String(value)
        return r
}

func (r *request) addParam(key, value string) *request {
        r.params[key] = value
        return r
}

func (r *request) addHeader(key, value string) *request {
        r.header[key] = value
        return r
}

func (r *request) setBody(body []byte) *request {
        r.body = body
        return r
}

func (r *request) Param(params ...anyParam) *request {
        for _, param := range params {
                r.addParamAny(param.key, param.val)
        }
        return r
}

func (r *request) Header(headers map[string]string, added ...string) *request {
        if len(added)%2 == 1 {
                added = added[:len(added)-1]
        }
        for k, v := range headers {
                r.header[k] = v
        }
        for idx := 0; idx < len(added); idx += 2 {
                r.header[added[idx]] = added[idx+1]
        }
        return r
}

func (r *request) Body(body interface{}) *request {
        reqBody, ok := body.([]byte)
        if !ok {
                var err error
                if reqBody, err = json.Marshal(body); err != nil {
                        r.err = fmt.Errorf("body json marshal %s", err.Error())
                        return r
                }
        }
        r.body = reqBody
        return r
}

func (r *request) NoTimeout() *request {
        r.noTimeout = true
        return r
}

func newRequest(method string, path string) *request {
        req := &request{
                method: method,
                path:   path,
                params: make(map[string]string),
                header: make(map[string]string),
        }
        req.header["User-Agent"] = ReqHeaderUA
        return req
}

func mergeHeader(headers map[string]string, added ...string) map[string]string {
        if len(added)%2 == 1 {
                added = added[:len(added)-1]
        }
        copied := make(map[string]string, len(headers)+len(added)/2)
        for k, v := range headers {
                copied[k] = v
        }
        for idx := 0; idx < len(added); idx += 2 {
                copied[added[idx]] = added[idx+1]
        }
        return copied
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        "errors"
        "fmt"
        syslog "log"
        "math"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

// Low-level API, i.e. work with inode

const (
        OpenRetryInterval = 5 * time.Millisecond
        OpenRetryLimit    = 1000
        maxUniqID         = 5000
)

const (
        BatchIgetRespBuf       = 1000
        MaxSummaryGoroutineNum = 120
        BatchGetBufLen         = 500
        UpdateSummaryRetry     = 3
        SummaryKey             = "DirStat"
        ChannelLen             = 100
        BatchSize              = 200
        MaxGoroutineNum        = 5
        InodeFullMaxRetryTime  = 2
        ForceUpdateRWMP        = "ForceUpdateRWMP"
)

func mapHaveSameKeys(m1, m2 map[uint32]*proto.MetaQuotaInfo) bool {
        if len(m1) != len(m2) {
                return false
        }

        for k := range m1 {
                if _, ok := m2[k]; !ok {
                        return false
                }
        }

        return true
}

func (mw *MetaWrapper) GetRootIno(subdir string) (uint64, error) {
        rootIno, err := mw.LookupPath(subdir)
        if err != nil {
                return 0, fmt.Errorf("GetRootIno: Lookup failed, subdir(%v) err(%v)", subdir, err)
        }
        info, err := mw.InodeGet_ll(rootIno)
        if err != nil {
                return 0, fmt.Errorf("GetRootIno: InodeGet failed, subdir(%v) err(%v)", subdir, err)
        }
        if !proto.IsDir(info.Mode) {
                return 0, fmt.Errorf("GetRootIno: not directory, subdir(%v) mode(%v) err(%v)", subdir, info.Mode, err)
        }
        syslog.Printf("GetRootIno: %v\n", rootIno)
        return rootIno, nil
}

// Looks up absolute path and returns the ino
func (mw *MetaWrapper) LookupPath(subdir string) (uint64, error) {
        ino := proto.RootIno
        if subdir == "" || subdir == "/" {
                return ino, nil
        }

        dirs := strings.Split(subdir, "/")
        for _, dir := range dirs {
                if dir == "/" || dir == "" {
                        continue
                }
                child, _, err := mw.Lookup_ll(ino, dir)
                if err != nil {
                        return 0, err
                }
                ino = child
        }
        return ino, nil
}

func (mw *MetaWrapper) Statfs() (total, used, inodeCount uint64) {
        total = atomic.LoadUint64(&mw.totalSize)
        used = atomic.LoadUint64(&mw.usedSize)
        inodeCount = atomic.LoadUint64(&mw.inodeCount)
        return
}

func (mw *MetaWrapper) Create_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, fullPath string) (*proto.InodeInfo, error) {
        // if mw.EnableTransaction {
        txMask := proto.TxOpMaskOff
        if proto.IsRegular(mode) {
                txMask = proto.TxOpMaskCreate
        } else if proto.IsDir(mode) {
                txMask = proto.TxOpMaskMkdir
        } else if proto.IsSymlink(mode) {
                txMask = proto.TxOpMaskSymlink
        } else {
                txMask = proto.TxOpMaskMknod
        }
        txType := proto.TxMaskToType(txMask)
        if mw.enableTx(txMask) && txType != proto.TxTypeUndefined {
                return mw.txCreate_ll(parentID, name, mode, uid, gid, target, txType, fullPath)
        } else {
                return mw.create_ll(parentID, name, mode, uid, gid, target, fullPath)
        }
}

func (mw *MetaWrapper) txCreate_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, txType uint32, fullPath string) (info *proto.InodeInfo, err error) {
        var (
                status int
                // err          error
                // info         *proto.InodeInfo
                mp           *MetaPartition
                rwPartitions []*MetaPartition
        )

        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("txCreate_ll: No parent partition, parentID(%v)", parentID)
                return nil, syscall.ENOENT
        }

        var quotaIds []uint32
        if mw.EnableQuota {
                quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
                if err != nil {
                        log.LogErrorf("Create_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
                        return nil, syscall.ENOENT
                }

                for quotaId := range quotaInfos {
                        quotaIds = append(quotaIds, quotaId)
                }
        }

        rwPartitions = mw.getRWPartitions()
        length := len(rwPartitions)
        var tx *Transaction

        defer func() {
                if tx != nil {
                        err = tx.OnDone(err, mw)
                }
        }()

        epoch := atomic.AddUint64(&mw.epoch, 1)
        for i := 0; i < length; i++ {
                index := (int(epoch) + i) % length
                mp = rwPartitions[index]
                tx, err = NewCreateTransaction(parentMP, mp, parentID, name, mw.TxTimeout, txType)
                if err != nil {
                        return nil, syscall.EAGAIN
                }

                status, info, err = mw.txIcreate(tx, mp, mode, uid, gid, target, quotaIds, fullPath)
                if err == nil && status == statusOK {
                        goto create_dentry
                } else if status == statusNoSpace {
                        log.LogErrorf("Create_ll status %v", status)
                        return nil, statusToErrno(status)
                } else {
                        // sync cancel previous transaction before retry
                        tx.Rollback(mw)
                }
        }
        return nil, syscall.ENOMEM

create_dentry:
        if log.EnableDebug() {
                log.LogDebugf("txCreate_ll: tx.txInfo(%v)", tx.txInfo)
        }

        status, err = mw.txDcreate(tx, parentMP, parentID, name, info.Inode, mode, quotaIds, fullPath)
        if err != nil || status != statusOK {
                return nil, statusErrToErrno(status, err)
        }

        if log.EnableDebug() {
                log.LogDebugf("txCreate_ll: tx.txInfo(%v)", tx.txInfo)
        }

        if mw.EnableSummary {
                var filesInc, dirsInc int64
                if proto.IsDir(mode) {
                        dirsInc = 1
                } else {
                        filesInc = 1
                }
                // go mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
                job := func() {
                        mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
                }
                tx.SetOnCommit(job)
        }

        return info, nil
}

func (mw *MetaWrapper) create_ll(parentID uint64, name string, mode, uid, gid uint32, target []byte, fullPath string) (*proto.InodeInfo, error) {
        var (
                status       int
                err          error
                info         *proto.InodeInfo
                mp           *MetaPartition
                rwPartitions []*MetaPartition
        )

        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("Create_ll: No parent partition, parentID(%v)", parentID)
                return nil, syscall.ENOENT
        }

        status, info, err = mw.iget(parentMP, parentID, mw.LastVerSeq)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }

        quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
        if info.Nlink >= quota {
                log.LogErrorf("Create_ll: parent inode's nlink quota reached, parentID(%v)", parentID)
                return nil, syscall.EDQUOT
        }

get_rwmp:
        rwPartitions = mw.getRWPartitions()
        length := len(rwPartitions)
        epoch := atomic.AddUint64(&mw.epoch, 1)
        retryTime := 0
        var quotaIds []uint32
        if mw.EnableQuota {
                quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
                if err != nil {
                        log.LogErrorf("Create_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
                        return nil, syscall.ENOENT
                }
                for quotaId := range quotaInfos {
                        quotaIds = append(quotaIds, quotaId)
                }

                for i := 0; i < length; i++ {
                        index := (int(epoch) + i) % length
                        mp = rwPartitions[index]
                        status, info, err = mw.quotaIcreate(mp, mode, uid, gid, target, quotaIds, fullPath)
                        if err == nil && status == statusOK {
                                goto create_dentry
                        } else if status == statusFull {
                                if retryTime >= InodeFullMaxRetryTime {
                                        break
                                }
                                retryTime++
                                log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
                                mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
                                        mw.triggerAndWaitForceUpdate()
                                        return nil, nil
                                })
                                goto get_rwmp
                        } else if status == statusNoSpace {
                                log.LogErrorf("Create_ll status %v", status)
                                return nil, statusToErrno(status)
                        }
                }
        } else {
                for i := 0; i < length; i++ {
                        index := (int(epoch) + i) % length
                        mp = rwPartitions[index]
                        status, info, err = mw.icreate(mp, mode, uid, gid, target, fullPath)
                        if err == nil && status == statusOK {
                                goto create_dentry
                        } else if status == statusFull {
                                if retryTime >= InodeFullMaxRetryTime {
                                        break
                                }
                                retryTime++
                                log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
                                mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
                                        mw.triggerAndWaitForceUpdate()
                                        return nil, nil
                                })
                                goto get_rwmp
                        } else if status == statusNoSpace {
                                log.LogErrorf("Create_ll status %v", status)
                                return nil, statusToErrno(status)
                        }
                }
        }
        return nil, syscall.ENOMEM
create_dentry:
        if mw.EnableQuota {
                status, err = mw.quotaDcreate(parentMP, parentID, name, info.Inode, mode, quotaIds, fullPath)
        } else {
                status, err = mw.dcreate(parentMP, parentID, name, info.Inode, mode, fullPath)
        }
        if err != nil {
                if status == statusOpDirQuota || status == statusNoSpace {
                        mw.iunlink(mp, info.Inode, mw.Client.GetLatestVer(), 0, fullPath)
                        mw.ievict(mp, info.Inode, fullPath)
                }
                return nil, statusToErrno(status)
        } else if status != statusOK {
                if status != statusExist {
                        mw.iunlink(mp, info.Inode, mw.Client.GetLatestVer(), 0, fullPath)
                        mw.ievict(mp, info.Inode, fullPath)
                }
                return nil, statusToErrno(status)
        }
        if mw.EnableSummary {
                var filesInc, dirsInc int64
                if proto.IsDir(mode) {
                        dirsInc = 1
                } else {
                        filesInc = 1
                }
                go mw.UpdateSummary_ll(parentID, filesInc, dirsInc, 0)
        }
        return info, nil
}

func (mw *MetaWrapper) Lookup_ll(parentID uint64, name string) (inode uint64, mode uint32, err error) {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("Lookup_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
                return 0, 0, syscall.ENOENT
        }

        status, inode, mode, err := mw.lookup(parentMP, parentID, name, mw.VerReadSeq)
        if err != nil || status != statusOK {
                return 0, 0, statusToErrno(status)
        }
        return inode, mode, nil
}

func (mw *MetaWrapper) BatchGetExpiredMultipart(prefix string, days int) (expiredIds []*proto.ExpiredMultipartInfo, err error) {
        partitions := mw.partitions
        var mp *MetaPartition
        wg := new(sync.WaitGroup)
        var resultMu sync.Mutex
        log.LogDebugf("BatchGetExpiredMultipart: mp num(%v) prefix(%v) days(%v)", len(partitions), prefix, days)
        for _, mp = range partitions {
                wg.Add(1)
                go func(mp *MetaPartition) {
                        defer wg.Done()
                        status, infos, err := mw.getExpiredMultipart(prefix, days, mp)
                        if err == nil && status == statusOK {
                                resultMu.Lock()
                                expiredIds = append(expiredIds, infos...)
                                resultMu.Unlock()
                        }
                        if err != nil && err != syscall.ENOENT {
                                log.LogErrorf("batchGetExpiredMultipart: get expired multipart fail: partitionId(%v)",
                                        mp.PartitionID)
                        }
                }(mp)
        }
        wg.Wait()

        resultMu.Lock()
        defer resultMu.Unlock()
        if len(expiredIds) == 0 {
                err = syscall.ENOENT
                return
        }
        return
}

func (mw *MetaWrapper) InodeGet_ll(inode uint64) (*proto.InodeInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeGet_ll: No such partition, ino(%v)", inode)
                return nil, syscall.ENOENT
        }

        status, info, err := mw.iget(mp, inode, mw.VerReadSeq)
        if err != nil || status != statusOK {
                if status == statusNoent {
                        // For NOENT error, pull the latest mp and give it another try,
                        // in case the mp view is outdated.
                        mw.triggerAndWaitForceUpdate()
                        return mw.doInodeGet(inode)
                }
                return nil, statusToErrno(status)
        }
        if mw.EnableQuota {
                if len(info.QuotaInfos) != 0 && proto.IsDir(info.Mode) {
                        var qinfo QuotaCacheInfo
                        qinfo.quotaInfos = make(map[uint32]*proto.MetaQuotaInfo)
                        qinfo.quotaInfos = info.QuotaInfos
                        qinfo.inode = inode
                        mw.qc.Put(inode, &qinfo)
                }
        }
        log.LogDebugf("InodeGet_ll: info(%v)", info)
        return info, nil
}

// Just like InodeGet but without retry
func (mw *MetaWrapper) doInodeGet(inode uint64) (*proto.InodeInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeGet_ll: No such partition, ino(%v)", inode)
                return nil, syscall.ENOENT
        }

        status, info, err := mw.iget(mp, inode, mw.VerReadSeq)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        log.LogDebugf("doInodeGet: info(%v)", info)
        return info, nil
}

func (mw *MetaWrapper) BatchInodeGet(inodes []uint64) []*proto.InodeInfo {
        var wg sync.WaitGroup

        batchInfos := make([]*proto.InodeInfo, 0)
        resp := make(chan []*proto.InodeInfo, BatchIgetRespBuf)
        candidates := make(map[uint64][]uint64)

        // Target partition does not have to be very accurate.
        for _, ino := range inodes {
                mp := mw.getPartitionByInode(ino)
                if mp == nil {
                        continue
                }
                if _, ok := candidates[mp.PartitionID]; !ok {
                        candidates[mp.PartitionID] = make([]uint64, 0, 256)
                }
                candidates[mp.PartitionID] = append(candidates[mp.PartitionID], ino)
        }

        for id, inos := range candidates {
                mp := mw.getPartitionByID(id)
                if mp == nil {
                        continue
                }
                wg.Add(1)
                go mw.batchIget(&wg, mp, inos, resp)
        }

        go func() {
                wg.Wait()
                close(resp)
        }()

        for infos := range resp {
                batchInfos = append(batchInfos, infos...)
        }

        log.LogDebugf("BatchInodeGet: inodesCnt(%d)", len(inodes))
        return batchInfos
}

// InodeDelete_ll is a low-level api that removes specified inode immediately
// and do not effect extent data managed by this inode.
func (mw *MetaWrapper) InodeDelete_ll(inode uint64, fullPath string) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeDelete: No such partition, ino(%v)", inode)
                return syscall.ENOENT
        }
        status, err := mw.idelete(mp, inode, fullPath)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }
        log.LogDebugf("InodeDelete_ll: inode(%v)", inode)
        return nil
}

func (mw *MetaWrapper) BatchGetXAttr(inodes []uint64, keys []string) ([]*proto.XAttrInfo, error) {
        // Collect meta partitions
        var (
                mps      = make(map[uint64]*MetaPartition) // Mapping: partition ID -> partition
                mpInodes = make(map[uint64][]uint64)       // Mapping: partition ID -> inodes
        )
        for _, ino := range inodes {
                mp := mw.getPartitionByInode(ino)
                if mp != nil {
                        mps[mp.PartitionID] = mp
                        mpInodes[mp.PartitionID] = append(mpInodes[mp.PartitionID], ino)
                }
        }

        var (
                xattrsCh = make(chan *proto.XAttrInfo, len(inodes))
                errorsCh = make(chan error, len(inodes))
        )

        var wg sync.WaitGroup
        for pID := range mps {
                wg.Add(1)
                go func(mp *MetaPartition, inodes []uint64, keys []string) {
                        defer wg.Done()
                        xattrs, err := mw.batchGetXAttr(mp, inodes, keys)
                        if err != nil {
                                errorsCh <- err
                                log.LogErrorf("BatchGetXAttr: get xattr fail: volume(%v) partitionID(%v) inodes(%v) keys(%v) err(%s)",
                                        mw.volname, mp.PartitionID, inodes, keys, err)
                                return
                        }
                        for _, info := range xattrs {
                                xattrsCh <- info
                        }
                }(mps[pID], mpInodes[pID], keys)
        }
        wg.Wait()

        close(xattrsCh)
        close(errorsCh)

        if len(errorsCh) > 0 {
                return nil, <-errorsCh
        }

        xattrs := make([]*proto.XAttrInfo, 0, len(inodes))
        for {
                info := <-xattrsCh
                if info == nil {
                        break
                }
                xattrs = append(xattrs, info)
        }
        return xattrs, nil
}

func (mw *MetaWrapper) Delete_ll(parentID uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
        if mw.enableTx(proto.TxOpMaskRemove) {
                return mw.txDelete_ll(parentID, name, isDir, fullPath)
        } else {
                return mw.Delete_ll_EX(parentID, name, isDir, 0, fullPath)
        }
}

func (mw *MetaWrapper) Delete_Ver_ll(parentID uint64, name string, isDir bool, verSeq uint64, fullPath string) (*proto.InodeInfo, error) {
        if verSeq == 0 {
                verSeq = math.MaxUint64
        }
        log.LogDebugf("Delete_Ver_ll.parentId %v name %v isDir %v verSeq %v", parentID, name, isDir, verSeq)
        return mw.Delete_ll_EX(parentID, name, isDir, verSeq, fullPath)
}

func (mw *MetaWrapper) DeleteWithCond_ll(parentID, cond uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
        return mw.deletewithcond_ll(parentID, cond, name, isDir, fullPath)
}

func (mw *MetaWrapper) txDelete_ll(parentID uint64, name string, isDir bool, fullPath string) (info *proto.InodeInfo, err error) {
        var (
                status int
                inode  uint64
                mode   uint32
                mp     *MetaPartition
        )

        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("txDelete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
                return nil, syscall.ENOENT
        }

        var tx *Transaction
        defer func() {
                if tx != nil {
                        err = tx.OnDone(err, mw)
                }
        }()

        status, inode, mode, err = mw.lookup(parentMP, parentID, name, mw.LastVerSeq)
        if err != nil || status != statusOK {
                return nil, statusErrToErrno(status, err)
        }

        mp = mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("txDelete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                return nil, syscall.EINVAL
        }

        if isDir && !proto.IsDir(mode) {
                return nil, syscall.EINVAL
        }

        if isDir && mw.EnableQuota {
                quotaInfos, err := mw.GetInodeQuota_ll(inode)
                if err != nil {
                        log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
                        return nil, syscall.ENOENT
                }
                for _, info := range quotaInfos {
                        if info.RootInode {
                                log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
                                return nil, syscall.EACCES
                        }
                }
        }

        tx, err = NewDeleteTransaction(parentMP, parentID, name, mp, inode, mw.TxTimeout)
        if err != nil {
                return nil, syscall.EAGAIN
        }

        status, err = mw.txCreateTX(tx, parentMP)
        if status != statusOK || err != nil {
                return nil, statusErrToErrno(status, err)
        }

        funcs := make([]func() (int, error), 0)

        funcs = append(funcs, func() (int, error) {
                var newSt int
                var newErr error

                newSt, _, newErr = mw.txDdelete(tx, parentMP, parentID, inode, name, fullPath)
                return newSt, newErr
        })

        funcs = append(funcs, func() (int, error) {
                var newSt int
                var newErr error

                newSt, info, newErr = mw.txIunlink(tx, mp, inode, fullPath)
                return newSt, newErr
        })

        // 2. prepare transaction
        var preErr error
        wg := sync.WaitGroup{}
        for _, fc := range funcs {
                wg.Add(1)
                go func(f func() (int, error)) {
                        defer wg.Done()
                        tStatus, tErr := f()
                        if tStatus != statusOK || tErr != nil {
                                preErr = statusErrToErrno(tStatus, tErr)
                        }
                }(fc)
        }
        wg.Wait()

        if preErr != nil {
                return info, preErr
        }

        if mw.EnableSummary {
                var job func()
                // go func() {
                if proto.IsDir(mode) {
                        job = func() {
                                mw.UpdateSummary_ll(parentID, 0, -1, 0)
                        }
                } else {
                        job = func() {
                                mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
                        }
                }
                tx.SetOnCommit(job)
        }

        return info, preErr
}

/*
 * Note that the return value of InodeInfo might be nil without error,
 * and the caller should make sure InodeInfo is valid before using it.
 */

func (mw *MetaWrapper) Delete_ll_EX(parentID uint64, name string, isDir bool, verSeq uint64, fullPath string) (*proto.InodeInfo, error) {
        var (
                status          int
                inode           uint64
                mode            uint32
                err             error
                info            *proto.InodeInfo
                mp              *MetaPartition
                inodeCreateTime int64
                denVer          uint64
        )
        log.LogDebugf("action[Delete_ll_EX] name %v verSeq %v parentID %v isDir %v", name, verSeq, parentID, isDir)
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("delete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
                return nil, syscall.ENOENT
        }

        if isDir {
                status, inode, mode, err = mw.lookup(parentMP, parentID, name, verSeq)
                if err != nil || status != statusOK {
                        return nil, statusToErrno(status)
                }
                if !proto.IsDir(mode) {
                        return nil, syscall.EINVAL
                }

                if verSeq == 0 {
                        mp = mw.getPartitionByInode(inode)
                        if mp == nil {
                                log.LogErrorf("Delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                                return nil, syscall.EAGAIN
                        }
                        status, info, err = mw.iget(mp, inode, verSeq)
                        if err != nil || status != statusOK {
                                return nil, statusToErrno(status)
                        }
                        if info == nil || info.Nlink > 2 {
                                return nil, syscall.ENOTEMPTY
                        }
                }
                if mw.EnableQuota {
                        quotaInfos, err := mw.GetInodeQuota_ll(inode)
                        if err != nil {
                                log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
                                return nil, syscall.ENOENT
                        }
                        for _, info := range quotaInfos {
                                if info.RootInode {
                                        log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
                                        return nil, syscall.EACCES
                                }
                        }
                        mw.qc.Delete(inode)
                }
                if mw.volDeleteLockTime > 0 {
                        inodeCreateTime = info.CreateTime.Unix()
                        if ok, err := mw.canDeleteInode(mp, info, inode); !ok {
                                return nil, err
                        }
                }
        } else {
                if mw.volDeleteLockTime > 0 {
                        status, inode, _, err = mw.lookup(parentMP, parentID, name, verSeq)
                        if err != nil || status != statusOK {
                                return nil, statusToErrno(status)
                        }
                        mp = mw.getPartitionByInode(inode)
                        if mp == nil {
                                log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                                return nil, syscall.EAGAIN
                        }
                        status, info, err = mw.iget(mp, inode, verSeq)
                        if err != nil || status != statusOK {
                                return nil, statusToErrno(status)
                        }
                        inodeCreateTime = info.CreateTime.Unix()
                        if ok, err := mw.canDeleteInode(mp, info, inode); !ok {
                                return nil, err
                        }
                }
        }

        log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)

        status, inode, _, err = mw.ddelete(parentMP, parentID, name, inodeCreateTime, verSeq, fullPath)
        if err != nil || status != statusOK {
                if status == statusNoent {
                        log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
                        return nil, nil
                }
                log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
                return nil, statusToErrno(status)
        }
        log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
        // dentry is deleted successfully but inode is not, still returns success.
        mp = mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                return nil, nil
        }
        log.LogDebugf("action[Delete_ll] parentID %v name %v verSeq %v", parentID, name, verSeq)
        status, info, err = mw.iunlink(mp, inode, verSeq, denVer, fullPath)
        if err != nil || status != statusOK {
                log.LogDebugf("action[Delete_ll] parentID %v inode %v name %v verSeq %v err %v", parentID, inode, name, verSeq, err)
                return nil, nil
        }

        if verSeq == 0 && mw.EnableSummary {
                go func() {
                        if proto.IsDir(mode) {
                                mw.UpdateSummary_ll(parentID, 0, -1, 0)
                        } else {
                                mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
                        }
                }()
        }

        return info, nil
}

func isObjectLocked(mw *MetaWrapper, inode uint64, name string) error {
        xattrInfo, err := mw.XAttrGet_ll(inode, "oss:lock")
        if err != nil {
                log.LogErrorf("isObjectLocked: check ObjectLock err(%v) name(%v)", err, name)
                return err
        }
        retainUntilDate := xattrInfo.Get("oss:lock")
        if len(retainUntilDate) > 0 {
                retainUntilDateInt64, err := strconv.ParseInt(string(retainUntilDate), 10, 64)
                if err != nil {
                        return err
                }
                if retainUntilDateInt64 > time.Now().UnixNano() {
                        log.LogWarnf("isObjectLocked: object is locked, retainUntilDate(%v) name(%v)", retainUntilDateInt64, name)
                        return errors.New("Access Denied")
                }
        }
        return nil
}

func (mw *MetaWrapper) deletewithcond_ll(parentID, cond uint64, name string, isDir bool, fullPath string) (*proto.InodeInfo, error) {
        err := isObjectLocked(mw, cond, name)
        if err != nil {
                return nil, err
        }

        var (
                status int
                inode  uint64
                mode   uint32
                info   *proto.InodeInfo
                mp     *MetaPartition
        )

        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("delete_ll: No parent partition, parentID(%v) name(%v)", parentID, name)
                return nil, syscall.ENOENT
        }

        if isDir {
                status, inode, mode, err = mw.lookup(parentMP, parentID, name, mw.LastVerSeq)
                if err != nil || status != statusOK {
                        return nil, statusToErrno(status)
                }
                if !proto.IsDir(mode) {
                        return nil, syscall.EINVAL
                }
                mp = mw.getPartitionByInode(inode)
                if mp == nil {
                        log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                        return nil, syscall.EAGAIN
                }
                status, info, err = mw.iget(mp, inode, mw.VerReadSeq)
                if err != nil || status != statusOK {
                        return nil, statusToErrno(status)
                }
                if info == nil || info.Nlink > 2 {
                        return nil, syscall.ENOTEMPTY
                }
                quotaInfos, err := mw.GetInodeQuota_ll(inode)
                if err != nil {
                        log.LogErrorf("get inode [%v] quota failed [%v]", inode, err)
                        return nil, syscall.ENOENT
                }
                for _, info := range quotaInfos {
                        if info.RootInode {
                                log.LogErrorf("can not remove quota Root inode equal inode [%v]", inode)
                                return nil, syscall.EACCES
                        }
                }
        }

        dentry := []proto.Dentry{
                {
                        Name:  name,
                        Inode: cond,
                        Type:  mode,
                },
        }
        status, resp, err := mw.ddeletes(parentMP, parentID, dentry, []string{fullPath})
        if err != nil || status != statusOK {
                if status == statusNoent {
                        return nil, nil
                }
                return nil, statusToErrno(status)
        }
        status = parseStatus(resp.Items[0].Status)
        if status != statusOK {
                if status == statusNoent {
                        return nil, nil
                }
                return nil, statusToErrno(status)
        }

        mp = mw.getPartitionByInode(resp.Items[0].Inode)
        if mp == nil {
                log.LogErrorf("delete_ll: No inode partition, parentID(%v) name(%v) ino(%v)", parentID, name, inode)
                return nil, nil
        }

        status, info, err = mw.iunlink(mp, resp.Items[0].Inode, 0, 0, fullPath)
        if err != nil || status != statusOK {
                return nil, nil
        }

        if mw.EnableSummary {
                go func() {
                        if proto.IsDir(mode) {
                                mw.UpdateSummary_ll(parentID, 0, -1, 0)
                        } else {
                                mw.UpdateSummary_ll(parentID, -1, 0, -int64(info.Size))
                        }
                }()
        }

        return info, nil
}

func (mw *MetaWrapper) Rename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
        if mw.enableTx(proto.TxOpMaskRename) {
                return mw.txRename_ll(srcParentID, srcName, dstParentID, dstName, srcFullPath, dstFullPath, overwritten)
        } else {
                return mw.rename_ll(srcParentID, srcName, dstParentID, dstName, srcFullPath, dstFullPath, overwritten)
        }
}

func (mw *MetaWrapper) txRename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
        var tx *Transaction
        defer func() {
                if tx != nil {
                        err = tx.OnDone(err, mw)
                }
        }()

        srcParentMP := mw.getPartitionByInode(srcParentID)
        if srcParentMP == nil {
                return syscall.ENOENT
        }
        dstParentMP := mw.getPartitionByInode(dstParentID)
        if dstParentMP == nil {
                return syscall.ENOENT
        }
        // look up for the src ino
        status, srcInode, srcMode, err := mw.lookup(srcParentMP, srcParentID, srcName, mw.LastVerSeq)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }

        tx, err = NewRenameTransaction(srcParentMP, srcParentID, srcName, dstParentMP, dstParentID, dstName, mw.TxTimeout)
        if err != nil {
                return syscall.EAGAIN
        }

        funcs := make([]func() (int, error), 0)

        status, dstInode, dstMode, err := mw.lookup(dstParentMP, dstParentID, dstName, mw.LastVerSeq)
        if err == nil && status == statusOK {

                // Note that only regular files are allowed to be overwritten.
                if !proto.IsRegular(dstMode) || !overwritten || !proto.IsRegular(srcMode) {
                        return syscall.EEXIST
                }

                oldInodeMP := mw.getPartitionByInode(dstInode)
                if oldInodeMP == nil {
                        return syscall.EAGAIN
                }

                err = RenameTxReplaceInode(tx, oldInodeMP, dstInode)
                if err != nil {
                        return syscall.EAGAIN
                }

                funcs = append(funcs, func() (int, error) {
                        var newSt int
                        var newErr error
                        newSt, _, newErr = mw.txDupdate(tx, dstParentMP, dstParentID, dstName, srcInode, dstInode, dstFullPath)
                        return newSt, newErr
                })

                funcs = append(funcs, func() (int, error) {
                        var newSt int
                        var newErr error
                        newSt, _, newErr = mw.txIunlink(tx, oldInodeMP, dstInode, dstFullPath)
                        if newSt == statusNoent {
                                return statusOK, nil
                        }
                        return newSt, newErr
                })

                if log.EnableDebug() {
                        log.LogDebugf("txRename_ll: tx(%v), pid:%v, name:%v, old(ino:%v) is replaced by src(new ino:%v)",
                                tx.txInfo, dstParentID, dstName, dstInode, srcInode)
                }
        } else if status == statusNoent {
                funcs = append(funcs, func() (int, error) {
                        var newSt int
                        var newErr error
                        newSt, newErr = mw.txDcreate(tx, dstParentMP, dstParentID, dstName, srcInode, srcMode, []uint32{}, dstFullPath)
                        return newSt, newErr
                })
        } else {
                return statusToErrno(status)
        }

        // var inode uint64
        funcs = append(funcs, func() (int, error) {
                var newSt int
                var newErr error
                newSt, _, newErr = mw.txDdelete(tx, srcParentMP, srcParentID, srcInode, srcName, srcFullPath)
                return newSt, newErr
        })

        if log.EnableDebug() {
                log.LogDebugf("txRename_ll: tx(%v), pid:%v, name:%v, old(ino:%v) is replaced by src(new ino:%v)",
                        tx.txInfo, dstParentID, dstName, dstInode, srcInode)
        }

        // 1. create transaction
        status, err = mw.txCreateTX(tx, dstParentMP)
        if status != statusOK || err != nil {
                return statusErrToErrno(status, err)
        }

        // 2. prepare transaction
        var preErr error
        wg := sync.WaitGroup{}
        for _, fc := range funcs {
                wg.Add(1)
                go func(f func() (int, error)) {
                        defer wg.Done()
                        tStatus, tErr := f()
                        if tStatus != statusOK || tErr != nil {
                                preErr = statusErrToErrno(tStatus, tErr)
                        }
                }(fc)
        }
        wg.Wait()

        if preErr != nil {
                return preErr
        }

        // update summary
        var job func()
        if mw.EnableSummary {
                var srcInodeInfo *proto.InodeInfo
                var dstInodeInfo *proto.InodeInfo

                srcInodeInfo, _ = mw.InodeGet_ll(srcInode)
                if dstInode != 0 {
                        dstInodeInfo, _ = mw.InodeGet_ll(dstInode)
                        sizeInc := srcInodeInfo.Size - dstInodeInfo.Size
                        job = func() {
                                mw.UpdateSummary_ll(srcParentID, -1, 0, -int64(srcInodeInfo.Size))
                                mw.UpdateSummary_ll(dstParentID, 0, 0, int64(sizeInc))
                        }
                        tx.SetOnCommit(job)
                        return
                } else {
                        sizeInc := int64(srcInodeInfo.Size)
                        if proto.IsRegular(srcMode) {
                                log.LogDebugf("txRename_ll: update summary when file dentry is replaced")
                                job = func() {
                                        mw.UpdateSummary_ll(srcParentID, -1, 0, -sizeInc)
                                        mw.UpdateSummary_ll(dstParentID, 1, 0, sizeInc)
                                }
                        } else {
                                log.LogDebugf("txRename_ll: update summary when dir dentry is replaced")
                                job = func() {
                                        mw.UpdateSummary_ll(srcParentID, 0, -1, 0)
                                        mw.UpdateSummary_ll(dstParentID, 0, 1, 0)
                                }
                        }
                        tx.SetOnCommit(job)
                }
        }

        // TODO
        // job = func() {
        //         var inodes []uint64
        //         inodes = append(inodes, srcInode)
        //         srcQuotaInfos, err := mw.GetInodeQuota_ll(srcParentID)
        //         if err != nil {
        //                 log.LogErrorf("rename_ll get src parent inode [%v] quota fail [%v]", srcParentID, err)
        //         }

        //         destQuotaInfos, err := mw.getInodeQuota(dstParentMP, dstParentID)
        //         if err != nil {
        //                 log.LogErrorf("rename_ll: get dst partent inode [%v] quota fail [%v]", dstParentID, err)
        //         }

        //         if mapHaveSameKeys(srcQuotaInfos, destQuotaInfos) {
        //                 return
        //         }

        //         for quotaId := range srcQuotaInfos {
        //                 mw.BatchDeleteInodeQuota_ll(inodes, quotaId)
        //         }

        //         for quotaId, info := range destQuotaInfos {
        //                 log.LogDebugf("BatchSetInodeQuota_ll inodes [%v] quotaId [%v] rootInode [%v]", inodes, quotaId, info.RootInode)
        //                 mw.BatchSetInodeQuota_ll(inodes, quotaId, false)
        //         }
        // }
        // tx.SetOnCommit(job)
        return nil
}

func (mw *MetaWrapper) rename_ll(srcParentID uint64, srcName string, dstParentID uint64, dstName string, srcFullPath string, dstFullPath string, overwritten bool) (err error) {
        var (
                oldInode   uint64
                lastVerSeq uint64
        )

        srcParentMP := mw.getPartitionByInode(srcParentID)
        if srcParentMP == nil {
                return syscall.ENOENT
        }
        dstParentMP := mw.getPartitionByInode(dstParentID)
        if dstParentMP == nil {
                return syscall.ENOENT
        }

        status, info, err := mw.iget(dstParentMP, dstParentID, mw.VerReadSeq)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }

        quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
        if info.Nlink >= quota {
                log.LogErrorf("rename_ll: dst parent inode's nlink quota reached, parentID(%v)", dstParentID)
                return syscall.EDQUOT
        }

        // look up for the src ino
        status, inode, mode, err := mw.lookup(srcParentMP, srcParentID, srcName, mw.VerReadSeq)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }

        srcMP := mw.getPartitionByInode(inode)
        if srcMP == nil {
                return syscall.ENOENT
        }

        status, _, err = mw.ilink(srcMP, inode, srcFullPath)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }

        // create dentry in dst parent
        status, err = mw.dcreate(dstParentMP, dstParentID, dstName, inode, mode, dstFullPath)
        if err != nil {
                if status == statusOpDirQuota {
                        return statusToErrno(status)
                }
                return syscall.EAGAIN
        }
        var srcInodeInfo *proto.InodeInfo
        var dstInodeInfo *proto.InodeInfo
        if mw.EnableSummary {
                srcInodeInfo, _ = mw.InodeGet_ll(inode)
        }

        // Note that only regular files are allowed to be overwritten.
        if status == statusExist && (proto.IsSymlink(mode) || proto.IsRegular(mode)) {
                if !overwritten {
                        return syscall.EEXIST
                }

                status, oldInode, err = mw.dupdate(dstParentMP, dstParentID, dstName, inode, dstFullPath)
                if err != nil {
                        return syscall.EAGAIN
                }
                if mw.EnableSummary {
                        dstInodeInfo, _ = mw.InodeGet_ll(oldInode)
                }
        }

        if status != statusOK {
                mw.iunlink(srcMP, inode, lastVerSeq, 0, srcFullPath)
                return statusToErrno(status)
        }
        var denVer uint64
        // delete dentry from src parent

        status, _, denVer, err = mw.ddelete(srcParentMP, srcParentID, srcName, 0, lastVerSeq, srcFullPath)

        if err != nil {
                log.LogErrorf("mw.ddelete(srcParentMP, srcParentID, %s) failed.", srcName)
                return statusToErrno(status)
        } else if status != statusOK {
                var (
                        sts int
                        e   error
                )
                if oldInode == 0 {
                        sts, inode, denVer, e = mw.ddelete(dstParentMP, dstParentID, dstName, 0, lastVerSeq, dstFullPath)
                } else {
                        sts, denVer, e = mw.dupdate(dstParentMP, dstParentID, dstName, oldInode, dstFullPath)
                }
                if e == nil && sts == statusOK {
                        mw.iunlink(srcMP, inode, lastVerSeq, denVer, srcFullPath)
                }
                return statusToErrno(status)
        }

        mw.iunlink(srcMP, inode, lastVerSeq, denVer, srcFullPath)

        if oldInode != 0 {
                // overwritten
                inodeMP := mw.getPartitionByInode(oldInode)
                if inodeMP != nil {
                        mw.iunlink(inodeMP, oldInode, lastVerSeq, 0, dstFullPath)
                        // evict oldInode to avoid oldInode becomes orphan inode
                        mw.ievict(inodeMP, oldInode, dstFullPath)
                }
                if mw.EnableSummary {
                        sizeInc := srcInodeInfo.Size - dstInodeInfo.Size
                        go func() {
                                mw.UpdateSummary_ll(srcParentID, -1, 0, -int64(srcInodeInfo.Size))
                                mw.UpdateSummary_ll(dstParentID, 0, 0, int64(sizeInc))
                        }()
                }
        } else {
                if mw.EnableSummary {
                        sizeInc := int64(srcInodeInfo.Size)
                        if proto.IsRegular(mode) {
                                // file
                                go func() {
                                        mw.UpdateSummary_ll(srcParentID, -1, 0, -sizeInc)
                                        mw.UpdateSummary_ll(dstParentID, 1, 0, sizeInc)
                                }()
                        } else {
                                // dir
                                go func() {
                                        mw.UpdateSummary_ll(srcParentID, 0, -1, 0)
                                        mw.UpdateSummary_ll(dstParentID, 0, 1, 0)
                                }()
                        }
                }
        }
        // TODO
        // var inodes []uint64
        // inodes = append(inodes, inode)
        // srcQuotaInfos, err := mw.GetInodeQuota_ll(srcParentID)
        // if err != nil {
        //         log.LogErrorf("rename_ll get src parent inode [%v] quota fail [%v]", srcParentID, err)
        // }

        // destQuotaInfos, err := mw.getInodeQuota(dstParentMP, dstParentID)
        // if err != nil {
        //         log.LogErrorf("rename_ll: get dst partent inode [%v] quota fail [%v]", dstParentID, err)
        // }

        // if mapHaveSameKeys(srcQuotaInfos, destQuotaInfos) {
        //         return nil
        // }

        // for quotaId := range srcQuotaInfos {
        //         mw.BatchDeleteInodeQuota_ll(inodes, quotaId)
        // }

        // for quotaId, info := range destQuotaInfos {
        //         log.LogDebugf("BatchSetInodeQuota_ll inodes [%v] quotaId [%v] rootInode [%v]", inodes, quotaId, info.RootInode)
        //         mw.BatchSetInodeQuota_ll(inodes, quotaId, false)
        // }

        return nil
}

// Read all dentries with parentID
func (mw *MetaWrapper) ReadDir_ll(parentID uint64) ([]proto.Dentry, error) {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                return nil, syscall.ENOENT
        }

        status, children, err := mw.readDir(parentMP, parentID)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        return children, nil
}

// Read limit count dentries with parentID, start from string
func (mw *MetaWrapper) ReadDirLimitForSnapShotClean(parentID uint64, from string, limit uint64, verSeq uint64, idDir bool) ([]proto.Dentry, error) {
        if verSeq == 0 {
                verSeq = math.MaxUint64
        }
        log.LogDebugf("action[ReadDirLimit_ll] parentID %v from %v limit %v verSeq %v", parentID, from, limit, verSeq)
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                return nil, syscall.ENOENT
        }
        var opt uint8
        opt |= uint8(proto.FlagsSnapshotDel)
        if idDir {
                opt |= uint8(proto.FlagsSnapshotDelDir)
        }
        status, children, err := mw.readDirLimit(parentMP, parentID, from, limit, verSeq, opt)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        for _, den := range children {
                log.LogDebugf("ReadDirLimitForSnapShotClean. get dentry %v", den)
        }
        return children, nil
}

// Read limit count dentries with parentID, start from string
func (mw *MetaWrapper) ReadDirLimit_ll(parentID uint64, from string, limit uint64) ([]proto.Dentry, error) {
        log.LogDebugf("action[ReadDirLimit_ll] parentID %v from %v limit %v", parentID, from, limit)
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                return nil, syscall.ENOENT
        }

        status, children, err := mw.readDirLimit(parentMP, parentID, from, limit, mw.VerReadSeq, 0)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        return children, nil
}

func (mw *MetaWrapper) DentryCreate_ll(parentID uint64, name string, inode uint64, mode uint32, fullPath string) error {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                return syscall.ENOENT
        }
        var err error
        var status int
        if status, err = mw.dcreate(parentMP, parentID, name, inode, mode, fullPath); err != nil || status != statusOK {
                return statusToErrno(status)
        }
        return nil
}

func (mw *MetaWrapper) DentryUpdate_ll(parentID uint64, name string, inode uint64, fullPath string) (oldInode uint64, err error) {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                err = syscall.ENOENT
                return
        }
        var status int
        status, oldInode, err = mw.dupdate(parentMP, parentID, name, inode, fullPath)
        if err != nil || status != statusOK {
                err = statusToErrno(status)
                return
        }
        return
}

func (mw *MetaWrapper) SplitExtentKey(parentInode, inode uint64, ek proto.ExtentKey) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return syscall.ENOENT
        }
        var oldInfo *proto.InodeInfo
        if mw.EnableSummary {
                oldInfo, _ = mw.InodeGet_ll(inode)
        }

        status, err := mw.appendExtentKey(mp, inode, ek, nil, true)
        if err != nil || status != statusOK {
                log.LogErrorf("SplitExtentKey: inode(%v) ek(%v) err(%v) status(%v)", inode, ek, err, status)
                return statusToErrno(status)
        }
        log.LogDebugf("SplitExtentKey: ino(%v) ek(%v)", inode, ek)

        if mw.EnableSummary {
                go func() {
                        newInfo, _ := mw.InodeGet_ll(inode)
                        if oldInfo != nil && newInfo != nil {
                                if int64(oldInfo.Size) < int64(newInfo.Size) {
                                        mw.UpdateSummary_ll(parentInode, 0, 0, int64(newInfo.Size)-int64(oldInfo.Size))
                                }
                        }
                }()
        }

        return nil
}

// Used as a callback by stream sdk
func (mw *MetaWrapper) AppendExtentKey(parentInode, inode uint64, ek proto.ExtentKey, discard []proto.ExtentKey) (int, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return statusError, syscall.ENOENT
        }
        var oldInfo *proto.InodeInfo
        if mw.EnableSummary {
                oldInfo, _ = mw.InodeGet_ll(inode)
        }

        status, err := mw.appendExtentKey(mp, inode, ek, discard, false)
        if err != nil || status != statusOK {
                log.LogErrorf("MetaWrapper AppendExtentKey: inode(%v) ek(%v) local discard(%v) err(%v) status(%v)", inode, ek, discard, err, status)
                return status, statusToErrno(status)
        }
        log.LogDebugf("MetaWrapper AppendExtentKey: ino(%v) ek(%v) discard(%v)", inode, ek, discard)

        if mw.EnableSummary {
                go func() {
                        newInfo, _ := mw.InodeGet_ll(inode)
                        if oldInfo != nil && newInfo != nil {
                                if int64(oldInfo.Size) < int64(newInfo.Size) {
                                        mw.UpdateSummary_ll(parentInode, 0, 0, int64(newInfo.Size)-int64(oldInfo.Size))
                                }
                        }
                }()
        }

        return statusOK, nil
}

// AppendExtentKeys append multiple extent key into specified inode with single request.
func (mw *MetaWrapper) AppendExtentKeys(inode uint64, eks []proto.ExtentKey) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return syscall.ENOENT
        }

        status, err := mw.appendExtentKeys(mp, inode, eks)
        if err != nil || status != statusOK {
                log.LogErrorf("AppendExtentKeys: inode(%v) extentKeys(%v) err(%v) status(%v)", inode, eks, err, status)
                return statusToErrno(status)
        }
        log.LogDebugf("AppendExtentKeys: ino(%v) extentKeys(%v)", inode, eks)
        return nil
}

// AppendObjExtentKeys append multiple obj extent key into specified inode with single request.
func (mw *MetaWrapper) AppendObjExtentKeys(inode uint64, eks []proto.ObjExtentKey) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return syscall.ENOENT
        }

        status, err := mw.appendObjExtentKeys(mp, inode, eks)
        if err != nil || status != statusOK {
                log.LogErrorf("AppendObjExtentKeys: inode(%v) objextentKeys(%v) err(%v) status(%v)", inode, eks, err, status)
                return statusToErrno(status)
        }
        log.LogDebugf("AppendObjExtentKeys: ino(%v) objextentKeys(%v)", inode, eks)
        return nil
}

func (mw *MetaWrapper) GetExtents(inode uint64) (gen uint64, size uint64, extents []proto.ExtentKey, err error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return 0, 0, nil, syscall.ENOENT
        }

        resp, err := mw.getExtents(mp, inode)
        if err != nil {
                if resp != nil {
                        err = statusToErrno(resp.Status)
                }
                log.LogErrorf("GetExtents: ino(%v) err(%v)", inode, err)
                return 0, 0, nil, err
        }
        extents = resp.Extents
        gen = resp.Generation
        size = resp.Size

        // log.LogDebugf("GetObjExtents stack[%v]", string(debug.Stack()))
        log.LogDebugf("GetExtents: ino(%v) gen(%v) size(%v) extents len (%v)", inode, gen, size, len(extents))
        return gen, size, extents, nil
}

func (mw *MetaWrapper) GetObjExtents(inode uint64) (gen uint64, size uint64, extents []proto.ExtentKey, objExtents []proto.ObjExtentKey, err error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                return 0, 0, nil, nil, syscall.ENOENT
        }

        status, gen, size, extents, objExtents, err := mw.getObjExtents(mp, inode)
        if err != nil || status != statusOK {
                log.LogErrorf("GetObjExtents: ino(%v) err(%v) status(%v)", inode, err, status)
                return 0, 0, nil, nil, statusToErrno(status)
        }
        log.LogDebugf("GetObjExtents: ino(%v) gen(%v) size(%v) extents(%v) objextents(%v)", inode, gen, size, extents, objExtents)
        return gen, size, extents, objExtents, nil
}

func (mw *MetaWrapper) Truncate(inode, size uint64, fullPath string) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("Truncate: No inode partition, ino(%v)", inode)
                return syscall.ENOENT
        }

        status, err := mw.truncate(mp, inode, size, fullPath)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }
        return nil
}

func (mw *MetaWrapper) Link(parentID uint64, name string, ino uint64, fullPath string) (*proto.InodeInfo, error) {
        // if mw.EnableTransaction {
        if mw.EnableTransaction&proto.TxOpMaskLink > 0 {
                return mw.txLink(parentID, name, ino, fullPath)
        } else {
                return mw.link(parentID, name, ino, fullPath)
        }
}

func (mw *MetaWrapper) txLink(parentID uint64, name string, ino uint64, fullPath string) (info *proto.InodeInfo, err error) {
        // var err error
        var status int
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("txLink: No parent partition, parentID(%v)", parentID)
                return nil, syscall.ENOENT
        }

        mp := mw.getPartitionByInode(ino)
        if mp == nil {
                log.LogErrorf("txLink: No target inode partition, ino(%v)", ino)
                return nil, syscall.ENOENT
        }
        var tx *Transaction

        defer func() {
                if tx != nil {
                        err = tx.OnDone(err, mw)
                }
        }()

        tx, err = NewLinkTransaction(parentMP, parentID, name, mp, ino, mw.TxTimeout)
        if err != nil {
                return nil, syscall.EAGAIN
        }

        status, err = mw.txCreateTX(tx, parentMP)
        if status != statusOK || err != nil {
                return nil, statusErrToErrno(status, err)
        }

        funcs := make([]func() (int, error), 0)

        funcs = append(funcs, func() (int, error) {
                var newSt int
                var newErr error
                newSt, info, newErr = mw.txIlink(tx, mp, ino, fullPath)
                return newSt, newErr
        })

        funcs = append(funcs, func() (int, error) {
                var newSt int
                var newErr error
                var quotaIds []uint32
                var ifo *proto.InodeInfo

                if mw.EnableQuota {
                        quotaInfos, newErr := mw.getInodeQuota(parentMP, parentID)
                        if newErr != nil {
                                log.LogErrorf("link: get parent quota fail, parentID(%v) err(%v)", parentID, newErr)
                                return statusError, syscall.ENOENT
                        }

                        for quotaId := range quotaInfos {
                                quotaIds = append(quotaIds, quotaId)
                        }
                }

                newSt, ifo, newErr = mw.iget(mp, ino, 0)
                if newErr != nil || newSt != statusOK {
                        return newSt, newErr
                }

                newSt, newErr = mw.txDcreate(tx, parentMP, parentID, name, ino, ifo.Mode, quotaIds, fullPath)
                return newSt, newErr
        })

        // 2. prepare transaction
        var preErr error
        wg := sync.WaitGroup{}
        for _, fc := range funcs {
                wg.Add(1)
                go func(f func() (int, error)) {
                        defer wg.Done()
                        tStatus, tErr := f()
                        if tStatus != statusOK || tErr != nil {
                                preErr = statusErrToErrno(tStatus, tErr)
                        }
                }(fc)
        }
        wg.Wait()

        if preErr != nil {
                return nil, preErr
        }

        return info, nil
}

func (mw *MetaWrapper) link(parentID uint64, name string, ino uint64, fullPath string) (*proto.InodeInfo, error) {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                log.LogErrorf("Link: No parent partition, parentID(%v)", parentID)
                return nil, syscall.ENOENT
        }

        status, info, err := mw.iget(parentMP, parentID, mw.VerReadSeq)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }

        quota := atomic.LoadUint32(&mw.DirChildrenNumLimit)
        if info.Nlink >= quota {
                log.LogErrorf("link: parent inode's nlink quota reached, parentID(%v)", parentID)
                return nil, syscall.EDQUOT
        }

        mp := mw.getPartitionByInode(ino)
        if mp == nil {
                log.LogErrorf("Link: No target inode partition, ino(%v)", ino)
                return nil, syscall.ENOENT
        }

        // increase inode nlink
        status, info, err = mw.ilink(mp, ino, fullPath)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        if mw.EnableQuota {
                quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
                if err != nil {
                        log.LogErrorf("link: get parent quota fail, parentID(%v) err(%v)", parentID, err)
                        return nil, syscall.ENOENT
                }
                var quotaIds []uint32
                for quotaId := range quotaInfos {
                        quotaIds = append(quotaIds, quotaId)
                }
                // create new dentry and refer to the inode
                status, err = mw.quotaDcreate(parentMP, parentID, name, ino, info.Mode, quotaIds, fullPath)
        } else {
                status, err = mw.dcreate(parentMP, parentID, name, ino, info.Mode, fullPath)
        }
        if err != nil {
                return nil, statusToErrno(status)
        } else if status != statusOK {
                if status != statusExist {
                        mw.iunlink(mp, ino, mw.Client.GetLatestVer(), 0, fullPath)
                }
                return nil, statusToErrno(status)
        }
        return info, nil
}

func (mw *MetaWrapper) Evict(inode uint64, fullPath string) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogWarnf("Evict: No such partition, ino(%v)", inode)
                return syscall.EINVAL
        }

        status, err := mw.ievict(mp, inode, fullPath)
        if err != nil || status != statusOK {
                log.LogWarnf("Evict: ino(%v) err(%v) status(%v)", inode, err, status)
                return statusToErrno(status)
        }
        return nil
}

func (mw *MetaWrapper) Setattr(inode uint64, valid, mode, uid, gid uint32, atime, mtime int64) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("Setattr: No such partition, ino(%v)", inode)
                return syscall.EINVAL
        }

        status, err := mw.setattr(mp, inode, valid, mode, uid, gid, atime, mtime)
        if err != nil || status != statusOK {
                log.LogErrorf("Setattr: ino(%v) err(%v) status(%v)", inode, err, status)
                return statusToErrno(status)
        }

        return nil
}

func (mw *MetaWrapper) InodeCreate_ll(parentID uint64, mode, uid, gid uint32, target []byte, quotaIds []uint64, fullPath string) (*proto.InodeInfo, error) {
        var (
                status       int
                err          error
                info         *proto.InodeInfo
                mp           *MetaPartition
                rwPartitions []*MetaPartition
        )

get_rwmp:
        rwPartitions = mw.getRWPartitions()
        length := len(rwPartitions)
        epoch := atomic.AddUint64(&mw.epoch, 1)
        retryTime := 0
        if mw.EnableQuota && parentID != 0 {
                var quotaIds []uint32
                parentMP := mw.getPartitionByInode(parentID)
                if parentMP == nil {
                        log.LogErrorf("InodeCreate_ll: No parent partition, parentID(%v)", parentID)
                        return nil, syscall.ENOENT
                }
                quotaInfos, err := mw.getInodeQuota(parentMP, parentID)
                if err != nil {
                        log.LogErrorf("InodeCreate_ll: get parent quota fail, parentID(%v) err(%v)", parentID, err)
                        return nil, syscall.ENOENT
                }
                for quotaId := range quotaInfos {
                        quotaIds = append(quotaIds, quotaId)
                }
                for i := 0; i < length; i++ {
                        index := (int(epoch) + i) % length
                        mp = rwPartitions[index]
                        status, info, err = mw.quotaIcreate(mp, mode, uid, gid, target, quotaIds, fullPath)
                        if err == nil && status == statusOK {
                                return info, nil
                        } else if status == statusFull {
                                if retryTime >= InodeFullMaxRetryTime {
                                        break
                                }
                                retryTime++
                                log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
                                mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
                                        mw.triggerAndWaitForceUpdate()
                                        return nil, nil
                                })
                                goto get_rwmp
                        } else if status == statusNoSpace {
                                log.LogErrorf("InodeCreate_ll status %v", status)
                                return nil, statusToErrno(status)
                        }
                }
        } else {
                for i := 0; i < length; i++ {
                        index := (int(epoch) + i) % length
                        mp = rwPartitions[index]
                        status, info, err = mw.icreate(mp, mode, uid, gid, target, fullPath)
                        if err == nil && status == statusOK {
                                return info, nil
                        } else if status == statusFull {
                                if retryTime >= InodeFullMaxRetryTime {
                                        break
                                }
                                retryTime++
                                log.LogWarnf("Mp(%v) inode is full, trigger rwmp get and retry(%v)", mp, retryTime)
                                mw.singleflight.Do(ForceUpdateRWMP, func() (interface{}, error) {
                                        mw.triggerAndWaitForceUpdate()
                                        return nil, nil
                                })
                                goto get_rwmp
                        } else if status == statusNoSpace {
                                log.LogErrorf("InodeCreate_ll status %v", status)
                                return nil, statusToErrno(status)
                        }
                }
        }
        return nil, syscall.ENOMEM
}

// InodeUnlink_ll is a low-level api that makes specified inode link value +1.
func (mw *MetaWrapper) InodeLink_ll(inode uint64, fullPath string) (*proto.InodeInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeLink_ll: No such partition, ino(%v)", inode)
                return nil, syscall.EINVAL
        }
        status, info, err := mw.ilink(mp, inode, fullPath)
        if err != nil || status != statusOK {
                log.LogErrorf("InodeLink_ll: ino(%v) err(%v) status(%v)", inode, err, status)
                return nil, statusToErrno(status)
        }
        return info, nil
}

// InodeUnlink_ll is a low-level api that makes specified inode link value -1.
func (mw *MetaWrapper) InodeUnlink_ll(inode uint64, fullPath string) (*proto.InodeInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeUnlink_ll: No such partition, ino(%v)", inode)
                return nil, syscall.EINVAL
        }
        var ver uint64
        if mw.Client != nil {
                ver = mw.Client.GetLatestVer()
        }
        status, info, err := mw.iunlink(mp, inode, ver, 0, fullPath)
        if err != nil || status != statusOK {
                log.LogErrorf("InodeUnlink_ll: ino(%v) err(%v) status(%v)", inode, err, status)
                return nil, statusToErrno(status)
        }
        return info, nil
}

func (mw *MetaWrapper) InodeClearPreloadCache_ll(inode uint64) error {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeClearPreloadCache_ll: No such partition, ino(%v)", inode)
                return syscall.EINVAL
        }
        status, err := mw.iclearCache(mp, inode)
        if err != nil || status != statusOK {
                log.LogErrorf("InodeClearPreloadCache_ll: ino(%v) err(%v) status(%v)", inode, err, status)
                return statusToErrno(status)
        }
        return nil
}

func (mw *MetaWrapper) InitMultipart_ll(path string, extend map[string]string) (multipartId string, err error) {
        var (
                status       int
                mp           *MetaPartition
                rwPartitions = mw.getRWPartitions()
                length       = len(rwPartitions)
        )
        if length <= 0 {
                log.LogErrorf("InitMultipart: no writable partitions, path(%v)", path)
                return "", syscall.ENOENT
        }

        epoch := atomic.AddUint64(&mw.epoch, 1)
        for i := 0; i < length; i++ {
                index := (int(epoch) + i) % length
                mp = rwPartitions[index]
                log.LogDebugf("InitMultipart_ll: mp(%v), index(%v)", mp, index)
                status, sessionId, err := mw.createMultipart(mp, path, extend)
                if err == nil && status == statusOK && len(sessionId) > 0 {
                        return sessionId, nil
                } else {
                        log.LogErrorf("InitMultipart: create multipart id fail, path(%v), mp(%v), status(%v), err(%v)",
                                path, mp, status, err)
                }
        }
        log.LogErrorf("InitMultipart: create multipart id fail, path(%v), status(%v), err(%v)", path, status, err)
        if err != nil {
                return "", err
        } else {
                return "", statusToErrno(status)
        }
}

func (mw *MetaWrapper) GetMultipart_ll(path, multipartId string) (info *proto.MultipartInfo, err error) {
        var (
                mpId  uint64
                found bool
        )
        mpId, found = util.MultipartIDFromString(multipartId).PartitionID()
        if !found {
                log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartId, err)
                // If meta partition not found by multipart id, broadcast to all meta partitions to find it
                info, _, err = mw.broadcastGetMultipart(path, multipartId)
                return
        }
        mp := mw.getPartitionByID(mpId)
        if mp == nil {
                err = syscall.ENOENT
                return
        }
        status, multipartInfo, err := mw.getMultipart(mp, path, multipartId)
        if err != nil || status != statusOK {
                log.LogErrorf("GetMultipartRequest: err(%v) status(%v)", err, status)
                return nil, statusToErrno(status)
        }
        return multipartInfo, nil
}

func (mw *MetaWrapper) AddMultipartPart_ll(path, multipartId string, partId uint16, size uint64, md5 string,
        inodeInfo *proto.InodeInfo) (oldInode uint64, updated bool, err error) {
        var (
                mpId  uint64
                found bool
        )
        mpId, found = util.MultipartIDFromString(multipartId).PartitionID()
        if !found {
                log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartId, err)
                // If meta partition not found by multipart id, broadcast to all meta partitions to find it
                if _, mpId, err = mw.broadcastGetMultipart(path, multipartId); err != nil {
                        log.LogErrorf("AddMultipartPart_ll: broadcast get multipart fail: multipartId(%v) err(%v)", multipartId, err)
                        return
                }
        }
        mp := mw.getPartitionByID(mpId)
        if mp == nil {
                log.LogWarnf("AddMultipartPart_ll: has no meta partition: multipartId(%v) mpId(%v)", multipartId, mpId)
                err = syscall.ENOENT
                return
        }
        status, oldInode, updated, err := mw.addMultipartPart(mp, path, multipartId, partId, size, md5, inodeInfo)
        if err != nil || status != statusOK {
                log.LogErrorf("AddMultipartPart_ll: err(%v) status(%v)", err, status)
                return 0, false, statusToErrno(status)
        }
        return
}

func (mw *MetaWrapper) RemoveMultipart_ll(path, multipartID string) (err error) {
        var (
                mpId  uint64
                found bool
        )
        mpId, found = util.MultipartIDFromString(multipartID).PartitionID()
        if !found {
                log.LogDebugf("AddMultipartPart_ll: meta partition not found by multipart id, multipartId(%v), err(%v)", multipartID, err)
                // If meta partition not found by multipart id, broadcast to all meta partitions to find it
                if _, mpId, err = mw.broadcastGetMultipart(path, multipartID); err != nil {
                        return
                }
        }
        mp := mw.getPartitionByID(mpId)
        if mp == nil {
                err = syscall.ENOENT
                return
        }
        status, err := mw.removeMultipart(mp, path, multipartID)
        if err != nil || status != statusOK {
                log.LogErrorf(" RemoveMultipart_ll: partition remove multipart fail: "+
                        "volume(%v) partitionID(%v) multipartID(%v) err(%v) status(%v)",
                        mw.volname, mp.PartitionID, multipartID, err, status)
                return statusToErrno(status)
        }
        return
}

func (mw *MetaWrapper) broadcastGetMultipart(path, multipartId string) (info *proto.MultipartInfo, mpID uint64, err error) {
        log.LogInfof("broadcastGetMultipart: find meta partition broadcast multipartId(%v)", multipartId)
        partitions := mw.partitions
        var mp *MetaPartition
        wg := new(sync.WaitGroup)
        var resultMu sync.Mutex
        for _, mp = range partitions {
                wg.Add(1)
                go func(mp *MetaPartition) {
                        defer wg.Done()
                        status, multipartInfo, err := mw.getMultipart(mp, path, multipartId)
                        if err == nil && status == statusOK && multipartInfo != nil && multipartInfo.ID == multipartId {
                                resultMu.Lock()
                                mpID = mp.PartitionID
                                info = multipartInfo
                                resultMu.Unlock()
                        }
                        if err != nil && err != syscall.ENOENT {
                                log.LogErrorf("broadcastGetMultipart: get multipart fail: partitionId(%v) multipartId(%v)",
                                        mp.PartitionID, multipartId)
                        }
                }(mp)
        }
        wg.Wait()

        resultMu.Lock()
        defer resultMu.Unlock()
        if info == nil {
                err = syscall.ENOENT
                return
        }
        return
}

func (mw *MetaWrapper) ListMultipart_ll(prefix, delimiter, keyMarker string, multipartIdMarker string, maxUploads uint64) (sessionResponse []*proto.MultipartInfo, err error) {
        partitions := mw.partitions
        wg := sync.WaitGroup{}
        wl := sync.Mutex{}
        sessions := make([]*proto.MultipartInfo, 0)

        for _, mp := range partitions {
                wg.Add(1)
                go func(mp *MetaPartition) {
                        defer wg.Done()
                        status, response, err := mw.listMultiparts(mp, prefix, delimiter, keyMarker, multipartIdMarker, maxUploads+1)
                        if err != nil || status != statusOK {
                                log.LogErrorf("ListMultipart: partition list multipart fail, partitionID(%v) err(%v) status(%v)",
                                        mp.PartitionID, err, status)
                                err = statusToErrno(status)
                                return
                        }
                        wl.Lock()
                        defer wl.Unlock()
                        sessions = append(sessions, response.Multiparts...)
                }(mp)
        }

        // combine sessions from per partition
        wg.Wait()

        // reorder sessions by path
        sort.SliceStable(sessions, func(i, j int) bool {
                return (sessions[i].Path < sessions[j].Path) || ((sessions[i].Path == sessions[j].Path) && (sessions[i].ID < sessions[j].ID))
        })
        return sessions, nil
}

func (mw *MetaWrapper) XAttrSet_ll(inode uint64, name, value []byte) error {
        var err error
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("XAttrSet_ll: no such partition, inode(%v)", inode)
                return syscall.ENOENT
        }
        var status int
        status, err = mw.setXAttr(mp, inode, name, value)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }
        log.LogDebugf("XAttrSet_ll: set xattr: volume(%v) inode(%v) name(%v) value(%v) status(%v)",
                mw.volname, inode, name, value, status)
        return nil
}

func (mw *MetaWrapper) BatchSetXAttr_ll(inode uint64, attrs map[string]string) error {
        var err error
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("XAttrSet_ll: no such partition, inode(%v)", inode)
                return syscall.ENOENT
        }
        var status int
        status, err = mw.batchSetXAttr(mp, inode, attrs)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }
        log.LogDebugf("BatchSetXAttr_ll: set xattr: volume(%v) inode(%v) attrs(%v) status(%v)",
                mw.volname, inode, attrs, status)
        return nil
}

func (mw *MetaWrapper) XAttrGetAll_ll(inode uint64) (*proto.XAttrInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("XAttrGetAll_ll: no such partition, ino(%v)", inode)
                return nil, syscall.ENOENT
        }

        attrs, status, err := mw.getAllXAttr(mp, inode)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }

        xAttr := &proto.XAttrInfo{
                Inode:  inode,
                XAttrs: attrs,
        }

        log.LogDebugf("XAttrGetAll_ll: volume(%v) inode(%v) attrs(%v)",
                mw.volname, inode, attrs)
        return xAttr, nil
}

func (mw *MetaWrapper) XAttrGet_ll(inode uint64, name string) (*proto.XAttrInfo, error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("InodeGet_ll: no such partition, ino(%v)", inode)
                return nil, syscall.ENOENT
        }

        value, status, err := mw.getXAttr(mp, inode, name)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }

        xAttrValues := make(map[string]string)
        xAttrValues[name] = value

        xAttr := &proto.XAttrInfo{
                Inode:  inode,
                XAttrs: xAttrValues,
        }

        log.LogDebugf("XAttrGet_ll: get xattr: volume(%v) inode(%v) name(%v) value(%v)",
                mw.volname, inode, name, value)
        return xAttr, nil
}

// XAttrDel_ll is a low-level meta api that deletes specified xattr.
func (mw *MetaWrapper) XAttrDel_ll(inode uint64, name string) error {
        var err error
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("XAttrDel_ll: no such partition, inode(%v)", inode)
                return syscall.ENOENT
        }
        var status int
        status, err = mw.removeXAttr(mp, inode, name)
        if err != nil || status != statusOK {
                return statusToErrno(status)
        }
        log.LogDebugf("XAttrDel_ll: remove xattr, inode(%v) name(%v) status(%v)", inode, name, status)
        return nil
}

func (mw *MetaWrapper) XAttrsList_ll(inode uint64) ([]string, error) {
        var err error
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                log.LogErrorf("XAttrsList_ll: no such partition, inode(%v)", inode)
                return nil, syscall.ENOENT
        }
        keys, status, err := mw.listXAttr(mp, inode)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }

        return keys, nil
}

func (mw *MetaWrapper) UpdateSummary_ll(parentIno uint64, filesInc int64, dirsInc int64, bytesInc int64) {
        if filesInc == 0 && dirsInc == 0 && bytesInc == 0 {
                return
        }
        mp := mw.getPartitionByInode(parentIno)
        if mp == nil {
                log.LogErrorf("UpdateSummary_ll: no such partition, inode(%v)", parentIno)
                return
        }
        for cnt := 0; cnt < UpdateSummaryRetry; cnt++ {
                err := mw.updateXAttrs(mp, parentIno, filesInc, dirsInc, bytesInc)
                if err == nil {
                        return
                }
        }
        return
}

func (mw *MetaWrapper) ReadDirOnly_ll(parentID uint64) ([]proto.Dentry, error) {
        parentMP := mw.getPartitionByInode(parentID)
        if parentMP == nil {
                return nil, syscall.ENOENT
        }

        status, children, err := mw.readdironly(parentMP, parentID)
        if err != nil || status != statusOK {
                return nil, statusToErrno(status)
        }
        return children, nil
}

type SummaryInfo struct {
        Files   int64
        Subdirs int64
        Fbytes  int64
}

func (mw *MetaWrapper) GetSummary_ll(parentIno uint64, goroutineNum int32) (SummaryInfo, error) {
        if goroutineNum > MaxSummaryGoroutineNum {
                goroutineNum = MaxSummaryGoroutineNum
        }
        if goroutineNum <= 0 {
                goroutineNum = 1
        }
        var summaryInfo SummaryInfo
        errCh := make(chan error)
        var wg sync.WaitGroup
        var currentGoroutineNum int32 = 0
        if mw.EnableSummary {
                inodeCh := make(chan uint64, ChannelLen)
                wg.Add(1)
                atomic.AddInt32(&currentGoroutineNum, 1)
                inodeCh <- parentIno
                go mw.getDentry(parentIno, inodeCh, errCh, &wg, &currentGoroutineNum, true, goroutineNum)
                go func() {
                        wg.Wait()
                        close(inodeCh)
                }()

                go mw.getDirSummary(&summaryInfo, inodeCh, errCh)
                for err := range errCh {
                        return SummaryInfo{0, 0, 0}, err
                }
                return summaryInfo, nil
        } else {
                summaryCh := make(chan SummaryInfo, ChannelLen)
                wg.Add(1)
                atomic.AddInt32(&currentGoroutineNum, 1)
                go mw.getSummaryOrigin(parentIno, summaryCh, errCh, &wg, &currentGoroutineNum, true, goroutineNum)
                go func() {
                        wg.Wait()
                        close(summaryCh)
                }()
                go func(summaryInfo *SummaryInfo) {
                        for summary := range summaryCh {
                                summaryInfo.Files = summaryInfo.Files + summary.Files
                                summaryInfo.Subdirs = summaryInfo.Subdirs + summary.Subdirs
                                summaryInfo.Fbytes = summaryInfo.Fbytes + summary.Fbytes
                        }
                        close(errCh)
                }(&summaryInfo)
                for err := range errCh {
                        return SummaryInfo{0, 0, 0}, err
                }
                return summaryInfo, nil
        }
}

func (mw *MetaWrapper) getDentry(parentIno uint64, inodeCh chan<- uint64, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
        defer func() {
                if newGoroutine {
                        atomic.AddInt32(currentGoroutineNum, -1)
                        wg.Done()
                }
        }()
        entries, err := mw.ReadDirOnly_ll(parentIno)
        if err != nil {
                errCh <- err
                return
        }
        for _, entry := range entries {
                inodeCh <- entry.Inode
                if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
                        wg.Add(1)
                        atomic.AddInt32(currentGoroutineNum, 1)
                        go mw.getDentry(entry.Inode, inodeCh, errCh, wg, currentGoroutineNum, true, goroutineNum)
                } else {
                        mw.getDentry(entry.Inode, inodeCh, errCh, wg, currentGoroutineNum, false, goroutineNum)
                }
        }
}

func (mw *MetaWrapper) getDirSummary(summaryInfo *SummaryInfo, inodeCh <-chan uint64, errch chan<- error) {
        var inodes []uint64
        var keys []string
        for inode := range inodeCh {
                inodes = append(inodes, inode)
                keys = append(keys, SummaryKey)
                if len(inodes) < BatchSize {
                        continue
                }
                xattrInfos, err := mw.BatchGetXAttr(inodes, keys)
                if err != nil {
                        errch <- err
                        return
                }
                inodes = inodes[0:0]
                keys = keys[0:0]
                for _, xattrInfo := range xattrInfos {
                        if xattrInfo.XAttrs[SummaryKey] != "" {
                                summaryList := strings.Split(xattrInfo.XAttrs[SummaryKey], ",")
                                files, _ := strconv.ParseInt(summaryList[0], 10, 64)
                                subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
                                fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
                                summaryInfo.Files += files
                                summaryInfo.Subdirs += subdirs
                                summaryInfo.Fbytes += fbytes
                        }
                }
        }
        xattrInfos, err := mw.BatchGetXAttr(inodes, keys)
        if err != nil {
                errch <- err
                return
        }
        for _, xattrInfo := range xattrInfos {
                if xattrInfo.XAttrs[SummaryKey] != "" {
                        summaryList := strings.Split(xattrInfo.XAttrs[SummaryKey], ",")
                        files, _ := strconv.ParseInt(summaryList[0], 10, 64)
                        subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
                        fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
                        summaryInfo.Files += files
                        summaryInfo.Subdirs += subdirs
                        summaryInfo.Fbytes += fbytes
                }
        }
        close(errch)
        return
}

func (mw *MetaWrapper) getSummaryOrigin(parentIno uint64, summaryCh chan<- SummaryInfo, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
        defer func() {
                if newGoroutine {
                        atomic.AddInt32(currentGoroutineNum, -1)
                        wg.Done()
                }
        }()
        var subdirsList []uint64
        retSummaryInfo := SummaryInfo{
                Files:   0,
                Subdirs: 0,
                Fbytes:  0,
        }
        children, err := mw.ReadDir_ll(parentIno)
        if err != nil {
                errCh <- err
                return
        }
        for _, dentry := range children {
                if proto.IsDir(dentry.Type) {
                        retSummaryInfo.Subdirs += 1
                        subdirsList = append(subdirsList, dentry.Inode)
                } else {
                        fileInfo, err := mw.InodeGet_ll(dentry.Inode)
                        if err != nil {
                                errCh <- err
                                return
                        }
                        retSummaryInfo.Files += 1
                        retSummaryInfo.Fbytes += int64(fileInfo.Size)
                }
        }
        summaryCh <- retSummaryInfo
        for _, subdirIno := range subdirsList {
                if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
                        wg.Add(1)
                        atomic.AddInt32(currentGoroutineNum, 1)
                        go mw.getSummaryOrigin(subdirIno, summaryCh, errCh, wg, currentGoroutineNum, true, goroutineNum)
                } else {
                        mw.getSummaryOrigin(subdirIno, summaryCh, errCh, wg, currentGoroutineNum, false, goroutineNum)
                }
        }
}

func (mw *MetaWrapper) RefreshSummary_ll(parentIno uint64, goroutineNum int32) error {
        if goroutineNum > MaxSummaryGoroutineNum {
                goroutineNum = MaxSummaryGoroutineNum
        }
        if goroutineNum <= 0 {
                goroutineNum = 1
        }
        var wg sync.WaitGroup
        var currentGoroutineNum int32 = 0
        errch := make(chan error)
        wg.Add(1)
        atomic.AddInt32(&currentGoroutineNum, 1)
        go mw.refreshSummary(parentIno, errch, &wg, &currentGoroutineNum, true, goroutineNum)
        go func() {
                wg.Wait()
                close(errch)
        }()
        for err := range errch {
                return err
        }
        return nil
}

func (mw *MetaWrapper) refreshSummary(parentIno uint64, errCh chan<- error, wg *sync.WaitGroup, currentGoroutineNum *int32, newGoroutine bool, goroutineNum int32) {
        defer func() {
                if newGoroutine {
                        atomic.AddInt32(currentGoroutineNum, -1)
                        wg.Done()
                }
        }()
        summaryXAttrInfo, err := mw.XAttrGet_ll(parentIno, SummaryKey)
        if err != nil {
                errCh <- err
                return
        }
        oldSummaryInfo := SummaryInfo{0, 0, 0}
        if summaryXAttrInfo.XAttrs[SummaryKey] != "" {
                summaryList := strings.Split(summaryXAttrInfo.XAttrs[SummaryKey], ",")
                files, _ := strconv.ParseInt(summaryList[0], 10, 64)
                subdirs, _ := strconv.ParseInt(summaryList[1], 10, 64)
                fbytes, _ := strconv.ParseInt(summaryList[2], 10, 64)
                oldSummaryInfo = SummaryInfo{
                        Files:   files,
                        Subdirs: subdirs,
                        Fbytes:  fbytes,
                }
        } else {
                oldSummaryInfo = SummaryInfo{0, 0, 0}
        }

        newSummaryInfo := SummaryInfo{0, 0, 0}

        var subdirsList []uint64
        children, err := mw.ReadDir_ll(parentIno)
        if err != nil {
                errCh <- err
                return
        }
        for _, dentry := range children {
                if proto.IsDir(dentry.Type) {
                        newSummaryInfo.Subdirs += 1
                        subdirsList = append(subdirsList, dentry.Inode)
                } else {
                        fileInfo, err := mw.InodeGet_ll(dentry.Inode)
                        if err != nil {
                                errCh <- err
                                return
                        }
                        newSummaryInfo.Files += 1
                        newSummaryInfo.Fbytes += int64(fileInfo.Size)
                }
        }
        go mw.UpdateSummary_ll(
                parentIno,
                newSummaryInfo.Files-oldSummaryInfo.Files,
                newSummaryInfo.Subdirs-oldSummaryInfo.Subdirs,
                newSummaryInfo.Fbytes-oldSummaryInfo.Fbytes,
        )

        for _, subdirIno := range subdirsList {
                if atomic.LoadInt32(currentGoroutineNum) < goroutineNum {
                        wg.Add(1)
                        atomic.AddInt32(currentGoroutineNum, 1)
                        go mw.refreshSummary(subdirIno, errCh, wg, currentGoroutineNum, true, goroutineNum)
                } else {
                        mw.refreshSummary(subdirIno, errCh, wg, currentGoroutineNum, false, goroutineNum)
                }
        }
}

func (mw *MetaWrapper) BatchSetInodeQuota_ll(inodes []uint64, quotaId uint32, IsRoot bool) (ret map[uint64]uint8, err error) {
        batchInodeMap := make(map[uint64][]uint64)
        ret = make(map[uint64]uint8, 0)
        for _, ino := range inodes {
                mp := mw.getPartitionByInode(ino)
                if mp == nil {
                        continue
                }
                if _, isFind := batchInodeMap[mp.PartitionID]; !isFind {
                        batchInodeMap[mp.PartitionID] = make([]uint64, 0, 128)
                }
                batchInodeMap[mp.PartitionID] = append(batchInodeMap[mp.PartitionID], ino)
        }

        for id, inos := range batchInodeMap {
                mp := mw.getPartitionByID(id)
                resp, err := mw.batchSetInodeQuota(mp, inos, quotaId, IsRoot)
                if err != nil {
                        log.LogErrorf("batchSetInodeQuota quota [%v] inodes [%v] err [%v]", quotaId, inos, err)
                        return ret, err
                }
                for k, v := range resp.InodeRes {
                        ret[k] = v
                }
        }

        log.LogInfof("set subInode quota [%v] inodes [%v] ret [%v] success.", quotaId, inodes, ret)
        return
}

func (mw *MetaWrapper) GetPartitionByInodeId_ll(inodeId uint64) (mp *MetaPartition) {
        return mw.getPartitionByInode(inodeId)
}

func (mw *MetaWrapper) BatchDeleteInodeQuota_ll(inodes []uint64, quotaId uint32) (ret map[uint64]uint8, err error) {
        batchInodeMap := make(map[uint64][]uint64)
        ret = make(map[uint64]uint8, 0)
        for _, ino := range inodes {
                mp := mw.getPartitionByInode(ino)
                if mp == nil {
                        continue
                }
                if _, isFind := batchInodeMap[mp.PartitionID]; !isFind {
                        batchInodeMap[mp.PartitionID] = make([]uint64, 0, 128)
                }
                batchInodeMap[mp.PartitionID] = append(batchInodeMap[mp.PartitionID], ino)
        }
        for id, inos := range batchInodeMap {
                mp := mw.getPartitionByID(id)
                resp, err := mw.batchDeleteInodeQuota(mp, inos, quotaId)
                if err != nil {
                        log.LogErrorf("batchDeleteInodeQuota quota [%v] inodes [%v] err [%v]", quotaId, inos, err)
                        return ret, err
                }
                for k, v := range resp.InodeRes {
                        ret[k] = v
                }
        }

        log.LogInfof("delete subInode inodes [%v] quota [%v] ret [%v] success.", inodes, quotaId, ret)
        return
}

func (mw *MetaWrapper) GetInodeQuota_ll(inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
        mp := mw.getPartitionByInode(inode)
        if mp == nil {
                err = fmt.Errorf("get partition by inode [%v] failed", inode)
                return nil, err
        }

        quotaInfos, err = mw.getInodeQuota(mp, inode)
        if err != nil {
                log.LogErrorf("GetInodeQuota_ll get inode [%v] quota failed [%v]", inode, err)
                return
        }

        return
}

func (mw *MetaWrapper) ApplyQuota_ll(parentIno uint64, quotaId uint32, maxConcurrencyInode uint64) (numInodes uint64, err error) {
        inodes := make([]uint64, 0, maxConcurrencyInode)
        var curInodeCount uint64
        err = mw.applyQuota(parentIno, quotaId, &numInodes, &curInodeCount, &inodes, maxConcurrencyInode, true)
        return
}

func (mw *MetaWrapper) RevokeQuota_ll(parentIno uint64, quotaId uint32, maxConcurrencyInode uint64) (numInodes uint64, err error) {
        inodes := make([]uint64, 0, maxConcurrencyInode)
        var curInodeCount uint64
        err = mw.revokeQuota(parentIno, quotaId, &numInodes, &curInodeCount, &inodes, maxConcurrencyInode, true)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        "fmt"
        "net"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        SendRetryLimit    = 200 // times
        SendRetryInterval = 100 // ms
)

type MetaConn struct {
        conn *net.TCPConn
        id   uint64 // PartitionID
        addr string // MetaNode addr
}

// Connection managements
//

func (mc *MetaConn) String() string {
        return fmt.Sprintf("partitionID(%v) addr(%v)", mc.id, mc.addr)
}

func (mw *MetaWrapper) getConn(partitionID uint64, addr string) (*MetaConn, error) {
        conn, err := mw.conns.GetConnect(addr)
        if err != nil {
                return nil, err
        }
        mc := &MetaConn{conn: conn, id: partitionID, addr: addr}
        return mc, nil
}

func (mw *MetaWrapper) putConn(mc *MetaConn, err error) {
        mw.conns.PutConnect(mc.conn, err != nil)
}

func (mw *MetaWrapper) sendToMetaPartition(mp *MetaPartition, req *proto.Packet) (*proto.Packet, error) {
        var (
                resp    *proto.Packet
                err     error
                addr    string
                mc      *MetaConn
                start   time.Time
                lastSeq uint64
        )
        var sendTimeLimit int
        if mw.metaSendTimeout < 20 {
                sendTimeLimit = 20 * 1000 // ms
        } else {
                sendTimeLimit = int(mw.metaSendTimeout) * 1000 // ms
        }

        delta := (sendTimeLimit*2/SendRetryLimit - SendRetryInterval*2) / SendRetryLimit // ms
        log.LogDebugf("mw.metaSendTimeout: %v s, sendTimeLimit: %v ms, delta: %v ms, req %v", mw.metaSendTimeout, sendTimeLimit, delta, req)

        req.ExtentType |= proto.MultiVersionFlag

        errs := make(map[int]error, len(mp.Members))
        var j int

        addr = mp.LeaderAddr
        if addr == "" {
                err = errors.New(fmt.Sprintf("sendToMetaPartition: failed due to empty leader addr and goto retry, req(%v) mp(%v)", req, mp))
                goto retry
        }
        mc, err = mw.getConn(mp.PartitionID, addr)
        if err != nil {
                log.LogWarnf("sendToMetaPartition: getConn failed and goto retry, req(%v) mp(%v) addr(%v) err(%v)", req, mp, addr, err)
                goto retry
        }

        if mw.Client != nil { // compatible lcNode not init Client
                lastSeq = mw.Client.GetLatestVer()
        }

sendWithList:
        resp, err = mc.send(req, lastSeq)
        if err == nil && !resp.ShouldRetry() && !resp.ShouldRetryWithVersionList() {
                mw.putConn(mc, err)
                goto out
        }
        if resp != nil && resp.ShouldRetryWithVersionList() {
                // already send with list, must be a issue happened
                if req.ExtentType&proto.VersionListFlag == proto.VersionListFlag {
                        mw.putConn(mc, err)
                        goto out
                }
                req.ExtentType |= proto.VersionListFlag
                req.VerList = make([]*proto.VolVersionInfo, len(mw.Client.GetVerMgr().VerList))
                copy(req.VerList, mw.Client.GetVerMgr().VerList)
                log.LogWarnf("sendToMetaPartition: leader failed and goto retry, req(%v) mp(%v) mc(%v) err(%v) resp(%v)", req, mp, mc, err, resp)
                goto sendWithList
        }
        mw.putConn(mc, err)
retry:
        start = time.Now()
        for i := 0; i <= SendRetryLimit; i++ {
                for j, addr = range mp.Members {
                        mc, err = mw.getConn(mp.PartitionID, addr)
                        errs[j] = err
                        if err != nil {
                                log.LogWarnf("sendToMetaPartition: getConn failed and continue to retry, req(%v) mp(%v) addr(%v) err(%v)", req, mp, addr, err)
                                continue
                        }
                        resp, err = mc.send(req, lastSeq)
                        mw.putConn(mc, err)
                        if err == nil && !resp.ShouldRetry() {
                                goto out
                        }
                        if err == nil {
                                errs[j] = errors.New(fmt.Sprintf("request should retry[%v]", resp.GetResultMsg()))
                        } else {
                                errs[j] = err
                        }
                        log.LogWarnf("sendToMetaPartition: retry failed req(%v) mp(%v) mc(%v) errs(%v) resp(%v)", req, mp, mc, errs, resp)
                }
                if time.Since(start) > time.Duration(sendTimeLimit)*time.Millisecond {
                        log.LogWarnf("sendToMetaPartition: retry timeout req(%v) mp(%v) time(%v)", req, mp, time.Since(start))
                        break
                }
                sendRetryInterval := time.Duration(SendRetryInterval+i*delta) * time.Millisecond
                log.LogWarnf("sendToMetaPartition: req(%v) mp(%v) retry in (%v), retry_iteration (%v), retry_totalTime (%v)", req, mp,
                        sendRetryInterval, i+1, time.Since(start))
                time.Sleep(sendRetryInterval)
        }

out:
        log.LogDebugf("sendToMetaPartition: succeed! req(%v) mc(%v) resp(%v)", req, mc, resp)
        if mw.Client != nil && resp != nil { // For compatibility with LcNode, the client checks whether it is nil
                mw.checkVerFromMeta(resp)
        }
        if err != nil || resp == nil {
                return nil, errors.New(fmt.Sprintf("sendToMetaPartition failed: req(%v) mp(%v) errs(%v) resp(%v)", req, mp, errs, resp))
        }
        return resp, nil
}

func (mc *MetaConn) send(req *proto.Packet, verSeq uint64) (resp *proto.Packet, err error) {
        req.ExtentType |= proto.MultiVersionFlag
        req.VerSeq = verSeq

        err = req.WriteToConn(mc.conn)
        if err != nil {
                return nil, errors.Trace(err, "Failed to write to conn, req(%v)", req)
        }
        resp = proto.NewPacket()
        err = resp.ReadFromConnWithVer(mc.conn, proto.ReadDeadlineTime)
        if err != nil {
                return nil, errors.Trace(err, "Failed to read from conn, req(%v)", req)
        }
        // Check if the ID and OpCode of the response are consistent with the request.
        if resp.ReqID != req.ReqID || resp.Opcode != req.Opcode {
                log.LogErrorf("send: the response packet mismatch with request: conn(%v to %v) req(%v) resp(%v)",
                        mc.conn.LocalAddr(), mc.conn.RemoteAddr(), req, resp)
                return nil, syscall.EBADMSG
        }
        return resp, nil
}

//go:build gofuzz
// +build gofuzz

// Copyright 2023 ADA Logics Ltd
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package meta

import (
        fuzz "github.com/AdaLogics/go-fuzz-headers"
)

func FuzzNewMeta(data []byte) int {
        f := fuzz.NewConsumer(data)
        config := MetaConfig{}

        err := f.GenerateStruct(&config)
        if err != nil {
                return 0
        }

        _, err = NewMetaWrapper(&config)
        if err != nil {
                return 0
        }
        return 1
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        gerrors "errors"
        "fmt"
        "sync"
        "syscall"
        "time"

        "golang.org/x/sync/singleflight"
        "golang.org/x/time/rate"

        "github.com/cubefs/cubefs/proto"
        authSDK "github.com/cubefs/cubefs/sdk/auth"
        "github.com/cubefs/cubefs/sdk/data/wrapper"
        masterSDK "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/auth"
        "github.com/cubefs/cubefs/util/btree"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

const (
        HostsSeparator                = ","
        RefreshMetaPartitionsInterval = time.Minute * 5
)

const (
        statusUnknown int = iota
        statusOK
        statusExist
        statusNoent
        statusFull
        statusAgain
        statusError
        statusInval
        statusNotPerm
        StatusConflictExtents
        statusOpDirQuota
        statusNoSpace
        statusTxInodeInfoNotExist
        statusTxConflict
        statusTxTimeout
        statusUploadPartConflict
        statusNotEmpty
)

const (
        MaxMountRetryLimit = 6
        MountRetryInterval = time.Second * 5

        /*
         * Minimum interval of forceUpdateMetaPartitions in seconds,
         * i.e. only one force update request is allowed every 5 sec.
         */
        MinForceUpdateMetaPartitionsInterval = 5
        DefaultQuotaExpiration               = 120 * time.Second
        MaxQuotaCache                        = 10000
)

type AsyncTaskErrorFunc func(err error)

func (f AsyncTaskErrorFunc) OnError(err error) {
        if f != nil {
                f(err)
        }
}

type MetaConfig struct {
        Volume           string
        Owner            string
        Masters          []string
        Authenticate     bool
        TicketMess       auth.TicketMess
        ValidateOwner    bool
        OnAsyncTaskError AsyncTaskErrorFunc
        EnableSummary    bool
        MetaSendTimeout  int64

        // EnableTransaction uint8
        // EnableTransaction bool
        VerReadSeq uint64
}

type MetaWrapper struct {
        sync.RWMutex
        cluster           string
        localIP           string
        volname           string
        ossSecure         *OSSSecure
        volCreateTime     int64
        volDeleteLockTime int64
        owner             string
        ownerValidation   bool
        mc                *masterSDK.MasterClient
        ac                *authSDK.AuthClient
        conns             *util.ConnectPool

        // Callback handler for handling asynchronous task errors.
        onAsyncTaskError AsyncTaskErrorFunc

        // Partitions and ranges should be modified together. So do not
        // use partitions and ranges directly. Use the helper functions instead.

        // Partition map indexed by ID
        partitions map[uint64]*MetaPartition

        // Partition tree indexed by Start, in order to find a partition in which
        // a specific inode locate.
        ranges *btree.BTree

        rwPartitions []*MetaPartition
        epoch        uint64

        totalSize  uint64
        usedSize   uint64
        inodeCount uint64

        authenticate bool
        Ticket       auth.Ticket
        accessToken  proto.APIAccessReq
        sessionKey   string
        ticketMess   auth.TicketMess

        closeCh   chan struct{}
        closeOnce sync.Once

        // Allocated to signal the go routines which are waiting for partition view update
        partMutex sync.Mutex
        partCond  *sync.Cond

        // Allocated to trigger and throttle instant partition updates
        forceUpdate             chan struct{}
        forceUpdateLimit        *rate.Limiter
        singleflight            singleflight.Group
        EnableSummary           bool
        metaSendTimeout         int64
        DirChildrenNumLimit     uint32
        EnableTransaction       proto.TxOpMask
        TxTimeout               int64
        TxConflictRetryNum      int64
        TxConflictRetryInterval int64
        EnableQuota             bool
        QuotaInfoMap            map[uint32]*proto.QuotaInfo
        QuotaLock               sync.RWMutex

        // uniqidRange for request dedup
        uniqidRangeMap   map[uint64]*uniqidRange
        uniqidRangeMutex sync.Mutex

        qc *QuotaCache

        VerReadSeq uint64
        LastVerSeq uint64
        Client     wrapper.SimpleClientInfo
}

type uniqidRange struct {
        cur uint64
        end uint64
}

// the ticket from authnode
type Ticket struct {
        ID         string `json:"client_id"`
        SessionKey string `json:"session_key"`
        ServiceID  string `json:"service_id"`
        Ticket     string `json:"ticket"`
}

func NewMetaWrapper(config *MetaConfig) (*MetaWrapper, error) {
        var err error
        mw := new(MetaWrapper)
        mw.closeCh = make(chan struct{}, 1)

        if config.Authenticate {
                ticketMess := config.TicketMess
                mw.ac = authSDK.NewAuthClient(ticketMess.TicketHosts, ticketMess.EnableHTTPS, ticketMess.CertFile)
                ticket, err := mw.ac.API().GetTicket(config.Owner, ticketMess.ClientKey, proto.MasterServiceID)
                if err != nil {
                        return nil, errors.Trace(err, "Get ticket from authnode failed!")
                }
                mw.authenticate = config.Authenticate
                mw.accessToken.Ticket = ticket.Ticket
                mw.accessToken.ClientID = config.Owner
                mw.accessToken.ServiceID = proto.MasterServiceID
                mw.sessionKey = ticket.SessionKey
                mw.ticketMess = ticketMess
        }

        mw.volname = config.Volume
        mw.owner = config.Owner
        mw.ownerValidation = config.ValidateOwner
        mw.mc = masterSDK.NewMasterClient(config.Masters, false)
        mw.onAsyncTaskError = config.OnAsyncTaskError
        mw.metaSendTimeout = config.MetaSendTimeout
        mw.conns = util.NewConnectPool()
        mw.partitions = make(map[uint64]*MetaPartition)
        mw.ranges = btree.New(32)
        mw.rwPartitions = make([]*MetaPartition, 0)
        mw.partCond = sync.NewCond(&mw.partMutex)
        mw.forceUpdate = make(chan struct{}, 1)
        mw.forceUpdateLimit = rate.NewLimiter(1, MinForceUpdateMetaPartitionsInterval)
        mw.EnableSummary = config.EnableSummary
        mw.DirChildrenNumLimit = proto.DefaultDirChildrenNumLimit
        mw.uniqidRangeMap = make(map[uint64]*uniqidRange, 0)
        mw.qc = NewQuotaCache(DefaultQuotaExpiration, MaxQuotaCache)
        mw.VerReadSeq = config.VerReadSeq

        limit := 0
        for limit < MaxMountRetryLimit {
                // When initializing the volume, if the master explicitly responds that the specified
                // volume does not exist, it will not retry.
                if err = mw.initMetaWrapper(); err != nil {
                        log.LogErrorf("NewMetaWrapper: init meta wrapper failed: volume(%v) err(%v)", mw.volname, err)
                        if gerrors.Is(err, proto.ErrVolAuthKeyNotMatch) || gerrors.Is(err, proto.ErrVolNotExists) {
                                break
                        }
                        limit++
                        time.Sleep(MountRetryInterval * time.Duration(limit))
                        continue
                }
                break
        }
        if err != nil {
                return nil, err
        }

        go mw.updateQuotaInfoTick()
        go mw.refresh()
        return mw, nil
}

func (mw *MetaWrapper) initMetaWrapper() (err error) {
        if err = mw.updateClusterInfo(); err != nil {
                return err
        }

        if err = mw.updateVolStatInfo(); err != nil {
                return err
        }

        if err = mw.updateMetaPartitions(); err != nil {
                return err
        }

        if err = mw.updateDirChildrenNumLimit(); err != nil {
                return err
        }

        return nil
}

func (mw *MetaWrapper) Owner() string {
        return mw.owner
}

func (mw *MetaWrapper) enableTx(mask proto.TxOpMask) bool {
        return mw.EnableTransaction != proto.TxPause && mw.EnableTransaction&mask > 0
}

func (mw *MetaWrapper) OSSSecure() (accessKey, secretKey string) {
        return mw.ossSecure.AccessKey, mw.ossSecure.SecretKey
}

func (mw *MetaWrapper) VolCreateTime() int64 {
        return mw.volCreateTime
}

func (mw *MetaWrapper) Close() error {
        mw.closeOnce.Do(func() {
                close(mw.closeCh)
                mw.conns.Close()
        })
        return nil
}

func (mw *MetaWrapper) Cluster() string {
        return mw.cluster
}

func (mw *MetaWrapper) LocalIP() string {
        return mw.localIP
}

func (mw *MetaWrapper) exporterKey(act string) string {
        return fmt.Sprintf("%s_sdk_meta_%s", mw.cluster, act)
}

// Proto ResultCode to status
func parseStatus(result uint8) (status int) {
        switch result {
        case proto.OpOk:
                status = statusOK
        case proto.OpExistErr:
                status = statusExist
        case proto.OpNotExistErr:
                status = statusNoent
        case proto.OpInodeFullErr:
                status = statusFull
        case proto.OpAgain:
                status = statusAgain
        case proto.OpArgMismatchErr:
                status = statusInval
        case proto.OpNotPerm:
                status = statusNotPerm
        case proto.OpConflictExtentsErr:
                status = StatusConflictExtents
        case proto.OpDirQuota:
                status = statusOpDirQuota
        case proto.OpNotEmpty:
                status = statusNotEmpty
        case proto.OpNoSpaceErr:
                status = statusNoSpace
        case proto.OpTxInodeInfoNotExistErr:
                status = statusTxInodeInfoNotExist
        case proto.OpTxConflictErr:
                status = statusTxConflict
        case proto.OpTxTimeoutErr:
                status = statusTxTimeout
        case proto.OpUploadPartConflictErr:
                status = statusUploadPartConflict
        default:
                status = statusError
        }
        return
}

func statusErrToErrno(status int, err error) error {
        if status == statusOK && err != nil {
                return syscall.EAGAIN
        }

        return statusToErrno(status)
}

func statusToErrno(status int) error {
        switch status {
        case statusOK:
                // return error anyway
                return syscall.EAGAIN
        case statusExist:
                return syscall.EEXIST
        case statusNotEmpty:
                return syscall.ENOTEMPTY
        case statusNoent:
                return syscall.ENOENT
        case statusFull:
                return syscall.ENOMEM
        case statusAgain:
                return syscall.EAGAIN
        case statusInval:
                return syscall.EINVAL
        case statusNotPerm:
                return syscall.EPERM
        case statusError:
                return syscall.EAGAIN
        case StatusConflictExtents:
                return syscall.ENOTSUP
        case statusOpDirQuota:
                return syscall.EDQUOT
        case statusNoSpace:
                return syscall.ENOSPC
        case statusTxInodeInfoNotExist:
                return syscall.EAGAIN
        case statusTxConflict:
                return syscall.EAGAIN
        case statusTxTimeout:
                return syscall.EAGAIN
        case statusUploadPartConflict:
                return syscall.EEXIST
        default:
        }
        return syscall.EIO
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        "fmt"
        "strconv"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

// API implementations
//
// txIcreate create inode and tx together
func (mw *MetaWrapper) txIcreate(tx *Transaction, mp *MetaPartition, mode, uid, gid uint32,
        target []byte, quotaIds []uint32, fullPath string) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txIcreate", err, bgTime, 1)
        }()

        tx.SetTmID(mp.PartitionID)

        req := &proto.TxCreateInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Mode:        mode,
                Uid:         uid,
                Gid:         gid,
                Target:      target,
                QuotaIds:    quotaIds,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}

        resp := new(proto.TxCreateInodeResponse)
        defer func() {
                tx.OnExecuted(status, resp.TxInfo)
        }()

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaTxCreateInode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("txIcreate: err(%v)", err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("txIcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                // set tx error msg
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("txIcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("txIcreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }

        if resp.Info == nil || resp.TxInfo == nil {
                err = errors.New(fmt.Sprintf("txIcreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
                log.LogWarn(err)
                return
        }

        tx.Started = true
        tx.txInfo = resp.TxInfo
        log.LogDebugf("txIcreate: packet(%v) mp(%v) req(%v) info(%v) tx(%v)", packet, mp, *req, resp.Info, resp.TxInfo)
        return status, resp.Info, nil
}

func (mw *MetaWrapper) quotaIcreate(mp *MetaPartition, mode, uid, gid uint32, target []byte, quotaIds []uint32, fullPath string) (status int,
        info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("icreate", err, bgTime, 1)
        }()

        req := &proto.QuotaCreateInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Mode:        mode,
                Uid:         uid,
                Gid:         gid,
                Target:      target,
                QuotaIds:    quotaIds,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpQuotaCreateInode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("quotaIcreate: err(%v)", err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("quotaIcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("quotaIcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.CreateInodeResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("quotaIcreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        if resp.Info == nil {
                err = errors.New(fmt.Sprintf("quotaIcreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
                log.LogWarn(err)
                return
        }
        log.LogDebugf("quotaIcreate: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) icreate(mp *MetaPartition, mode, uid, gid uint32, target []byte, fullPath string) (status int,
        info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("icreate", err, bgTime, 1)
        }()

        req := &proto.CreateInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Mode:        mode,
                Uid:         uid,
                Gid:         gid,
                Target:      target,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaCreateInode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("icreate: err(%v)", err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("icreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("icreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.CreateInodeResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("icreate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        if resp.Info == nil {
                err = errors.New(fmt.Sprintf("icreate: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
                log.LogWarn(err)
                return
        }
        log.LogDebugf("icreate: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) sendToMetaPartitionWithTx(mp *MetaPartition, req *proto.Packet) (packet *proto.Packet, err error) {
        retryNum := int64(0)
        for {
                packet, err = mw.sendToMetaPartition(mp, req)
                if err != nil {
                        log.LogErrorf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) err(%v)",
                                string(req.Data), mp, req.GetOpMsg(), err)
                        return
                }

                if packet.ResultCode != proto.OpTxConflictErr {
                        break
                }

                log.LogWarnf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) result(%v), tx conflict retry: %v req(%v)",
                        packet, mp, packet.GetOpMsg(), packet.GetResultMsg(), retryNum, string(req.Data))
                retryNum++
                if retryNum > mw.TxConflictRetryNum {
                        log.LogErrorf("sendToMetaPartitionWithTx: packet(%v) mp(%v) reqType(%v) result(%v), tx conflict retry: %v req(%v)",
                                packet, mp, packet.GetOpMsg(), packet.GetResultMsg(), retryNum, string(req.Data))
                        break
                }
                time.Sleep(time.Duration(mw.TxConflictRetryInterval) * time.Millisecond)
        }

        return
}

func (mw *MetaWrapper) SendTxPack(req proto.TxPack, resp interface{}, Opcode uint8, mp *MetaPartition,
        checkStatusFunc func(int, *proto.Packet) error) (status int, err error, packet *proto.Packet) {
        packet = proto.NewPacketReqID()
        packet.Opcode = Opcode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("SendTxPack reqType(%v) txInfo(%v) : err(%v)", packet.GetOpMsg(), req.GetInfo(), err)
                return
        }

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("SendTxPack: packet(%v) mp(%v) txInfo(%v) err(%v)",
                        packet, mp, req.GetInfo(), err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if checkStatusFunc != nil {
                if err = checkStatusFunc(status, packet); err != nil {
                        log.LogErrorf("SendTxPack: packet(%v) mp(%v) req(%v) txInfo(%v) result(%v) err(%v)",
                                packet, mp, packet.GetOpMsg(), req.GetInfo(), packet.GetResultMsg(), err)
                        return
                }
        } else if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("SendTxPack: packet(%v) mp(%v) req(%v) txInfo(%v) result(%v)",
                        packet, mp, packet.GetOpMsg(), req.GetInfo(), packet.GetResultMsg())
                return
        }

        if resp == nil {
                return
        }

        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("SendTxPack: packet(%v) mp(%v) txInfo(%v) err(%v) PacketData(%v)",
                        packet, mp, req.GetInfo(), err, string(packet.Data))
                return
        }
        return
}

func (mw *MetaWrapper) txIunlink(tx *Transaction, mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txIunlink", err, bgTime, 1)
        }()

        req := &proto.TxUnlinkInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}
        resp := new(proto.TxUnlinkInodeResponse)
        metric := exporter.NewTPCnt("OpMetaTxUnlinkInode")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxUnlinkInode, mp, nil); err != nil {
                log.LogErrorf("txIunlink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("txIunlink: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) iunlink(mp *MetaPartition, inode uint64, verSeq uint64, denVerSeq uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("iunlink", err, bgTime, 1)
        }()

        // use uniq id to dedup request
        status, uniqID, err := mw.consumeUniqID(mp)
        if err != nil || status != statusOK {
                err = statusToErrno(status)
                return
        }

        req := &proto.UnlinkInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                UniqID:      uniqID,
                VerSeq:      verSeq,
                DenVerSeq:   denVerSeq,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaUnlinkInode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("iunlink: ino(%v) err(%v)", inode, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) result(%v) status(%v)", packet, mp, *req, packet.GetResultMsg(), status)
                return
        }

        resp := new(proto.UnlinkInodeResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("iunlink: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }

        log.LogDebugf("iunlink: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) iclearCache(mp *MetaPartition, inode uint64) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("iclearCache", err, bgTime, 1)
        }()

        req := &proto.ClearInodeCacheRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaClearInodeCache
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("iclearCache: ino(%v) err(%v)", inode, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("iclearCache: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("iclearCache: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("iclearCache: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return status, nil
}

func (mw *MetaWrapper) ievict(mp *MetaPartition, inode uint64, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ievict", err, bgTime, 1)
        }()

        req := &proto.EvictInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaEvictInode
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogWarnf("ievict: ino(%v) err(%v)", inode, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogWarnf("ievict: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogWarnf("ievict: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("ievict exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, nil
}

func (mw *MetaWrapper) txDcreate(tx *Transaction, mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32, quotaIds []uint32, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txDcreate", err, bgTime, 1)
        }()

        if parentID == inode {
                return statusExist, nil
        }

        req := &proto.TxCreateDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Inode:       inode,
                Name:        name,
                Mode:        mode,
                QuotaIds:    quotaIds,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}

        metric := exporter.NewTPCnt("OpMetaTxCreateDentry")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        //statusCheckFunc := func(status int, packet *proto.Packet) (err error) {
        //        if (status != statusOK) && (status != statusExist) {
        //                err = errors.New(packet.GetResultMsg())
        //                log.LogErrorf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        //                return
        //        } else if status == statusExist {
        //                log.LogWarnf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        //        }
        //        return
        //}

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, nil, proto.OpMetaTxCreateDentry, mp, nil); err != nil {
                log.LogErrorf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("txDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) quotaDcreate(mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32,
        quotaIds []uint32, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("dcreate", err, bgTime, 1)
        }()

        if parentID == inode {
                return statusExist, nil
        }

        req := &proto.QuotaCreateDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Inode:       inode,
                Name:        name,
                Mode:        mode,
                QuotaIds:    quotaIds,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpQuotaCreateDentry
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("quotaDcreate: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("quotaDcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if (status != statusOK) && (status != statusExist) {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        } else if status == statusExist {
                log.LogWarnf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        }
        log.LogDebugf("quotaDcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) dcreate(mp *MetaPartition, parentID uint64, name string, inode uint64, mode uint32, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("dcreate", err, bgTime, 1)
        }()

        if parentID == inode {
                return statusExist, nil
        }

        req := &proto.CreateDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Inode:       inode,
                Name:        name,
                Mode:        mode,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaCreateDentry
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("dcreate: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("dcreate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if (status != statusOK) && (status != statusExist) {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        } else if status == statusExist {
                log.LogWarnf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        }
        log.LogDebugf("dcreate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) txDupdate(tx *Transaction, mp *MetaPartition, parentID uint64, name string, newInode, oldIno uint64, fullPath string) (status int, oldInode uint64, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txDupdate", err, bgTime, 1)
        }()

        if parentID == newInode {
                return statusExist, 0, nil
        }

        req := &proto.TxUpdateDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Name:        name,
                Inode:       newInode,
                OldIno:      oldIno,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}

        resp := new(proto.TxUpdateDentryResponse)
        metric := exporter.NewTPCnt("OpMetaTxUpdateDentry")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxUpdateDentry, mp, nil); err != nil {
                log.LogErrorf("txDupdate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("txDupdate: packet(%v) mp(%v) req(%v) oldIno(%v)", packet, mp, *req, resp.Inode)
        return statusOK, resp.Inode, nil
}

func (mw *MetaWrapper) dupdate(mp *MetaPartition, parentID uint64, name string, newInode uint64, fullPath string) (status int, oldInode uint64, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("dupdate", err, bgTime, 1)
        }()

        if parentID == newInode {
                return statusExist, 0, nil
        }

        req := &proto.UpdateDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Name:        name,
                Inode:       newInode,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaUpdateDentry
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("dupdate: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("dupdate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("dupdate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.UpdateDentryResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("dupdate: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("dupdate: packet(%v) mp(%v) req(%v) oldIno(%v)", packet, mp, *req, resp.Inode)
        return statusOK, resp.Inode, nil
}

func (mw *MetaWrapper) txCreateTX(tx *Transaction, mp *MetaPartition) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txCreateTX", err, bgTime, 1)
        }()

        tx.SetTmID(mp.PartitionID)

        req := &proto.TxCreateRequest{
                VolName:         mw.volname,
                PartitionID:     mp.PartitionID,
                TransactionInfo: tx.txInfo,
        }

        resp := new(proto.TxCreateResponse)
        metric := exporter.NewTPCnt("OpMetaTxCreate")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxCreate, mp, nil); err != nil {
                log.LogErrorf("txCreateTX: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        if resp.TxInfo == nil {
                err = fmt.Errorf("txCreateTX: create tx resp nil")
                log.LogError(err)
                return statusError, err
        }

        if log.EnableDebug() {
                log.LogDebugf("txCreateTX: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        }

        tx.txInfo = resp.TxInfo
        tx.Started = true
        return statusOK, nil
}

//func (mw *MetaWrapper) txPreCommit(tx *Transaction, mp *MetaPartition) (status int, err error) {
//        bgTime := stat.BeginStat()
//        defer func() {
//                stat.EndStat("txPreCommit", err, bgTime, 1)
//        }()
//
//        tx.txInfo.TmID = int64(mp.PartitionID)
//        req := &proto.TxPreCommitRequest{
//                VolName:         mw.volname,
//                PartitionID:     mp.PartitionID,
//                TransactionInfo: tx.txInfo,
//        }
//
//        metric := exporter.NewTPCnt("OpTxPreCommit")
//        defer func() {
//                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
//        }()
//
//        var packet *proto.Packet
//        if status, err, packet = mw.SendTxPack(req, nil, proto.OpTxPreCommit, mp, nil); err != nil {
//                log.LogErrorf("txPreCommit: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
//                return
//        }
//
//        if log.EnableDebug() {
//                log.LogDebugf("txPreCommit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
//        }
//
//        return statusOK, nil
//}

func (mw *MetaWrapper) txDdelete(tx *Transaction, mp *MetaPartition, parentID, ino uint64, name string, fullPath string) (status int, inode uint64, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txDdelete", err, bgTime, 1)
        }()

        req := &proto.TxDeleteDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Name:        name,
                Ino:         ino,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}

        resp := new(proto.TxDeleteDentryResponse)

        metric := exporter.NewTPCnt("OpMetaTxDeleteDentry")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxDeleteDentry, mp, nil); err != nil {
                log.LogErrorf("txDdelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("txDdelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, resp.Inode)
        return statusOK, resp.Inode, nil
}

func (mw *MetaWrapper) ddelete(mp *MetaPartition, parentID uint64, name string, inodeCreateTime int64, verSeq uint64, fullPath string) (status int, inode uint64, denVer uint64, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ddelete", err, bgTime, 1)
        }()

        req := &proto.DeleteDentryRequest{
                VolName:         mw.volname,
                PartitionID:     mp.PartitionID,
                ParentID:        parentID,
                Name:            name,
                InodeCreateTime: inodeCreateTime,
                Verseq:          verSeq,
        }
        req.FullPaths = []string{fullPath}
        log.LogDebugf("action[ddelete] %v", req)
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaDeleteDentry
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("ddelete: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("ddelete: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("ddelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.DeleteDentryResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("ddelete: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("ddelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, resp.Inode)
        return statusOK, resp.Inode, packet.VerSeq, nil
}

func (mw *MetaWrapper) canDeleteInode(mp *MetaPartition, info *proto.InodeInfo, ino uint64) (can bool, err error) {
        createTime := info.CreateTime.Unix()
        deleteLockTime := mw.volDeleteLockTime * 60 * 60

        if deleteLockTime > 0 && createTime+deleteLockTime > time.Now().Unix() {
                err = errors.NewErrorf("the current Inode[%v] is still locked for deletion", ino)
                log.LogWarnf("canDeleteInode: mp(%v) ino(%v) err(%v)", mp, ino, err)
                return false, syscall.EPERM
        }

        return true, nil
}

func (mw *MetaWrapper) ddeletes(mp *MetaPartition, parentID uint64, dentries []proto.Dentry, fullPaths []string) (status int,
        resp *proto.BatchDeleteDentryResponse, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ddeletes", err, bgTime, 1)
        }()

        req := &proto.BatchDeleteDentryRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Dens:        dentries,
                FullPaths:   fullPaths,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchDeleteDentry
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("ddeletes: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status == statusAgain {
                err = errors.New("conflict request")
                log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("ddeletes: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp = new(proto.BatchDeleteDentryResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("ddeletes: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("ddeletes: packet(%v) mp(%v) req(%v) (%v)", packet, mp, *req, resp.Items)
        return statusOK, resp, nil
}

func (mw *MetaWrapper) lookup(mp *MetaPartition, parentID uint64, name string, verSeq uint64) (status int, inode uint64, mode uint32, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("lookup", err, bgTime, 1)
        }()

        req := &proto.LookupRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Name:        name,
                VerSeq:      verSeq,
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaLookup
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("lookup: err(%v)", err)
                return
        }

        log.LogDebugf("lookup enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("lookup: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                errMetric := exporter.NewCounter("fileOpenFailed")
                errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                if status != statusNoent {
                        err = errors.New(packet.GetResultMsg())
                        log.LogErrorf("lookup: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                        errMetric := exporter.NewCounter("fileOpenFailed")
                        errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
                } else {
                        log.LogDebugf("lookup exit: packet(%v) mp(%v) req(%v) NoEntry", packet, mp, *req)
                }
                return
        }

        resp := new(proto.LookupResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("lookup: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                errMetric := exporter.NewCounter("fileOpenFailed")
                errMetric.AddWithLabels(1, map[string]string{exporter.Vol: mw.volname, exporter.Err: "EIO"})
                return
        }
        log.LogDebugf("lookup exit: packet(%v) mp(%v) req(%v) ino(%v) mode(%v)", packet, mp, *req, resp.Inode, resp.Mode)
        return statusOK, resp.Inode, resp.Mode, nil
}

func (mw *MetaWrapper) iget(mp *MetaPartition, inode uint64, verSeq uint64) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("iget", err, bgTime, 1)
        }()

        req := &proto.InodeGetRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                VerSeq:      verSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaInodeGet
        packet.PartitionID = mp.PartitionID

        log.LogDebugf("action[iget] pack mp id %v, req %v", mp.PartitionID, req)

        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("iget: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("iget: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("iget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }
        resp := new(proto.InodeGetResponse)
        err = packet.UnmarshalData(resp)
        if err != nil || resp.Info == nil {
                log.LogErrorf("iget: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) batchIget(wg *sync.WaitGroup, mp *MetaPartition, inodes []uint64, respCh chan []*proto.InodeInfo) {
        defer wg.Done()
        var err error

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("batchIget", err, bgTime, 1)
        }()

        req := &proto.BatchInodeGetRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inodes:      inodes,
                VerSeq:      mw.VerReadSeq,
        }
        log.LogDebugf("action[batchIget] req %v", req)
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchInodeGet
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.BatchInodeGetResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("batchIget: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("action[batchIget] resp %v", resp)
        if len(resp.Infos) == 0 {
                return
        }

        select {
        case respCh <- resp.Infos:
        default:
        }
}

func (mw *MetaWrapper) readDir(mp *MetaPartition, parentID uint64) (status int, children []proto.Dentry, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("readDir", err, bgTime, 1)
        }()

        req := &proto.ReadDirRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaReadDir
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("readDir: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                children = make([]proto.Dentry, 0)
                log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.ReadDirResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("readDir: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("readDir: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, resp.Children, nil
}

// read limit dentries start from
func (mw *MetaWrapper) readDirLimit(mp *MetaPartition, parentID uint64, from string, limit uint64, verSeq uint64, verOpt uint8) (status int, children []proto.Dentry, err error) {
        req := &proto.ReadDirLimitRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                Marker:      from,
                Limit:       limit,
                VerSeq:      verSeq,
                VerOpt:      verOpt,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaReadDirLimit
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("readDirLimit: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("action[readDirLimit] mp [%v] parentId %v", mp.PartitionID, parentID)
        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("readDirLimit: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                children = make([]proto.Dentry, 0)
                log.LogErrorf("readDirLimit: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.ReadDirLimitResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("readDirLimit: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("readDirLimit: packet(%v) mp(%v) req(%v) rsp(%v)", packet, mp, *req, resp.Children)
        return statusOK, resp.Children, nil
}

func (mw *MetaWrapper) appendExtentKey(mp *MetaPartition, inode uint64, extent proto.ExtentKey, discard []proto.ExtentKey, isSplit bool) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("appendExtentKey", err, bgTime, 1)
        }()

        req := &proto.AppendExtentKeyWithCheckRequest{
                VolName:        mw.volname,
                PartitionID:    mp.PartitionID,
                Inode:          inode,
                Extent:         extent,
                DiscardExtents: discard,
                IsSplit:        isSplit,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaExtentAddWithCheck
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("appendExtentKey: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("appendExtentKey: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                if status != StatusConflictExtents {
                        log.LogErrorf("appendExtentKey: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                }
        }
        return status, err
}

func (mw *MetaWrapper) getExtents(mp *MetaPartition, inode uint64) (resp *proto.GetExtentsResponse, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getExtents", err, bgTime, 1)
        }()

        req := &proto.GetExtentsRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaExtentsList
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("getExtents: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getExtents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }
        resp = &proto.GetExtentsResponse{}
        resp.Status = parseStatus(packet.ResultCode)
        if resp.Status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("getExtents: packet(%v) mp(%v) result(%v)", packet, mp, packet.GetResultMsg())
                return
        }

        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("getExtents: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        return resp, nil
}

func (mw *MetaWrapper) getObjExtents(mp *MetaPartition, inode uint64) (status int, gen, size uint64, extents []proto.ExtentKey, objExtents []proto.ObjExtentKey, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getObjExtents", err, bgTime, 1)
        }()

        req := &proto.GetExtentsRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaObjExtentsList
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("getObjExtents: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getObjExtents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                extents = make([]proto.ExtentKey, 0)
                log.LogErrorf("getObjExtents: packet(%v) mp(%v) result(%v)", packet, mp, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetObjExtentsResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("getObjExtents: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        return statusOK, resp.Generation, resp.Size, resp.Extents, resp.ObjExtents, nil
}

// func (mw *MetaWrapper) delExtentKey(mp *MetaPartition, inode uint64, extents []proto.ExtentKey) (status int, err error) {
//         req := &proto.DelExtentKeyRequest{
//                 VolName:     mw.volname,
//                 PartitionID: mp.PartitionID,
//                 Inode:       inode,
//                 Extents:     extents,
//         }

//         packet := proto.NewPacketReqID()
//         packet.Opcode = proto.OpMetaExtentsDel
//         packet.PartitionID = mp.PartitionID
//         err = packet.MarshalData(req)
//         if err != nil {
//                 log.LogErrorf("delExtentKey: req(%v) err(%v)", *req, err)
//                 return
//         }

//         metric := exporter.NewTPCnt(packet.GetOpMsg())
//         defer func() {
//                 metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
//         }()

//         packet, err = mw.sendToMetaPartition(mp, packet)
//         if err != nil {
//                 log.LogErrorf("delExtentKey: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
//                 return
//         }

//         status = parseStatus(packet.ResultCode)
//         if status != statusOK {
//                 log.LogErrorf("delExtentKey: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
//         }
//         return status, nil
// }

func (mw *MetaWrapper) truncate(mp *MetaPartition, inode, size uint64, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("truncate", err, bgTime, 1)
        }()

        req := &proto.TruncateRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                Size:        size,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaTruncate
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("truncate: ino(%v) size(%v) err(%v)", inode, size, err)
                return
        }

        log.LogDebugf("truncate enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("truncate: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("truncate: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("truncate exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, nil
}

func (mw *MetaWrapper) txIlink(tx *Transaction, mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txIlink", err, bgTime, 1)
        }()

        req := &proto.TxLinkInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                TxInfo:      tx.txInfo,
        }
        req.FullPaths = []string{fullPath}

        resp := new(proto.TxLinkInodeResponse)
        metric := exporter.NewTPCnt("OpMetaTxLinkInode")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        var packet *proto.Packet
        if status, err, packet = mw.SendTxPack(req, resp, proto.OpMetaTxLinkInode, mp, nil); err != nil {
                log.LogErrorf("txIlink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        if log.EnableDebug() {
                log.LogDebugf("txIlink exit: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
        }
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) ilink(mp *MetaPartition, inode uint64, fullPath string) (status int, info *proto.InodeInfo, err error) {
        return mw.ilinkWork(mp, inode, proto.OpMetaLinkInode, fullPath)
}

func (mw *MetaWrapper) ilinkWork(mp *MetaPartition, inode uint64, op uint8, fullPath string) (status int, info *proto.InodeInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("ilink", err, bgTime, 1)
        }()

        // use unique id to dedup request
        status, uniqID, err := mw.consumeUniqID(mp)
        if err != nil || status != statusOK {
                err = statusToErrno(status)
                return
        }

        req := &proto.LinkInodeRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                UniqID:      uniqID,
        }
        req.FullPaths = []string{fullPath}

        packet := proto.NewPacketReqID()
        packet.Opcode = op
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("ilink: req(%v) err(%v)", *req, err)
                return
        }

        log.LogDebugf("ilink enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("ilink: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("ilink: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.LinkInodeResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("ilink: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        if resp.Info == nil {
                err = errors.New(fmt.Sprintf("ilink: info is nil, packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, string(packet.Data)))
                log.LogWarn(err)
                return
        }
        log.LogDebugf("ilink exit: packet(%v) mp(%v) req(%v) info(%v)", packet, mp, *req, resp.Info)
        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) setattr(mp *MetaPartition, inode uint64, valid, mode, uid, gid uint32, atime, mtime int64) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("setattr", err, bgTime, 1)
        }()

        req := &proto.SetAttrRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                Valid:       valid,
                Mode:        mode,
                Uid:         uid,
                Gid:         gid,
                AccessTime:  atime,
                ModifyTime:  mtime,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaSetattr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("setattr: err(%v)", err)
                return
        }

        log.LogDebugf("setattr enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("setattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("setattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("setattr exit: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, nil
}

func (mw *MetaWrapper) createMultipart(mp *MetaPartition, path string, extend map[string]string) (status int, multipartId string, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("createMultipart", err, bgTime, 1)
        }()

        req := &proto.CreateMultipartRequest{
                PartitionId: mp.PartitionID,
                VolName:     mw.volname,
                Path:        path,
                Extend:      extend,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpCreateMultipart
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("createMultipart: err(%v)", err)
                return
        }

        log.LogDebugf("createMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.CreateMultipartResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("createMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        return statusOK, resp.Info.ID, nil
}

func (mw *MetaWrapper) getExpiredMultipart(prefix string, days int, mp *MetaPartition) (status int, Infos []*proto.ExpiredMultipartInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getExpiredMultipart", err, bgTime, 1)
        }()

        req := &proto.GetExpiredMultipartRequest{
                PartitionId: mp.PartitionID,
                VolName:     mw.volname,
                Prefix:      prefix,
                Days:        days,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpGetExpiredMultipart
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("get session: err(%v)", err)
                return
        }

        log.LogDebugf("getExpiredMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetExpiredMultipartResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("getExpiredMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }

        return statusOK, resp.Infos, nil
}

func (mw *MetaWrapper) getMultipart(mp *MetaPartition, path, multipartId string) (status int, info *proto.MultipartInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getMultipart", err, bgTime, 1)
        }()

        req := &proto.GetMultipartRequest{
                PartitionId: mp.PartitionID,
                VolName:     mw.volname,
                Path:        path,
                MultipartId: multipartId,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpGetMultipart
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("get session: err(%v)", err)
                return
        }

        log.LogDebugf("getMultipart enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetMultipartResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("getMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }

        return statusOK, resp.Info, nil
}

func (mw *MetaWrapper) addMultipartPart(mp *MetaPartition, path, multipartId string, partId uint16, size uint64, md5 string, inodeInfo *proto.InodeInfo) (status int, oldNode uint64, updated bool, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("addMultipartPart", err, bgTime, 1)
        }()

        part := &proto.MultipartPartInfo{
                ID:         partId,
                Inode:      inodeInfo.Inode,
                MD5:        md5,
                Size:       size,
                UploadTime: time.Now(),
        }

        req := &proto.AddMultipartPartRequest{
                PartitionId: mp.PartitionID,
                VolName:     mw.volname,
                Path:        path,
                MultipartId: multipartId,
                Part:        part,
        }
        log.LogDebugf("addMultipartPart: part(%v), req(%v)", part, req)
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpAddMultipartPart
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("addMultipartPart: marshal packet fail, err(%v)", err)
                return
        }

        log.LogDebugf("addMultipartPart entry: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("addMultipartPart: packet(%v) mp(%v) req(%v) part(%v) err(%v)", packet, mp, req, part, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("addMultipartPart: packet(%v) mp(%v) req(%v) part(%v) result(%v)", packet, mp, *req, part, packet.GetResultMsg())
                return
        }
        resp := new(proto.AppendMultipartResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("appendMultipart: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        return status, resp.OldInode, resp.Update, nil
}

func (mw *MetaWrapper) idelete(mp *MetaPartition, inode uint64, fullPath string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("idelete", err, bgTime, 1)
        }()

        req := &proto.DeleteInodeRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
        }
        req.FullPaths = []string{fullPath}
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaDeleteInode
        packet.PartitionID = mp.PartitionID
        if err = packet.MarshalData(req); err != nil {
                log.LogErrorf("delete inode: err[%v]", err)
                return
        }
        log.LogDebugf("delete inode: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartitionWithTx(mp, packet)
        if err != nil {
                log.LogErrorf("delete inode: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("idelete: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }
        log.LogDebugf("idelete: packet(%v) mp(%v) req(%v) ino(%v)", packet, mp, *req, inode)
        return statusOK, nil
}

func (mw *MetaWrapper) removeMultipart(mp *MetaPartition, path, multipartId string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("removeMultipart", err, bgTime, 1)
        }()

        req := &proto.RemoveMultipartRequest{
                PartitionId: mp.PartitionID,
                VolName:     mw.volname,
                Path:        path,
                MultipartId: multipartId,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpRemoveMultipart
        packet.PartitionID = mp.PartitionID
        if err = packet.MarshalData(req); err != nil {
                log.LogErrorf("delete session: err[%v]", err)
                return
        }
        log.LogDebugf("delete session: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("delete session: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("delete session: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }
        log.LogDebugf("delete session: packet(%v) mp(%v) req(%v) PacketData(%v)", packet, mp, *req, packet.Data)
        return statusOK, nil
}

func (mw *MetaWrapper) appendExtentKeys(mp *MetaPartition, inode uint64, extents []proto.ExtentKey) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("appendExtentKeys", err, bgTime, 1)
        }()

        req := &proto.AppendExtentKeysRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Extents:     extents,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchExtentsAdd
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("batch append extent: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("appendExtentKeys: batch append extent: packet(%v) mp(%v) req(%v)", packet, mp, *req)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batch append extent: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batch append extent: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("batch append extent: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) appendObjExtentKeys(mp *MetaPartition, inode uint64, extents []proto.ObjExtentKey) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("appendObjExtentKeys", err, bgTime, 1)
        }()

        req := &proto.AppendObjExtentKeysRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Inode:       inode,
                Extents:     extents,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchObjExtentsAdd
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("batch append obj extents: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("appendObjExtentKeys: batch append obj extents: packet(%v) mp(%v) req(%v)", packet, mp, *req)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batch append obj extents: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batch append obj extents: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("batch append obj extents: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) batchSetXAttr(mp *MetaPartition, inode uint64, attrs map[string]string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("batchSetXAttr", err, bgTime, 1)
        }()

        req := &proto.BatchSetXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Attrs:       make(map[string]string),
        }

        for key, val := range attrs {
                req.Attrs[key] = val
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchSetXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("batchSetXAttr: matshal packet fail, err(%v)", err)
                return
        }
        log.LogDebugf("batchSetXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batchSetXAttr: send to partition fail, packet(%v) mp(%v) req(%v) err(%v)",
                        packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batchSetXAttr: received fail status, packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("batchSetXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) setXAttr(mp *MetaPartition, inode uint64, name []byte, value []byte) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("setXAttr", err, bgTime, 1)
        }()

        req := &proto.SetXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Key:         string(name),
                Value:       string(value),
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaSetXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("setXAttr: matshal packet fail, err(%v)", err)
                return
        }
        log.LogDebugf("setXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("setXAttr: send to partition fail, packet(%v) mp(%v) req(%v) err(%v)",
                        packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("setXAttr: received fail status, packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("setXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) getAllXAttr(mp *MetaPartition, inode uint64) (attrs map[string]string, status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getAllXAttr", err, bgTime, 1)
        }()

        req := &proto.GetAllXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaGetAllXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("getAllXAttr: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("getAllXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getAllXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("getAllXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetAllXAttrResponse)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        attrs = resp.Attrs

        log.LogDebugf("getAllXAttr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) getXAttr(mp *MetaPartition, inode uint64, name string) (value string, status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getXAttr", err, bgTime, 1)
        }()

        req := &proto.GetXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Key:         name,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaGetXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("get xattr: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("get xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetXAttrResponse)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("get xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        value = resp.Value

        log.LogDebugf("get xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) removeXAttr(mp *MetaPartition, inode uint64, name string) (status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("removeXAttr", err, bgTime, 1)
        }()

        req := &proto.RemoveXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Key:         name,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaRemoveXAttr
        packet.PartitionID = mp.PartitionID
        if err = packet.MarshalData(req); err != nil {
                log.LogErrorf("remove xattr: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("remove xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        if packet, err = mw.sendToMetaPartition(mp, packet); err != nil {
                log.LogErrorf("remove xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("remove xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        log.LogDebugf("remove xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) listXAttr(mp *MetaPartition, inode uint64) (keys []string, status int, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("listXAttr", err, bgTime, 1)
        }()

        req := &proto.ListXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaListXAttr
        packet.PartitionID = mp.PartitionID
        if err = packet.MarshalData(req); err != nil {
                log.LogErrorf("list xattr: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("list xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        if packet, err = mw.sendToMetaPartition(mp, packet); err != nil {
                log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.ListXAttrResponse)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("list xattr: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }

        keys = resp.XAttrs

        log.LogDebugf("list xattr: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return
}

func (mw *MetaWrapper) listMultiparts(mp *MetaPartition, prefix, delimiter, keyMarker string, multipartIdMarker string, maxUploads uint64) (status int, sessions *proto.ListMultipartResponse, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("listMultiparts", err, bgTime, 1)
        }()

        req := &proto.ListMultipartRequest{
                VolName:           mw.volname,
                PartitionId:       mp.PartitionID,
                Marker:            keyMarker,
                MultipartIdMarker: multipartIdMarker,
                Max:               maxUploads,
                Delimiter:         delimiter,
                Prefix:            prefix,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpListMultiparts
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("list sessions : err(%v)", err)
                return
        }

        log.LogDebugf("listMultiparts enter: packet(%v) mp(%v) req(%v)", packet, mp, string(packet.Data))
        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.ListMultipartResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("listMultiparts: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }

        return statusOK, resp, nil
}

func (mw *MetaWrapper) batchGetXAttr(mp *MetaPartition, inodes []uint64, keys []string) ([]*proto.XAttrInfo, error) {
        var err error

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("batchGetXAttr", err, bgTime, 1)
        }()

        req := &proto.BatchGetXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inodes:      inodes,
                Keys:        keys,
                VerSeq:      mw.VerReadSeq,
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchGetXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                return nil, err
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batchGetXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return nil, err
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batchIget: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return nil, err
        }

        resp := new(proto.BatchGetXAttrResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("batchIget: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return nil, err
        }

        return resp.XAttrs, nil
}

func (mw *MetaWrapper) readdironly(mp *MetaPartition, parentID uint64) (status int, children []proto.Dentry, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("readdironly", err, bgTime, 1)
        }()

        req := &proto.ReadDirOnlyRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                ParentID:    parentID,
                VerSeq:      mw.VerReadSeq,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaReadDirOnly
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("readDir: req(%v) err(%v)", *req, err)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                children = make([]proto.Dentry, 0)
                log.LogErrorf("readDir: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.ReadDirOnlyResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("readDir: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        log.LogDebugf("readDir: packet(%v) mp(%v) req(%v)", packet, mp, *req)
        return statusOK, resp.Children, nil
}

func (mw *MetaWrapper) updateXAttrs(mp *MetaPartition, inode uint64, filesInc int64, dirsInc int64, bytesInc int64) error {
        var err error

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("updateXAttrs", err, bgTime, 1)
        }()

        value := strconv.FormatInt(int64(filesInc), 10) + "," + strconv.FormatInt(int64(dirsInc), 10) + "," + strconv.FormatInt(int64(bytesInc), 10)
        req := &proto.UpdateXAttrRequest{
                VolName:     mw.volname,
                PartitionId: mp.PartitionID,
                Inode:       inode,
                Key:         SummaryKey,
                Value:       value,
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaUpdateXAttr
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("updateXAttr: matshal packet fail, err(%v)", err)
                return err
        }
        log.LogDebugf("updateXAttr: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("readdironly: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return err
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("readdironly: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return err
        }

        log.LogDebugf("updateXAttrs: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
        return nil
}

func (mw *MetaWrapper) batchSetInodeQuota(mp *MetaPartition, inodes []uint64, quotaId uint32,
        IsRoot bool) (resp *proto.BatchSetMetaserverQuotaResponse, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("batchSetInodeQuota", err, bgTime, 1)
        }()

        req := &proto.BatchSetMetaserverQuotaReuqest{
                PartitionId: mp.PartitionID,
                Inodes:      inodes,
                QuotaId:     quotaId,
                IsRoot:      IsRoot,
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchSetInodeQuota
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("batchSetInodeQuota MarshalData req [%v] fail.", req)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }
        resp = new(proto.BatchSetMetaserverQuotaResponse)
        resp.InodeRes = make(map[uint64]uint8, 0)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        log.LogInfof("batchSetInodeQuota inodes [%v] quota [%v] resp [%v] success.", inodes, quotaId, resp)
        return
}

func (mw *MetaWrapper) batchDeleteInodeQuota(mp *MetaPartition, inodes []uint64,
        quotaId uint32) (resp *proto.BatchDeleteMetaserverQuotaResponse, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("batchDeleteInodeQuota", err, bgTime, 1)
        }()
        req := &proto.BatchDeleteMetaserverQuotaReuqest{
                PartitionId: mp.PartitionID,
                Inodes:      inodes,
                QuotaId:     quotaId,
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaBatchDeleteInodeQuota
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("batchDeleteInodeQuota MarshalData req [%v] fail.", req)
                return
        }

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("batchDeleteInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("batchDeleteInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }
        resp = new(proto.BatchDeleteMetaserverQuotaResponse)
        resp.InodeRes = make(map[uint64]uint8, 0)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("batchSetInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        log.LogInfof("batchDeleteInodeQuota inodes [%v] quota [%v] resp [%v] success.",
                inodes, quotaId, resp)
        return
}

func (mw *MetaWrapper) getInodeQuota(mp *MetaPartition, inode uint64) (quotaInfos map[uint32]*proto.MetaQuotaInfo, err error) {
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("getInodeQuota", err, bgTime, 1)
        }()

        req := &proto.GetInodeQuotaRequest{
                PartitionId: mp.PartitionID,
                Inode:       inode,
        }
        qcInfo := mw.qc.Get(inode)
        if qcInfo != nil {
                return qcInfo.quotaInfos, nil
        }
        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaGetInodeQuota
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("getInodeQuota: req(%v) err(%v)", *req, err)
                return
        }
        log.LogDebugf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)

        metric := exporter.NewTPCnt(packet.GetOpMsg())
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetInodeQuotaResponse)
        if err = packet.UnmarshalData(resp); err != nil {
                log.LogErrorf("getInodeQuota: packet(%v) mp(%v) req(%v) err(%v) PacketData(%v)", packet, mp, *req, err, string(packet.Data))
                return
        }
        quotaInfos = resp.MetaQuotaInfoMap
        var qinfo QuotaCacheInfo
        qinfo.quotaInfos = make(map[uint32]*proto.MetaQuotaInfo)
        qinfo.quotaInfos = quotaInfos
        qinfo.inode = inode
        mw.qc.Put(inode, &qinfo)
        log.LogDebugf("getInodeQuota: req(%v) resp(%v) err(%v)", *req, *resp, err)
        return
}

func (mw *MetaWrapper) applyQuota(parentIno uint64, quotaId uint32, totalInodeCount *uint64, curInodeCount *uint64, inodes *[]uint64,
        maxInodes uint64, first bool) (err error) {
        if first {
                var rootInodes []uint64
                var ret map[uint64]uint8
                rootInodes = append(rootInodes, parentIno)
                ret, err = mw.BatchSetInodeQuota_ll(rootInodes, quotaId, true)
                if err != nil {
                        return
                }
                if status, ok := ret[parentIno]; ok {
                        if status != proto.OpOk {
                                if status == proto.OpNotExistErr {
                                        err = fmt.Errorf("apply inode %v is not exist.", parentIno)
                                } else {
                                        err = fmt.Errorf("apply inode %v failed, status: %v.", parentIno, status)
                                }

                                return
                        }
                }
                *totalInodeCount = *totalInodeCount + 1
        }
        var defaultReaddirLimit uint64 = 1024
        noMore := false
        from := ""
        for !noMore {
                entries, err := mw.ReadDirLimit_ll(parentIno, from, defaultReaddirLimit)
                if err != nil {
                        return err
                }
                entryNum := uint64(len(entries))
                if entryNum == 0 || (from != "" && entryNum == 1) {
                        break
                }

                if entryNum < defaultReaddirLimit {
                        noMore = true
                }

                if from != "" {
                        entries = entries[1:]
                }

                for _, entry := range entries {
                        *inodes = append(*inodes, entry.Inode)
                        *curInodeCount = *curInodeCount + 1
                        *totalInodeCount = *totalInodeCount + 1
                        if *curInodeCount >= maxInodes {
                                mw.BatchSetInodeQuota_ll(*inodes, quotaId, false)
                                *curInodeCount = 0
                                *inodes = (*inodes)[:0]
                        }
                        if proto.IsDir(entry.Type) {
                                err = mw.applyQuota(entry.Inode, quotaId, totalInodeCount, curInodeCount, inodes, maxInodes, false)
                                if err != nil {
                                        return err
                                }
                        }
                }
                from = entries[len(entries)-1].Name
        }

        if first && *curInodeCount > 0 {
                mw.BatchSetInodeQuota_ll(*inodes, quotaId, false)
                *curInodeCount = 0
                *inodes = (*inodes)[:0]
        }
        return
}

func (mw *MetaWrapper) revokeQuota(parentIno uint64, quotaId uint32, totalInodeCount *uint64, curInodeCount *uint64, inodes *[]uint64,
        maxInodes uint64, first bool) (err error) {
        if first {
                var rootInodes []uint64
                rootInodes = append(rootInodes, parentIno)
                _, err = mw.BatchDeleteInodeQuota_ll(rootInodes, quotaId)
                if err != nil {
                        return
                }
                *totalInodeCount = *totalInodeCount + 1
        }

        var defaultReaddirLimit uint64 = 1024
        noMore := false
        from := ""
        for !noMore {
                entries, err := mw.ReadDirLimit_ll(parentIno, from, defaultReaddirLimit)
                if err != nil {
                        return err
                }
                entryNum := uint64(len(entries))
                if entryNum == 0 || (from != "" && entryNum == 1) {
                        break
                }

                if entryNum < defaultReaddirLimit {
                        noMore = true
                }

                if from != "" {
                        entries = entries[1:]
                }

                for _, entry := range entries {
                        *inodes = append(*inodes, entry.Inode)
                        *curInodeCount = *curInodeCount + 1
                        *totalInodeCount = *totalInodeCount + 1
                        if *curInodeCount >= maxInodes {
                                mw.BatchDeleteInodeQuota_ll(*inodes, quotaId)
                                *curInodeCount = 0
                                *inodes = (*inodes)[:0]
                        }
                        if proto.IsDir(entry.Type) {
                                err = mw.revokeQuota(entry.Inode, quotaId, totalInodeCount, curInodeCount, inodes, maxInodes, false)
                                if err != nil {
                                        return err
                                }
                        }
                }
                from = entries[len(entries)-1].Name
        }

        if first && *curInodeCount > 0 {
                mw.BatchDeleteInodeQuota_ll(*inodes, quotaId)
                *curInodeCount = 0
                *inodes = (*inodes)[:0]
        }
        return
}

func (mw *MetaWrapper) consumeUniqID(mp *MetaPartition) (status int, uniqid uint64, err error) {
        pid := mp.PartitionID
        mw.uniqidRangeMutex.Lock()
        defer mw.uniqidRangeMutex.Unlock()
        id, ok := mw.uniqidRangeMap[pid]
        if ok {
                if id.cur < id.end {
                        status = statusOK
                        uniqid = id.cur
                        id.cur = id.cur + 1
                        return
                }
        }
        status, start, err := mw.getUniqID(mp, maxUniqID)
        if err != nil || status != statusOK {
                return status, 0, err
        }
        uniqid = start
        if ok {
                id.cur = start + 1
                id.end = start + maxUniqID
        } else {
                mw.uniqidRangeMap[pid] = &uniqidRange{start + 1, start + maxUniqID}
        }
        return
}

func (mw *MetaWrapper) getUniqID(mp *MetaPartition, num uint32) (status int, start uint64, err error) {
        req := &proto.GetUniqIDRequest{
                VolName:     mw.volname,
                PartitionID: mp.PartitionID,
                Num:         num,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpMetaGetUniqID
        packet.PartitionID = mp.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                return
        }

        packet, err = mw.sendToMetaPartition(mp, packet)
        if err != nil {
                log.LogErrorf("getUniqID: packet(%v) mp(%v) req(%v) err(%v)", packet, mp, *req, err)
                return
        }

        status = parseStatus(packet.ResultCode)
        if status != statusOK {
                log.LogErrorf("getUniqID: packet(%v) mp(%v) req(%v) result(%v)", packet, mp, *req, packet.GetResultMsg())
                return
        }

        resp := new(proto.GetUniqIDResponse)
        err = packet.UnmarshalData(resp)
        if err != nil {
                log.LogErrorf("getUniqID: packet(%v) mp(%v) err(%v) PacketData(%v)", packet, mp, err, string(packet.Data))
                return
        }
        start = resp.Start
        return
}

func (mw *MetaWrapper) checkVerFromMeta(packet *proto.Packet) {
        if packet.VerSeq <= mw.Client.GetLatestVer() {
                return
        }

        log.LogDebugf("checkVerFromMeta.UpdateLatestVer.try update meta wrapper verSeq from %v to %v verlist[%v]", mw.Client.GetLatestVer(), packet.VerSeq, packet.VerList)
        mw.Client.UpdateLatestVer(&proto.VolVersionInfoList{VerList: packet.VerList})
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        "fmt"

        "github.com/cubefs/cubefs/util/btree"
)

type MetaPartition struct {
        PartitionID uint64
        Start       uint64
        End         uint64
        Members     []string
        LeaderAddr  string
        Status      int8
}

func (this *MetaPartition) Less(than btree.Item) bool {
        that := than.(*MetaPartition)
        return this.Start < that.Start
}

func (mp *MetaPartition) Copy() btree.Item {
        return mp
}

func (mp *MetaPartition) String() string {
        return fmt.Sprintf("PartitionID(%v) Start(%v) End(%v) Members(%v) LeaderAddr(%v) Status(%v)", mp.PartitionID, mp.Start, mp.End, mp.Members, mp.LeaderAddr, mp.Status)
}

// Meta partition managements
//

func (mw *MetaWrapper) addPartition(mp *MetaPartition) {
        mw.partitions[mp.PartitionID] = mp
        mw.ranges.ReplaceOrInsert(mp)
}

func (mw *MetaWrapper) deletePartition(mp *MetaPartition) {
        delete(mw.partitions, mp.PartitionID)
        mw.ranges.Delete(mp)
}

func (mw *MetaWrapper) replaceOrInsertPartition(mp *MetaPartition) {
        mw.Lock()
        defer mw.Unlock()

        found, ok := mw.partitions[mp.PartitionID]
        if ok {
                mw.deletePartition(found)
        }

        mw.addPartition(mp)
        return
}

func (mw *MetaWrapper) getPartitionByID(id uint64) *MetaPartition {
        mw.RLock()
        defer mw.RUnlock()
        mp, ok := mw.partitions[id]
        if !ok {
                return nil
        }
        return mp
}

func (mw *MetaWrapper) getPartitionByInode(ino uint64) *MetaPartition {
        var mp *MetaPartition
        mw.RLock()
        defer mw.RUnlock()

        pivot := &MetaPartition{Start: ino}
        mw.ranges.DescendLessOrEqual(pivot, func(i btree.Item) bool {
                mp = i.(*MetaPartition)
                if ino > mp.End || ino < mp.Start {
                        mp = nil
                }
                // Iterate one item is enough
                return false
        })

        return mp
}

//func (mw *MetaWrapper) getRWPartitions() []*MetaPartition {
//        rwPartitions := make([]*MetaPartition, 0)
//        mw.RLock()
//        defer mw.RUnlock()
//        for _, mp := range mw.partitions {
//                if mp.Status == proto.ReadWrite {
//                        rwPartitions = append(rwPartitions, mp)
//                }
//        }
//        return rwPartitions
//}

func (mw *MetaWrapper) getRWPartitions() []*MetaPartition {
        mw.RLock()
        defer mw.RUnlock()
        rwPartitions := mw.rwPartitions
        if len(rwPartitions) == 0 {
                rwPartitions = make([]*MetaPartition, 0)
                for _, mp := range mw.partitions {
                        rwPartitions = append(rwPartitions, mp)
                }
        }
        return rwPartitions
}

// GetConnect the partition whose Start is Larger than ino.
// Return nil if no successive partition.
func (mw *MetaWrapper) getNextPartition(ino uint64) *MetaPartition {
        var mp *MetaPartition
        mw.RLock()
        defer mw.RUnlock()

        pivot := &MetaPartition{Start: ino + 1}
        mw.ranges.AscendGreaterOrEqual(pivot, func(i btree.Item) bool {
                mp = i.(*MetaPartition)
                return false
        })

        return mp
}

func (mw *MetaWrapper) getLatestPartition() *MetaPartition {
        mw.RLock()
        defer mw.RUnlock()
        return mw.ranges.Max().(*MetaPartition)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
package meta

import (
        "container/list"
        "sync"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        MinQuotaCacheEvictNum = 10
)

type QuotaCache struct {
        sync.RWMutex
        cache       map[uint64]*list.Element
        lruList     *list.List
        expiration  time.Duration
        maxElements int
}

type QuotaCacheInfo struct {
        quotaInfos map[uint32]*proto.MetaQuotaInfo
        expiration int64
        inode      uint64
}

func NewQuotaCache(exp time.Duration, maxElements int) *QuotaCache {
        qc := &QuotaCache{
                cache:       make(map[uint64]*list.Element),
                lruList:     list.New(),
                expiration:  exp,
                maxElements: maxElements,
        }
        go qc.backgroundEviction()
        return qc
}

func (qc *QuotaCache) Put(ino uint64, qinfo *QuotaCacheInfo) {
        qc.Lock()
        defer qc.Unlock()
        old, ok := qc.cache[ino]
        if ok {
                qc.lruList.Remove(old)
                delete(qc.cache, ino)
        }

        if qc.lruList.Len() >= qc.maxElements {
                qc.evict(true)
        }
        qinfo.quotaSetExpiration(qc.expiration)
        element := qc.lruList.PushFront(qinfo)
        qc.cache[ino] = element
}

func (qc *QuotaCache) Get(ino uint64) *QuotaCacheInfo {
        qc.RLock()
        defer qc.RUnlock()
        element, ok := qc.cache[ino]
        if !ok {
                return nil
        }

        info := element.Value.(*QuotaCacheInfo)
        if info.quotaExpired() {
                return nil
        }
        return info
}

func (qc *QuotaCache) Delete(ino uint64) {
        qc.Lock()
        defer qc.Unlock()
        element, ok := qc.cache[ino]
        if ok {
                qc.lruList.Remove(element)
                delete(qc.cache, ino)
        }
}

func (qc *QuotaCache) evict(foreground bool) {
        for i := 0; i < MinQuotaCacheEvictNum; i++ {
                element := qc.lruList.Back()
                if element == nil {
                        return
                }

                info := element.Value.(*QuotaCacheInfo)
                if !foreground && !info.quotaExpired() {
                        return
                }

                qc.lruList.Remove(element)
                delete(qc.cache, info.inode)
        }

        // For background eviction, we need to continue evict all expired items from the cache
        if foreground {
                return
        }

        for i := 0; i < qc.maxElements; i++ {
                element := qc.lruList.Back()
                if element == nil {
                        break
                }
                info := element.Value.(*QuotaCacheInfo)
                if !info.quotaExpired() {
                        break
                }
                qc.lruList.Remove(element)
                delete(qc.cache, info.inode)
        }
}

func (qc *QuotaCache) backgroundEviction() {
        t := time.NewTicker(qc.expiration)
        defer t.Stop()

        for range t.C {
                log.LogInfof("QuotaCache: start BG evict")
                qc.Lock()
                qc.evict(false)
                qc.Unlock()
                log.LogInfof("QuotaCache: end BG evict")
        }
}

func (qinfo *QuotaCacheInfo) quotaSetExpiration(expiration time.Duration) {
        qinfo.expiration = time.Now().Add(expiration).UnixNano()
}

func (qinfo *QuotaCacheInfo) quotaExpired() bool {
        return time.Now().UnixNano() > qinfo.expiration
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package meta

import (
        "errors"
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/exporter"
        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/stat"
)

type Transaction struct {
        txInfo          *proto.TransactionInfo
        Started         bool
        status          int
        onCommitFuncs   []func()
        onRollbackFuncs []func()
        sync.RWMutex
}

func (tx *Transaction) SetTxID(clientId uint64) {
        tx.txInfo.TxID = genTransactionId(clientId)
}

func (tx *Transaction) GetTxID() string {
        tx.RLock()
        defer tx.RUnlock()
        return tx.txInfo.TxID
}

func (tx *Transaction) SetTmID(tmID uint64) {
        tx.txInfo.TmID = int64(tmID)
}

func (tx *Transaction) AddInode(inode *proto.TxInodeInfo) error {
        tx.Lock()
        defer tx.Unlock()
        if tx.Started {
                return errors.New("transaction already started")
        } else {
                tx.txInfo.TxInodeInfos[inode.GetKey()] = inode
        }
        return nil
}

func (tx *Transaction) AddDentry(dentry *proto.TxDentryInfo) error {
        tx.Lock()
        defer tx.Unlock()
        if tx.Started {
                return errors.New("transaction already started")
        } else {
                tx.txInfo.TxDentryInfos[dentry.GetKey()] = dentry
        }
        return nil
}

// NewTransaction returns a `Transaction` with a timeout(seconds) duration after which the transaction
// will be rolled back if it has not completed yet
func NewTransaction(timeout int64, txType uint32) (tx *Transaction) {
        if timeout == 0 {
                timeout = proto.DefaultTransactionTimeout
        }
        return &Transaction{
                onCommitFuncs:   make([]func(), 0),
                onRollbackFuncs: make([]func(), 0),
                txInfo:          proto.NewTransactionInfo(timeout, txType),
        }
}

func (tx *Transaction) OnExecuted(status int, respTxInfo *proto.TransactionInfo) {
        tx.Lock()
        defer tx.Unlock()
        tx.status = status
        if tx.status == statusOK {
                if !tx.Started {
                        tx.Started = true
                }
                if tx.txInfo.TxID == "" && respTxInfo != nil {
                        tx.txInfo = respTxInfo
                }
        }
}

func (tx *Transaction) SetOnCommit(job func()) {
        tx.onCommitFuncs = append(tx.onCommitFuncs, job)
}

func (tx *Transaction) SetOnRollback(job func()) {
        tx.onRollbackFuncs = append(tx.onRollbackFuncs, job)
        // tx.onRollback = job
}

func (tx *Transaction) OnDone(err error, mw *MetaWrapper) (newErr error) {
        // commit or rollback depending on status
        newErr = err
        if !tx.Started {
                return
        }
        if err != nil {
                log.LogDebugf("OnDone: rollback, tx %s", tx.txInfo.TxID)
                tx.Rollback(mw)
        } else {
                log.LogDebugf("OnDone: commit, tx %s", tx.txInfo.TxID)
                newErr = tx.Commit(mw)
        }
        return
}

// Commit will notify all the RM(related metapartitions) that transaction is completed successfully,
// and corresponding transaction items can be removed
func (tx *Transaction) Commit(mw *MetaWrapper) (err error) {
        tmMP := mw.getPartitionByID(uint64(tx.txInfo.TmID))
        if tmMP == nil {
                log.LogErrorf("Transaction commit: No TM partition, TmID(%v), txID(%v)", tx.txInfo.TmID, tx.txInfo.TxID)
                return fmt.Errorf("transaction commit: can't find target mp for tx, mpId %d", tx.txInfo.TmID)
        }

        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txCommit", err, bgTime, 1)
        }()
        metric := exporter.NewTPCnt("OpTxCommit")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        req := &proto.TxApplyRequest{
                TxID:        tx.txInfo.TxID,
                TmID:        uint64(tx.txInfo.TmID),
                TxApplyType: proto.TxCommit,
                // TxInfo:      tx.txInfo,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpTxCommit
        packet.PartitionID = tmMP.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("Transaction commit: TmID(%v), txID(%v), req(%v) err(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, *req, err)
                return
        }

        packet, err = mw.sendToMetaPartition(tmMP, packet)
        if err != nil {
                log.LogErrorf("Transaction commit: txID(%v), packet(%v) mp(%v) req(%v) err(%v)",
                        tx.txInfo.TxID, packet, tmMP, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                err = errors.New(packet.GetResultMsg())
                log.LogErrorf("Transaction commit failed: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
                return
        }

        for _, job := range tx.onCommitFuncs {
                job()
        }

        if log.EnableDebug() {
                log.LogDebugf("Transaction commit succesfully: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
        }

        return
}

// Rollback will notify all the RM(related metapartitions) that transaction is cancelled,
// and corresponding transaction items should be rolled back to previous state(before transaction)
func (tx *Transaction) Rollback(mw *MetaWrapper) {
        tmMP := mw.getPartitionByID(uint64(tx.txInfo.TmID))
        if tmMP == nil {
                log.LogWarnf("Transaction Rollback: No TM partition, TmID(%v), txID(%v)", tx.txInfo.TmID, tx.txInfo.TxID)
                return
        }

        var err error
        bgTime := stat.BeginStat()
        defer func() {
                stat.EndStat("txRollback", err, bgTime, 1)
        }()

        req := &proto.TxApplyRequest{
                TxID:        tx.txInfo.TxID,
                TmID:        uint64(tx.txInfo.TmID),
                TxApplyType: proto.TxRollback,
                // TxInfo:      tx.txInfo,
        }

        packet := proto.NewPacketReqID()
        packet.Opcode = proto.OpTxRollback
        packet.PartitionID = tmMP.PartitionID
        err = packet.MarshalData(req)
        if err != nil {
                log.LogErrorf("Transaction Rollback: TmID(%v), txID(%v), req(%v) err(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, *req, err)
                return
        }

        metric := exporter.NewTPCnt("OpTxRollback")
        defer func() {
                metric.SetWithLabels(err, map[string]string{exporter.Vol: mw.volname})
        }()

        packet, err = mw.sendToMetaPartition(tmMP, packet)
        if err != nil {
                log.LogErrorf("Transaction Rollback: txID(%v), packet(%v) mp(%v) req(%v) err(%v)",
                        tx.txInfo.TxID, packet, tmMP, *req, err)
                return
        }

        status := parseStatus(packet.ResultCode)
        if status != statusOK {
                log.LogErrorf("Transaction Rollback failed: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
                return
        }

        for _, job := range tx.onRollbackFuncs {
                job()
        }

        if log.EnableDebug() {
                log.LogDebugf("Transaction Rollback successfully: TmID(%v), txID(%v), packet(%v) mp(%v) req(%v) result(%v)",
                        tx.txInfo.TmID, tx.txInfo.TxID, packet, tmMP, *req, packet.GetResultMsg())
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.k

package meta

import (
        "fmt"
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

var txId uint64 = 1

func genTransactionId(clientId uint64) string {
        return fmt.Sprintf("%d_%d", clientId, atomic.AddUint64(&txId, 1))
}

func getMembersFromMp(parentMp *MetaPartition) string {
        members := parentMp.LeaderAddr
        for _, addr := range parentMp.Members {
                if addr == parentMp.LeaderAddr {
                        continue
                }
                if members == "" {
                        members += addr
                } else {
                        members += "," + addr
                }
        }
        return members
}

func NewCreateTransaction(parentMp, inoMp *MetaPartition, parentID uint64, name string, txTimeout int64, txType uint32) (tx *Transaction, err error) {
        // tx = NewTransaction(txTimeout, proto.TxTypeCreate)
        tx = NewTransaction(txTimeout, txType)

        members := getMembersFromMp(parentMp)
        if members == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        inoMembers := getMembersFromMp(inoMp)

        txDentryInfo := proto.NewTxDentryInfo(members, parentID, name, parentMp.PartitionID)
        txParInoInfo := proto.NewTxInodeInfo(inoMembers, 0, inoMp.PartitionID)
        if err = tx.AddDentry(txDentryInfo); err != nil {
                return nil, err
        }
        if err = tx.AddInode(txParInoInfo); err != nil {
                return nil, err
        }
        if log.EnableDebug() {
                log.LogDebugf("NewCreateTransaction: txInfo(%v) parentMp", tx.txInfo)
        }
        return tx, nil
}

func NewDeleteTransaction(
        denMp *MetaPartition, parentID uint64, name string,
        inoMp *MetaPartition, ino uint64, txTimeout int64) (tx *Transaction, err error) {
        tx = NewTransaction(txTimeout, proto.TxTypeRemove)

        denMembers := getMembersFromMp(denMp)
        if denMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        inoMembers := getMembersFromMp(inoMp)
        if inoMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
        txDentryInfo := proto.NewTxDentryInfo(denMembers, parentID, name, denMp.PartitionID)
        if err = tx.AddInode(txInoInfo); err != nil {
                return nil, err
        }
        if err = tx.AddDentry(txDentryInfo); err != nil {
                return nil, err
        }
        if log.EnableDebug() {
                log.LogDebugf("NewDeleteTransaction: tx(%v)", tx)
        }
        return tx, nil
}

func NewRenameTransaction(srcMp *MetaPartition, srcDenParentID uint64, srcName string,
        dstMp *MetaPartition, dstDenParentID uint64, dstName string, txTimeout int64) (tx *Transaction, err error) {
        tx = NewTransaction(txTimeout, proto.TxTypeRename)

        srcMembers := getMembersFromMp(srcMp)
        if srcMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        dstMembers := getMembersFromMp(dstMp)
        if dstMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        txSrcDentryInfo := proto.NewTxDentryInfo(srcMembers, srcDenParentID, srcName, srcMp.PartitionID)
        txDstDentryInfo := proto.NewTxDentryInfo(dstMembers, dstDenParentID, dstName, dstMp.PartitionID)
        if err = tx.AddDentry(txSrcDentryInfo); err != nil {
                return nil, err
        }
        if err = tx.AddDentry(txDstDentryInfo); err != nil {
                return nil, err
        }

        if log.EnableDebug() {
                log.LogDebugf("NewRenameTransaction: txInfo(%v)", tx.txInfo)
        }
        return tx, nil
}

func RenameTxReplaceInode(tx *Transaction, inoMp *MetaPartition, ino uint64) (err error) {
        inoMembers := getMembersFromMp(inoMp)
        if inoMembers == "" {
                return fmt.Errorf("invalid parent metapartition")
        }
        txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
        _ = tx.AddInode(txInoInfo)
        log.LogDebugf("RenameTxReplaceInode: txInfo(%v)", tx.txInfo)
        return nil
}

func NewLinkTransaction(
        denMp *MetaPartition, parentID uint64, name string,
        inoMp *MetaPartition, ino uint64, txTimeout int64) (tx *Transaction, err error) {
        tx = NewTransaction(txTimeout, proto.TxTypeLink)

        denMembers := getMembersFromMp(denMp)
        if denMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        inoMembers := getMembersFromMp(inoMp)
        if inoMembers == "" {
                return nil, fmt.Errorf("invalid parent metapartition")
        }

        txInoInfo := proto.NewTxInodeInfo(inoMembers, ino, inoMp.PartitionID)
        txDentryInfo := proto.NewTxDentryInfo(denMembers, parentID, name, denMp.PartitionID)
        if err = tx.AddInode(txInoInfo); err != nil {
                return nil, err
        }
        if err = tx.AddDentry(txDentryInfo); err != nil {
                return nil, err
        }
        if log.EnableDebug() {
                log.LogDebugf("NewLinkTransaction: tx(%v)", tx)
        }
        return tx, nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package meta

import (
        "crypto/md5"
        "encoding/base64"
        "encoding/hex"
        "encoding/json"
        "fmt"
        "os"
        "strings"
        "sync/atomic"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/sdk/master"
        "github.com/cubefs/cubefs/util/cryptoutil"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
        "github.com/jacobsa/daemonize"
)

const (
        MaxSendToMaster = 3
)

type VolumeView struct {
        Name           string
        Owner          string
        MetaPartitions []*MetaPartition
        OSSSecure      *OSSSecure
        CreateTime     int64
        DeleteLockTime int64
}

type OSSSecure struct {
        AccessKey string
        SecretKey string
}

type VolStatInfo = proto.VolStatInfo

func (mw *MetaWrapper) fetchVolumeView() (view *VolumeView, err error) {
        var vv *proto.VolView
        if mw.ownerValidation {
                var authKey string
                if authKey, err = calculateAuthKey(mw.owner); err != nil {
                        return
                }
                if mw.authenticate {
                        var (
                                tokenMessage string
                                ts           int64
                        )
                        mw.accessToken.Type = proto.MsgMasterFetchVolViewReq
                        if tokenMessage, ts, err = genMasterToken(mw.accessToken, mw.sessionKey); err != nil {
                                log.LogWarnf("fetchVolumeView generate token failed: err(%v)", err)
                                return nil, err
                        }
                        var decoder master.Decoder = func(raw []byte) ([]byte, error) {
                                return mw.parseAndVerifyResp(raw, ts)
                        }
                        if vv, err = mw.mc.ClientAPI().GetVolumeWithAuthnode(mw.volname, authKey, tokenMessage, decoder); err != nil {
                                return
                        }
                } else {
                        if vv, err = mw.mc.ClientAPI().GetVolume(mw.volname, authKey); err != nil {
                                return
                        }
                }
        } else {
                if vv, err = mw.mc.ClientAPI().GetVolumeWithoutAuthKey(mw.volname); err != nil {
                        return
                }
        }
        if vv.Status == 1 {
                log.LogErrorf("fetchVolumeView: volume has been marked for deletion: volume(%v) status(%v - 0:normal/1:markDelete)",
                        vv.Name, vv.Status)
                return nil, proto.ErrVolNotExists
        }
        convert := func(volView *proto.VolView) *VolumeView {
                result := &VolumeView{
                        Name:           volView.Name,
                        Owner:          volView.Owner,
                        MetaPartitions: make([]*MetaPartition, len(volView.MetaPartitions)),
                        OSSSecure:      &OSSSecure{},
                        CreateTime:     volView.CreateTime,
                        DeleteLockTime: volView.DeleteLockTime,
                }
                if volView.OSSSecure != nil {
                        result.OSSSecure.AccessKey = volView.OSSSecure.AccessKey
                        result.OSSSecure.SecretKey = volView.OSSSecure.SecretKey
                }
                for i, mp := range volView.MetaPartitions {
                        result.MetaPartitions[i] = &MetaPartition{
                                PartitionID: mp.PartitionID,
                                Start:       mp.Start,
                                End:         mp.End,
                                Members:     mp.Members,
                                LeaderAddr:  mp.LeaderAddr,
                                Status:      mp.Status,
                        }
                }
                return result
        }
        view = convert(vv)
        return
}

// fetch and update cluster info if successful
func (mw *MetaWrapper) updateClusterInfo() (err error) {
        var info *proto.ClusterInfo
        if info, err = mw.mc.AdminAPI().GetClusterInfo(); err != nil {
                log.LogWarnf("updateClusterInfo: get cluster info fail: err(%v) volume(%v)", err, mw.volname)
                return
        }
        log.LogInfof("updateClusterInfo: get cluster info: cluster(%v) localIP(%v) volume(%v)",
                info.Cluster, info.Ip, mw.volname)
        mw.cluster = info.Cluster
        mw.localIP = info.Ip
        return
}

func (mw *MetaWrapper) updateDirChildrenNumLimit() (err error) {
        var clusterInfo *proto.ClusterInfo
        clusterInfo, err = mw.mc.AdminAPI().GetClusterInfo()
        if err != nil {
                return
        }

        if clusterInfo.DirChildrenNumLimit < proto.MinDirChildrenNumLimit {
                log.LogWarnf("updateDirChildrenNumLimit: DirChildrenNumLimit probably not enabled on master, set to default value(%v)",
                        proto.DefaultDirChildrenNumLimit)
                atomic.StoreUint32(&mw.DirChildrenNumLimit, proto.DefaultDirChildrenNumLimit)
        } else {
                atomic.StoreUint32(&mw.DirChildrenNumLimit, clusterInfo.DirChildrenNumLimit)
                log.LogInfof("updateDirChildrenNumLimit: DirChildrenNumLimit(%v)", mw.DirChildrenNumLimit)
        }

        return
}

func (mw *MetaWrapper) updateVolStatInfo() (err error) {
        var info *proto.VolStatInfo
        if info, err = mw.mc.ClientAPI().GetVolumeStat(mw.volname); err != nil {
                log.LogWarnf("updateVolStatInfo: get volume status fail: volume(%v) err(%v)", mw.volname, err)
                return
        }

        if info.UsedSize > info.TotalSize {
                log.LogInfof("volume(%v) queried usedSize(%v) is larger than totalSize(%v), force set usedSize as totalSize",
                        mw.volname, info.UsedSize, info.TotalSize)
                info.UsedSize = info.TotalSize
        }

        atomic.StoreUint64(&mw.totalSize, info.TotalSize)
        atomic.StoreUint64(&mw.usedSize, info.UsedSize)
        atomic.StoreUint64(&mw.inodeCount, info.InodeCount)
        log.LogInfof("VolStatInfo: volume(%v) info(%v)", mw.volname, info)
        return
}

func (mw *MetaWrapper) updateMetaPartitions() error {
        view, err := mw.fetchVolumeView()
        if err != nil {
                log.LogInfof("updateMetaPartition volume(%v) error: %v", mw.volname, err.Error())
                switch err {
                case proto.ErrExpiredTicket:
                        // TODO: bad logic, remove later (Mofei Zhang)
                        if e := mw.updateTicket(); e != nil {
                                log.LogFlush()
                                daemonize.SignalOutcome(err)
                                os.Exit(1)
                        }
                        log.LogInfof("updateTicket: ok!")
                        return err
                case proto.ErrInvalidTicket:
                        // TODO: bad logic, remove later (Mofei Zhang)
                        log.LogFlush()
                        daemonize.SignalOutcome(err)
                        os.Exit(1)
                default:
                        return err
                }
        }

        rwPartitions := make([]*MetaPartition, 0)
        for _, mp := range view.MetaPartitions {
                mw.replaceOrInsertPartition(mp)
                log.LogInfof("updateMetaPartition: mp(%v)", mp)
                if mp.Status == proto.ReadWrite {
                        rwPartitions = append(rwPartitions, mp)
                }
        }
        mw.ossSecure = view.OSSSecure
        mw.volCreateTime = view.CreateTime
        mw.volDeleteLockTime = view.DeleteLockTime

        if len(rwPartitions) == 0 {
                log.LogInfof("updateMetaPartition: no rw partitions")
                return nil
        }

        mw.Lock()
        mw.rwPartitions = rwPartitions
        mw.Unlock()
        return nil
}

func (mw *MetaWrapper) forceUpdateMetaPartitions() error {
        // Only one forceUpdateMetaPartition is allowed in a specific period of time.
        if ok := mw.forceUpdateLimit.AllowN(time.Now(), MinForceUpdateMetaPartitionsInterval); !ok {
                return errors.New("Force update meta partitions throttled!")
        }

        return mw.updateMetaPartitions()
}

// Should be protected by partMutex, otherwise the caller might not be signaled.
func (mw *MetaWrapper) triggerAndWaitForceUpdate() {
        mw.partMutex.Lock()
        select {
        case mw.forceUpdate <- struct{}{}:
        default:
        }
        mw.partCond.Wait()
        mw.partMutex.Unlock()
}

func (mw *MetaWrapper) refresh() {
        var err error

        t := time.NewTimer(RefreshMetaPartitionsInterval)
        defer t.Stop()

        for {
                select {
                case <-t.C:
                        if err = mw.updateMetaPartitions(); err != nil {
                                mw.onAsyncTaskError.OnError(err)
                                log.LogErrorf("updateMetaPartition fail cause: %v", err)
                        }
                        if err = mw.updateVolStatInfo(); err != nil {
                                mw.onAsyncTaskError.OnError(err)
                                log.LogErrorf("updateVolStatInfo fail cause: %v", err)
                        }
                        if err = mw.updateDirChildrenNumLimit(); err != nil {
                                mw.onAsyncTaskError.OnError(err)
                                log.LogErrorf("updateDirChildrenNumLimit fail cause: %v", err)
                        }
                        t.Reset(RefreshMetaPartitionsInterval)
                case <-mw.forceUpdate:
                        log.LogInfof("Start forceUpdateMetaPartitions")
                        mw.partMutex.Lock()
                        if err = mw.forceUpdateMetaPartitions(); err == nil {
                                if err = mw.updateVolStatInfo(); err == nil {
                                        t.Reset(RefreshMetaPartitionsInterval)
                                }
                        }
                        mw.partMutex.Unlock()
                        mw.partCond.Broadcast()
                        log.LogInfof("End forceUpdateMetaPartitions: err(%v)", err)
                case <-mw.closeCh:
                        return
                }
        }
}

func calculateAuthKey(key string) (authKey string, err error) {
        h := md5.New()
        _, err = h.Write([]byte(key))
        if err != nil {
                log.LogErrorf("action[calculateAuthKey] calculate auth key[%v] failed,err[%v]", key, err)
                return
        }
        cipherStr := h.Sum(nil)
        return strings.ToLower(hex.EncodeToString(cipherStr)), nil
}

func genMasterToken(req proto.APIAccessReq, key string) (message string, ts int64, err error) {
        var (
                sessionKey []byte
                data       []byte
        )

        if sessionKey, err = cryptoutil.Base64Decode(key); err != nil {
                return
        }

        if req.Verifier, ts, err = cryptoutil.GenVerifier(sessionKey); err != nil {
                return
        }

        if data, err = json.Marshal(req); err != nil {
                return
        }
        message = base64.StdEncoding.EncodeToString(data)

        return
}

func (mw *MetaWrapper) updateTicket() error {
        ticket, err := mw.ac.API().GetTicket(mw.owner, mw.ticketMess.ClientKey, proto.MasterServiceID)
        if err != nil {
                return errors.Trace(err, "Update ticket from authnode failed!")
        }
        mw.accessToken.Ticket = ticket.Ticket
        mw.sessionKey = ticket.SessionKey
        return nil
}

func (mw *MetaWrapper) parseAndVerifyResp(body []byte, ts int64) (dataBody []byte, err error) {
        var resp proto.MasterAPIAccessResp
        if resp, err = mw.parseRespWithAuth(body); err != nil {
                log.LogWarnf("fetchVolumeView parse response failed: err(%v) body(%v)", err, string(body))
                return nil, err
        }
        if err = proto.VerifyAPIRespComm(&(resp.APIResp), mw.accessToken.Type, mw.owner, proto.MasterServiceID, ts); err != nil {
                log.LogWarnf("fetchVolumeView verify response: err(%v)", err)
                return nil, err
        }
        viewBody := &struct {
                Code int32  `json:"code"`
                Msg  string `json:"msg"`
                Data json.RawMessage
        }{}
        if err = json.Unmarshal(resp.Data, viewBody); err != nil {
                log.LogWarnf("VolViewCache unmarshal: err(%v) body(%v)", err, viewBody)
                return nil, err
        }
        if viewBody.Code != 0 {
                return nil, fmt.Errorf("request error, code[%d], msg[%s]", viewBody.Code, viewBody.Msg)
        }
        return viewBody.Data, err
}

func (mw *MetaWrapper) parseRespWithAuth(body []byte) (resp proto.MasterAPIAccessResp, err error) {
        var (
                message    string
                sessionKey []byte
                plaintext  []byte
        )

        if err = json.Unmarshal(body, &message); err != nil {
                return
        }

        if sessionKey, err = cryptoutil.Base64Decode(mw.sessionKey); err != nil {
                return
        }

        if plaintext, err = cryptoutil.DecodeMessage(message, sessionKey); err != nil {
                return
        }

        if err = json.Unmarshal(plaintext, &resp); err != nil {
                return
        }

        return
}

func (mw *MetaWrapper) updateQuotaInfoTick() {
        mw.updateQuotaInfo()
        ticker := time.NewTicker(10 * time.Second)
        defer ticker.Stop()

        for {
                select {
                case <-ticker.C:
                        mw.updateQuotaInfo()
                case <-mw.closeCh:
                        return
                }
        }
}

func (mw *MetaWrapper) updateQuotaInfo() {
        var volumeInfo *proto.SimpleVolView
        volumeInfo, err := mw.mc.AdminAPI().GetVolumeSimpleInfo(mw.volname)
        if err != nil {
                return
        }
        mw.EnableQuota = volumeInfo.EnableQuota
        if !mw.EnableQuota {
                return
        }

        quotaInfos, err := mw.mc.AdminAPI().ListQuota(mw.volname)
        if err != nil {
                log.LogWarnf("updateQuotaInfo get quota info fail: vol [%v] err [%v]", mw.volname, err)
                return
        }
        mw.QuotaLock.Lock()
        defer mw.QuotaLock.Unlock()
        mw.QuotaInfoMap = make(map[uint32]*proto.QuotaInfo)
        for _, info := range quotaInfos {
                mw.QuotaInfoMap[info.QuotaId] = info
                log.LogDebugf("updateQuotaInfo quotaInfo [%v]", info)
        }
}

func (mw *MetaWrapper) IsQuotaLimited(quotaIds []uint32) bool {
        mw.QuotaLock.RLock()
        defer mw.QuotaLock.RUnlock()
        for _, quotaId := range quotaIds {
                if info, isFind := mw.QuotaInfoMap[quotaId]; isFind {
                        if info.LimitedInfo.LimitedBytes {
                                log.LogDebugf("IsQuotaLimited quotaId [%v]", quotaId)
                                return true
                        }
                }
                log.LogDebugf("IsQuotaLimited false quota [%v]", quotaId)
        }
        return false
}

func (mw *MetaWrapper) GetQuotaFullPaths() (fullPaths []string) {
        fullPaths = make([]string, 0, 0)
        mw.QuotaLock.RLock()
        defer mw.QuotaLock.RUnlock()
        for _, info := range mw.QuotaInfoMap {
                for _, pathInfo := range info.PathInfos {
                        fullPaths = append(fullPaths, pathInfo.FullPath)
                }
        }
        return fullPaths
}

func (mw *MetaWrapper) IsQuotaLimitedById(inodeId uint64, size bool, files bool) bool {
        mp := mw.getPartitionByInode(inodeId)
        if mp == nil {
                log.LogErrorf("IsQuotaLimitedById: inodeId(%v)", inodeId)
                return true
        }
        quotaInfos, err := mw.getInodeQuota(mp, inodeId)
        if err != nil {
                log.LogErrorf("IsQuotaLimitedById: get parent quota fail, inodeId(%v) err(%v)", inodeId, err)
                return true
        }
        for quotaId := range quotaInfos {
                if info, isFind := mw.QuotaInfoMap[quotaId]; isFind {
                        if size && info.LimitedInfo.LimitedBytes {
                                log.LogDebugf("IsQuotaLimitedById quotaId [%v]", quotaId)
                                return true
                        }

                        if files && info.LimitedInfo.LimitedFiles {
                                log.LogDebugf("IsQuotaLimitedById quotaId [%v]", quotaId)
                                return true
                        }
                }
                log.LogDebugf("IsQuotaLimitedById false quota [%v]", quotaId)
        }

        return false
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "errors"
        "fmt"
)

var (
        ExtentHasBeenDeletedError  = errors.New("extent has been deleted")
        ParameterMismatchError     = errors.New("parameter mismatch error")
        NoAvailableExtentError     = errors.New("no available extent")
        NoBrokenExtentError        = errors.New("no unavailable extent")
        NoSpaceError               = errors.New("no space left on the device")
        TryAgainError              = errors.New("try again")
        CrcMismatchError           = errors.New("packet Crc is incorrect")
        NoLeaderError              = errors.New("no raft leader")
        ExtentNotFoundError        = errors.New("extent does not exist")
        ExtentExistsError          = errors.New("extent already exists")
        ExtentIsFullError          = errors.New("extent is full")
        BrokenExtentError          = errors.New("extent has been broken")
        BrokenDiskError            = errors.New("disk has broken")
        ForbidWriteError           = errors.New("single replica decommission forbid write")
        VerNotConsistentError      = errors.New("ver not consistent")
        SnapshotNeedNewExtentError = errors.New("snapshot need new extent error")
)

func newParameterError(format string, a ...interface{}) error {
        return fmt.Errorf("parameter mismatch error: %s", fmt.Sprintf(format, a...))
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "encoding/binary"
        "fmt"
        "hash/crc32"
        "io"
        "math"
        "os"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

const (
        ExtentOpenOpt  = os.O_CREATE | os.O_RDWR | os.O_EXCL
        ExtentHasClose = -1
        SEEK_DATA      = 3
        SEEK_HOLE      = 4
)

const (
        ExtentMaxSize = 1024 * 1024 * 1024 * 1024 * 4 // 4TB
)

type ExtentInfo struct {
        FileID              uint64 `json:"fileId"`
        Size                uint64 `json:"size"`
        Crc                 uint32 `json:"Crc"`
        IsDeleted           bool   `json:"deleted"`
        ModifyTime          int64  `json:"modTime"` // random write not update modify time
        AccessTime          int64  `json:"accessTime"`
        Source              string `json:"src"`
        SnapshotDataOff     uint64 `json:"snapSize"`
        SnapPreAllocDataOff uint64 `json:"snapPreAllocSize"`
        ApplyID             uint64 `json:"applyID"`
}

func (ei *ExtentInfo) TotalSize() uint64 {
        if ei.SnapshotDataOff > util.ExtentSize {
                return ei.Size + (ei.SnapshotDataOff - util.ExtentSize)
        }
        return ei.Size
}

func (ei *ExtentInfo) String() (m string) {
        source := ei.Source
        if source == "" {
                source = "none"
        }
        return fmt.Sprintf("%v_%v_%v_%v_%v_%d_%d_%d", ei.FileID, ei.Size, ei.SnapshotDataOff, ei.IsDeleted, source, ei.ModifyTime, ei.AccessTime, ei.Crc)
}

// SortedExtentInfos defines an array sorted by AccessTime
type SortedExtentInfos []*ExtentInfo

func (extInfos SortedExtentInfos) Len() int {
        return len(extInfos)
}

func (extInfos SortedExtentInfos) Less(i, j int) bool {
        return extInfos[i].AccessTime < extInfos[j].AccessTime
}

func (extInfos SortedExtentInfos) Swap(i, j int) {
        extInfos[i], extInfos[j] = extInfos[j], extInfos[i]
}

// Extent is an implementation of Extent for local regular extent file data management.
// This extent implementation manages all header info and data body in one single entry file.
// Header of extent include inode value of this extent block and Crc blocks of data blocks.
type Extent struct {
        file            *os.File
        filePath        string
        extentID        uint64
        modifyTime      int64
        accessTime      int64
        dataSize        int64
        hasClose        int32
        header          []byte
        snapshotDataOff uint64
        sync.Mutex
}

// NewExtentInCore create and returns a new extent instance.
func NewExtentInCore(name string, extentID uint64) *Extent {
        e := new(Extent)
        e.extentID = extentID
        e.filePath = name
        e.snapshotDataOff = util.ExtentSize
        return e
}

func (e *Extent) String() string {
        return fmt.Sprintf("%v_%v_%v", e.filePath, e.dataSize, e.snapshotDataOff)
}

func (e *Extent) GetSize() (int64, uint64) {
        return e.dataSize, e.snapshotDataOff
}

func (e *Extent) HasClosed() bool {
        return atomic.LoadInt32(&e.hasClose) == ExtentHasClose
}

// Close this extent and release FD.
func (e *Extent) Close() (err error) {
        if e.HasClosed() {
                return
        }
        if err = e.file.Close(); err != nil {
                return
        }
        return
}

func (e *Extent) Exist() (exsit bool) {
        _, err := os.Stat(e.filePath)
        if err != nil {
                return os.IsExist(err)
        }
        return true
}

func (e *Extent) GetFile() *os.File {
        return e.file
}

// InitToFS init extent data info filesystem. If entry file exist and overwrite is true,
// this operation will clear all data of exist entry file and initialize extent header data.
func (e *Extent) InitToFS() (err error) {
        if e.file, err = os.OpenFile(e.filePath, ExtentOpenOpt, 0o666); err != nil {
                return err
        }

        if IsTinyExtent(e.extentID) {
                e.dataSize = 0
                return
        }
        atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
        atomic.StoreInt64(&e.accessTime, time.Now().Unix())
        e.dataSize = 0
        return
}

func (e *Extent) GetDataSize(statSize int64) (dataSize int64) {
        var (
                dataStart int64
                holStart  int64
                curOff    int64
                err       error
        )

        for {
                // curOff if the hold start and the data end
                curOff, err = e.file.Seek(holStart, SEEK_DATA)
                if err != nil || curOff >= util.ExtentSize || (holStart > 0 && holStart == curOff) {
                        log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
                        break
                }
                log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
                dataStart = curOff

                curOff, err = e.file.Seek(dataStart, SEEK_HOLE)
                if err != nil || curOff >= util.ExtentSize || dataStart == curOff {
                        log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
                        break
                }
                log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
                holStart = curOff
        }
        log.LogDebugf("GetDataSize statSize %v curOff %v dataStart %v holStart %v, err %v,path %v", statSize, curOff, dataStart, holStart, err, e.filePath)
        if holStart == 0 {
                if statSize > util.ExtentSize {
                        return util.ExtentSize
                }
                return statSize
        }
        return holStart
}

// RestoreFromFS restores the entity data and status from the file stored on the filesystem.
func (e *Extent) RestoreFromFS() (err error) {
        if e.file, err = os.OpenFile(e.filePath, os.O_RDWR, 0o666); err != nil {
                if strings.Contains(err.Error(), syscall.ENOENT.Error()) {
                        err = ExtentNotFoundError
                }
                return err
        }
        var info os.FileInfo
        if info, err = e.file.Stat(); err != nil {
                err = fmt.Errorf("stat file %v: %v", e.file.Name(), err)
                return
        }

        if IsTinyExtent(e.extentID) {
                watermark := info.Size()
                if watermark%util.PageSize != 0 {
                        watermark = watermark + (util.PageSize - watermark%util.PageSize)
                }
                e.dataSize = watermark
                return
        }

        e.dataSize = e.GetDataSize(info.Size())
        e.snapshotDataOff = util.ExtentSize
        if info.Size() > util.ExtentSize {
                e.snapshotDataOff = uint64(info.Size())
        }

        atomic.StoreInt64(&e.modifyTime, info.ModTime().Unix())

        ts := info.Sys().(*syscall.Stat_t)
        atomic.StoreInt64(&e.accessTime, time.Unix(int64(ts.Atim.Sec), int64(ts.Atim.Nsec)).Unix())
        return
}

// Size returns length of the extent (not including the header).
func (e *Extent) Size() (size int64) {
        return e.dataSize
}

// ModifyTime returns the time when this extent was modified recently.
func (e *Extent) ModifyTime() int64 {
        return atomic.LoadInt64(&e.modifyTime)
}

func IsRandomWrite(writeType int) bool {
        return writeType == RandomWriteType
}

func IsAppendWrite(writeType int) bool {
        return writeType == AppendWriteType
}

func IsAppendRandomWrite(writeType int) bool {
        return writeType == AppendRandomWriteType
}

// WriteTiny performs write on a tiny extent.
func (e *Extent) WriteTiny(data []byte, offset, size int64, crc uint32, writeType int, isSync bool) (err error) {
        e.Lock()
        defer e.Unlock()
        index := offset + size
        if index >= ExtentMaxSize {
                return ExtentIsFullError
        }

        if IsAppendWrite(writeType) && offset != e.dataSize {
                return ParameterMismatchError
        }

        if _, err = e.file.WriteAt(data[:size], int64(offset)); err != nil {
                return
        }
        if isSync {
                if err = e.file.Sync(); err != nil {
                        return
                }
        }

        if !IsAppendWrite(writeType) {
                return
        }
        if index%util.PageSize != 0 {
                index = index + (util.PageSize - index%util.PageSize)
        }
        e.dataSize = index

        return
}

// Write writes data to an extent.
func (e *Extent) Write(data []byte, offset, size int64, crc uint32, writeType int, isSync bool, crcFunc UpdateCrcFunc, ei *ExtentInfo) (status uint8, err error) {
        log.LogDebugf("action[Extent.Write] path %v offset %v size %v writeType %v", e.filePath, offset, size, writeType)
        status = proto.OpOk
        if IsTinyExtent(e.extentID) {
                err = e.WriteTiny(data, offset, size, crc, writeType, isSync)
                return
        }

        if err = e.checkWriteOffsetAndSize(writeType, offset, size); err != nil {
                log.LogErrorf("action[Extent.Write] checkWriteOffsetAndSize offset %v size %v writeType %v err %v",
                        offset, size, writeType, err)
                err = newParameterError("extent current size=%d write offset=%d write size=%d", e.dataSize, offset, size)
                log.LogInfof("action[Extent.Write] newParameterError path %v offset %v size %v writeType %v err %v", e.filePath,
                        offset, size, writeType, err)
                status = proto.OpTryOtherExtent
                return
        }

        log.LogDebugf("action[Extent.Write] path %v offset %v size %v writeType %v", e.filePath, offset, size, writeType)
        // Check if extent file size matches the write offset just in case
        // multiple clients are writing concurrently.
        e.Lock()
        defer e.Unlock()
        log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v path %v", offset, size, writeType, e.filePath)
        if IsAppendWrite(writeType) && e.dataSize != offset {
                err = newParameterError("extent current size=%d write offset=%d write size=%d", e.dataSize, offset, size)
                log.LogInfof("action[Extent.Write] newParameterError path %v offset %v size %v writeType %v err %v", e.filePath,
                        offset, size, writeType, err)
                status = proto.OpTryOtherExtent
                return
        }
        if IsAppendRandomWrite(writeType) {
                if e.snapshotDataOff <= util.ExtentSize {
                        log.LogInfof("action[Extent.Write] truncate extent %v offset %v size %v writeType %v truncate err %v", e, offset, size, writeType, err)
                        if err = e.file.Truncate(util.ExtentSize); err != nil {
                                log.LogErrorf("action[Extent.Write] offset %v size %v writeType %v truncate err %v", offset, size, writeType, err)
                                return
                        }
                }
        }
        if _, err = e.file.WriteAt(data[:size], int64(offset)); err != nil {
                log.LogErrorf("action[Extent.Write] offset %v size %v writeType %v err %v", offset, size, writeType, err)
                return
        }

        blockNo := offset / util.BlockSize
        offsetInBlock := offset % util.BlockSize
        defer func() {
                log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v path %v", offset, size, writeType, e.filePath)
                if IsAppendWrite(writeType) {
                        atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
                        e.dataSize = int64(math.Max(float64(e.dataSize), float64(offset+size)))
                        log.LogDebugf("action[Extent.Write] e %v offset %v size %v writeType %v", e, offset, size, writeType)
                } else if IsAppendRandomWrite(writeType) {
                        atomic.StoreInt64(&e.modifyTime, time.Now().Unix())
                        e.snapshotDataOff = uint64(math.Max(float64(e.snapshotDataOff), float64(offset+size)))
                }
                log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v dataSize %v snapshotDataOff %v",
                        offset, size, writeType, e.dataSize, e.snapshotDataOff)
        }()

        if isSync {
                if err = e.file.Sync(); err != nil {
                        log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v err %v",
                                offset, size, writeType, err)
                        return
                }
        }
        if offsetInBlock == 0 && size == util.BlockSize {
                err = crcFunc(e, int(blockNo), crc)
                log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v err %v", offset, size, writeType, err)
                return
        }

        if offsetInBlock+size <= util.BlockSize {
                err = crcFunc(e, int(blockNo), 0)
                log.LogDebugf("action[Extent.Write]  offset %v size %v writeType %v err %v", offset, size, writeType, err)
                return
        }
        log.LogDebugf("action[Extent.Write] offset %v size %v writeType %v", offset, size, writeType)
        if err = crcFunc(e, int(blockNo), 0); err == nil {
                err = crcFunc(e, int(blockNo+1), 0)
        }
        return
}

// Read reads data from an extent.
func (e *Extent) Read(data []byte, offset, size int64, isRepairRead bool) (crc uint32, err error) {
        log.LogDebugf("action[Extent.read] offset %v size %v extent %v", offset, size, e)
        if IsTinyExtent(e.extentID) {
                return e.ReadTiny(data, offset, size, isRepairRead)
        }

        if err = e.checkReadOffsetAndSize(offset, size); err != nil {
                log.LogErrorf("action[Extent.Read] offset %d size %d err %v", offset, size, err)
                return
        }

        var rSize int
        if rSize, err = e.file.ReadAt(data[:size], offset); err != nil {
                log.LogErrorf("action[Extent.Read] offset %v size %v err %v realsize %v", offset, size, err, rSize)
                return
        }
        crc = crc32.ChecksumIEEE(data)
        return
}

// ReadTiny read data from a tiny extent.
func (e *Extent) ReadTiny(data []byte, offset, size int64, isRepairRead bool) (crc uint32, err error) {
        _, err = e.file.ReadAt(data[:size], offset)
        if isRepairRead && err == io.EOF {
                err = nil
        }
        crc = crc32.ChecksumIEEE(data[:size])
        return
}

func (e *Extent) checkReadOffsetAndSize(offset, size int64) error {
        if (e.snapshotDataOff == util.ExtentSize && offset > e.Size()) ||
                (e.snapshotDataOff > util.ExtentSize && uint64(offset) > e.snapshotDataOff) {
                return newParameterError("offset=%d size=%d snapshotDataOff=%d", offset, size, e.snapshotDataOff)
        }
        return nil
}

func (e *Extent) checkWriteOffsetAndSize(writeType int, offset, size int64) error {
        err := newParameterError("writeType=%d offset=%d size=%d", writeType, offset, size)
        if IsAppendWrite(writeType) {
                if size == 0 || size > util.BlockSize ||
                        offset+size > util.ExtentSize || offset >= util.ExtentSize {
                        return err
                }
        } else if IsAppendRandomWrite(writeType) {
                log.LogDebugf("action[checkOffsetAndSize] offset %v size %v", offset, size)
                if offset < util.ExtentSize || size == 0 {
                        return err
                }
        }
        return nil
}

// Flush synchronizes data to the disk.
func (e *Extent) Flush() (err error) {
        err = e.file.Sync()
        return
}

func (e *Extent) GetCrc(blockNo int64) uint32 {
        if int64(len(e.header)) < (blockNo+1)*util.PerBlockCrcSize {
                return 0
        }
        return binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
}

func (e *Extent) autoComputeExtentCrc(crcFunc UpdateCrcFunc) (crc uint32, err error) {
        var blockCnt int
        extSize := e.Size()
        if e.snapshotDataOff > util.ExtentSize {
                extSize = int64(e.snapshotDataOff)
        }
        blockCnt = int(extSize / util.BlockSize)
        if extSize%util.BlockSize != 0 {
                blockCnt += 1
        }
        log.LogDebugf("autoComputeExtentCrc. path %v extent %v extent size %v,blockCnt %v", e.filePath, e.extentID, extSize, blockCnt)
        crcData := make([]byte, blockCnt*util.PerBlockCrcSize)
        for blockNo := 0; blockNo < blockCnt; blockNo++ {
                blockCrc := binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
                if blockCrc != 0 {
                        binary.BigEndian.PutUint32(crcData[blockNo*util.PerBlockCrcSize:(blockNo+1)*util.PerBlockCrcSize], blockCrc)
                        continue
                }
                bdata := make([]byte, util.BlockSize)
                offset := int64(blockNo * util.BlockSize)
                readN, err := e.file.ReadAt(bdata[:util.BlockSize], offset)
                if readN == 0 && err != nil {
                        log.LogErrorf("autoComputeExtentCrc. path %v extent %v blockNo %v, readN %v err %v", e.filePath, e.extentID, blockNo, readN, err)
                        break
                }
                blockCrc = crc32.ChecksumIEEE(bdata[:readN])
                err = crcFunc(e, blockNo, blockCrc)
                if err != nil {
                        log.LogErrorf("autoComputeExtentCrc. path %v extent %v blockNo %v, err %v", e.filePath, e.extentID, blockNo, err)
                        return 0, nil
                }
                log.LogDebugf("autoComputeExtentCrc. path %v extent %v blockCrc %v,blockNo %v", e.filePath, e.extentID, blockCrc, blockNo)
                binary.BigEndian.PutUint32(crcData[blockNo*util.PerBlockCrcSize:(blockNo+1)*util.PerBlockCrcSize], blockCrc)
        }
        crc = crc32.ChecksumIEEE(crcData)
        log.LogDebugf("autoComputeExtentCrc. path %v extent %v crc %v", e.filePath, e.extentID, crc)
        return crc, err
}

// DeleteTiny deletes a tiny extent.
func (e *Extent) punchDelete(offset, size int64) (hasDelete bool, err error) {
        log.LogDebugf("punchDelete extent %v offset %v, size %v", e, offset, size)
        if int(offset)%util.PageSize != 0 {
                return false, ParameterMismatchError
        }
        if int(size)%util.PageSize != 0 {
                size += int64(util.PageSize - int(size)%util.PageSize)
        }

        newOffset, err := e.file.Seek(offset, SEEK_DATA)
        if err != nil {
                if strings.Contains(err.Error(), syscall.ENXIO.Error()) {
                        return true, nil
                }
                return false, err
        }
        if newOffset-offset >= size {
                return true, nil
        }
        log.LogDebugf("punchDelete offset %v size %v", offset, size)
        err = fallocate(int(e.file.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize, offset, size)
        return
}

func (e *Extent) getRealBlockCnt() (blockNum int64) {
        stat := new(syscall.Stat_t)
        syscall.Stat(e.filePath, stat)
        return stat.Blocks
}

func (e *Extent) TinyExtentRecover(data []byte, offset, size int64, crc uint32, isEmptyPacket bool) (err error) {
        e.Lock()
        defer e.Unlock()
        if !IsTinyExtent(e.extentID) {
                return ParameterMismatchError
        }
        if offset%util.PageSize != 0 || offset != e.dataSize {
                return fmt.Errorf("error empty packet on (%v) offset(%v) size(%v)"+
                        " isEmptyPacket(%v)  e.dataSize(%v)", e.file.Name(), offset, size, isEmptyPacket, e.dataSize)
        }
        log.LogDebugf("before file (%v) getRealBlockNo (%v) isEmptyPacket(%v)"+
                "offset(%v) size(%v) e.datasize(%v)", e.filePath, e.getRealBlockCnt(), isEmptyPacket, offset, size, e.dataSize)
        if isEmptyPacket {
                var finfo os.FileInfo
                finfo, err = e.file.Stat()
                if err != nil {
                        return err
                }
                if offset < finfo.Size() {
                        return fmt.Errorf("error empty packet on (%v) offset(%v) size(%v)"+
                                " isEmptyPacket(%v) filesize(%v) e.dataSize(%v)", e.file.Name(), offset, size, isEmptyPacket, finfo.Size(), e.dataSize)
                }
                if err = syscall.Ftruncate(int(e.file.Fd()), offset+size); err != nil {
                        return err
                }
                err = fallocate(int(e.file.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize, offset, size)
        } else {
                _, err = e.file.WriteAt(data[:size], int64(offset))
        }
        if err != nil {
                return
        }
        watermark := offset + size
        if watermark%util.PageSize != 0 {
                watermark = watermark + (util.PageSize - watermark%util.PageSize)
        }
        e.dataSize = watermark
        log.LogDebugf("after file (%v) getRealBlockNo (%v) isEmptyPacket(%v)"+
                "offset(%v) size(%v) e.datasize(%v)", e.filePath, e.getRealBlockCnt(), isEmptyPacket, offset, size, e.dataSize)

        return
}

func (e *Extent) tinyExtentAvaliOffset(offset int64) (newOffset, newEnd int64, err error) {
        e.Lock()
        defer e.Unlock()
        newOffset, err = e.file.Seek(int64(offset), SEEK_DATA)
        if err != nil {
                return
        }
        newEnd, err = e.file.Seek(int64(newOffset), SEEK_HOLE)
        if err != nil {
                return
        }
        if newOffset-offset > util.BlockSize {
                newOffset = offset + util.BlockSize
        }
        if newEnd-newOffset > util.BlockSize {
                newEnd = newOffset + util.BlockSize
        }
        if newEnd < newOffset {
                err = fmt.Errorf("unavali TinyExtentAvaliOffset on SEEK_DATA or SEEK_HOLE   (%v) offset(%v) "+
                        "newEnd(%v) newOffset(%v)", e.extentID, offset, newEnd, newOffset)
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "container/list"
        "sync"
)

// ExtentMapItem stores the extent entity pointer and the element
// pointer of the extent entity in a cache list.
type ExtentMapItem struct {
        e       *Extent
        element *list.Element
}

// ExtentCache is an implementation of the ExtentCache with LRU support.
type ExtentCache struct {
        extentMap   map[uint64]*ExtentMapItem
        extentList  *list.List
        tinyExtents map[uint64]*Extent
        tinyLock    sync.RWMutex
        lock        sync.RWMutex
        capacity    int
}

// NewExtentCache creates and returns a new ExtentCache instance.
func NewExtentCache(capacity int) *ExtentCache {
        return &ExtentCache{
                extentMap:   make(map[uint64]*ExtentMapItem),
                extentList:  list.New(),
                capacity:    capacity,
                tinyExtents: make(map[uint64]*Extent),
        }
}

// Put puts an extent object into the cache.
func (cache *ExtentCache) Put(e *Extent) {
        if IsTinyExtent(e.extentID) {
                cache.tinyLock.Lock()
                cache.tinyExtents[e.extentID] = e
                cache.tinyLock.Unlock()
                return
        }
        cache.lock.Lock()
        defer cache.lock.Unlock()
        item := &ExtentMapItem{
                e:       e,
                element: cache.extentList.PushBack(e),
        }
        cache.extentMap[e.extentID] = item
        cache.evict()
}

// Get gets the extent from the cache.
func (cache *ExtentCache) Get(extentID uint64) (e *Extent, ok bool) {
        if IsTinyExtent(extentID) {
                cache.tinyLock.RLock()
                e, ok = cache.tinyExtents[extentID]
                cache.tinyLock.RUnlock()
                return
        }
        cache.lock.Lock()
        defer cache.lock.Unlock()
        var item *ExtentMapItem
        if item, ok = cache.extentMap[extentID]; ok {
                if !IsTinyExtent(extentID) {
                        cache.extentList.MoveToBack(item.element)
                }
                e = item.e
        }
        return
}

// Del deletes the extent stored in the cache.
func (cache *ExtentCache) Del(extentID uint64) {
        if IsTinyExtent(extentID) {
                return
        }
        cache.lock.Lock()
        defer cache.lock.Unlock()
        var (
                item *ExtentMapItem
                ok   bool
        )
        if item, ok = cache.extentMap[extentID]; ok {
                delete(cache.extentMap, extentID)
                cache.extentList.Remove(item.element)

                item.e.Close()
        }
}

// Clear closes all the extents stored in the cache.
func (cache *ExtentCache) Clear() {
        cache.tinyLock.RLock()
        for _, extent := range cache.tinyExtents {
                extent.Close()
        }
        cache.tinyLock.RUnlock()

        cache.lock.Lock()
        defer cache.lock.Unlock()
        for e := cache.extentList.Front(); e != nil; {
                curr := e
                e = e.Next()
                ec := curr.Value.(*Extent)
                delete(cache.extentMap, ec.extentID)

                ec.Close()
                cache.extentList.Remove(curr)
        }
        cache.extentList = list.New()
        cache.extentMap = make(map[uint64]*ExtentMapItem)
}

// Size returns number of extents stored in the cache.
func (cache *ExtentCache) Size() int {
        cache.lock.RLock()
        defer cache.lock.RUnlock()
        return cache.extentList.Len()
}

func (cache *ExtentCache) evict() {
        if cache.capacity <= 0 {
                return
        }
        needRemove := cache.extentList.Len() - cache.capacity
        for i := 0; i < needRemove; i++ {
                if e := cache.extentList.Front(); e != nil {
                        front := e.Value.(*Extent)
                        if IsTinyExtent(front.extentID) {
                                continue
                        }
                        delete(cache.extentMap, front.extentID)
                        cache.extentList.Remove(e)
                        front.Close()
                }
        }
}

// Flush synchronizes the extent stored in the cache to the disk.
func (cache *ExtentCache) Flush() {
        cache.tinyLock.RLock()
        for _, extent := range cache.tinyExtents {
                extent.Flush()
        }
        cache.tinyLock.RUnlock()

        cache.lock.RLock()
        defer cache.lock.RUnlock()
        for _, item := range cache.extentMap {
                item.e.Flush()
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "bytes"
        "encoding/binary"
        "fmt"
        "hash/crc32"
        "io"
        "os"
        "path"
        "regexp"
        "runtime"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/errors"
        "github.com/cubefs/cubefs/util/log"
)

//TODO: remove this later.
//go:generate golangci-lint run --issues-exit-code=1 -D errcheck -E bodyclose ./...

const (
        ExtCrcHeaderFileName     = "EXTENT_CRC"
        ExtBaseExtentIDFileName  = "EXTENT_META"
        TinyDeleteFileOpt        = os.O_CREATE | os.O_RDWR | os.O_APPEND
        TinyExtDeletedFileName   = "TINYEXTENT_DELETE"
        NormalExtDeletedFileName = "NORMALEXTENT_DELETE"
        MaxExtentCount           = 20000
        TinyExtentCount          = 64
        TinyExtentStartID        = 1
        MinExtentID              = 1024
        DeleteTinyRecordSize     = 24
        UpdateCrcInterval        = 600
        RepairInterval           = 60
        RandomWriteType          = 2
        AppendWriteType          = 1
        AppendRandomWriteType    = 4

        NormalExtentDeleteRetainTime = 3600 * 4

        StaleExtStoreBackupSuffix = ".old"
        StaleExtStoreTimeFormat   = "20060102150405.000000000"
)

var (
        RegexpExtentFile, _ = regexp.Compile(`^(\d)+$`)
        SnapShotFilePool    = &sync.Pool{New: func() interface{} {
                return new(proto.File)
        }}
)

func GetSnapShotFileFromPool() (f *proto.File) {
        f = SnapShotFilePool.Get().(*proto.File)
        return
}

func PutSnapShotFileToPool(f *proto.File) {
        SnapShotFilePool.Put(f)
}

type ExtentFilter func(info *ExtentInfo) bool

// Filters
var (
        NormalExtentFilter = func() ExtentFilter {
                now := time.Now()
                return func(ei *ExtentInfo) bool {
                        return !IsTinyExtent(ei.FileID) && now.Unix()-ei.ModifyTime > RepairInterval && !ei.IsDeleted
                }
        }

        TinyExtentFilter = func(filters []uint64) ExtentFilter {
                return func(ei *ExtentInfo) bool {
                        if !IsTinyExtent(ei.FileID) {
                                return false
                        }
                        for _, filterID := range filters {
                                if filterID == ei.FileID {
                                        return true
                                }
                        }
                        return false
                }
        }
)

// ExtentStore defines fields used in the storage engine.
// Packets smaller than 128K are stored in the "tinyExtent", a place to persist the small files.
// packets larger than or equal to 128K are stored in the normal "extent", a place to persist large files.
// The difference between them is that the extentID of a tinyExtent starts at 5000000 and ends at 5000128.
// Multiple small files can be appended to the same tinyExtent.
// In addition, the deletion of small files is implemented by the punch hole from the underlying file system.
type ExtentStore struct {
        dataPath               string
        baseExtentID           uint64                 // TODO what is baseExtentID
        extentInfoMap          map[uint64]*ExtentInfo // map that stores all the extent information
        eiMutex                sync.RWMutex           // mutex for extent info
        cache                  *ExtentCache           // extent cache
        mutex                  sync.Mutex
        storeSize              int      // size of the extent store
        metadataFp             *os.File // metadata file pointer?
        tinyExtentDeleteFp     *os.File
        normalExtentDeleteFp   *os.File
        closeC                 chan bool
        closed                 bool
        availableTinyExtentC   chan uint64 // available tinyExtent channel
        availableTinyExtentMap sync.Map
        brokenTinyExtentC      chan uint64 // broken tinyExtent channel
        brokenTinyExtentMap    sync.Map
        // blockSize                         int
        partitionID    uint64
        verifyExtentFp *os.File

        verifyExtentFpAppend              []*os.File
        hasAllocSpaceExtentIDOnVerfiyFile uint64
        hasDeleteNormalExtentsCache       sync.Map
        partitionType                     int
        ApplyId                           uint64
        ApplyIdMutex                      sync.RWMutex
}

func MkdirAll(name string) (err error) {
        return os.MkdirAll(name, 0o755)
}

func NewExtentStore(dataDir string, partitionID uint64, storeSize, dpType int, isCreate bool) (s *ExtentStore, err error) {
        s = new(ExtentStore)
        s.dataPath = dataDir
        s.partitionType = dpType
        s.partitionID = partitionID

        if isCreate {
                if err = s.renameStaleExtentStore(); err != nil {
                        return
                }
                if err = MkdirAll(dataDir); err != nil {
                        return nil, fmt.Errorf("NewExtentStore [%v] err[%v]", dataDir, err)
                }

                if s.tinyExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, TinyExtDeletedFileName), TinyDeleteFileOpt, 0o666); err != nil {
                        return
                }
                if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                        return
                }
                if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                        return
                }
                if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o666); err != nil {
                        return
                }
        } else {
                if err = MkdirAll(dataDir); err != nil {
                        return nil, fmt.Errorf("NewExtentStore [%v] err[%v]", dataDir, err)
                }
                if s.tinyExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, TinyExtDeletedFileName), os.O_RDWR|os.O_APPEND, 0o666); err != nil {
                        return
                }
                if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_RDWR, 0o666); err != nil {
                        return
                }
                if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_RDWR, 0o666); err != nil {
                        return
                }
                if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_RDWR|os.O_APPEND, 0o666); err != nil {
                        return
                }
        }

        stat, err := s.tinyExtentDeleteFp.Stat()
        if err != nil {
                return
        }
        if stat.Size()%DeleteTinyRecordSize != 0 {
                needWriteEmpty := DeleteTinyRecordSize - (stat.Size() % DeleteTinyRecordSize)
                data := make([]byte, needWriteEmpty)
                s.tinyExtentDeleteFp.Write(data)
        }

        log.LogDebugf("NewExtentStore.partitionID [%v] dataPath %v verifyExtentFp init", partitionID, s.dataPath)
        if s.verifyExtentFp, err = os.OpenFile(path.Join(s.dataPath, ExtCrcHeaderFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                return
        }

        aId := 0
        var vFp *os.File
        for {
                dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(aId))
                if _, err = os.Stat(dataPath); err != nil {
                        log.LogDebugf("NewExtentStore. partitionID [%v] dataPath not exist err %v. verifyExtentFpAppend init return", partitionID, err)
                        break
                }
                if vFp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                        log.LogErrorf("NewExtentStore. partitionID [%v] dataPath exist but open err %v. verifyExtentFpAppend init return", partitionID, err)
                        return
                }
                log.LogDebugf("NewExtentStore. partitionID [%v] dataPath exist and opened id %v", partitionID, aId)
                s.verifyExtentFpAppend = append(s.verifyExtentFpAppend, vFp)
                aId++
        }
        if s.metadataFp, err = os.OpenFile(path.Join(s.dataPath, ExtBaseExtentIDFileName), os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                return
        }
        if s.normalExtentDeleteFp, err = os.OpenFile(path.Join(s.dataPath, NormalExtDeletedFileName), os.O_CREATE|os.O_RDWR|os.O_APPEND, 0o666); err != nil {
                return
        }

        s.extentInfoMap = make(map[uint64]*ExtentInfo)
        s.cache = NewExtentCache(100)
        if err = s.initBaseFileID(); err != nil {
                err = fmt.Errorf("init base field ID: %v", err)
                return
        }
        s.hasAllocSpaceExtentIDOnVerfiyFile = s.GetPreAllocSpaceExtentIDOnVerifyFile()
        s.storeSize = storeSize
        s.closeC = make(chan bool, 1)
        s.closed = false
        err = s.initTinyExtent()
        if err != nil {
                return
        }
        return
}

func (ei *ExtentInfo) UpdateExtentInfo(extent *Extent, crc uint32) {
        extent.Lock()
        defer extent.Unlock()

        if time.Now().Unix()-extent.ModifyTime() <= UpdateCrcInterval {
                crc = 0
        }

        ei.Size = uint64(extent.dataSize)
        ei.SnapshotDataOff = extent.snapshotDataOff

        log.LogInfof("action[ExtentInfo.UpdateExtentInfo] ei info [%v]", ei.String())

        if !IsTinyExtent(ei.FileID) {
                atomic.StoreUint32(&ei.Crc, crc)
                ei.ModifyTime = extent.ModifyTime()
        }
}

// SnapShot returns the information of all the extents on the current data partition.
// When the master sends the loadDataPartition request, the snapshot is used to compare the replicas.
func (s *ExtentStore) SnapShot() (files []*proto.File, err error) {
        var normalExtentSnapshot, tinyExtentSnapshot []*ExtentInfo

        // compute crc again to guarantee crc and applyID is the newest
        s.autoComputeExtentCrc()

        if normalExtentSnapshot, _, err = s.GetAllWatermarks(NormalExtentFilter()); err != nil {
                log.LogErrorf("SnapShot GetAllWatermarks err %v", err)
                return
        }

        files = make([]*proto.File, 0, len(normalExtentSnapshot))
        for _, ei := range normalExtentSnapshot {
                file := GetSnapShotFileFromPool()
                file.Name = strconv.FormatUint(ei.FileID, 10)
                file.Size = uint32(ei.Size)
                file.Modified = ei.ModifyTime
                file.Crc = atomic.LoadUint32(&ei.Crc)
                file.ApplyID = ei.ApplyID
                log.LogDebugf("partitionID %v ExtentStore set applyid %v partition %v", s.partitionID, s.ApplyId, s.partitionID)
                files = append(files, file)
        }
        tinyExtentSnapshot = s.getTinyExtentInfo()
        for _, ei := range tinyExtentSnapshot {
                file := GetSnapShotFileFromPool()
                file.Name = strconv.FormatUint(ei.FileID, 10)
                file.Size = uint32(ei.Size)
                file.Modified = ei.ModifyTime
                file.Crc = 0
                files = append(files, file)
        }

        return
}

// Create creates an extent.
func (s *ExtentStore) Create(extentID uint64) (err error) {
        var e *Extent
        name := path.Join(s.dataPath, strconv.Itoa(int(extentID)))
        if s.HasExtent(extentID) {
                err = ExtentExistsError
                return err
        }

        e = NewExtentInCore(name, extentID)
        e.header = make([]byte, util.BlockHeaderSize)
        err = e.InitToFS()
        if err != nil {
                return err
        }

        s.cache.Put(e)
        extInfo := &ExtentInfo{FileID: extentID}
        extInfo.UpdateExtentInfo(e, 0)

        atomic.StoreInt64(&extInfo.AccessTime, e.accessTime)
        s.eiMutex.Lock()
        s.extentInfoMap[extentID] = extInfo
        s.eiMutex.Unlock()

        s.UpdateBaseExtentID(extentID)
        return
}

func (s *ExtentStore) initBaseFileID() error {
        var baseFileID uint64
        baseFileID, _ = s.GetPersistenceBaseExtentID()
        files, err := os.ReadDir(s.dataPath)
        if err != nil {
                return err
        }

        var (
                extentID uint64
                isExtent bool
                e        *Extent
                ei       *ExtentInfo
                loadErr  error
        )
        for _, f := range files {
                if extentID, isExtent = s.ExtentID(f.Name()); !isExtent {
                        continue
                }

                if e, loadErr = s.extent(extentID); loadErr != nil {
                        log.LogError("[initBaseFileID] load extent error", loadErr)
                        continue
                }

                ei = &ExtentInfo{FileID: extentID}
                ei.UpdateExtentInfo(e, 0)
                atomic.StoreInt64(&ei.AccessTime, e.accessTime)

                s.eiMutex.Lock()
                s.extentInfoMap[extentID] = ei
                s.eiMutex.Unlock()

                e.Close()
                if !IsTinyExtent(extentID) && extentID > baseFileID {
                        baseFileID = extentID
                }
        }
        if baseFileID < MinExtentID {
                baseFileID = MinExtentID
        }
        atomic.StoreUint64(&s.baseExtentID, baseFileID)
        log.LogInfof("datadir(%v) maxBaseId(%v)", s.dataPath, baseFileID)
        runtime.GC()
        return nil
}

// Write writes the given extent to the disk.
func (s *ExtentStore) Write(extentID uint64, offset, size int64, data []byte, crc uint32, writeType int, isSync bool) (status uint8, err error) {
        var (
                e  *Extent
                ei *ExtentInfo
        )
        status = proto.OpOk
        s.eiMutex.Lock()
        ei = s.extentInfoMap[extentID]
        e, err = s.extentWithHeader(ei)
        s.eiMutex.Unlock()
        if err != nil {
                return status, err
        }
        // update access time
        atomic.StoreInt64(&ei.AccessTime, time.Now().Unix())
        log.LogDebugf("action[Write] dp %v extentID %v offset %v size %v writeTYPE %v", s.partitionID, extentID, offset, size, writeType)
        if err = s.checkOffsetAndSize(extentID, offset, size, writeType); err != nil {
                log.LogInfof("action[Write] path %v err %v", e.filePath, err)
                return status, err
        }

        status, err = e.Write(data, offset, size, crc, writeType, isSync, s.PersistenceBlockCrc, ei)
        if err != nil {
                log.LogInfof("action[Write] path %v err %v", e.filePath, err)
                return status, err
        }

        ei.UpdateExtentInfo(e, 0)
        return status, nil
}

func (s *ExtentStore) checkOffsetAndSize(extentID uint64, offset, size int64, writeType int) error {
        if IsTinyExtent(extentID) {
                return nil
        }
        // random write pos can happen on modAppend partition of extent
        if writeType == RandomWriteType {
                return nil
        }
        if writeType == AppendRandomWriteType {
                if offset < util.ExtentSize {
                        return newParameterError("writeType=%d offset=%d size=%d", writeType, offset, size)
                }
                return nil
        }
        if size == 0 || size > util.BlockSize ||
                offset >= util.BlockCount*util.BlockSize ||
                offset+size > util.BlockCount*util.BlockSize {
                return newParameterError("offset=%d size=%d", offset, size)
        }
        return nil
}

// IsTinyExtent checks if the given extent is tiny extent.
func IsTinyExtent(extentID uint64) bool {
        return extentID >= TinyExtentStartID && extentID < TinyExtentStartID+TinyExtentCount
}

// Read reads the extent based on the given id.
func (s *ExtentStore) Read(extentID uint64, offset, size int64, nbuf []byte, isRepairRead bool) (crc uint32, err error) {
        var e *Extent
        s.eiMutex.RLock()
        ei := s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()

        if ei == nil {
                return 0, errors.Trace(ExtentHasBeenDeletedError, "[Read] extent[%d] is already been deleted", extentID)
        }

        // update extent access time
        atomic.StoreInt64(&ei.AccessTime, time.Now().Unix())

        if e, err = s.extentWithHeader(ei); err != nil {
                return
        }

        //if err = s.checkOffsetAndSize(extentID, offset, size); err != nil {
        //        return
        //}
        crc, err = e.Read(nbuf, offset, size, isRepairRead)

        return
}

func (s *ExtentStore) DumpExtents() (extInfos SortedExtentInfos) {
        s.eiMutex.RLock()
        for _, v := range s.extentInfoMap {
                extInfos = append(extInfos, v)
        }
        s.eiMutex.RUnlock()
        return
}

func (s *ExtentStore) punchDelete(extentID uint64, offset, size int64) (err error) {
        e, err := s.extentWithHeaderByExtentID(extentID)
        if err != nil {
                return nil
        }
        if offset+size > e.dataSize {
                return
        }
        var hasDelete bool
        if hasDelete, err = e.punchDelete(offset, size); err != nil {
                return
        }
        if hasDelete {
                return
        }
        if err = s.RecordTinyDelete(e.extentID, offset, size); err != nil {
                return
        }
        return
}

// MarkDelete marks the given extent as deleted.
func (s *ExtentStore) MarkDelete(extentID uint64, offset, size int64) (err error) {
        var ei *ExtentInfo
        s.eiMutex.RLock()
        ei = s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()
        if ei == nil || ei.IsDeleted {
                return
        }
        log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v snapshotSize %v)",
                extentID, offset, size, ei.Size, ei.SnapshotDataOff)

        funcNeedPunchDel := func() bool {
                return offset != 0 || (size != 0 && ((ei.Size != uint64(size) && ei.SnapshotDataOff == util.ExtentSize) ||
                        (ei.SnapshotDataOff != uint64(size) && ei.SnapshotDataOff > util.ExtentSize)))
        }

        if IsTinyExtent(extentID) || funcNeedPunchDel() {
                log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v snapshotSize %v)",
                        extentID, offset, size, ei.Size, ei.SnapshotDataOff)
                return s.punchDelete(extentID, offset, size)
        }

        extentFilePath := path.Join(s.dataPath, strconv.FormatUint(extentID, 10))
        log.LogDebugf("action[MarkDelete] extentID %v offset %v size %v ei(size %v extentFilePath %v)",
                extentID, offset, size, ei.Size, extentFilePath)
        if err = os.Remove(extentFilePath); err != nil && !os.IsNotExist(err) {
                // NOTE: if remove failed
                // we meet a disk error
                err = BrokenDiskError
                return
        }
        if err = s.PersistenceHasDeleteExtent(extentID); err != nil {
                err = BrokenDiskError
                return
        }
        ei.IsDeleted = true
        ei.ModifyTime = time.Now().Unix()
        s.cache.Del(extentID)
        if err = s.DeleteBlockCrc(extentID); err != nil {
                err = BrokenDiskError
                return
        }
        s.PutNormalExtentToDeleteCache(extentID)

        s.eiMutex.Lock()
        delete(s.extentInfoMap, extentID)
        s.eiMutex.Unlock()

        return
}

func (s *ExtentStore) PutNormalExtentToDeleteCache(extentID uint64) {
        s.hasDeleteNormalExtentsCache.Store(extentID, time.Now().Unix())
}

func (s *ExtentStore) IsDeletedNormalExtent(extentID uint64) (ok bool) {
        _, ok = s.hasDeleteNormalExtentsCache.Load(extentID)
        return
}

// Close closes the extent store.
func (s *ExtentStore) Close() {
        s.mutex.Lock()
        defer s.mutex.Unlock()
        if s.closed {
                return
        }

        // Release cache
        s.cache.Flush()
        s.cache.Clear()
        s.tinyExtentDeleteFp.Sync()
        s.tinyExtentDeleteFp.Close()
        s.normalExtentDeleteFp.Sync()
        s.normalExtentDeleteFp.Close()
        s.verifyExtentFp.Sync()
        s.verifyExtentFp.Close()
        for _, vFp := range s.verifyExtentFpAppend {
                if vFp != nil {
                        vFp.Sync()
                        vFp.Close()
                }
        }
        s.closed = true
}

// Watermark returns the extent info of the given extent on the record.
func (s *ExtentStore) Watermark(extentID uint64) (ei *ExtentInfo, err error) {
        var has bool
        s.eiMutex.RLock()
        ei, has = s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()
        if !has {
                err = fmt.Errorf("e %v not exist", s.getExtentKey(extentID))
                return
        }
        return
}

// GetTinyExtentOffset returns the offset of the given extent.
func (s *ExtentStore) GetTinyExtentOffset(extentID uint64) (watermark int64, err error) {
        einfo, err := s.Watermark(extentID)
        if err != nil {
                return
        }
        watermark = int64(einfo.Size)
        if watermark%util.PageSize != 0 {
                watermark = watermark + (util.PageSize - watermark%util.PageSize)
        }

        return
}

// GetTinyExtentOffset returns the offset of the given extent.
func (s *ExtentStore) GetExtentSnapshotModOffset(extentID uint64, allocSize uint32) (watermark int64, err error) {
        einfo, err := s.Watermark(extentID)
        if err != nil {
                return
        }
        log.LogDebugf("action[ExtentStore.GetExtentSnapshotModOffset] extId %v SnapshotDataOff %v SnapPreAllocDataOff %v allocSize %v",
                extentID, einfo.SnapshotDataOff, einfo.SnapPreAllocDataOff, allocSize)

        if einfo.SnapPreAllocDataOff == 0 {
                einfo.SnapPreAllocDataOff = einfo.SnapshotDataOff
        }
        watermark = int64(einfo.SnapPreAllocDataOff)
        //if watermark%util.PageSize != 0 {
        //        watermark = watermark + (util.PageSize - watermark%util.PageSize)
        //}
        einfo.SnapPreAllocDataOff += uint64(allocSize)

        return
}

// Sector size
const (
        DiskSectorSize = 512
)

func (s *ExtentStore) GetStoreUsedSize() (used int64) {
        extentInfoSlice := make([]*ExtentInfo, 0, s.GetExtentCount())
        s.eiMutex.RLock()
        for _, extentID := range s.extentInfoMap {
                extentInfoSlice = append(extentInfoSlice, extentID)
        }
        s.eiMutex.RUnlock()
        for _, einfo := range extentInfoSlice {
                if einfo.IsDeleted {
                        continue
                }
                if IsTinyExtent(einfo.FileID) {
                        stat := new(syscall.Stat_t)
                        err := syscall.Stat(fmt.Sprintf("%v/%v", s.dataPath, einfo.FileID), stat)
                        if err != nil {
                                continue
                        }
                        used += stat.Blocks * DiskSectorSize
                } else {
                        used += int64(einfo.Size + (einfo.SnapshotDataOff - util.ExtentSize))
                }
        }
        return
}

// GetAllWatermarks returns all the watermarks.
func (s *ExtentStore) GetAllWatermarks(filter ExtentFilter) (extents []*ExtentInfo, tinyDeleteFileSize int64, err error) {
        extents = make([]*ExtentInfo, 0, len(s.extentInfoMap))
        extentInfoSlice := make([]*ExtentInfo, 0, len(s.extentInfoMap))
        s.eiMutex.RLock()
        for _, extentID := range s.extentInfoMap {
                extentInfoSlice = append(extentInfoSlice, extentID)
        }
        s.eiMutex.RUnlock()

        for _, extentInfo := range extentInfoSlice {
                if filter != nil && !filter(extentInfo) {
                        continue
                }
                if extentInfo.IsDeleted {
                        continue
                }
                extents = append(extents, extentInfo)
        }
        tinyDeleteFileSize, err = s.LoadTinyDeleteFileOffset()

        return
}

func (s *ExtentStore) getTinyExtentInfo() (extents []*ExtentInfo) {
        extents = make([]*ExtentInfo, 0)
        s.eiMutex.RLock()
        var extentID uint64
        for extentID = TinyExtentStartID; extentID < TinyExtentCount+TinyExtentStartID; extentID++ {
                ei := s.extentInfoMap[extentID]
                if ei == nil {
                        continue
                }
                extents = append(extents, ei)
        }
        s.eiMutex.RUnlock()

        return
}

// ExtentID return the extent ID.
func (s *ExtentStore) ExtentID(filename string) (extentID uint64, isExtent bool) {
        if isExtent = RegexpExtentFile.MatchString(filename); !isExtent {
                return
        }
        var err error
        if extentID, err = strconv.ParseUint(filename, 10, 64); err != nil {
                isExtent = false
                return
        }
        isExtent = true
        return
}

func (s *ExtentStore) initTinyExtent() (err error) {
        s.availableTinyExtentC = make(chan uint64, TinyExtentCount)
        s.brokenTinyExtentC = make(chan uint64, TinyExtentCount)
        var extentID uint64

        for extentID = TinyExtentStartID; extentID < TinyExtentStartID+TinyExtentCount; extentID++ {
                err = s.Create(extentID)
                if err == nil || strings.Contains(err.Error(), syscall.EEXIST.Error()) || err == ExtentExistsError {
                        err = nil
                        s.brokenTinyExtentC <- extentID
                        s.brokenTinyExtentMap.Store(extentID, true)
                        continue
                }
                return err
        }

        return
}

// GetAvailableTinyExtent returns the available tiny extent from the channel.
func (s *ExtentStore) GetAvailableTinyExtent() (extentID uint64, err error) {
        select {
        case extentID = <-s.availableTinyExtentC:
                log.LogDebugf("dp %v GetAvailableTinyExtent. extentID %v", s.partitionID, extentID)
                s.availableTinyExtentMap.Delete(extentID)
                return
        default:
                log.LogDebugf("dp %v GetAvailableTinyExtent not found", s.partitionID)
                return 0, NoAvailableExtentError

        }
}

// SendToAvailableTinyExtentC sends the extent to the channel that stores the available tiny extents.
func (s *ExtentStore) SendToAvailableTinyExtentC(extentID uint64) {
        log.LogDebugf("dp %v action[SendToAvailableTinyExtentC] extentid %v", s.partitionID, extentID)
        if _, ok := s.availableTinyExtentMap.Load(extentID); !ok {
                log.LogDebugf("dp %v SendToAvailableTinyExtentC. extentID %v", s.partitionID, extentID)
                s.availableTinyExtentC <- extentID
                s.availableTinyExtentMap.Store(extentID, true)
        } else {
                log.LogDebugf("dp %v action[SendToAvailableTinyExtentC] extentid %v already exist", s.partitionID, extentID)
        }
}

// SendAllToBrokenTinyExtentC sends all the extents to the channel that stores the broken extents.
func (s *ExtentStore) SendAllToBrokenTinyExtentC(extentIds []uint64) {
        for _, extentID := range extentIds {
                if _, ok := s.brokenTinyExtentMap.Load(extentID); !ok {
                        s.brokenTinyExtentC <- extentID
                        s.brokenTinyExtentMap.Store(extentID, true)
                }
        }
}

// AvailableTinyExtentCnt returns the count of the available tiny extents.
func (s *ExtentStore) AvailableTinyExtentCnt() int {
        return len(s.availableTinyExtentC)
}

// BrokenTinyExtentCnt returns the count of the broken tiny extents.
func (s *ExtentStore) BrokenTinyExtentCnt() int {
        return len(s.brokenTinyExtentC)
}

// MoveAllToBrokenTinyExtentC moves all the tiny extents to the channel stores the broken extents.
func (s *ExtentStore) MoveAllToBrokenTinyExtentC(cnt int) {
        for i := 0; i < cnt; i++ {
                extentID, err := s.GetAvailableTinyExtent()
                if err != nil {
                        return
                }
                s.SendToBrokenTinyExtentC(extentID)
        }
}

// SendToBrokenTinyExtentC sends the given extent id to the channel.
func (s *ExtentStore) SendToBrokenTinyExtentC(extentID uint64) {
        if _, ok := s.brokenTinyExtentMap.Load(extentID); !ok {
                s.brokenTinyExtentC <- extentID
                s.brokenTinyExtentMap.Store(extentID, true)
        }
}

// GetBrokenTinyExtent returns the first broken extent in the channel.
func (s *ExtentStore) GetBrokenTinyExtent() (extentID uint64, err error) {
        select {
        case extentID = <-s.brokenTinyExtentC:
                s.brokenTinyExtentMap.Delete(extentID)
                return
        default:
                return 0, NoBrokenExtentError

        }
}

// StoreSizeExtentID returns the size of the extent store
func (s *ExtentStore) StoreSizeExtentID(maxExtentID uint64) (totalSize uint64) {
        extentInfos := make([]*ExtentInfo, 0)
        s.eiMutex.RLock()
        for _, extentInfo := range s.extentInfoMap {
                if extentInfo.FileID <= maxExtentID {
                        extentInfos = append(extentInfos, extentInfo)
                }
        }
        s.eiMutex.RUnlock()
        for _, extentInfo := range extentInfos {
                totalSize += extentInfo.TotalSize()
                log.LogDebugf("ExtentStore.StoreSizeExtentID dp %v extentInfo %v totalSize %v", s.partitionID, extentInfo, extentInfo.TotalSize())
        }

        return totalSize
}

// StoreSizeExtentID returns the size of the extent store
func (s *ExtentStore) GetMaxExtentIDAndPartitionSize() (maxExtentID, totalSize uint64) {
        extentInfos := make([]*ExtentInfo, 0)
        s.eiMutex.RLock()
        for _, extentInfo := range s.extentInfoMap {
                extentInfos = append(extentInfos, extentInfo)
        }
        s.eiMutex.RUnlock()
        for _, extentInfo := range extentInfos {
                if extentInfo.FileID > maxExtentID {
                        maxExtentID = extentInfo.FileID
                }
                totalSize += extentInfo.TotalSize()
        }
        return maxExtentID, totalSize
}

func MarshalTinyExtent(extentID uint64, offset, size int64) (data []byte) {
        data = make([]byte, DeleteTinyRecordSize)
        binary.BigEndian.PutUint64(data[0:8], extentID)
        binary.BigEndian.PutUint64(data[8:16], uint64(offset))
        binary.BigEndian.PutUint64(data[16:DeleteTinyRecordSize], uint64(size))
        return data
}

func UnMarshalTinyExtent(data []byte) (extentID, offset, size uint64) {
        extentID = binary.BigEndian.Uint64(data[0:8])
        offset = binary.BigEndian.Uint64(data[8:16])
        size = binary.BigEndian.Uint64(data[16:DeleteTinyRecordSize])
        return
}

func (s *ExtentStore) RecordTinyDelete(extentID uint64, offset, size int64) (err error) {
        record := MarshalTinyExtent(extentID, offset, size)
        stat, err := s.tinyExtentDeleteFp.Stat()
        if err != nil {
                return
        }
        if stat.Size()%DeleteTinyRecordSize != 0 {
                needWriteEmpty := DeleteTinyRecordSize - (stat.Size() % DeleteTinyRecordSize)
                data := make([]byte, needWriteEmpty)
                s.tinyExtentDeleteFp.Write(data)
        }
        _, err = s.tinyExtentDeleteFp.Write(record)
        if err != nil {
                return
        }

        return
}

func (s *ExtentStore) ReadTinyDeleteRecords(offset, size int64, data []byte) (crc uint32, err error) {
        _, err = s.tinyExtentDeleteFp.ReadAt(data[:size], offset)
        if err == nil || err == io.EOF {
                err = nil
                crc = crc32.ChecksumIEEE(data[:size])
        }
        return
}

type ExtentDeleted struct {
        ExtentID uint64 `json:"extentID"`
        Offset   uint64 `json:"offset"`
        Size     uint64 `json:"size"`
}

func (s *ExtentStore) GetHasDeleteTinyRecords() (extentDes []ExtentDeleted, err error) {
        data := make([]byte, DeleteTinyRecordSize)
        offset := int64(0)

        for {
                _, err = s.tinyExtentDeleteFp.ReadAt(data, offset)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                        }
                        return
                }

                extent := ExtentDeleted{}
                extent.ExtentID, extent.Offset, extent.Size = UnMarshalTinyExtent(data)
                extentDes = append(extentDes, extent)
                offset += DeleteTinyRecordSize
        }
}

// NextExtentID returns the next extentID. When the client sends the request to create an extent,
// this function generates an unique extentID within the current partition.
// This function can only be called by the leader.
func (s *ExtentStore) NextExtentID() (extentID uint64, err error) {
        extentID = atomic.AddUint64(&s.baseExtentID, 1)
        err = s.PersistenceBaseExtentID(extentID)
        return
}

func (s *ExtentStore) LoadTinyDeleteFileOffset() (offset int64, err error) {
        stat, err := s.tinyExtentDeleteFp.Stat()
        if err == nil {
                offset = stat.Size()
        }
        return
}

func (s *ExtentStore) getExtentKey(extent uint64) string {
        return fmt.Sprintf("extent %v_%v", s.partitionID, extent)
}

// UpdateBaseExtentID updates the base extent ID.
func (s *ExtentStore) UpdateBaseExtentID(id uint64) (err error) {
        if IsTinyExtent(id) {
                return
        }
        if id > atomic.LoadUint64(&s.baseExtentID) {
                atomic.StoreUint64(&s.baseExtentID, id)
                err = s.PersistenceBaseExtentID(atomic.LoadUint64(&s.baseExtentID))
        }
        s.PreAllocSpaceOnVerfiyFile(atomic.LoadUint64(&s.baseExtentID))

        return
}

func (s *ExtentStore) extent(extentID uint64) (e *Extent, err error) {
        if e, err = s.LoadExtentFromDisk(extentID, false); err != nil {
                err = fmt.Errorf("load extent from disk: %v", err)
                return nil, err
        }
        return
}

func (s *ExtentStore) extentWithHeader(ei *ExtentInfo) (e *Extent, err error) {
        var ok bool
        if ei == nil || ei.IsDeleted {
                err = ExtentNotFoundError
                return
        }
        if e, ok = s.cache.Get(ei.FileID); !ok {
                if e, err = s.LoadExtentFromDisk(ei.FileID, true); err != nil {
                        err = fmt.Errorf("load  %v from disk: %v", s.getExtentKey(ei.FileID), err)
                        return nil, err
                }
        }
        return
}

func (s *ExtentStore) extentWithHeaderByExtentID(extentID uint64) (e *Extent, err error) {
        var ok bool
        if e, ok = s.cache.Get(extentID); !ok {
                if e, err = s.LoadExtentFromDisk(extentID, true); err != nil {
                        err = fmt.Errorf("load  %v from disk: %v", s.getExtentKey(extentID), err)
                        return nil, err
                }
        }
        return
}

// HasExtent tells if the extent store has the extent with the given ID
func (s *ExtentStore) HasExtent(extentID uint64) (exist bool) {
        s.eiMutex.RLock()
        defer s.eiMutex.RUnlock()
        _, exist = s.extentInfoMap[extentID]
        return
}

// GetExtentCount returns the number of extents in the extentInfoMap
func (s *ExtentStore) GetExtentCount() (count int) {
        s.eiMutex.RLock()
        defer s.eiMutex.RUnlock()
        return len(s.extentInfoMap)
}

func (s *ExtentStore) LoadExtentFromDisk(extentID uint64, putCache bool) (e *Extent, err error) {
        name := path.Join(s.dataPath, fmt.Sprintf("%v", extentID))
        e = NewExtentInCore(name, extentID)
        if err = e.RestoreFromFS(); err != nil {
                err = fmt.Errorf("restore from file %v putCache %v system: %v", name, putCache, err)
                return
        }

        if !putCache {
                return
        }

        if !IsTinyExtent(extentID) && proto.IsNormalDp(s.partitionType) {
                e.header = make([]byte, util.BlockHeaderSize)
                if _, err = s.verifyExtentFp.ReadAt(e.header, int64(extentID*util.BlockHeaderSize)); err != nil && err != io.EOF {
                        return
                }
                emptyHeader := make([]byte, util.BlockHeaderSize)
                log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v, append fp cnt %v",
                        s.partitionID, extentID, e.snapshotDataOff, len(s.verifyExtentFpAppend))
                if e.snapshotDataOff > util.ExtentSize {
                        for id, vFp := range s.verifyExtentFpAppend {
                                if uint64(id) > (e.snapshotDataOff-util.ExtentSize)/util.ExtentSize {
                                        log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v id %v out of extent range",
                                                s.partitionID, extentID, e.snapshotDataOff, id)
                                        break
                                }
                                log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, snapshotOff %v id %v", s.partitionID, extentID, e.snapshotDataOff, id)
                                header := make([]byte, util.BlockHeaderSize)
                                if _, err = vFp.ReadAt(header, int64(extentID*util.BlockHeaderSize)); err != nil && err != io.EOF {
                                        log.LogDebugf("LoadExtentFromDisk. partition id %v extentId %v, read at %v err %v",
                                                s.partitionID, extentID, extentID*util.BlockHeaderSize, err)
                                        return
                                }
                                if bytes.Equal(emptyHeader, header) {
                                        log.LogErrorf("LoadExtentFromDisk. partition id %v extent %v hole at id %v", s.partitionID, e, id)
                                }
                                e.header = append(e.header, header...)
                        }
                        if len(s.verifyExtentFpAppend) < int(e.snapshotDataOff-1)/util.ExtentSize {
                                log.LogErrorf("LoadExtentFromDisk. extent %v need fp %v out of range %v", e, int(e.snapshotDataOff-1)/util.ExtentSize, len(s.verifyExtentFpAppend))
                        }
                }
        }

        err = nil
        s.cache.Put(e)

        return
}

func (s *ExtentStore) ScanBlocks(extentID uint64) (bcs []*BlockCrc, err error) {
        if !proto.IsNormalDp(s.partitionType) {
                return
        }

        var blockCnt int
        bcs = make([]*BlockCrc, 0)
        ei := s.extentInfoMap[extentID]
        e, err := s.extentWithHeader(ei)
        if err != nil {
                return bcs, err
        }

        extSize := e.Size()
        if e.snapshotDataOff > util.ExtentSize {
                extSize = int64(e.snapshotDataOff)
        }
        blockCnt = int(extSize / util.BlockSize)

        if e.Size()%util.BlockSize != 0 {
                blockCnt += 1
        }
        for blockNo := 0; blockNo < blockCnt; blockNo++ {
                blockCrc := binary.BigEndian.Uint32(e.header[blockNo*util.PerBlockCrcSize : (blockNo+1)*util.PerBlockCrcSize])
                bcs = append(bcs, &BlockCrc{BlockNo: blockNo, Crc: blockCrc})
        }
        sort.Sort(BlockCrcArr(bcs))

        return
}

type ExtentInfoArr []*ExtentInfo

func (arr ExtentInfoArr) Len() int           { return len(arr) }
func (arr ExtentInfoArr) Less(i, j int) bool { return arr[i].FileID < arr[j].FileID }
func (arr ExtentInfoArr) Swap(i, j int)      { arr[i], arr[j] = arr[j], arr[i] }

func (s *ExtentStore) BackendTask() {
        s.autoComputeExtentCrc()
        s.cleanExpiredNormalExtentDeleteCache()
}

func (s *ExtentStore) cleanExpiredNormalExtentDeleteCache() {
        s.hasDeleteNormalExtentsCache.Range(func(key, value interface{}) bool {
                deleteTime := value.(int64)
                extentID := key.(uint64)
                if time.Now().Unix()-deleteTime > NormalExtentDeleteRetainTime {
                        s.hasDeleteNormalExtentsCache.Delete(extentID)
                }
                return true
        })
}

func (s *ExtentStore) autoComputeExtentCrc() {
        if !proto.IsNormalDp(s.partitionType) {
                return
        }

        defer func() {
                if r := recover(); r != nil {
                        return
                }
        }()

        extentInfos := make([]*ExtentInfo, 0)
        deleteExtents := make([]*ExtentInfo, 0)
        s.eiMutex.RLock()
        for _, ei := range s.extentInfoMap {
                extentInfos = append(extentInfos, ei)
                if ei.IsDeleted && time.Now().Unix()-ei.ModifyTime > UpdateCrcInterval {
                        deleteExtents = append(deleteExtents, ei)
                }
        }
        s.eiMutex.RUnlock()

        if len(deleteExtents) > 0 {
                s.eiMutex.Lock()
                for _, ei := range deleteExtents {
                        delete(s.extentInfoMap, ei.FileID)
                }
                s.eiMutex.Unlock()
        }

        sort.Sort(ExtentInfoArr(extentInfos))

        for _, ei := range extentInfos {
                s.ApplyIdMutex.RLock()
                if ei == nil {
                        s.ApplyIdMutex.RUnlock()
                        continue
                }

                if !IsTinyExtent(ei.FileID) && time.Now().Unix()-ei.ModifyTime > UpdateCrcInterval &&
                        !ei.IsDeleted && ei.Size > 0 && ei.Crc == 0 {

                        e, err := s.extentWithHeader(ei)
                        if err != nil {
                                log.LogError("[autoComputeExtentCrc] get extent error", err)
                                s.ApplyIdMutex.RUnlock()
                                continue
                        }

                        extentCrc, err := e.autoComputeExtentCrc(s.PersistenceBlockCrc)
                        if err != nil {
                                log.LogError("[autoComputeExtentCrc] compute crc fail", err)
                                s.ApplyIdMutex.RUnlock()
                                continue
                        }

                        ei.UpdateExtentInfo(e, extentCrc)
                        ei.ApplyID = s.ApplyId
                        time.Sleep(time.Millisecond * 100)
                }
                s.ApplyIdMutex.RUnlock()
        }

        time.Sleep(time.Second)
}

func (s *ExtentStore) TinyExtentRecover(extentID uint64, offset, size int64, data []byte, crc uint32, isEmptyPacket bool) (err error) {
        if !IsTinyExtent(extentID) {
                return fmt.Errorf("extent %v not tinyExtent", extentID)
        }

        var (
                e  *Extent
                ei *ExtentInfo
        )

        s.eiMutex.RLock()
        ei = s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()
        if e, err = s.extentWithHeader(ei); err != nil {
                return nil
        }

        if err = e.TinyExtentRecover(data, offset, size, crc, isEmptyPacket); err != nil {
                return err
        }
        ei.UpdateExtentInfo(e, 0)

        return nil
}

func (s *ExtentStore) TinyExtentGetFinfoSize(extentID uint64) (size uint64, err error) {
        var e *Extent
        if !IsTinyExtent(extentID) {
                return 0, fmt.Errorf("unavali extent id (%v)", extentID)
        }
        s.eiMutex.RLock()
        ei := s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()
        if e, err = s.extentWithHeader(ei); err != nil {
                return
        }

        finfo, err := e.file.Stat()
        if err != nil {
                return 0, err
        }
        size = uint64(finfo.Size())

        return
}

func (s *ExtentStore) TinyExtentAvaliOffset(extentID uint64, offset int64) (newOffset, newEnd int64, err error) {
        var e *Extent
        if !IsTinyExtent(extentID) {
                return 0, 0, fmt.Errorf("unavali extent(%v)", extentID)
        }
        s.eiMutex.RLock()
        ei := s.extentInfoMap[extentID]
        s.eiMutex.RUnlock()
        if e, err = s.extentWithHeader(ei); err != nil {
                return
        }

        defer func() {
                if err != nil && strings.Contains(err.Error(), syscall.ENXIO.Error()) {
                        newOffset = e.dataSize
                        newEnd = e.dataSize
                        err = nil
                }
        }()
        newOffset, newEnd, err = e.tinyExtentAvaliOffset(offset)

        return
}

func (s *ExtentStore) renameStaleExtentStore() (err error) {
        // create: move current folder to .old and create a new folder
        if _, err = os.Stat(s.dataPath); err != nil {
                if os.IsNotExist(err) {
                        return nil
                }
        }

        curTime := time.Now().Format(StaleExtStoreTimeFormat)
        staleExtStoreDirName := s.dataPath + "_" + curTime + StaleExtStoreBackupSuffix

        if err = os.Rename(s.dataPath, staleExtStoreDirName); err != nil {
                return
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "syscall"

        "github.com/cubefs/cubefs/util"
)

func fallocate(fd int, mode uint32, off int64, len int64) (err error) {
        var tryCnt int
        for {
                err = syscall.Fallocate(fd, mode, off, len)
                if err == syscall.EINTR {
                        tryCnt++
                        if tryCnt >= util.SyscallTryMaxTimes {
                                return
                        }
                        continue
                }
                return
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package storage

import (
        "encoding/binary"
        "io"
        "os"
        "path"
        "strconv"
        "sync/atomic"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
)

type BlockCrc struct {
        BlockNo int
        Crc     uint32
}
type BlockCrcArr []*BlockCrc

const (
        BaseExtentIDOffset = 0
)

func (arr BlockCrcArr) Len() int           { return len(arr) }
func (arr BlockCrcArr) Less(i, j int) bool { return arr[i].BlockNo < arr[j].BlockNo }
func (arr BlockCrcArr) Swap(i, j int)      { arr[i], arr[j] = arr[j], arr[i] }

type (
        UpdateCrcFunc    func(e *Extent, blockNo int, crc uint32) (err error)
        GetExtentCrcFunc func(extentID uint64) (crc uint32, err error)
)

func (s *ExtentStore) BuildSnapshotExtentCrcMetaFile(blockNo int) (fp *os.File, err error) {
        fIdx := blockNo * util.PerBlockCrcSize / util.BlockHeaderSize
        if fIdx > 0 {
                gap := fIdx - len(s.verifyExtentFpAppend)
                log.LogDebugf("PersistenceBlockCrc. idx %v gap %v", fIdx, gap)
                if gap > 0 {
                        appendFpArr := make([]*os.File, fIdx-len(s.verifyExtentFpAppend))
                        s.verifyExtentFpAppend = append(s.verifyExtentFpAppend, appendFpArr...)

                        for i := gap; i > 0; i-- {
                                suffix := fIdx - i
                                dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(suffix))
                                log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v", fIdx-1, dataPath)
                                if fp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                                        log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v err %v", fIdx, dataPath, err)
                                        return
                                }
                                log.LogDebugf("PersistenceBlockCrc. idx %v try create path %v success", fIdx, dataPath)
                                s.verifyExtentFpAppend[suffix] = fp
                                s.PreAllocSpaceOnVerfiyFileForAppend(suffix)
                        }
                }
                if s.verifyExtentFpAppend[fIdx-1] == nil {
                        dataPath := path.Join(s.dataPath, ExtCrcHeaderFileName+"_"+strconv.Itoa(fIdx-1))
                        if fp, err = os.OpenFile(dataPath, os.O_CREATE|os.O_RDWR, 0o666); err != nil {
                                return
                        }
                        s.verifyExtentFpAppend[fIdx-1] = fp
                }
                fp = s.verifyExtentFpAppend[fIdx-1]
        }
        return
}

func (s *ExtentStore) PersistenceBlockCrc(e *Extent, blockNo int, blockCrc uint32) (err error) {
        log.LogDebugf("PersistenceBlockCrc. extent id %v blockNo %v blockCrc %v data path %v", e.extentID, blockNo, blockCrc, s.dataPath)
        if !proto.IsNormalDp(s.partitionType) {
                return
        }

        if blockNo >= len(e.header)/util.PerBlockCrcSize {
                exp := make([]byte, util.BlockHeaderSize*(1+(blockNo*util.PerBlockCrcSize-len(e.header))/util.BlockHeaderSize))
                e.header = append(e.header, exp...)
        }

        fIdx := blockNo * util.PerBlockCrcSize / util.BlockHeaderSize
        log.LogDebugf("PersistenceBlockCrc. idx %v", fIdx)
        fp := s.verifyExtentFp
        if fIdx > 0 {
                if fp, err = s.BuildSnapshotExtentCrcMetaFile(blockNo); err != nil {
                        return
                }
        }
        startIdx := blockNo * util.PerBlockCrcSize % util.BlockHeaderSize
        verifyStart := startIdx + int(util.BlockHeaderSize*e.extentID)
        log.LogDebugf("PersistenceBlockCrc. dp %v write at start %v name %v", s.partitionID, startIdx, fp.Name())

        headerOff := blockNo*util.PerBlockCrcSize%util.BlockHeaderSize + fIdx*util.BlockHeaderSize
        headerEnd := startIdx + util.PerBlockCrcSize%util.BlockHeaderSize + fIdx*util.BlockHeaderSize
        binary.BigEndian.PutUint32(e.header[headerOff:headerEnd], blockCrc)
        if _, err = fp.WriteAt(e.header[headerOff:headerEnd], int64(verifyStart)); err != nil {
                return
        }
        return
}

func (s *ExtentStore) DeleteBlockCrc(extentID uint64) (err error) {
        if !proto.IsNormalDp(s.partitionType) {
                return
        }

        if err = fallocate(int(s.verifyExtentFp.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize,
                int64(util.BlockHeaderSize*extentID), util.BlockHeaderSize); err != nil {
                return
        }

        for idx, fp := range s.verifyExtentFpAppend {
                if fp == nil {
                        log.LogErrorf("DeleteBlockCrc. idx %v append fp is nil", idx)
                        return
                }
                log.LogDebugf("DeleteBlockCrc. dp %v idx %v extentID %v offset %v", s.partitionID, idx, extentID, int64(util.BlockHeaderSize*extentID))
                if err = fallocate(int(fp.Fd()), util.FallocFLPunchHole|util.FallocFLKeepSize,
                        int64(util.BlockHeaderSize*extentID), util.BlockHeaderSize); err != nil {
                        return
                }
        }

        return
}

func (s *ExtentStore) PersistenceBaseExtentID(extentID uint64) (err error) {
        value := make([]byte, 8)
        binary.BigEndian.PutUint64(value, extentID)
        _, err = s.metadataFp.WriteAt(value, BaseExtentIDOffset)
        return
}

func (s *ExtentStore) GetPreAllocSpaceExtentIDOnVerifyFile() (extentID uint64) {
        value := make([]byte, 8)
        _, err := s.metadataFp.ReadAt(value, 8)
        if err != nil {
                return
        }
        extentID = binary.BigEndian.Uint64(value)
        return
}

func (s *ExtentStore) PreAllocSpaceOnVerfiyFileForAppend(idx int) {
        if !proto.IsNormalDp(s.partitionType) {
                return
        }
        log.LogDebugf("PreAllocSpaceOnVerfiyFileForAppend. idx %v end %v", idx, len(s.verifyExtentFpAppend))
        if idx >= len(s.verifyExtentFpAppend) {
                log.LogErrorf("PreAllocSpaceOnVerfiyFileForAppend. idx %v end %v", idx, len(s.verifyExtentFpAppend))
                return
        }
        prevAllocSpaceExtentID := int64(atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile))

        log.LogDebugf("PreAllocSpaceOnVerfiyFileForAppend. idx %v size %v", idx, prevAllocSpaceExtentID*util.BlockHeaderSize)
        err := fallocate(int(s.verifyExtentFpAppend[idx].Fd()), 1, 0, prevAllocSpaceExtentID*util.BlockHeaderSize)
        if err != nil {
                log.LogErrorf("PreAllocSpaceOnVerfiyFileForAppend. idx %v size %v err %v", idx, prevAllocSpaceExtentID*util.BlockHeaderSize, err)
                return
        }
}

func (s *ExtentStore) PreAllocSpaceOnVerfiyFile(currExtentID uint64) {
        if !proto.IsNormalDp(s.partitionType) {
                return
        }

        if currExtentID > atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile) {
                prevAllocSpaceExtentID := int64(atomic.LoadUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile))
                endAllocSpaceExtentID := int64(prevAllocSpaceExtentID + 1000)
                size := int64(1000 * util.BlockHeaderSize)
                err := fallocate(int(s.verifyExtentFp.Fd()), 1, prevAllocSpaceExtentID*util.BlockHeaderSize, size)
                if err != nil {
                        return
                }

                for id, fp := range s.verifyExtentFpAppend {
                        stat, _ := fp.Stat()
                        log.LogDebugf("PreAllocSpaceOnVerfiyFile. id %v name %v size %v", id, fp.Name(), stat.Size())
                        err = fallocate(int(fp.Fd()), 1, prevAllocSpaceExtentID*util.BlockHeaderSize, size)
                        if err != nil {
                                log.LogErrorf("PreAllocSpaceOnVerfiyFile. id %v name %v err %v", id, fp.Name(), err)
                                return
                        }
                }

                data := make([]byte, 8)
                binary.BigEndian.PutUint64(data, uint64(endAllocSpaceExtentID))
                if _, err = s.metadataFp.WriteAt(data, 8); err != nil {
                        return
                }
                atomic.StoreUint64(&s.hasAllocSpaceExtentIDOnVerfiyFile, uint64(endAllocSpaceExtentID))
                log.LogInfof("Action(PreAllocSpaceOnVerifyFile) PartitionID(%v) currentExtent(%v)"+
                        "PrevAllocSpaceExtentIDOnVerifyFile(%v) EndAllocSpaceExtentIDOnVerifyFile(%v)"+
                        " has allocSpaceOnVerifyFile to (%v)", s.partitionID, currExtentID, prevAllocSpaceExtentID, endAllocSpaceExtentID,
                        prevAllocSpaceExtentID*util.BlockHeaderSize+size)
        }
}

func (s *ExtentStore) GetPersistenceBaseExtentID() (extentID uint64, err error) {
        data := make([]byte, 8)
        _, err = s.metadataFp.ReadAt(data, 0)
        if err != nil {
                return
        }
        extentID = binary.BigEndian.Uint64(data)
        return
}

func (s *ExtentStore) PersistenceHasDeleteExtent(extentID uint64) (err error) {
        data := make([]byte, 8)
        binary.BigEndian.PutUint64(data, extentID)
        if _, err = s.normalExtentDeleteFp.Write(data); err != nil {
                return
        }
        return
}

func (s *ExtentStore) GetHasDeleteExtent() (extentDes []ExtentDeleted, err error) {
        data := make([]byte, 8)
        offset := int64(0)
        for {
                _, err = s.normalExtentDeleteFp.ReadAt(data, offset)
                if err != nil {
                        if err == io.EOF {
                                err = nil
                        }
                        return
                }

                extent := ExtentDeleted{}
                extent.ExtentID = binary.BigEndian.Uint64(data)
                extentDes = append(extentDes, extent)
                offset += 8
        }
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package atomicutil

import (
        "math"
        "sync/atomic"
)

type Float64 struct {
        val uint64
}

func (f *Float64) Load() float64 {
        return math.Float64frombits(atomic.LoadUint64(&f.val))
}

func (f *Float64) Store(val float64) {
        atomic.StoreUint64(&f.val, math.Float64bits(val))
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package auditlog

import (
        "bufio"
        "encoding/json"
        "errors"
        "fmt"
        "io/ioutil"
        "net"
        "net/http"
        "os"
        "path"
        "path/filepath"
        "regexp"
        "sort"
        "strconv"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        Audit_Module           = "audit"
        FileNameDateFormat     = "20060102150405"
        ShiftedExtension       = ".old"
        DefaultAuditLogBufSize = 0

        F_OK                 = 0
        DefaultCleanInterval = 1 * time.Hour
        DefaultAuditLogSize  = 200 * 1024 * 1024 // 200M
        DefaultHeadRoom      = 50 * 1024         // 50G
        MaxReservedDays      = 7 * 24 * time.Hour
)

const (
        EnableAuditLogReqPath     = "/auditlog/enable"
        DisableAuditLogReqPath    = "/auditlog/disable"
        SetAuditLogBufSizeReqPath = "/auditlog/setbufsize"
)

const auditFullPathUnsupported = "(Audit full path unsupported)"

var DefaultTimeOutUs = [3]uint32{100000, 500000, 1000000}

type ShiftedFile []os.FileInfo

func (f ShiftedFile) Less(i, j int) bool {
        return f[i].ModTime().Before(f[j].ModTime())
}

func (f ShiftedFile) Len() int {
        return len(f)
}

func (f ShiftedFile) Swap(i, j int) {
        f[i], f[j] = f[j], f[i]
}

//type typeInfo struct {
//        typeName  string
//        allCount  uint32
//        failCount uint32
//        maxTime   time.Duration
//        minTime   time.Duration
//        allTimeUs time.Duration
//        timeOut   [MaxTimeoutLevel]uint32
//}

type AuditPrefix struct {
        prefixes []string
}

func NewAuditPrefix(p ...string) *AuditPrefix {
        return &AuditPrefix{
                prefixes: p,
        }
}

func (a *AuditPrefix) String() string {
        builder := strings.Builder{}
        for _, p := range a.prefixes {
                builder.WriteString(p)
                builder.WriteString(", ")
        }
        return builder.String()
}

type Audit struct {
        hostName         string
        ipAddr           string
        logDir           string
        logModule        string
        logMaxSize       int64
        logFileName      string
        logFile          *os.File
        writer           *bufio.Writer
        writerBufSize    int
        prefix           *AuditPrefix
        bufferC          chan string
        stopC            chan struct{}
        resetWriterBuffC chan int
        pid              int
        lock             sync.Mutex
}

var (
        gAdt      *Audit = nil
        gAdtMutex sync.RWMutex
)

func getAddr() (HostName, IPAddr string) {
        hostName, err := os.Hostname()
        if err != nil {
                HostName = "Unknown"
                log.LogWarnf("Get host name failed, replaced by unknown. err(%v)", err)
        } else {
                HostName = hostName
        }
        addrs, err := net.InterfaceAddrs()
        if err != nil {
                IPAddr = "Unknown"
                log.LogWarnf("Get ip address failed, replaced by unknown. err(%v)", err)
        } else {
                var ip_addrs []string
                for _, addr := range addrs {
                        if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() && ipnet.IP.To4() != nil {
                                ip_addrs = append(ip_addrs, ipnet.IP.String())
                        }
                }
                if len(ip_addrs) > 0 {
                        IPAddr = strings.Join(ip_addrs, ",")
                } else {
                        IPAddr = "Unknown"
                        log.LogWarnf("Get ip address failed, replaced by unknown. err(%v)", err)
                }
        }
        return
}

// NOTE: for client http apis
func ResetWriterBuffSize(w http.ResponseWriter, r *http.Request) {
        var err error
        if err = r.ParseForm(); err != nil {
                BuildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        size := int(DefaultAuditLogBufSize)
        if sizeStr := r.FormValue("size"); sizeStr != "" {
                val, err := strconv.Atoi(sizeStr)
                if err != nil {
                        err = fmt.Errorf("size error")
                        BuildFailureResp(w, http.StatusBadRequest, err.Error())
                        return
                }
                size = val
        }

        ResetWriterBufferSize(size)
        BuildSuccessResp(w, "set audit log buffer size success")
}

func DisableAuditLog(w http.ResponseWriter, r *http.Request) {
        StopAudit()
        BuildSuccessResp(w, "disable audit log success")
}

func BuildSuccessResp(w http.ResponseWriter, data interface{}) {
        buildJSONResp(w, http.StatusOK, data, "")
}

func BuildFailureResp(w http.ResponseWriter, code int, msg string) {
        buildJSONResp(w, code, nil, msg)
}

// Create response for the API request.
func buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
        var (
                jsonBody []byte
                err      error
        )
        w.WriteHeader(code)
        w.Header().Set("Content-Type", "application/json")
        body := struct {
                Code int         `json:"code"`
                Data interface{} `json:"data"`
                Msg  string      `json:"msg"`
        }{
                Code: code,
                Data: data,
                Msg:  msg,
        }
        if jsonBody, err = json.Marshal(body); err != nil {
                return
        }
        w.Write(jsonBody)
}

func (a *Audit) GetInfo() (dir, logModule string, logMaxSize int64) {
        return a.logDir, a.logModule, a.logMaxSize
}

func NewAuditWithPrefix(dir, logModule string, logMaxSize int64, prefix *AuditPrefix) (a *Audit, err error) {
        a, err = NewAudit(dir, logModule, logMaxSize)
        if err != nil {
                return nil, err
        }
        a.prefix = prefix
        return a, nil
}

func NewAudit(dir, logModule string, logMaxSize int64) (*Audit, error) {
        absPath, err := filepath.Abs(dir)
        if err != nil {
                return nil, err
        }
        host, ip := getAddr()
        absPath = path.Join(absPath, logModule)
        if !isPathSafe(absPath) {
                return nil, errors.New("invalid file path")
        }
        fi, err := os.Stat(absPath)
        if err != nil {
                os.MkdirAll(absPath, 0o755)
        } else {
                if !fi.IsDir() {
                        return nil, errors.New(absPath + " is not a directory")
                }
        }
        _ = os.Chmod(absPath, 0o755)
        logName := path.Join(absPath, Audit_Module) + ".log"
        audit := &Audit{
                hostName:         host,
                ipAddr:           ip,
                logDir:           absPath,
                logModule:        logModule,
                logMaxSize:       logMaxSize,
                logFileName:      logName,
                writerBufSize:    DefaultAuditLogBufSize,
                bufferC:          make(chan string, 1000),
                prefix:           nil,
                stopC:            make(chan struct{}),
                resetWriterBuffC: make(chan int),
                pid:              os.Getpid(),
        }
        err = audit.newWriterSize(audit.writerBufSize)
        if err != nil {
                return nil, err
        }
        go audit.flushAuditLog()
        return audit, nil
}

// NOTE:
// common header:
// [PREFIX] CURRENT_TIME TIME_ZONE
// format for client:
// [COMMON HEADER] IP_ADDR HOSTNAME OP SRC DST(Rename) ERR LATENCY SRC_INODE DST_INODE(Rename)
// format for server(inode):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP ("nil") FULL_PATH ERR LATENCY INODE FILE_SIZE(Trunc)
// format for server(dentry):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP NAME FULL_PATH ERR LATENCY INODE PARENT_INODE
// format for server(transaction):
// [COMMON HEADER] CLIENT_ADDR VOLUME OP TX_ID ("nil") ERR LATENCY TM_ID (0)
func (a *Audit) formatAuditEntry(ipAddr, hostName, op, src, dst string, err error, latency int64, srcInode, dstInode uint64) (entry string) {
        var errStr string
        if err != nil {
                errStr = err.Error()
        } else {
                errStr = "nil"
        }
        curTime := time.Now()
        curTimeStr := curTime.Format("2006-01-02 15:04:05")
        timeZone, _ := curTime.Zone()
        latencyStr := strconv.FormatInt(latency, 10) + " us"
        srcInodeStr := strconv.FormatUint(srcInode, 10)
        dstInodeStr := strconv.FormatUint(dstInode, 10)

        entry = fmt.Sprintf("%s %s, %s, %s, %s, %s, %s, %s, %s, %s, %s",
                curTimeStr, timeZone, ipAddr, hostName, op, src, dst, errStr, latencyStr, srcInodeStr, dstInodeStr)
        return
}

func (a *Audit) LogClientOp(op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
        a.formatLog(a.ipAddr, a.hostName, op, src, dst, err, latency, srcInode, dstInode)
}

func (a *Audit) LogDentryOp(clientAddr, volume, op, name, fullPath string, err error, latency int64, ino, parentIno uint64) {
        if fullPath == "" {
                fullPath = auditFullPathUnsupported
        }
        a.formatLog(clientAddr, volume, op, name, fullPath, err, latency, ino, parentIno)
}

func (a *Audit) LogInodeOp(clientAddr, volume, op, fullPath string, err error, latency int64, ino uint64, fileSize uint64) {
        if fullPath == "" {
                fullPath = auditFullPathUnsupported
        }
        a.formatLog(clientAddr, volume, op, "nil", fullPath, err, latency, ino, fileSize)
}

func (a *Audit) LogTxOp(clientAddr, volume, op, txId string, err error, latency int64) {
        a.formatLog(clientAddr, volume, op, txId, "nil", err, latency, 0, 0)
}

func (a *Audit) formatLog(ipAddr, hostName, op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
        if entry := a.formatAuditEntry(ipAddr, hostName, op, src, dst, err, latency, srcInode, dstInode); entry != "" {
                if a.prefix != nil {
                        entry = fmt.Sprintf("%s%s", a.prefix.String(), entry)
                }
                a.AddLog(entry)
        }
}

func (a *Audit) ResetWriterBufferSize(size int) {
        a.lock.Lock()
        defer a.lock.Unlock()
        a.resetWriterBuffC <- size
}

func (a *Audit) AddLog(content string) {
        a.lock.Lock()
        defer a.lock.Unlock()
        select {
        case a.bufferC <- content:
                return
        default:
                log.LogErrorf("async audit log failed, audit:[%s]", content)
        }
}

// NOTE: global functions

func GetAuditLogInfo() (dir, logModule string, logMaxSize int64, err error) {
        gAdtMutex.RLock()
        defer gAdtMutex.RUnlock()
        if gAdt != nil {
                dir, logModule, logMaxSize = gAdt.GetInfo()
                return
        } else {
                return "", "", 0, errors.New("audit log is not initialized yet")
        }
}

func InitAuditWithPrefix(dir, logModule string, logMaxSize int64, prefix *AuditPrefix) (a *Audit, err error) {
        a, err = InitAudit(dir, logModule, logMaxSize)
        if err != nil {
                return nil, err
        }

        a.prefix = prefix
        return a, nil
}

func InitAudit(dir, logModule string, logMaxSize int64) (*Audit, error) {
        gAdtMutex.Lock()
        defer gAdtMutex.Unlock()
        if gAdt == nil {
                adt, err := NewAudit(dir, logModule, logMaxSize)
                if err != nil {
                        return nil, err
                }
                gAdt = adt
        }
        return gAdt, nil
}

func LogClientOp(op, src, dst string, err error, latency int64, srcInode, dstInode uint64) {
        gAdtMutex.RLock()
        defer gAdtMutex.RUnlock()
        if gAdt == nil {
                return
        }
        gAdt.LogClientOp(op, src, dst, err, latency, srcInode, dstInode)
}

func LogDentryOp(clientAddr, volume, op, name, fullPath string, err error, latency int64, ino, parentIno uint64) {
        gAdtMutex.RLock()
        defer gAdtMutex.RUnlock()
        if gAdt == nil {
                return
        }
        gAdt.LogDentryOp(clientAddr, volume, op, name, fullPath, err, latency, ino, parentIno)
}

func LogInodeOp(clientAddr, volume, op, fullPath string, err error, latency int64, ino uint64, fileSize uint64) {
        gAdtMutex.RLock()
        defer gAdtMutex.RUnlock()
        if gAdt == nil {
                return
        }
        gAdt.LogInodeOp(clientAddr, volume, op, fullPath, err, latency, ino, fileSize)
}

func LogTxOp(clientAddr, volume, op, txId string, err error, latency int64) {
        gAdtMutex.RLock()
        defer gAdtMutex.RUnlock()
        if gAdt == nil {
                return
        }
        gAdt.LogTxOp(clientAddr, volume, op, txId, err, latency)
}

func ResetWriterBufferSize(size int) {
        gAdtMutex.Lock()
        defer gAdtMutex.Unlock()
        if gAdt == nil {
                return
        }
        gAdt.ResetWriterBufferSize(size)
}

func AddLog(content string) {
        gAdtMutex.Lock()
        defer gAdtMutex.Unlock()
        if gAdt == nil {
                return
        }
        gAdt.AddLog(content)
}

func StopAudit() {
        gAdtMutex.Lock()
        defer gAdtMutex.Unlock()
        if gAdt == nil {
                return
        }
        gAdt.Stop()
        gAdt = nil
}

// NOTE: implementation details

func (a *Audit) flushAuditLog() {
        cleanTimer := time.NewTimer(DefaultCleanInterval)

        for {
                select {
                case <-a.stopC:
                        return
                case bufSize := <-a.resetWriterBuffC:
                        a.writerBufSize = bufSize
                        a.newWriterSize(bufSize)
                case aLog := <-a.bufferC:
                        a.logAudit(aLog)
                case <-cleanTimer.C:
                        a.removeLogFile()
                        cleanTimer.Reset(DefaultCleanInterval)
                }
        }
}

func (a *Audit) newWriterSize(size int) error {
        a.writerBufSize = size
        if a.writer != nil {
                a.writer.Flush()
        }

        if a.logFile == nil {
                logFile, err := os.OpenFile(a.logFileName, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0o666)
                if err != nil {
                        log.LogErrorf("newWriterSize failed, logFileName: %s, err: %v\n", a.logFileName, err)
                        return fmt.Errorf("OpenLogFile failed, logFileName %s", a.logFileName)
                }

                a.logFile = logFile
                if size <= 0 {
                        log.LogDebugf("newWriterSize : buffer for logFileName: %s is disabled", a.logFileName)
                        a.writer = bufio.NewWriter(logFile)
                } else {
                        a.writer = bufio.NewWriterSize(logFile, size)
                }

        } else {
                _, err := a.logFile.Stat()
                if err == nil {
                        if size <= 0 {
                                log.LogErrorf("newWriterSize : buffer for logFileName is disabled")
                                a.writer = bufio.NewWriter(a.logFile)
                        } else {
                                a.writer = bufio.NewWriterSize(a.logFile, size)
                        }
                } else {
                        a.logFile.Close()
                        a.logFile = nil
                        return a.newWriterSize(size)
                }
        }
        return nil
}

func (a *Audit) removeLogFile() {
        fs := syscall.Statfs_t{}
        if err := syscall.Statfs(a.logDir, &fs); err != nil {
                log.LogErrorf("Get fs stat failed, err: %v", err)
                return
        }
        diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
        diskSpaceLeft -= DefaultHeadRoom * 1024 * 1024

        fInfos, err := ioutil.ReadDir(a.logDir)
        if err != nil {
                log.LogErrorf("ReadDir failed, logDir: %s, err: %v", a.logDir, err)
                return
        }
        var needDelFiles ShiftedFile
        for _, info := range fInfos {
                if a.shouldDelete(info, diskSpaceLeft, Audit_Module) {
                        needDelFiles = append(needDelFiles, info)
                }
        }
        sort.Sort(needDelFiles)
        for _, info := range needDelFiles {
                if err = os.Remove(path.Join(a.logDir, info.Name())); err != nil {
                        log.LogErrorf("Remove log file failed, logFileName: %s, err: %v", info.Name(), err)
                        continue
                }
                diskSpaceLeft += info.Size()
                if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
                        break
                }
        }
}

func (a *Audit) shouldDelete(info os.FileInfo, diskSpaceLeft int64, module string) bool {
        isOldAuditLogFile := info.Mode().IsRegular() && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
        if diskSpaceLeft <= 0 {
                return isOldAuditLogFile
        }
        return time.Since(info.ModTime()) > MaxReservedDays && isOldAuditLogFile
}

func (a *Audit) Stop() {
        a.lock.Lock()
        defer a.lock.Unlock()
        close(a.stopC)
        a.writer.Flush()
        a.logFile.Close()
}

func (a *Audit) logAudit(content string) error {
        a.shiftFiles()

        fmt.Fprintf(a.writer, "%s\n", content)
        if a.writerBufSize <= 0 {
                a.writer.Flush()
        }

        return nil
}

func (a *Audit) shiftFiles() error {
        fileInfo, err := os.Stat(a.logFileName)
        if err != nil {
                return err
        }

        if fileInfo.Size() < a.logMaxSize {
                return nil
        }

        if syscall.Access(a.logFileName, F_OK) == nil {
                logNewFileName := a.logFileName + "." + time.Now().Format(FileNameDateFormat) + ShiftedExtension

                a.writer.Flush()
                a.logFile.Close()
                a.writer = nil
                a.logFile = nil
                if err = os.Rename(a.logFileName, logNewFileName); err != nil {
                        log.LogErrorf("RenameFile failed, logFileName: %s, logNewFileName: %s, err: %v\n",
                                a.logFileName, logNewFileName, err)
                        return fmt.Errorf("action[shiftFiles] renameFile failed, logFileName %s, logNewFileName %s",
                                a.logFileName, logNewFileName)
                }
        }

        // NOTE: try to recycle space when shift file
        a.removeLogFile()

        return a.newWriterSize(a.writerBufSize)
}

func isPathSafe(filePath string) bool {
        safePattern := `^[a-zA-Z0-9\-_/]+$`
        match, _ := regexp.MatchString(safePattern, filePath)
        return match
}

// Copyright 2014 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package btree implements in-memory B-Trees of arbitrary degree.
//
// btree implements an in-memory B-Tree for use as an ordered data structure.
// It is not meant for persistent storage solutions.
//
// It has a flatter structure than an equivalent red-black or other binary tree,
// which in some cases yields better memory usage and/or performance.
// See some discussion on the matter here:
//   http://google-opensource.blogspot.com/2013/01/c-containers-that-save-memory-and-time.html
// Note, though, that this project is in no way related to the C++ B-Tree
// implementation written about there.
//
// Within this tree, each node contains a slice of items and a (possibly nil)
// slice of children.  For basic numeric values or raw structs, this can cause
// efficiency differences when compared to equivalent C++ template code that
// stores values in arrays within the node:
//   * Due to the overhead of storing values as interfaces (each
//     value needs to be stored as the value itself, then 2 words for the
//     interface pointing to that value and its type), resulting in higher
//     memory use.
//   * Since interfaces can point to values anywhere in memory, values are
//     most likely not stored in contiguous blocks, resulting in a higher
//     number of cache misses.
// These issues don't tend to matter, though, when working with strings or other
// heap-allocated structures, since C++-equivalent structures also must store
// pointers and also distribute their values across the heap.
//
// This implementation is designed to be a drop-in replacement to gollrb.LLRB
// trees, (http://github.com/petar/gollrb), an excellent and probably the most
// widely used ordered tree implementation in the Go ecosystem currently.
// Its functions, therefore, exactly mirror those of
// llrb.LLRB where possible.  Unlike gollrb, though, we currently don't
// support storing multiple equivalent values.
package btree

import (
        "fmt"
        "io"
        "sort"
        "strings"
        "sync"
)

// Item represents a single object in the tree.
type Item interface {
        // Less tests whether the current item is less than the given argument.
        //
        // This must provide a strict weak ordering.
        // If !a.Less(b) && !b.Less(a), we treat this to mean a == b (i.e. we can only
        // hold one of either a or b in the tree).
        Less(than Item) bool
        Copy() Item
}

const (
        DefaultFreeListSize = 32
)

var (
        nilItems    = make(items, 16)
        nilChildren = make(children, 16)
)

// FreeList represents a free list of btree nodes. By default each
// BTree has its own FreeList, but multiple BTrees can share the same
// FreeList.
// Two Btrees using the same freelist are safe for concurrent write access.
type FreeList struct {
        mu       sync.Mutex
        freelist []*node
}

// NewFreeList creates a new free list.
// size is the maximum size of the returned free list.
func NewFreeList(size int) *FreeList {
        return &FreeList{freelist: make([]*node, 0, size)}
}

func (f *FreeList) newNode() (n *node) {
        f.mu.Lock()
        index := len(f.freelist) - 1
        if index < 0 {
                f.mu.Unlock()
                return new(node)
        }
        n = f.freelist[index]
        f.freelist[index] = nil
        f.freelist = f.freelist[:index]
        f.mu.Unlock()
        return
}

// freeNode adds the given node to the list, returning true if it was added
// and false if it was discarded.
func (f *FreeList) freeNode(n *node) (out bool) {
        f.mu.Lock()
        if len(f.freelist) < cap(f.freelist) {
                f.freelist = append(f.freelist, n)
                out = true
        }
        f.mu.Unlock()
        return
}

// ItemIterator allows callers of Ascend* to iterate in-order over portions of
// the tree.  When this function returns false, iteration will stop and the
// associated Ascend* function will immediately return.
type ItemIterator func(i Item) bool

// New creates a new B-Tree with the given degree.
//
// New(2), for example, will create a 2-3-4 tree (each node contains 1-3 items
// and 2-4 children).
func New(degree int) *BTree {
        return NewWithFreeList(degree, NewFreeList(DefaultFreeListSize))
}

func NewWithSize(degree, initSize int) *BTree {
        return NewWithFreeList(degree, NewFreeList(initSize))
}

// NewWithFreeList creates a new B-Tree that uses the given node free list.
func NewWithFreeList(degree int, f *FreeList) *BTree {
        if degree <= 1 {
                panic("bad degree")
        }
        return &BTree{
                degree: degree,
                cow:    &copyOnWriteContext{freelist: f},
        }
}

// items stores items in a node.
type items []Item

// insertAt inserts a value into the given index, pushing all subsequent values
// forward.
func (s *items) insertAt(index int, item Item) {
        *s = append(*s, nil)
        if index < len(*s) {
                copy((*s)[index+1:], (*s)[index:])
        }
        (*s)[index] = item
}

// removeAt removes a value at a given index, pulling all subsequent values
// back.
func (s *items) removeAt(index int) Item {
        item := (*s)[index]
        copy((*s)[index:], (*s)[index+1:])
        (*s)[len(*s)-1] = nil
        *s = (*s)[:len(*s)-1]
        return item
}

// pop removes and returns the last element in the list.
func (s *items) pop() (out Item) {
        index := len(*s) - 1
        out = (*s)[index]
        (*s)[index] = nil
        *s = (*s)[:index]
        return
}

// copy copy and return new items
func (s *items) copy() items {
        nItems := make(items, 0, len(*s))
        for _, v := range *s {
                nItems = append(nItems, v.Copy())
        }
        return nItems
}

// truncate truncates this instance at index so that it contains only the
// first index items. index must be less than or equal to length.
func (s *items) truncate(index int) {
        var toClear items
        *s, toClear = (*s)[:index], (*s)[index:]
        for len(toClear) > 0 {
                toClear = toClear[copy(toClear, nilItems):]
        }
}

// find returns the index where the given item should be inserted into this
// list.  'found' is true if the item already exists in the list at the given
// index.
func (s items) find(item Item) (index int, found bool) {
        i := sort.Search(len(s), func(i int) bool {
                return item.Less(s[i])
        })
        if i > 0 && !s[i-1].Less(item) {
                return i - 1, true
        }
        return i, false
}

// children stores child nodes in a node.
type children []*node

// insertAt inserts a value into the given index, pushing all subsequent values
// forward.
func (s *children) insertAt(index int, n *node) {
        *s = append(*s, nil)
        if index < len(*s) {
                copy((*s)[index+1:], (*s)[index:])
        }
        (*s)[index] = n
}

// removeAt removes a value at a given index, pulling all subsequent values
// back.
func (s *children) removeAt(index int) *node {
        n := (*s)[index]
        copy((*s)[index:], (*s)[index+1:])
        (*s)[len(*s)-1] = nil
        *s = (*s)[:len(*s)-1]
        return n
}

// pop removes and returns the last element in the list.
func (s *children) pop() (out *node) {
        index := len(*s) - 1
        out = (*s)[index]
        (*s)[index] = nil
        *s = (*s)[:index]
        return
}

// truncate truncates this instance at index so that it contains only the
// first index children. index must be less than or equal to length.
func (s *children) truncate(index int) {
        var toClear children
        *s, toClear = (*s)[:index], (*s)[index:]
        for len(toClear) > 0 {
                toClear = toClear[copy(toClear, nilChildren):]
        }
}

// node is an internal node in a tree.
//
// It must at all times maintain the invariant that either
//   * len(children) == 0, len(items) unconstrained
//   * len(children) == len(items) + 1
type node struct {
        items    items
        children children
        cow      *copyOnWriteContext
}

func (n *node) mutableFor(cow *copyOnWriteContext) *node {
        if n.cow == cow {
                return n
        }
        out := cow.newNode()
        if cap(out.items) >= len(n.items) {
                out.items = out.items[:len(n.items)]
        } else {
                out.items = make(items, len(n.items), cap(n.items))
        }
        copy(out.items, n.items.copy())
        // Copy children
        if cap(out.children) >= len(n.children) {
                out.children = out.children[:len(n.children)]
        } else {
                out.children = make(children, len(n.children), cap(n.children))
        }
        copy(out.children, n.children)
        return out
}

func (n *node) mutableChild(i int) *node {
        c := n.children[i].mutableFor(n.cow)
        n.children[i] = c
        return c
}

// split splits the given node at the given index.  The current node shrinks,
// and this function returns the item that existed at that index and a new node
// containing all items/children after it.
func (n *node) split(i int) (Item, *node) {
        item := n.items[i]
        next := n.cow.newNode()
        next.items = append(next.items, n.items[i+1:]...)
        n.items.truncate(i)
        if len(n.children) > 0 {
                next.children = append(next.children, n.children[i+1:]...)
                n.children.truncate(i + 1)
        }
        return item, next
}

// maybeSplitChild checks if a child should be split, and if so splits it.
// Returns whether or not a split occurred.
func (n *node) maybeSplitChild(i, maxItems int) bool {
        if len(n.children[i].items) < maxItems {
                return false
        }
        first := n.mutableChild(i)
        item, second := first.split(maxItems / 2)
        n.items.insertAt(i, item)
        n.children.insertAt(i+1, second)
        return true
}

// insert inserts an item into the subtree rooted at this node, making sure
// no nodes in the subtree exceed maxItems items.  Should an equivalent item be
// be found/replaced by insert, it will be returned.
func (n *node) insert(item Item, maxItems int) Item {
        i, found := n.items.find(item)
        if found {
                out := n.items[i]
                n.items[i] = item
                return out
        }
        if len(n.children) == 0 {
                n.items.insertAt(i, item)
                return nil
        }
        if n.maybeSplitChild(i, maxItems) {
                inTree := n.items[i]
                switch {
                case item.Less(inTree):
                        // no change, we want first split node
                case inTree.Less(item):
                        i++ // we want second split node
                default:
                        out := n.items[i]
                        n.items[i] = item
                        return out
                }
        }
        return n.mutableChild(i).insert(item, maxItems)
}

// get finds the given key in the subtree and returns it.
func (n *node) get(key Item) Item {
        i, found := n.items.find(key)
        if found {
                return n.items[i]
        } else if len(n.children) > 0 {
                return n.children[i].get(key)
        }
        return nil
}

func (n *node) copyGet(key Item, cow *copyOnWriteContext) Item {
        i, found := n.items.find(key)
        if found {
                return n.items[i]
        } else if len(n.children) > 0 {
                child := n.mutableChild(i)
                return child.copyGet(key, cow)
        }
        return nil
}

// min returns the first item in the subtree.
func min(n *node) Item {
        for len(n.children) > 0 {
                n = n.children[0]
        }
        if len(n.items) == 0 {
                return nil
        }
        return n.items[0]
}

// max returns the last item in the subtree.
func max(n *node) Item {
        for len(n.children) > 0 {
                n = n.children[len(n.children)-1]
        }
        if len(n.items) == 0 {
                return nil
        }
        return n.items[len(n.items)-1]
}

// toRemove details what item to remove in a node.remove call.
type toRemove int

const (
        removeItem toRemove = iota // removes the given item
        removeMin                  // removes smallest item in the subtree
        removeMax                  // removes largest item in the subtree
)

// remove removes an item from the subtree rooted at this node.
func (n *node) remove(item Item, minItems int, typ toRemove) Item {
        var i int
        var found bool
        switch typ {
        case removeMax:
                if len(n.children) == 0 {
                        return n.items.pop()
                }
                i = len(n.items)
        case removeMin:
                if len(n.children) == 0 {
                        return n.items.removeAt(0)
                }
                i = 0
        case removeItem:
                i, found = n.items.find(item)
                if len(n.children) == 0 {
                        if found {
                                return n.items.removeAt(i)
                        }
                        return nil
                }
        default:
                panic("invalid type")
        }
        // If we get to here, we have children.
        if len(n.children[i].items) <= minItems {
                return n.growChildAndRemove(i, item, minItems, typ)
        }
        child := n.mutableChild(i)
        // Either we had enough items to begin with, or we've done some
        // merging/stealing, because we've got enough now and we're ready to return
        // stuff.
        if found {
                // The item exists at index 'i', and the child we've selected can give us a
                // predecessor, since if we've gotten here it's got > minItems items in it.
                out := n.items[i]
                // We use our special-case 'remove' call with typ=maxItem to pull the
                // predecessor of item i (the rightmost leaf of our immediate left child)
                // and set it into where we pulled the item from.
                n.items[i] = child.remove(nil, minItems, removeMax)
                return out
        }
        // Final recursive call.  Once we're here, we know that the item isn't in this
        // node and that the child is big enough to remove from.
        return child.remove(item, minItems, typ)
}

// growChildAndRemove grows child 'i' to make sure it's possible to remove an
// item from it while keeping it at minItems, then calls remove to actually
// remove it.
//
// Most documentation says we have to do two sets of special casing:
//   1) item is in this node
//   2) item is in child
// In both cases, we need to handle the two subcases:
//   A) node has enough values that it can spare one
//   B) node doesn't have enough values
// For the latter, we have to check:
//   a) left sibling has node to spare
//   b) right sibling has node to spare
//   c) we must merge
// To simplify our code here, we handle cases #1 and #2 the same:
// If a node doesn't have enough items, we make sure it does (using a,b,c).
// We then simply redo our remove call, and the second time (regardless of
// whether we're in case 1 or 2), we'll have enough items and can guarantee
// that we hit case A.
func (n *node) growChildAndRemove(i int, item Item, minItems int, typ toRemove) Item {
        if i > 0 && len(n.children[i-1].items) > minItems {
                // Steal from left child
                child := n.mutableChild(i)
                stealFrom := n.mutableChild(i - 1)
                stolenItem := stealFrom.items.pop()
                child.items.insertAt(0, n.items[i-1])
                n.items[i-1] = stolenItem
                if len(stealFrom.children) > 0 {
                        child.children.insertAt(0, stealFrom.children.pop())
                }
        } else if i < len(n.items) && len(n.children[i+1].items) > minItems {
                // steal from right child
                child := n.mutableChild(i)
                stealFrom := n.mutableChild(i + 1)
                stolenItem := stealFrom.items.removeAt(0)
                child.items = append(child.items, n.items[i])
                n.items[i] = stolenItem
                if len(stealFrom.children) > 0 {
                        child.children = append(child.children, stealFrom.children.removeAt(0))
                }
        } else {
                if i >= len(n.items) {
                        i--
                }
                child := n.mutableChild(i)
                // merge with right child
                mergeItem := n.items.removeAt(i)
                mergeChild := n.children.removeAt(i + 1)
                child.items = append(child.items, mergeItem)
                child.items = append(child.items, mergeChild.items...)
                child.children = append(child.children, mergeChild.children...)
                n.cow.freeNode(mergeChild)
        }
        return n.remove(item, minItems, typ)
}

type direction int

const (
        descend = direction(-1)
        ascend  = direction(+1)
)

// iterate provides a simple method for iterating over elements in the tree.
//
// When ascending, the 'start' should be less than 'stop' and when descending,
// the 'start' should be greater than 'stop'. Setting 'includeStart' to true
// will force the iterator to include the first item when it equals 'start',
// thus creating a "greaterOrEqual" or "lessThanEqual" rather than just a
// "greaterThan" or "lessThan" queries.
func (n *node) iterate(dir direction, start, stop Item, includeStart bool, hit bool, iter ItemIterator) (bool, bool) {
        var ok, found bool
        var index int
        switch dir {
        case ascend:
                if start != nil {
                        index, _ = n.items.find(start)
                }
                for i := index; i < len(n.items); i++ {
                        if len(n.children) > 0 {
                                if hit, ok = n.children[i].iterate(dir, start, stop, includeStart, hit, iter); !ok {
                                        return hit, false
                                }
                        }
                        if !includeStart && !hit && start != nil && !start.Less(n.items[i]) {
                                hit = true
                                continue
                        }
                        hit = true
                        if stop != nil && !n.items[i].Less(stop) {
                                return hit, false
                        }
                        if !iter(n.items[i]) {
                                return hit, false
                        }
                }
                if len(n.children) > 0 {
                        if hit, ok = n.children[len(n.children)-1].iterate(dir, start, stop, includeStart, hit, iter); !ok {
                                return hit, false
                        }
                }
        case descend:
                if start != nil {
                        index, found = n.items.find(start)
                        if !found {
                                index = index - 1
                        }
                } else {
                        index = len(n.items) - 1
                }
                for i := index; i >= 0; i-- {
                        if start != nil && !n.items[i].Less(start) {
                                if !includeStart || hit || start.Less(n.items[i]) {
                                        continue
                                }
                        }
                        if len(n.children) > 0 {
                                if hit, ok = n.children[i+1].iterate(dir, start, stop, includeStart, hit, iter); !ok {
                                        return hit, false
                                }
                        }
                        if stop != nil && !stop.Less(n.items[i]) {
                                return hit, false //        continue
                        }
                        hit = true
                        if !iter(n.items[i]) {
                                return hit, false
                        }
                }
                if len(n.children) > 0 {
                        if hit, ok = n.children[0].iterate(dir, start, stop, includeStart, hit, iter); !ok {
                                return hit, false
                        }
                }
        default:
                // do nothing
        }
        return hit, true
}

// Allocated for testing/debugging purposes.
func (n *node) print(w io.Writer, level int) {
        fmt.Fprintf(w, "%sNODE:%v\n", strings.Repeat("  ", level), n.items)
        for _, c := range n.children {
                c.print(w, level+1)
        }
}

// BTree is an implementation of a B-Tree.
//
// BTree stores Item instances in an ordered structure, allowing easy insertion,
// removal, and iteration.
//
// Write operations are not safe for concurrent mutation by multiple
// goroutines, but Read operations are.
type BTree struct {
        degree int
        length int
        root   *node
        cow    *copyOnWriteContext
}

// copyOnWriteContext pointers determine node ownership... a tree with a write
// context equivalent to a node's write context is allowed to modify that node.
// A tree whose write context does not match a node's is not allowed to modify
// it, and must create a new, writable copy (IE: it's a Clone).
//
// When doing any write operation, we maintain the invariant that the current
// node's context is equal to the context of the tree that requested the write.
// We do this by, before we descend into any node, creating a copy with the
// correct context if the contexts don't match.
//
// Since the node we're currently visiting on any write has the requesting
// tree's context, that node is modifiable in place.  Children of that node may
// not share context, but before we descend into them, we'll make a mutable
// copy.
type copyOnWriteContext struct {
        freelist *FreeList
}

// Clone clones the btree, lazily.  Clone should not be called concurrently,
// but the original tree (t) and the new tree (t2) can be used concurrently
// once the Clone call completes.
//
// The internal tree structure of b is marked read-only and shared between t and
// t2.  Writes to both t and t2 use copy-on-write logic, creating new nodes
// whenever one of b's original nodes would have been modified.  Read operations
// should have no performance degredation.  Write operations for both t and t2
// will initially experience minor slow-downs caused by additional allocs and
// copies due to the aforementioned copy-on-write logic, but should converge to
// the original performance characteristics of the original tree.
func (t *BTree) Clone() (t2 *BTree) {
        // Create two entirely new copy-on-write contexts.
        // This operation effectively creates three trees:
        //   the original, shared nodes (old b.cow)
        //   the new b.cow nodes
        //   the new out.cow nodes
        cow1, cow2 := *t.cow, *t.cow
        out := *t
        t.cow = &cow1
        out.cow = &cow2
        return &out
}

// maxItems returns the max number of items to allow per node.
func (t *BTree) maxItems() int {
        return t.degree*2 - 1
}

// minItems returns the min number of items to allow per node (ignored for the
// root node).
func (t *BTree) minItems() int {
        return t.degree - 1
}

func (c *copyOnWriteContext) newNode() (n *node) {
        n = c.freelist.newNode()
        n.cow = c
        return
}

type freeType int

const (
        ftFreelistFull freeType = iota // node was freed (available for GC, not stored in freelist)
        ftStored                       // node was stored in the freelist for later use
        ftNotOwned                     // node was ignored by COW, since it's owned by another one
)

// freeNode frees a node within a given COW context, if it's owned by that
// context.  It returns what happened to the node (see freeType const
// documentation).
func (c *copyOnWriteContext) freeNode(n *node) freeType {
        if n.cow == c {
                // clear to allow GC
                n.items.truncate(0)
                n.children.truncate(0)
                n.cow = nil
                if c.freelist.freeNode(n) {
                        return ftStored
                } else {
                        return ftFreelistFull
                }
        } else {
                return ftNotOwned
        }
}

// ReplaceOrInsert adds the given item to the tree.  If an item in the tree
// already equals the given one, it is removed from the tree and returned.
// Otherwise, nil is returned.
//
// nil cannot be added to the tree (will panic).
func (t *BTree) ReplaceOrInsert(item Item) Item {
        if item == nil {
                panic("nil item being added to BTree")
        }
        if t.root == nil {
                t.root = t.cow.newNode()
                t.root.items = append(t.root.items, item)
                t.length++
                return nil
        } else {
                t.root = t.root.mutableFor(t.cow)
                if len(t.root.items) >= t.maxItems() {
                        item2, second := t.root.split(t.maxItems() / 2)
                        oldroot := t.root
                        t.root = t.cow.newNode()
                        t.root.items = append(t.root.items, item2)
                        t.root.children = append(t.root.children, oldroot, second)
                }
        }
        out := t.root.insert(item, t.maxItems())
        if out == nil {
                t.length++
        }
        return out
}

// Delete removes an item equal to the passed in item from the tree, returning
// it.  If no such item exists, returns nil.
func (t *BTree) Delete(item Item) Item {
        return t.deleteItem(item, removeItem)
}

// DeleteMin removes the smallest item in the tree and returns it.
// If no such item exists, returns nil.
func (t *BTree) DeleteMin() Item {
        return t.deleteItem(nil, removeMin)
}

// DeleteMax removes the largest item in the tree and returns it.
// If no such item exists, returns nil.
func (t *BTree) DeleteMax() Item {
        return t.deleteItem(nil, removeMax)
}

func (t *BTree) deleteItem(item Item, typ toRemove) Item {
        if t.root == nil || len(t.root.items) == 0 {
                return nil
        }
        t.root = t.root.mutableFor(t.cow)
        out := t.root.remove(item, t.minItems(), typ)
        if len(t.root.items) == 0 && len(t.root.children) > 0 {
                oldroot := t.root
                t.root = t.root.children[0]
                t.cow.freeNode(oldroot)
        }
        if out != nil {
                t.length--
        }
        return out
}

// AscendRange calls the iterator for every value in the tree within the range
// [greaterOrEqual, lessThan), until iterator returns false.
func (t *BTree) AscendRange(greaterOrEqual, lessThan Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(ascend, greaterOrEqual, lessThan, true, false, iterator)
}

// AscendLessThan calls the iterator for every value in the tree within the range
// [first, pivot), until iterator returns false.
func (t *BTree) AscendLessThan(pivot Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(ascend, nil, pivot, false, false, iterator)
}

// AscendGreaterOrEqual calls the iterator for every value in the tree within
// the range [pivot, last], until iterator returns false.
func (t *BTree) AscendGreaterOrEqual(pivot Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(ascend, pivot, nil, true, false, iterator)
}

// AscendGreaterOrEqual calls the iterator for every value in the tree within
// the range [pivot, last], until iterator returns false.
func (t *BTree) AscendGreaterThan(pivot Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(ascend, pivot, nil, false, false, iterator)
}

// Ascend calls the iterator for every value in the tree within the range
// [first, last], until iterator returns false.
func (t *BTree) Ascend(iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(ascend, nil, nil, false, false, iterator)
}

// DescendRange calls the iterator for every value in the tree within the range
// [lessOrEqual, greaterThan), until iterator returns false.
func (t *BTree) DescendRange(lessOrEqual, greaterThan Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(descend, lessOrEqual, greaterThan, true, false, iterator)
}

// DescendLessOrEqual calls the iterator for every value in the tree within the range
// [pivot, first], until iterator returns false.
func (t *BTree) DescendLessOrEqual(pivot Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(descend, pivot, nil, true, false, iterator)
}

// DescendGreaterThan calls the iterator for every value in the tree within
// the range (pivot, last], until iterator returns false.
func (t *BTree) DescendGreaterThan(pivot Item, iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(descend, nil, pivot, false, false, iterator)
}

// Descend calls the iterator for every value in the tree within the range
// [last, first], until iterator returns false.
func (t *BTree) Descend(iterator ItemIterator) {
        if t.root == nil {
                return
        }
        t.root.iterate(descend, nil, nil, false, false, iterator)
}

// Get looks for the key item in the tree, returning it.  It returns nil if
// unable to find that item.
func (t *BTree) Get(key Item) Item {
        if t.root == nil {
                return nil
        }
        return t.root.get(key)
}

func (t *BTree) CopyGet(key Item) Item {
        if t.root == nil {
                return nil
        }
        t.root = t.root.mutableFor(t.cow)
        item := t.root.copyGet(key, t.cow)
        return item
}

// Min returns the smallest item in the tree, or nil if the tree is empty.
func (t *BTree) Min() Item {
        if t.root == nil {
                return nil
        }
        return min(t.root)
}

// Max returns the largest item in the tree, or nil if the tree is empty.
func (t *BTree) Max() Item {
        if t.root == nil {
                return nil
        }
        return max(t.root)
}

// Has returns true if the given key is in the tree.
func (t *BTree) Has(key Item) bool {
        return t.Get(key) != nil
}

// Len returns the number of items currently in the tree.
func (t *BTree) Len() int {
        return t.length
}

// Clear removes all items from the btree.  If addNodesToFreelist is true,
// t's nodes are added to its freelist as part of this call, until the freelist
// is full.  Otherwise, the root node is simply dereferenced and the subtree
// left to Go's normal GC processes.
//
// This can be much faster
// than calling Delete on all elements, because that requires finding/removing
// each element in the tree and updating the tree accordingly.  It also is
// somewhat faster than creating a new tree to replace the old one, because
// nodes from the old tree are reclaimed into the freelist for use by the new
// one, instead of being lost to the garbage collector.
//
// This call takes:
//   O(1): when addNodesToFreelist is false, this is a single operation.
//   O(1): when the freelist is already full, it breaks out immediately
//   O(freelist size):  when the freelist is empty and the nodes are all owned
//       by this tree, nodes are added to the freelist until full.
//   O(tree size):  when all nodes are owned by another tree, all nodes are
//       iterated over looking for nodes to add to the freelist, and due to
//       ownership, none are.
func (t *BTree) Clear(addNodesToFreelist bool) {
        if t.root != nil && addNodesToFreelist {
                t.root.reset(t.cow)
        }
        t.root, t.length = nil, 0
}

// reset returns a subtree to the freelist.  It breaks out immediately if the
// freelist is full, since the only benefit of iterating is to fill that
// freelist up.  Returns true if parent reset call should continue.
func (n *node) reset(c *copyOnWriteContext) bool {
        for _, child := range n.children {
                if !child.reset(c) {
                        return false
                }
        }
        return c.freeNode(n) != ftFreelistFull
}

// Int implements the Item interface for integers.
type Int int

func (a Int) Copy() Item {
        return a
}

// Less returns true if int(a) < int(b).
func (a Int) Less(b Item) bool {
        return a < b.(Int)
}

package buf

import (
        "context"
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
        "golang.org/x/time/rate"
)

var (
        bcacheTotalLimit int64
        bcacheRateLimit  = rate.NewLimiter(rate.Limit(1), 16)
        bcacheCount      int64
        BCachePool       *FileBCachePool
)

func newBlockCachePool(blockSize int) *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if atomic.LoadInt64(&bcacheCount) >= bcacheTotalLimit {
                                log.LogWarnf("FileBCachePool: bcacheCount=(%v),bcacheTotalLimit=(%v)", atomic.LoadInt64(&bcacheCount), bcacheTotalLimit)
                                ctx := context.Background()
                                bcacheRateLimit.Wait(ctx)
                        }
                        return make([]byte, blockSize)
                },
        }
}

type FileBCachePool struct {
        pool *sync.Pool
}

func InitbCachePool(blockSize int) {
        if blockSize == 0 {
                return
        }
        BCachePool = &FileBCachePool{}
        bcacheTotalLimit = int64((4 * util.GB) / blockSize)
        BCachePool.pool = newBlockCachePool(blockSize)
}

func (fileCachePool *FileBCachePool) Get() []byte {
        atomic.AddInt64(&bcacheCount, 1)
        return fileCachePool.pool.Get().([]byte)
}

func (fileCachePool *FileBCachePool) Put(data []byte) {
        atomic.AddInt64(&bcacheCount, -1)
        fileCachePool.pool.Put(data)
}

package buf

import (
        "context"
        "fmt"
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/util"
        "golang.org/x/time/rate"
)

const (
        HeaderBufferPoolSize = 8192
        InvalidLimit         = 0
)

var ReadBufPool = sync.Pool{
        New: func() interface{} {
                b := make([]byte, 32*1024)
                return b
        },
}

const (
        BufferTypeHeader    = 0
        BufferTypeNormal    = 1
        BufferTypeHeaderVer = 2
)

var (
        tinyBuffersTotalLimit    int64 = 4096
        NormalBuffersTotalLimit  int64
        HeadBuffersTotalLimit    int64
        HeadVerBuffersTotalLimit int64
)

var (
        tinyBuffersCount    int64
        normalBuffersCount  int64
        headBuffersCount    int64
        headVerBuffersCount int64
)

var (
        normalBufAllocId  uint64
        headBufAllocId    uint64
        headBufVerAllocId uint64
)

var (
        normalBufFreecId uint64
        headBufFreeId    uint64
        headBufVerFreeId uint64
)

var (
        buffersRateLimit        = rate.NewLimiter(rate.Limit(16), 16)
        normalBuffersRateLimit  = rate.NewLimiter(rate.Limit(16), 16)
        headBuffersRateLimit    = rate.NewLimiter(rate.Limit(16), 16)
        headVerBuffersRateLimit = rate.NewLimiter(rate.Limit(16), 16)
)

func NewTinyBufferPool() *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if atomic.LoadInt64(&tinyBuffersCount) >= tinyBuffersTotalLimit {
                                ctx := context.Background()
                                buffersRateLimit.Wait(ctx)
                        }
                        return make([]byte, util.DefaultTinySizeLimit)
                },
        }
}

func NewHeadVerBufferPool() *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if HeadVerBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&headVerBuffersCount) >= HeadVerBuffersTotalLimit {
                                ctx := context.Background()
                                headVerBuffersRateLimit.Wait(ctx)
                        }
                        return make([]byte, util.PacketHeaderVerSize)
                },
        }
}

func NewHeadBufferPool() *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if HeadBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&headBuffersCount) >= HeadBuffersTotalLimit {
                                ctx := context.Background()
                                headBuffersRateLimit.Wait(ctx)
                        }
                        return make([]byte, util.PacketHeaderSize)
                },
        }
}

func NewNormalBufferPool() *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if NormalBuffersTotalLimit != InvalidLimit && atomic.LoadInt64(&normalBuffersCount) >= NormalBuffersTotalLimit {
                                ctx := context.Background()
                                normalBuffersRateLimit.Wait(ctx)
                        }
                        return make([]byte, util.BlockSize)
                },
        }
}

// BufferPool defines the struct of a buffered pool with 4 objects.
type BufferPool struct {
        headPools    []chan []byte
        headVerPools []chan []byte
        normalPools  []chan []byte
        tinyPool     *sync.Pool
        headPool     *sync.Pool
        normalPool   *sync.Pool
        headVerPool  *sync.Pool
}

var slotCnt = uint64(16)

// NewBufferPool returns a new buffered pool.
func NewBufferPool() (bufferP *BufferPool) {
        bufferP = &BufferPool{}
        bufferP.headPools = make([]chan []byte, slotCnt)
        bufferP.normalPools = make([]chan []byte, slotCnt)
        bufferP.headVerPools = make([]chan []byte, slotCnt)
        for i := 0; i < int(slotCnt); i++ {
                bufferP.headPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
                bufferP.headVerPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
                bufferP.normalPools[i] = make(chan []byte, HeaderBufferPoolSize/slotCnt)
        }
        bufferP.tinyPool = NewTinyBufferPool()
        bufferP.headPool = NewHeadBufferPool()
        bufferP.headVerPool = NewHeadVerBufferPool()
        bufferP.normalPool = NewNormalBufferPool()
        return bufferP
}

func (bufferP *BufferPool) getHead(id uint64) (data []byte) {
        select {
        case data = <-bufferP.headPools[id%slotCnt]:
                return
        default:
                return bufferP.headPool.Get().([]byte)
        }
}

func (bufferP *BufferPool) getHeadVer(id uint64) (data []byte) {
        select {
        case data = <-bufferP.headVerPools[id%slotCnt]:
                return
        default:
                return bufferP.headVerPool.Get().([]byte)
        }
}

func (bufferP *BufferPool) getNoraml(id uint64) (data []byte) {
        select {
        case data = <-bufferP.normalPools[id%slotCnt]:
                return
        default:
                return bufferP.normalPool.Get().([]byte)
        }
}

// Get returns the data based on the given size. Different size corresponds to different object in the pool.
func (bufferP *BufferPool) Get(size int) (data []byte, err error) {
        if size == util.PacketHeaderSize {
                atomic.AddInt64(&headBuffersCount, 1)
                id := atomic.AddUint64(&headBufAllocId, 1)
                return bufferP.getHead(id), nil
        } else if size == util.PacketHeaderVerSize {
                atomic.AddInt64(&headVerBuffersCount, 1)
                id := atomic.AddUint64(&headBufVerAllocId, 1)
                return bufferP.getHeadVer(id), nil
        } else if size == util.BlockSize {
                atomic.AddInt64(&normalBuffersCount, 1)
                id := atomic.AddUint64(&normalBufAllocId, 1)
                return bufferP.getNoraml(id), nil
        } else if size == util.DefaultTinySizeLimit {
                atomic.AddInt64(&tinyBuffersCount, 1)
                return bufferP.tinyPool.Get().([]byte), nil
        }
        return nil, fmt.Errorf("can only support 45 or 65536 bytes")
}

func (bufferP *BufferPool) putHead(index int, data []byte) {
        select {
        case bufferP.headPools[index] <- data:
                return
        default:
                bufferP.headPool.Put(data)
        }
}

func (bufferP *BufferPool) putHeadVer(index int, data []byte) {
        select {
        case bufferP.headVerPools[index] <- data:
                return
        default:
                bufferP.headVerPool.Put(data)
        }
}

func (bufferP *BufferPool) putNormal(index int, data []byte) {
        select {
        case bufferP.normalPools[index] <- data:
                return
        default:
                bufferP.normalPool.Put(data)
        }
}

// Put puts the given data into the buffer pool.
func (bufferP *BufferPool) Put(data []byte) {
        if data == nil {
                return
        }
        size := len(data)
        if size == util.PacketHeaderSize {
                atomic.AddInt64(&headBuffersCount, -1)
                id := atomic.AddUint64(&headBufFreeId, 1)
                bufferP.putHead(int(id%slotCnt), data)
        } else if size == util.PacketHeaderVerSize {
                atomic.AddInt64(&headVerBuffersCount, -1)
                id := atomic.AddUint64(&headBufVerFreeId, 1)
                bufferP.putHeadVer(int(id%slotCnt), data)
        } else if size == util.BlockSize {
                atomic.AddInt64(&normalBuffersCount, -1)
                id := atomic.AddUint64(&normalBufFreecId, 1)
                bufferP.putNormal(int(id%slotCnt), data)
        } else if size == util.DefaultTinySizeLimit {
                bufferP.tinyPool.Put(data)
                atomic.AddInt64(&tinyBuffersCount, -1)
        }
}

package buf

import (
        "context"
        "sync"
        "sync/atomic"

        "github.com/cubefs/cubefs/util"
        "github.com/cubefs/cubefs/util/log"
        "golang.org/x/time/rate"
)

var (
        cacheTotalLimit int64
        cacheRateLimit  = rate.NewLimiter(rate.Limit(16), 16)
        cacheCount      int64
        CachePool       *FileCachePool
)

func newWriterCachePool(blockSize int) *sync.Pool {
        return &sync.Pool{
                New: func() interface{} {
                        if atomic.LoadInt64(&cacheCount) >= cacheTotalLimit {
                                ctx := context.Background()
                                cacheRateLimit.Wait(ctx)
                        }
                        return make([]byte, blockSize)
                },
        }
}

type FileCachePool struct {
        pool *sync.Pool
}

func InitCachePool(blockSize int) {
        if blockSize == 0 {
                return
        }
        CachePool = &FileCachePool{}
        cacheTotalLimit = int64((4 * util.GB) / blockSize)
        CachePool.pool = newWriterCachePool(blockSize)
}

func (fileCachePool *FileCachePool) Get() []byte {
        atomic.AddInt64(&cacheCount, 1)
        return fileCachePool.pool.Get().([]byte)
}

func (fileCachePool *FileCachePool) Put(data []byte) {
        log.LogInfof("action[FileCachePool.put] %v", fileCachePool)
        log.LogInfof("action[FileCachePool.put] pool %v", fileCachePool.pool)
        atomic.AddInt64(&cacheCount, -1)
        fileCachePool.pool.Put(data)
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package caps

import (
        "encoding/json"
        "fmt"
        "regexp"
        "strings"
)

// Caps defines the capability type
type Caps struct {
        API          []string
        OwnerVOL     []string
        NoneOwnerVOL []string
}

// ContainCaps whether contain a capability with kind
func (c *Caps) ContainCaps(cat string, cap string) (r bool) {
        if cat == "API" {
                return traversalCaps(c.API, cap)
        } else if cat == "OwnerVOL" {
                return traversalCaps(c.OwnerVOL, cap)
        } else if cat == "NoneOwnerVOL" {
                return traversalCaps(c.NoneOwnerVOL, cap)
        }
        return false
}

func traversalCaps(caps []string, cap string) (r bool) {
        r = false
        for _, s := range caps {
                a := strings.Split(s, ":")
                b := strings.Split(cap, ":")
                i := 0
                for ; i < 3; i++ {
                        if a[i] != "*" && a[i] != b[i] {
                                break
                        }
                }
                if i == 3 {
                        r = true
                        break
                }
        }
        return
}

// Init init a Caps instance
func (c *Caps) Init(b []byte) (err error) {
        if err = json.Unmarshal(b, c); err != nil {
                return
        }
        if err = c.check(); err != nil {
                return
        }
        c.cleanDup()
        return
}

// Dump dump the content of Caps
func (c *Caps) Dump() (d string) {
        for _, s := range c.API {
                d += fmt.Sprintf("API:%s,", s)
        }
        // TODO c.vol (no usage?)
        return
}

// Union union caps
func (c *Caps) Union(caps *Caps) {
        c.API = append(c.API, caps.API...)
        c.OwnerVOL = append(c.OwnerVOL, caps.OwnerVOL...)
        c.NoneOwnerVOL = append(c.NoneOwnerVOL, caps.NoneOwnerVOL...)
        c.cleanDup()
}

func (c *Caps) check() (err error) {
        apiRe := regexp.MustCompile("^[A-Za-z0-9*]{1,20}:[A-Za-z0-9*]{1,20}:[A-Za-z0-9*]{1,20}$")
        volRe := regexp.MustCompile("^[A-Za-z0-9*]{1,20}:[a-zA-Z0-9_-]{3,256}:[A-Za-z0-9*]{1,20}$")
        if err = checkRegexp(apiRe, c.API); err != nil {
                return
        }
        if err = checkRegexp(volRe, c.OwnerVOL); err != nil {
                return
        }
        if err = checkRegexp(volRe, c.NoneOwnerVOL); err != nil {
                return
        }
        return
}

func checkRegexp(re *regexp.Regexp, caps []string) (err error) {
        for _, cap := range caps {
                if !re.MatchString(cap) {
                        err = fmt.Errorf("invalid cap [%s]", cap)
                        return
                }
        }
        return
}

// Delete delete caps
func (c *Caps) Delete(caps *Caps) {
        c.API = deleteCaps(c.API, caps.API)
        c.OwnerVOL = deleteCaps(c.OwnerVOL, caps.OwnerVOL)
        c.NoneOwnerVOL = deleteCaps(c.NoneOwnerVOL, caps.NoneOwnerVOL)
}

func deleteCaps(caps []string, deleteCaps []string) []string {
        m := make(map[string]bool)
        for _, item := range caps {
                m[item] = true
        }
        caps = []string{}
        for _, item := range deleteCaps {
                delete(m, item)
        }
        for k := range m {
                caps = append(caps, k)
        }
        return caps
}

func (c *Caps) cleanDup() {
        c.API = cleanCaps(c.API)
        c.OwnerVOL = cleanCaps(c.OwnerVOL)
        c.NoneOwnerVOL = cleanCaps(c.NoneOwnerVOL)
}

func cleanCaps(caps []string) []string {
        newCaps := make([]string, 0)
        m := make(map[string]map[string]bool)
        for _, cap := range caps {
                a := strings.Split(cap, ":")
                key1 := a[0]
                key2 := a[1] + ":" + a[2]
                if _, ok := m[key1]; !ok {
                        m[key1] = make(map[string]bool)
                }
                if _, ok := m[key1][key2]; !ok {
                        newCaps = append(newCaps, cap)
                        m[key1][key2] = true
                }
        }
        return newCaps
}

package util

/*
 * Copyright 2016, 2017 SUSE LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import (
        "fmt"
        "os"

        "golang.org/x/sys/unix"
)

// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
const MaxNameLen = 4096

// oobSpace is the size of the oob slice required to store a single FD. Note
// that unix.UnixRights appears to make the assumption that fd is always int32,
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)

// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
        // For some reason, unix.Recvmsg uses the length rather than the capacity
        // when passing the msg_controllen and other attributes to recvmsg.  So we
        // have to actually set the length.
        name := make([]byte, MaxNameLen)
        oob := make([]byte, oobSpace)

        sockfd := socket.Fd()
        n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
        if err != nil {
                return nil, err
        }

        if n >= MaxNameLen || oobn != oobSpace {
                return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
        }

        // Truncate.
        name = name[:n]
        oob = oob[:oobn]

        scms, err := unix.ParseSocketControlMessage(oob)
        if err != nil {
                return nil, err
        }
        if len(scms) != 1 {
                return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
        }
        scm := scms[0]

        fds, err := unix.ParseUnixRights(&scm)
        if err != nil {
                return nil, err
        }
        if len(fds) != 1 {
                return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
        }
        fd := uintptr(fds[0])

        return os.NewFile(fd, string(name)), nil
}

// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
        if len(name) >= MaxNameLen {
                return fmt.Errorf("sendfd: filename too long: %s", name)
        }
        return SendFds(socket, []byte(name), int(fd))
}

// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
        oob := unix.UnixRights(fds...)
        return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package compressor

const EncodingGzip = "gzip"

// Compressor bytes compressor.
// TODO: add stream Compressor.
type Compressor interface {
        Compress([]byte) ([]byte, error)
        Decompress([]byte) ([]byte, error)
}

type none struct{}

func (none) Compress(pb []byte) ([]byte, error)   { return pb, nil }
func (none) Decompress(cb []byte) ([]byte, error) { return cb, nil }

var compressors = make(map[string]func() Compressor)

func init() {
        compressors[""] = func() Compressor { return none{} }
        compressors[EncodingGzip] = func() Compressor { return gzipCompressor{} }
}

func New(encoding string) Compressor {
        if newCompressor, ok := compressors[encoding]; ok {
                return newCompressor()
        }
        return compressors[""]()
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package compressor

import (
        "bytes"
        "compress/gzip"
        "io"
)

// TODO: reuse bytes.Buffer

type gzipCompressor struct{}

func (gzipCompressor) Compress(pb []byte) ([]byte, error) {
        buffer := new(bytes.Buffer)
        gw := gzip.NewWriter(buffer)
        if _, err := gw.Write(pb); err != nil {
                return nil, err
        }
        if err := gw.Close(); err != nil {
                return nil, err
        }
        return buffer.Bytes(), nil
}

func (gzipCompressor) Decompress(cb []byte) ([]byte, error) {
        gr, err := gzip.NewReader(bytes.NewBuffer(cb))
        if err != nil {
                return nil, err
        }
        buffer := new(bytes.Buffer)
        if _, err := io.Copy(buffer, gr); err != nil {
                return nil, err
        }
        if err := gr.Close(); err != nil {
                return nil, err
        }
        return buffer.Bytes(), nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package config

import (
        "encoding/json"
        "fmt"
        "log"
        "os"
        "path"
        "strconv"
        "strings"
)

const (
        DefaultConstConfigFile = "constcfg"
        ClusterVersionFile     = "CLUSTER-VERSION"
        ClusterUUID            = "ClusterUUID"
)

// Config defines the struct of a configuration in general.
type Config struct {
        data map[string]interface{}
        Raw  []byte
}

func newConfig() *Config {
        result := new(Config)
        result.data = make(map[string]interface{})
        return result
}

// LoadConfigFile loads config information from a JSON file.
func LoadConfigFile(filename string) (*Config, error) {
        result := newConfig()
        err := result.parse(filename)
        if err != nil {
                log.Printf("error loading config file %s: %s", filename, err)
        }
        return result, err
}

// LoadConfigString loads config information from a JSON string.
func LoadConfigString(s string) *Config {
        result := newConfig()
        decoder := json.NewDecoder(strings.NewReader(s))
        decoder.UseNumber()
        err := decoder.Decode(&result.data)
        if err != nil {
                log.Fatalf("error parsing config string %s: %s", s, err)
        }
        return result
}

func (c *Config) parse(fileName string) error {
        jsonFileBytes, err := os.ReadFile(fileName)
        c.Raw = jsonFileBytes
        if err == nil {
                decoder := json.NewDecoder(strings.NewReader(string(jsonFileBytes)))
                decoder.UseNumber()
                err = decoder.Decode(&c.data)
        }
        return err
}

// GetValue returns the raw data for the config key.
func (c *Config) GetValue(key string) interface{} {
        return c.data[key]
}

// GetString returns a string for the config key.
func (c *Config) GetString(key string) string {
        x, present := c.data[key]
        if !present {
                return ""
        }
        if result, isString := x.(string); isString {
                return result
        }
        return ""
}

// GetString returns a string for the config key.
func (c *Config) SetString(key, val string) {
        c.data[key] = val
}

// GetFloat returns a float value for the config key.
func (c *Config) GetFloat(key string) float64 {
        x, present := c.data[key]
        if !present {
                return -1
        }
        if result, isNumber := x.(json.Number); isNumber {
                number, err := result.Float64()
                if err != nil {
                        return 0
                }
                return number
        }
        return 0
}

// returns a bool value for the config key with default val when not present
func (c *Config) GetBoolWithDefault(key string, defval bool) bool {
        _, present := c.data[key]
        if !present {
                return defval
        }
        return c.GetBool(key)
}

// GetBool returns a bool value for the config key.
func (c *Config) GetBool(key string) bool {
        x, present := c.data[key]
        if !present {
                return false
        }
        if result, isBool := x.(bool); isBool {
                return result
        }
        if result, isString := x.(string); isString {
                if result == "true" {
                        return true
                }
        }
        return false
}

// GetInt returns a int value for the config key.
func (c *Config) GetInt(key string) int {
        return int(c.GetInt64(key))
}

// GetInt64 returns a int64 value for the config key.
func (c *Config) GetInt64(key string) int64 {
        x, present := c.data[key]
        if !present {
                return 0
        }
        if result, isNumber := x.(json.Number); isNumber {
                number, err := result.Int64()
                if err != nil {
                        return 0
                }
                return number
        }
        // TODO: change all int64 setting with string configurations to int64
        // try parse int64 from string
        if numStr, isString := x.(string); isString {
                number, err := strconv.ParseInt(numStr, 10, 64)
                if err == nil {
                        return number
                }
        }
        return 0
}

func (c *Config) HasKey(key string) bool {
        _, present := c.data[key]
        return present
}

// GetInt64WithDefault returns a int64 value for the config key.
func (c *Config) GetInt64WithDefault(key string, defaultVal int64) int64 {
        if val := c.GetInt64(key); val == 0 {
                return defaultVal
        } else {
                return val
        }
}

// GetInt returns a int value for the config key with default value.
func (c *Config) GetIntWithDefault(key string, defaultVal int) int {
        val := int(c.GetInt64(key))
        if val == 0 {
                return defaultVal
        }
        return val
}

// GetSlice returns an array for the config key.
func (c *Config) GetSlice(key string) []interface{} {
        result, present := c.data[key]
        if !present {
                return []interface{}(nil)
        }
        return result.([]interface{})
}

func (c *Config) GetStringSlice(key string) []string {
        s := c.GetSlice(key)
        result := make([]string, 0, len(s))
        for _, item := range s {
                result = append(result, item.(string))
        }
        return result
}

// Check and get a string for the config key.
func (c *Config) CheckAndGetString(key string) (string, bool) {
        x, present := c.data[key]
        if !present {
                return "", false
        }
        if result, isString := x.(string); isString {
                return result, true
        }
        return "", false
}

// GetBool returns a bool value for the config key.
func (c *Config) CheckAndGetBool(key string) (bool, bool) {
        x, present := c.data[key]
        if !present {
                return false, false
        }
        if result, isBool := x.(bool); isBool {
                return result, true
        }
        // Take string value "true" and "false" as well.
        if result, isString := x.(string); isString {
                if result == "true" {
                        return true, true
                }
                if result == "false" {
                        return false, true
                }
        }
        return false, false
}

func NewIllegalConfigError(configKey string) error {
        return fmt.Errorf("illegal config %s", configKey)
}

type ConstConfig struct {
        Listen           string `json:"listen"`
        RaftReplicaPort  string `json:"raftReplicaPort"`
        RaftHeartbetPort string `json:"raftHeartbetPort"`
}

func (ccfg *ConstConfig) Equals(cfg *ConstConfig) bool {
        return (ccfg.Listen == cfg.Listen &&
                ccfg.RaftHeartbetPort == cfg.RaftHeartbetPort &&
                ccfg.RaftReplicaPort == cfg.RaftReplicaPort)
}

// check listen port, raft replica port and raft heartbeat port
func CheckOrStoreConstCfg(fileDir, fileName string, cfg *ConstConfig) (ok bool, err error) {
        filePath := path.Join(fileDir, fileName)
        var buf []byte
        buf, err = os.ReadFile(filePath)
        if err != nil && !os.IsNotExist(err) {
                return false, fmt.Errorf("read config file %v failed: %v", filePath, err)
        }
        if os.IsNotExist(err) || len(buf) == 0 {
                // Persist configuration to disk
                if buf, err = json.Marshal(cfg); err != nil {
                        return false, fmt.Errorf("marshal const config failed: %v", err)
                }
                if err = os.MkdirAll(fileDir, 0o755); err != nil {
                        return false, fmt.Errorf("make directory %v filed: %v", fileDir, err)
                }
                var file *os.File
                if file, err = os.OpenFile(filePath, os.O_CREATE|os.O_RDWR, 0o755); err != nil {
                        return false, fmt.Errorf("create config file %v failed: %v", filePath, err)
                }
                defer func() {
                        _ = file.Close()
                        if err != nil {
                                _ = os.Remove(filePath)
                        }
                }()
                if _, err = file.Write(buf); err != nil {
                        return false, fmt.Errorf("write config file %v failed: %v", filePath, err)
                }
                if err = file.Sync(); err != nil {
                        return false, fmt.Errorf("sync config file %v failed: %v", filePath, err)
                }
                return true, nil
        }
        // Load and check stored const configuration
        storedConstCfg := new(ConstConfig)
        if err = json.Unmarshal(buf, storedConstCfg); err != nil {
                return false, fmt.Errorf("unmarshal const config %v failed: %v", filePath, err)
        }
        if ok := storedConstCfg.Equals(cfg); !ok {
                return false, fmt.Errorf("compare const config %v and %v failed: %v", storedConstCfg, cfg, err)
        }
        return true, nil
}

func CheckOrStoreClusterUuid(dirPath, id string, force bool) (err error) {
        dir, err := os.ReadDir(dirPath)
        if err != nil {
                return fmt.Errorf("read dir %v failed: %v", dirPath, err.Error())
        }
        versionFile := path.Join(dirPath, ClusterVersionFile)
        if len(dir) == 0 || force {
                // store clusterUUID
                ClusterMap := map[string]interface{}{"ClusterUUID": id}
                data, err := json.Marshal(ClusterMap)
                if err != nil {
                        return fmt.Errorf("json marshal failed: %v", err.Error())
                }
                if err = os.WriteFile(versionFile, data, 0o755); err != nil {
                        return fmt.Errorf("write file %v failed: %v", versionFile, err.Error())
                }
        } else {
                // check clusterUUID
                cfg, err := LoadConfigFile(versionFile)
                if err != nil {
                        return fmt.Errorf("read file %v failed: %v\n", versionFile, err.Error())
                }
                clusterUuId := cfg.GetString(ClusterUUID)
                if clusterUuId != id {
                        return fmt.Errorf("file %v ClusterUUID %v not equal to %v\n",
                                versionFile, clusterUuId, id)
                }
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "net"
        "sync"
        "time"
)

type Object struct {
        conn *net.TCPConn
        idle int64
}

const (
        ConnectIdleTime       = 30
        defaultConnectTimeout = 1
)

type ConnectPool struct {
        sync.RWMutex
        pools          map[string]*Pool
        mincap         int
        maxcap         int
        timeout        int64
        connectTimeout int64
        closeCh        chan struct{}
        closeOnce      sync.Once
}

func NewConnectPool() (cp *ConnectPool) {
        cp = &ConnectPool{
                pools:          make(map[string]*Pool),
                mincap:         5,
                maxcap:         500,
                timeout:        int64(time.Second * ConnectIdleTime),
                connectTimeout: defaultConnectTimeout,
                closeCh:        make(chan struct{}),
        }
        go cp.autoRelease()

        return cp
}

func NewConnectPoolWithTimeout(idleConnTimeout time.Duration, connectTimeout int64) (cp *ConnectPool) {
        cp = &ConnectPool{
                pools:          make(map[string]*Pool),
                mincap:         5,
                maxcap:         80,
                timeout:        int64(idleConnTimeout * time.Second),
                connectTimeout: connectTimeout,
                closeCh:        make(chan struct{}),
        }
        go cp.autoRelease()

        return cp
}

func DailTimeOut(target string, timeout time.Duration) (c *net.TCPConn, err error) {
        var connect net.Conn
        connect, err = net.DialTimeout("tcp", target, timeout)
        if err == nil {
                conn := connect.(*net.TCPConn)
                conn.SetKeepAlive(true)
                conn.SetNoDelay(true)
                c = conn
        }
        return
}

func (cp *ConnectPool) GetConnect(targetAddr string) (c *net.TCPConn, err error) {
        cp.RLock()
        pool, ok := cp.pools[targetAddr]
        cp.RUnlock()
        if !ok {
                newPool := NewPool(cp.mincap, cp.maxcap, cp.timeout, cp.connectTimeout, targetAddr)
                cp.Lock()
                pool, ok = cp.pools[targetAddr]
                if !ok {
                        // pool = NewPool(cp.mincap, cp.maxcap, cp.timeout, cp.connectTimeout, targetAddr)
                        pool = newPool
                        cp.pools[targetAddr] = pool
                }
                cp.Unlock()
        }

        return pool.GetConnectFromPool()
}

func (cp *ConnectPool) PutConnect(c *net.TCPConn, forceClose bool) {
        if c == nil {
                return
        }
        if forceClose {
                _ = c.Close()
                return
        }
        select {
        case <-cp.closeCh:
                _ = c.Close()
                return
        default:
        }
        addr := c.RemoteAddr().String()
        cp.RLock()
        pool, ok := cp.pools[addr]
        cp.RUnlock()
        if !ok {
                c.Close()
                return
        }
        object := &Object{conn: c, idle: time.Now().UnixNano()}
        pool.PutConnectObjectToPool(object)
}

func (cp *ConnectPool) autoRelease() {
        timer := time.NewTimer(time.Second)
        for {
                select {
                case <-cp.closeCh:
                        timer.Stop()
                        return
                case <-timer.C:
                }
                pools := make([]*Pool, 0)
                cp.RLock()
                for _, pool := range cp.pools {
                        pools = append(pools, pool)
                }
                cp.RUnlock()
                for _, pool := range pools {
                        pool.autoRelease()
                }
                timer.Reset(time.Second)
        }
}

func (cp *ConnectPool) releaseAll() {
        pools := make([]*Pool, 0)
        cp.RLock()
        for _, pool := range cp.pools {
                pools = append(pools, pool)
        }
        cp.RUnlock()
        for _, pool := range pools {
                pool.ReleaseAll()
        }
}

func (cp *ConnectPool) Close() {
        cp.closeOnce.Do(func() {
                close(cp.closeCh)
                cp.releaseAll()
        })
}

type Pool struct {
        objects        chan *Object
        mincap         int
        maxcap         int
        target         string
        timeout        int64
        connectTimeout int64
}

func NewPool(min, max int, timeout, connectTimeout int64, target string) (p *Pool) {
        p = new(Pool)
        p.mincap = min
        p.maxcap = max
        p.target = target
        p.objects = make(chan *Object, max)
        p.timeout = timeout
        p.connectTimeout = connectTimeout
        p.initAllConnect()
        return p
}

func (p *Pool) initAllConnect() {
        for i := 0; i < p.mincap; i++ {
                c, err := net.Dial("tcp", p.target)
                if err == nil {
                        conn := c.(*net.TCPConn)
                        conn.SetKeepAlive(true)
                        conn.SetNoDelay(true)
                        o := &Object{conn: conn, idle: time.Now().UnixNano()}
                        p.PutConnectObjectToPool(o)
                }
        }
}

func (p *Pool) PutConnectObjectToPool(o *Object) {
        select {
        case p.objects <- o:
                return
        default:
                if o.conn != nil {
                        o.conn.Close()
                }
                return
        }
}

func (p *Pool) autoRelease() {
        connectLen := len(p.objects)
        for i := 0; i < connectLen; i++ {
                select {
                case o := <-p.objects:
                        if time.Now().UnixNano()-int64(o.idle) > p.timeout {
                                o.conn.Close()
                        } else {
                                p.PutConnectObjectToPool(o)
                        }
                default:
                        return
                }
        }
}

func (p *Pool) ReleaseAll() {
        connectLen := len(p.objects)
        for i := 0; i < connectLen; i++ {
                select {
                case o := <-p.objects:
                        o.conn.Close()
                default:
                        return
                }
        }
}

func (p *Pool) NewConnect(target string) (c *net.TCPConn, err error) {
        var connect net.Conn
        connect, err = net.DialTimeout("tcp", p.target, time.Duration(p.connectTimeout)*time.Second)
        if err == nil {
                conn := connect.(*net.TCPConn)
                conn.SetKeepAlive(true)
                conn.SetNoDelay(true)
                c = conn
        }
        return
}

func (p *Pool) GetConnectFromPool() (c *net.TCPConn, err error) {
        var o *Object
        for {
                select {
                case o = <-p.objects:
                default:
                        return p.NewConnect(p.target)
                }
                if time.Now().UnixNano()-int64(o.idle) > p.timeout {
                        _ = o.conn.Close()
                        o = nil
                        continue
                }
                return o.conn, nil
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package cryptoutil

import (
        "bytes"
        "crypto/aes"
        "crypto/cipher"
        "crypto/hmac"
        "crypto/md5"
        "crypto/rand"
        "crypto/sha256"
        "crypto/tls"
        "crypto/x509"
        "encoding/base64"
        "encoding/binary"
        "fmt"
        "io"
        rand2 "math/rand"
        "net/http"
        "strconv"
        "time"
        "unsafe"
)

func pad(src []byte) []byte {
        padding := aes.BlockSize - len(src)%aes.BlockSize
        padtext := bytes.Repeat([]byte{byte(padding)}, padding)
        return append(src, padtext...)
}

func unpad(src []byte) []byte {
        length := len(src)
        unpadding := int(src[length-1])
        return src[:(length - unpadding)]
}

// AesEncryptCBC defines aes encryption with CBC
func AesEncryptCBC(key, plaintext []byte) (ciphertext []byte, err error) {
        var block cipher.Block

        if len(plaintext) == 0 {
                err = fmt.Errorf("input for encryption is invalid")
                return
        }

        paddedText := pad(plaintext)

        if len(paddedText)%aes.BlockSize != 0 {
                err = fmt.Errorf("paddedText [len=%d] is not a multiple of the block size", len(paddedText))
                return
        }

        block, err = aes.NewCipher(key)
        if err != nil {
                return
        }

        ciphertext = make([]byte, aes.BlockSize+len(paddedText))
        iv := ciphertext[:aes.BlockSize]
        if _, err = io.ReadFull(rand.Reader, iv); err != nil {
                return
        }

        cbc := cipher.NewCBCEncrypter(block, iv)
        cbc.CryptBlocks(ciphertext[aes.BlockSize:], paddedText)

        return
}

// AesDecryptCBC defines aes decryption with CBC
func AesDecryptCBC(key, ciphertext []byte) (plaintext []byte, err error) {
        var block cipher.Block

        if block, err = aes.NewCipher(key); err != nil {
                return
        }

        if len(ciphertext) < aes.BlockSize {
                err = fmt.Errorf("ciphertext [len=%d] too short; should greater than blocksize", len(ciphertext))
                return
        }

        iv := ciphertext[:aes.BlockSize]
        ciphertext = ciphertext[aes.BlockSize:]

        cbc := cipher.NewCBCDecrypter(block, iv)
        cbc.CryptBlocks(ciphertext, ciphertext)

        plaintext = unpad(ciphertext)

        return
}

// GenSecretKey generate a secret key according to pair {ts, id}
func GenSecretKey(key []byte, ts int64, id string) (secretKey []byte) {
        b := make([]byte, 8)
        binary.LittleEndian.PutUint64(b, uint64(ts))
        data := append(b, []byte(id)...)
        secretKey = genKey(key, data)
        return
}

func genKey(key []byte, data []byte) (sessionKey []byte) {
        h := hmac.New(sha256.New, []byte(key))
        h.Write([]byte(data))
        sessionKey = h.Sum(nil)
        return
}

// AuthGenSessionKeyTS authnode generates a session key according to its master key and current timestamp
func AuthGenSessionKeyTS(key []byte) (sessionKey []byte) {
        data := []byte(strconv.FormatInt(int64(time.Now().Unix()), 10))
        sessionKey = genKey(key, data)
        return
}

// Base64Encode encoding using base64
func Base64Encode(text []byte) (encodedText string) {
        encodedText = base64.StdEncoding.EncodeToString(text)
        return
}

// Base64Decode Decoding using base64
func Base64Decode(encodedText string) (text []byte, err error) {
        text, err = base64.StdEncoding.DecodeString(encodedText)
        return
}

// EncodeMessage encode a message with aes encrption, md5 signature
func EncodeMessage(plaintext []byte, key []byte) (message string, err error) {
        var cipher []byte

        if len(plaintext) > MaxAllocSize {
                return "too max packet", fmt.Errorf("too max packet len %v", len(plaintext))
        }
        // 8 for random number; 16 for md5 hash
        buffer := make([]byte, RandomNumberSize+CheckSumSize+len(plaintext))

        // add random
        random := rand2.Uint64()
        binary.LittleEndian.PutUint64(buffer[RandomNumberOffset:], random)

        // add request body
        copy(buffer[MessageOffset:], plaintext)

        // calculate and add checksum
        checksum := md5.Sum(buffer)
        copy(buffer[CheckSumOffset:], checksum[:])

        // encryption with aes CBC with keysize of 256-bit
        if cipher, err = AesEncryptCBC(key, buffer); err != nil {
                return
        }
        // base64 encoding
        message = base64.StdEncoding.EncodeToString(cipher)

        return
}

// DecodeMessage decode a message and verify its validity
func DecodeMessage(message string, key []byte) (plaintext []byte, err error) {
        var (
                cipher      []byte
                decodedText []byte
        )

        if cipher, err = base64.StdEncoding.DecodeString(message); err != nil {
                return
        }

        if decodedText, err = AesDecryptCBC(key, cipher); err != nil {
                return
        }

        if len(decodedText) <= MessageMetaDataSize {
                err = fmt.Errorf("invalid json format with size [%d] less than message meta data size", len(decodedText))
                return
        }

        msgChecksum := make([]byte, CheckSumSize)
        copy(msgChecksum, decodedText[CheckSumOffset:CheckSumOffset+CheckSumSize])

        // calculate checksum
        filltext := bytes.Repeat([]byte{byte(0)}, CheckSumSize)
        copy(decodedText[CheckSumOffset:], filltext[:])
        newChecksum := md5.Sum(decodedText)

        // verify checksum
        if !bytes.Equal(msgChecksum, newChecksum[:]) {
                err = fmt.Errorf("checksum not match")
        }

        plaintext = decodedText[MessageOffset:]

        // fmt.Printf("DecodeMessage CBC: %s\n", plaintext)
        return
}

// GenVerifier generate a verifier for replay mitigation in http
func GenVerifier(key []byte) (v string, ts int64, err error) {
        ts = time.Now().Unix()
        tsbuf := make([]byte, unsafe.Sizeof(ts))
        binary.LittleEndian.PutUint64(tsbuf, uint64(ts))
        if v, err = EncodeMessage(tsbuf, key); err != nil {
                panic(err)
        }
        return
}

// CreateClientX creates a https client
func CreateClientX(cert *[]byte) (client *http.Client, err error) {
        caCertPool := x509.NewCertPool()
        ok := caCertPool.AppendCertsFromPEM(*cert)

        if !ok {
                err = fmt.Errorf("CreateClientX AppendCertsFromPEM fails")
                return
        }

        // We don't use PKI to verify client since we have secret key for authentication
        client = &http.Client{
                Transport: &http.Transport{
                        TLSClientConfig: &tls.Config{
                                MinVersion:         tls.VersionTLS12,
                                RootCAs:            caCertPool,
                                InsecureSkipVerify: false,
                        },
                },
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "fmt"
        "path"
        "runtime"
        "strings"
)

type ErrorTrace struct {
        msg string
}

func New(msg string) error {
        return &ErrorTrace{msg: msg}
}

func NewError(err error) error {
        if err == nil {
                return nil
        }

        _, file, line, _ := runtime.Caller(1)
        _, fileName := path.Split(file)

        return &ErrorTrace{
                msg: fmt.Sprintf("[%v %v] %v", fileName, line, err.Error()),
        }
}

func NewErrorf(format string, a ...interface{}) error {
        msg := fmt.Sprintf(format, a...)
        _, file, line, _ := runtime.Caller(1)
        _, fileName := path.Split(file)

        return &ErrorTrace{
                msg: fmt.Sprintf("[%v %v] %v", fileName, line, msg),
        }
}

func (e *ErrorTrace) Error() string {
        return e.msg
}

func Trace(err error, format string, a ...interface{}) error {
        msg := fmt.Sprintf(format, a...)
        _, file, line, _ := runtime.Caller(1)
        _, fileName := path.Split(file)

        if err == nil {
                return &ErrorTrace{
                        msg: fmt.Sprintf("[%v %v] %v", fileName, line, msg),
                }
        }

        return &ErrorTrace{
                msg: fmt.Sprintf("[%v %v] %v :: %v", fileName, line, msg, err),
        }
}

func Stack(err error) string {
        e, ok := err.(*ErrorTrace)
        if !ok {
                return err.Error()
        }

        var msg string

        stack := strings.Split(e.msg, "::")
        for _, s := range stack {
                msg = fmt.Sprintf("%v\n%v", msg, strings.TrimPrefix(s, " "))
        }
        return msg
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package errors

import (
        "runtime"
        "sync"
        _ "unsafe"

        "github.com/brahma-adshonor/gohook"
)

var ErrUnsupportedArch = New("Unsupported arch")

//go:linkname gopanic runtime.gopanic
func gopanic(e interface{})

var panicHook func()

// NOTE: trampoline don't works
var mu sync.Mutex

func hookedPanic(e interface{}) {
        mu.Lock()
        defer mu.Unlock()
        // NOTE: unhook before invoke hook function
        gohook.UnHook(gopanic)
        defer gohook.Hook(gopanic, hookedPanic, nil)
        panicHook()
        gopanic(e)
}

func AtPanic(hook func()) error {
        if !SupportPanicHook() {
                return ErrUnsupportedArch
        }
        panicHook = hook
        return gohook.Hook(gopanic, hookedPanic, nil)
}

var (
        oldToken = false
        newToken = false
)

//go:noinline
func setOldToken() {
        oldToken = true
}

//go:noinline
func setNewToken() {
        newToken = true
}

func supportTest() (ok bool) {
        err := gohook.Hook(setOldToken, setNewToken, nil)
        if err != nil {
                return
        }
        setOldToken()
        err = gohook.UnHook(setOldToken)
        if err != nil {
                return
        }
        setOldToken()
        ok = oldToken && newToken
        return
}

func SupportPanicHook() (ok bool) {
        switch runtime.GOARCH {
        case "amd64", "386":
                ok = supportTest()
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/util/log"
        "github.com/cubefs/cubefs/util/ump"
)

var (
        AlarmPool = &sync.Pool{New: func() interface{} {
                return new(Alarm)
        }}
        // AlarmGroup  sync.Map
        AlarmCh chan *Alarm
)

func collectAlarm() {
        AlarmCh = make(chan *Alarm, ChSize)
        for {
                m := <-AlarmCh
                AlarmPool.Put(m)
        }
}

type Alarm struct {
        Counter
}

func Warning(detail string) (a *Alarm) {
        key := fmt.Sprintf("%v_%v_warning", clustername, modulename)
        ump.Alarm(key, detail)
        log.LogCritical(key, detail)
        if !enabledPrometheus {
                return
        }
        a = AlarmPool.Get().(*Alarm)
        a.name = metricsName(key)
        a.Add(1)
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "bytes"
        "encoding/json"
        "fmt"
        "io"
        "net"
        "net/http"
        "regexp"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/log"
)

const (
        RegisterPeriod = time.Duration(10) * time.Minute
        RegisterPath   = "/v1/agent/service/register"
)

/**
 * consul register info for prometheus
 * optional for user when set prometheus exporter
 */
type ConsulRegisterInfo struct {
        Name    string            `json:"Name"`
        ID      string            `json:"ID"`
        Address string            `json:"Address"`
        Port    int64             `json:"Port"`
        Tags    []string          `json:"Tags"`
        Meta    map[string]string `json:",omitempty"`
}

// get consul id
func GetConsulId(app string, role string, host string, port int64) string {
        return fmt.Sprintf("%s_%s_%s_%d", app, role, host, port)
}

// do consul register process
func DoConsulRegisterProc(addr, app, role, cluster, meta, host string, port int64) {
        if len(addr) <= 0 {
                return
        }
        log.LogInfof("metrics consul register %v %v %v", addr, cluster, port)
        ticker := time.NewTicker(RegisterPeriod)
        defer func() {
                if err := recover(); err != nil {
                        log.LogErrorf("RegisterConsul panic,err[%v]", err)
                }
                ticker.Stop()
        }()

        client := &http.Client{}
        req := makeRegisterReq(host, addr, app, role, cluster, meta, port)
        if req == nil {
                log.LogErrorf("make register req error")
                return
        }

        if resp, _ := client.Do(req); resp != nil {
                io.ReadAll(resp.Body)
                resp.Body.Close()
        }

        for range ticker.C {
                req := makeRegisterReq(host, addr, app, role, cluster, meta, port)
                if req == nil {
                        log.LogErrorf("make register req error")
                        return
                }
                if resp, _ := client.Do(req); resp != nil {
                        io.ReadAll(resp.Body)
                        resp.Body.Close()
                }
        }
}

// GetLocalIpAddr returns the local IP address.
func GetLocalIpAddr(filter string) (ipaddr string, err error) {
        addrs, err := net.InterfaceAddrs()
        if err != nil {
                log.LogError("consul register get local ip failed, ", err)
                return
        }
        for _, addr := range addrs {
                if ipnet, ok := addr.(*net.IPNet); ok && !ipnet.IP.IsLoopback() {
                        if ipnet.IP.To4() != nil {
                                ip := ipnet.IP.String()

                                if filter != "" {
                                        match, err := doFilter(filter, ip)
                                        if err != nil {
                                                return "", fmt.Errorf("regex match err, err %s", err.Error())
                                        }

                                        if !match {
                                                continue
                                        }
                                }

                                return ip, nil
                        }
                }
        }
        return "", fmt.Errorf("cannot get local ip")
}

// use ! tag to represent to do negative filter
func doFilter(filter, ip string) (ok bool, err error) {
        // negative filter
        if strings.HasPrefix(filter, "!") {
                filter = filter[1:]
                ok, err := regexp.MatchString(filter, ip)
                return !ok, err
        }

        ok, err = regexp.MatchString(filter, ip)
        return ok, err
}

// make a consul rest request
func makeRegisterReq(host, addr, app, role, cluster, meta string, port int64) (req *http.Request) {
        id := GetConsulId(app, role, host, port)
        url := addr + RegisterPath
        cInfo := &ConsulRegisterInfo{
                Name:    app,
                ID:      id,
                Address: host,
                Port:    port,
                Tags: []string{
                        "app=" + app,
                        "role=" + role,
                        "cluster=" + cluster,
                },
        }

        ok, metas := parseMetaStr(meta)
        if ok {
                cInfo.Meta = metas
                cInfo.Meta["cluster"] = cluster
                cInfo.Meta["commit"] = proto.CommitID
                if len(cInfo.Meta["metric_path"]) == 0 {
                        cInfo.Meta["metric_path"] = "/metrics"
                        log.LogInfo("metric_path is empty, use default /metrics")
                }
        }

        cInfoBytes, err := json.Marshal(cInfo)
        if err != nil {
                log.LogErrorf("marshal error, %v", err.Error())
                return nil
        }
        req, err = http.NewRequest(http.MethodPut, url, bytes.NewBuffer(cInfoBytes))
        if err != nil {
                log.LogErrorf("new request error, %v", err.Error())
                return nil
        }
        req.Header.Set("Content-Type", "application/json; charset=utf-8")
        req.Close = true

        return
}

// parse k1=v1;k2=v2 as a map
func parseMetaStr(meta string) (bool, map[string]string) {
        if len(meta) == 0 {
                log.LogInfo("meta is empty, use default")
                meta = "dataset=custom;category=custom;app=cfs;role=fuseclient;metric_path=/metrics"
        }

        m := map[string]string{}

        kvs := strings.Split(meta, ";")
        for _, kv := range kvs {
                arr := strings.Split(kv, "=")
                if len(arr) != 2 {
                        log.LogInfof("meta is invalid, can't use %s", meta)
                        return false, m
                }

                m[arr[0]] = arr[1]
        }

        return true, m
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "sync"

        "github.com/cubefs/cubefs/util/log"
        "github.com/prometheus/client_golang/prometheus"
)

var (
        CounterGroup sync.Map
        CounterPool  = &sync.Pool{New: func() interface{} {
                return new(Counter)
        }}
        CounterCh chan *Counter
)

func collectCounter() {
        CounterCh = make(chan *Counter, ChSize)
        for {
                m := <-CounterCh
                metric := m.Metric()
                metric.Add(float64(m.val))
        }
}

type Counter struct {
        Gauge
}

func NewCounter(name string) (c *Counter) {
        c = new(Counter)
        c.name = metricsName(name)
        return
}

func (c *Counter) Add(val int64) {
        if !enabledPrometheus {
                return
        }
        c.val = float64(val)
        c.publish()
}

func (c *Counter) publish() {
        select {
        case CounterCh <- c:
        default:
        }
}

func (c *Counter) AddWithLabels(val int64, labels map[string]string) {
        if !enabledPrometheus {
                return
        }
        c.labels = labels
        c.Add(val)
}

func (c *Counter) Metric() prometheus.Counter {
        metric := prometheus.NewCounter(
                prometheus.CounterOpts{
                        Name:        c.name,
                        ConstLabels: c.labels,
                })
        key := c.Key()

        actualMetric, load := CounterGroup.LoadOrStore(key, metric)
        if load {
                return actualMetric.(prometheus.Counter)
        }

        if enablePush {
                registry.MustRegister(actualMetric.(prometheus.Collector))
                return actualMetric.(prometheus.Counter)
        }

        err := prometheus.Register(actualMetric.(prometheus.Collector))
        if err == nil {
                log.LogInfo("register metric ", c.name)
        }

        return actualMetric.(prometheus.Counter)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "fmt"
        "net"
        "net/http"
        "os"
        "path/filepath"
        "strconv"
        "strings"
        "time"

        "github.com/cubefs/cubefs/proto"
        "github.com/cubefs/cubefs/util/config"
        "github.com/cubefs/cubefs/util/log"
        "github.com/gorilla/mux"
        "github.com/prometheus/client_golang/prometheus"
        "github.com/prometheus/client_golang/prometheus/promhttp"
        "github.com/prometheus/client_golang/prometheus/push"
)

const (
        PromHandlerPattern      = "/metrics"       // prometheus handler
        AppName                 = "cfs"            // app name
        ConfigKeyExporterEnable = "exporterEnable" // exporter enable
        ConfigKeyExporterPort   = "exporterPort"   // exporter port
        ConfigKeyConsulAddr     = "consulAddr"     // consul addr
        ConfigKeyConsulMeta     = "consulMeta"     // consul meta
        ConfigKeyIpFilter       = "ipFilter"       // add ip filter
        ConfigKeyEnablePid      = "enablePid"      // enable report partition id
        ConfigKeyPushAddr       = "pushAddr"       // enable push data to gateway
        ChSize                  = 1024 * 10        // collect chan size

        // monitor label name
        Vol    = "vol"
        Disk   = "disk"
        PartId = "partid"
        Op     = "op"
        Type   = "type"
        Err    = "err"
)

var (
        namespace         string
        clustername       string
        modulename        string
        pushAddr          string
        exporterPort      int64
        enabledPrometheus = false
        enablePush        = false
        EnablePid         = false
        replacer          = strings.NewReplacer("-", "_", ".", "_", " ", "_", ",", "_", ":", "_")
        registry          = prometheus.NewRegistry()
)

func metricsName(name string) string {
        return replacer.Replace(fmt.Sprintf("%s_%s", namespace, name))
}

// Init initializes the exporter.
func Init(role string, cfg *config.Config) {
        modulename = role
        if !cfg.GetBoolWithDefault(ConfigKeyExporterEnable, true) {
                log.LogInfof("%v exporter disabled", role)
                return
        }

        EnablePid = cfg.GetBoolWithDefault(ConfigKeyEnablePid, false)
        log.LogInfo("enable report partition id info? ", EnablePid)

        port := cfg.GetInt64(ConfigKeyExporterPort)

        if port < 0 {
                log.LogInfof("%v exporter port set random default", port)
        }

        exporterPort = port
        enabledPrometheus = true

        pushAddr = cfg.GetString(ConfigKeyPushAddr)
        log.LogInfof("pushAddr %v ", pushAddr)
        if pushAddr != "" {
                enablePush = true
        }

        http.Handle(PromHandlerPattern, promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
                Timeout: 60 * time.Second,
        }))

        namespace = AppName + "_" + role
        addr := fmt.Sprintf(":%d", port)
        l, err := net.Listen("tcp", addr)
        if err != nil {
                log.LogError("exporter tcp listen error: ", err)
                return
        }

        exporterPort = int64(l.Addr().(*net.TCPAddr).Port)

        go func() {
                err = http.Serve(l, nil)
                if err != nil {
                        log.LogError("exporter http serve error: ", err)
                        return
                }
        }()

        collect()

        m := NewGauge("start_time")
        m.Set(float64(time.Now().Unix() * 1000))

        log.LogInfof("exporter Start: %v", exporterPort)
}

// Init initializes the exporter.
func InitWithRouter(role string, cfg *config.Config, router *mux.Router, exPort string) {
        modulename = role
        if !cfg.GetBoolWithDefault(ConfigKeyExporterEnable, true) {
                log.LogInfof("%v metrics exporter disabled", role)
                return
        }
        exporterPort, _ = strconv.ParseInt(exPort, 10, 64)
        enabledPrometheus = true
        router.NewRoute().Name("metrics").
                Methods(http.MethodGet).
                Path(PromHandlerPattern).
                Handler(promhttp.HandlerFor(prometheus.DefaultGatherer, promhttp.HandlerOpts{
                        Timeout: 5 * time.Second,
                }))
        namespace = AppName + "_" + role

        collect()

        m := NewGauge("start_time")
        m.Set(float64(time.Now().Unix() * 1000))

        log.LogInfof("exporter Start: %v %v", exporterPort, m)
}

func RegistConsul(cluster string, role string, cfg *config.Config) {
        ipFilter := cfg.GetString(ConfigKeyIpFilter)
        host, err := GetLocalIpAddr(ipFilter)
        if err != nil {
                log.LogErrorf("get local ip error, %v", err.Error())
                return
        }

        rawmnt := cfg.GetString("subdir")
        if rawmnt == "" {
                rawmnt = "/"
        }
        mountPoint, _ := filepath.Abs(rawmnt)
        log.LogInfof("RegistConsul:%v", enablePush)
        if enablePush {
                log.LogWarnf("[RegisterConsul] use auto push data strategy, not register consul")
                autoPush(pushAddr, role, cluster, host, mountPoint)
                return
        }

        clustername = replacer.Replace(cluster)
        consulAddr := cfg.GetString(ConfigKeyConsulAddr)
        consulMeta := cfg.GetString(ConfigKeyConsulMeta)

        if exporterPort == int64(0) {
                exporterPort = cfg.GetInt64(ConfigKeyExporterPort)
        }

        if exporterPort == 0 {
                log.LogInfo("config export port is 0, use default 17510")
                exporterPort = 17510
        }

        if exporterPort != int64(0) && len(consulAddr) > 0 {
                if ok := strings.HasPrefix(consulAddr, "http"); !ok {
                        consulAddr = "http://" + consulAddr
                }
                go DoConsulRegisterProc(consulAddr, AppName, role, cluster, consulMeta, host, exporterPort)
        }
}

func autoPush(pushAddr, role, cluster, ip, mountPoint string) {
        pid := os.Getpid()

        client := &http.Client{
                Timeout: time.Second * 10,
        }

        hostname, err := os.Hostname()
        if err != nil {
                log.LogWarnf("get host name failed %v", err)
        }

        pusher := push.New(pushAddr, "cbfs").
                Client(client).
                Gatherer(registry).
                Grouping("cip", ip).
                Grouping("role", role).
                Grouping("cluster", cluster).
                Grouping("pid", strconv.Itoa(pid)).
                Grouping("commit", proto.CommitID).
                Grouping("app", AppName).
                Grouping("mountPoint", mountPoint).
                Grouping("hostName", hostname)

        log.LogInfof("start push data, ip %s, addr %s, role %s, cluster %s, mountPoint %s, hostName %s",
                ip, pushAddr, role, cluster, mountPoint, hostname)

        ticker := time.NewTicker(time.Second * 15)
        go func() {
                for range ticker.C {
                        if err := pusher.Push(); err != nil {
                                log.LogWarnf("push monitor data to %s err, %s", pushAddr, err.Error())
                        }
                }
        }()
}

func collect() {
        if !enabledPrometheus {
                return
        }
        go collectCounter()
        go collectGauge()
        go collectHistogram()
        go collectAlarm()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/util/log"
        "github.com/prometheus/client_golang/prometheus"
)

var (
        GaugeGroup sync.Map
        GaugeCh    chan *Gauge
)

func collectGauge() {
        GaugeCh = make(chan *Gauge, ChSize)
        for {
                m := <-GaugeCh
                metric := m.Metric()
                metric.Set(m.val)
                // log.LogDebugf("collect metric %v", m)
        }
}

type Gauge struct {
        name   string
        labels map[string]string
        val    float64
}

func NewGauge(name string) (g *Gauge) {
        g = new(Gauge)
        g.name = metricsName(name)
        return
}

func (c *Gauge) Key() (key string) {
        return stringMD5(c.Name())
}

func (g *Gauge) Name() string {
        return fmt.Sprintf("{%s: %s}", g.name, stringMapToString(g.labels))
}

func (g *Gauge) String() string {
        return fmt.Sprintf("{name: %s, labels: %s, val: %v}", g.name, stringMapToString(g.labels), g.val)
}

func (c *Gauge) Metric() prometheus.Gauge {
        metric := prometheus.NewGauge(
                prometheus.GaugeOpts{
                        Name:        c.name,
                        ConstLabels: c.labels,
                })
        key := c.Key()
        actualMetric, load := GaugeGroup.LoadOrStore(key, metric)
        if load {
                return actualMetric.(prometheus.Gauge)
        }

        if enablePush {
                registry.MustRegister(actualMetric.(prometheus.Collector))
                return actualMetric.(prometheus.Gauge)
        }

        err := prometheus.Register(actualMetric.(prometheus.Collector))
        if err == nil {
                log.LogInfof("register metric %v", c.Name())
        } else {
                log.LogErrorf("register metric %v, %v", c.Name(), err)
        }

        return actualMetric.(prometheus.Gauge)
}

func (g *Gauge) Set(val float64) {
        if !enabledPrometheus {
                return
        }
        g.val = val
        g.publish()
}

func (c *Gauge) publish() {
        select {
        case GaugeCh <- c:
        default:
        }
}

func (g *Gauge) SetWithLabels(val float64, labels map[string]string) {
        if !enabledPrometheus {
                return
        }
        g.labels = labels
        g.Set(val)
}

type GaugeVec struct {
        *prometheus.GaugeVec
}

func NewGaugeVec(name, help string, labels []string) *GaugeVec {
        if !enabledPrometheus {
                return nil
        }
        v := prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                        Name: metricsName(name),
                        Help: help,
                },
                labels,
        )

        if err := prometheus.Register(v); err != nil {
                log.LogErrorf("prometheus register gaugevec name:%v, labels:{%v} error: %v", name, labels, err)
                return nil
        }

        return &GaugeVec{GaugeVec: v}
}

func (v *GaugeVec) SetWithLabelValues(val float64, lvs ...string) {
        if m, err := v.GetMetricWithLabelValues(lvs...); err == nil {
                m.Set(val)
        }
}

func (v *GaugeVec) SetBoolWithLabelValues(val bool, lvs ...string) {
        if val {
                v.SetWithLabelValues(float64(1), lvs...)
        } else {
                v.SetWithLabelValues(0, lvs...)
        }
}

package exporter

import (
        "fmt"
        "sync"

        "github.com/cubefs/cubefs/util/log"
        "github.com/prometheus/client_golang/prometheus"
)

var (
        // us 1us, 100us, 500us, 1ms, 5ms, 50ms, 200ms, 500ms, 1s, 3s
        buckets = []float64{1, 50, 250, 500, 2500, 5000, 25000, 50000, 250000, 500000, 2500000, 5000000}

        HistogramGroup sync.Map
        HistogramCh    chan *Histogram
        once           = sync.Once{}
)

func collectHistogram() {
        HistogramCh = make(chan *Histogram, ChSize)
        for {
                m := <-HistogramCh
                metric := m.Metric()
                metric.Observe(m.val / 1000)
        }
}

type Histogram struct {
        name   string
        labels map[string]string
        val    float64
}

func (c *Histogram) Key() (key string) {
        return stringMD5(c.Name())
}

func (g *Histogram) Name() string {
        return fmt.Sprintf("{%s: %s}", g.name, stringMapToString(g.labels))
}

func (g *Histogram) String() string {
        return fmt.Sprintf("{name: %s, labels: %s, val: %v}", g.name, stringMapToString(g.labels), g.val)
}

func (c *Histogram) Metric() prometheus.Histogram {
        if enablePush {
                once.Do(func() {
                        buckets = []float64{1, 300, 1000, 5000, 500000, 2500000}
                })
        }

        metric := prometheus.NewHistogram(
                prometheus.HistogramOpts{
                        Name:        c.name,
                        ConstLabels: c.labels,
                        Buckets:     buckets,
                })

        key := c.Key()
        actualMetric, load := HistogramGroup.LoadOrStore(key, metric)
        if load {
                return actualMetric.(prometheus.Histogram)
        }

        if enablePush {
                registry.MustRegister(actualMetric.(prometheus.Collector))
                return actualMetric.(prometheus.Histogram)
        }

        err := prometheus.Register(actualMetric.(prometheus.Collector))
        if err == nil {
                log.LogInfof("register metric %v", c.Name())
        } else {
                log.LogErrorf("register metric %v, %v", c.Name(), err)
        }

        return actualMetric.(prometheus.Histogram)
}

func (h *Histogram) publish() {
        select {
        case HistogramCh <- h:
        default:
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "fmt"
        "time"

        "github.com/cubefs/cubefs/util/ump"
)

type TimePoint struct {
        Histogram
        startTime time.Time
}

func NewTP(name string) (tp *TimePoint) {
        tp = new(TimePoint)
        tp.name = fmt.Sprintf("%s_hist", metricsName(name))
        tp.labels = make(map[string]string)
        tp.val = 0
        tp.startTime = time.Now()
        return
}

func (tp *TimePoint) Set() {
        if !enabledPrometheus {
                return
        }
        val := time.Since(tp.startTime).Nanoseconds()
        tp.val = float64(val)
        tp.publish()
}

func (tp *TimePoint) SetWithLabels(labels map[string]string) {
        if !enabledPrometheus {
                return
        }
        tp.labels = labels
        tp.Set()
}

func (tp *TimePoint) GetStartTime() time.Time {
        return tp.startTime
}

type TimePointCount struct {
        tp  *TimePoint
        cnt *Counter
        to  *ump.TpObject
}

func NewTPCnt(name string) (tpc *TimePointCount) {
        tpc = new(TimePointCount)
        tpc.to = ump.BeforeTP(fmt.Sprintf("%v_%v_%v", clustername, modulename, name))
        tpc.tp = NewTP(name)
        tpc.cnt = NewCounter(fmt.Sprintf("%s_count", name))
        return
}

// it should be invoked by defer func{set(err)}
func (tpc *TimePointCount) Set(err error) {
        ump.AfterTP(tpc.to, err)
        tpc.tp.Set()
        tpc.cnt.Add(1)
}

func (tpc *TimePointCount) SetWithLabels(err error, labels map[string]string) {
        ump.AfterTP(tpc.to, err)
        if !enabledPrometheus {
                return
        }
        tpc.tp.SetWithLabels(labels)
        tpc.cnt.AddWithLabels(1, labels)
}

func (tpc *TimePointCount) GetStartTime() time.Time {
        return tpc.tp.GetStartTime()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exporter

import (
        "crypto/md5"
        "encoding/json"
        "fmt"
        "io"
)

func stringMD5(str string) string {
        h := md5.New()
        _, err := io.WriteString(h, str)
        if err != nil {
                return ""
        }
        return fmt.Sprintf("%x", h.Sum(nil))
}

func stringMapToString(m map[string]string) string {
        mjson, err := json.Marshal(m)
        if err != nil {
                return "{}"
        }

        return string(mjson)
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fileutil

import "os"

func Exist(path string) bool {
        _, err := os.Stat(path)
        return err == nil || !os.IsNotExist(err)
}

func ExistDir(path string) bool {
        state, err := os.Stat(path)
        if err == nil || !os.IsNotExist(err) {
                return state.IsDir()
        }
        return false
}

// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fileutil

import (
        "os"
        "syscall"
)

func Stat(name string) (stat *syscall.Stat_t, err error) {
        info, err := os.Stat(name)
        if err != nil {
                return
        }
        stat = info.Sys().(*syscall.Stat_t)
        return
}

// Copyright 2024 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package fileutil

import "syscall"

type FilesystemInfo = syscall.Statfs_t

func Statfs(name string) (stat *FilesystemInfo, err error) {
        stat = &FilesystemInfo{}
        err = syscall.Statfs(name, stat)
        return
}

package util

type respErr struct {
        errCh chan error
}

func (e *respErr) init() {
        e.errCh = make(chan error, 1)
}

func (e *respErr) respond(err error) {
        e.errCh <- err
        close(e.errCh)
}

func (e *respErr) error() <-chan error {
        return e.errCh
}

// Future the future
type Future struct {
        respErr
        respCh chan interface{}
}

func NewFuture() *Future {
        f := &Future{
                respCh: make(chan interface{}, 1),
        }
        f.init()
        return f
}

func (f *Future) Respond(resp interface{}, err error) {
        if err == nil {
                f.respCh <- resp
                close(f.respCh)
        } else {
                f.respErr.respond(err)
        }
}

// Response wait response
func (f *Future) Response() (resp interface{}, err error) {
        select {
        case err = <-f.error():
                return
        case resp = <-f.respCh:
                return
        }
}

// AsyncResponse export channels
func (f *Future) AsyncResponse() (respCh <-chan interface{}, errCh <-chan error) {
        return f.respCh, f.errCh
}

// Copyright 2015 The Go Authors. All rights reserved.
//
// Modified by 2020 The CubeFS Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Minimal RFC 6724 address selection.

package iputil

import (
        "net"
)

// commonPrefixLen reports the length of the longest prefix (looking
// at the most significant, or leftmost, bits) that the
// two addresses have in common, up to the length of a's prefix (i.e.,
// the portion of the address not including the interface ID).
//
// If a or b is an IPv4 address as an IPv6 address, the IPv4 addresses
// are compared (with max common prefix length of 32).
// If a and b are different IP versions, 0 is returned.
//
// See https://tools.ietf.org/html/rfc6724#section-2.2
func commonPrefixLen(a, b net.IP) (cpl int) {
        if a4 := a.To4(); a4 != nil {
                a = a4
        }
        if b4 := b.To4(); b4 != nil {
                b = b4
        }
        if len(a) != len(b) {
                return 0
        }
        // If IPv6, only up to the prefix (first 64 bits)
        if len(a) > 8 {
                a = a[:8]
                b = b[:8]
        }
        for len(a) > 0 {
                if a[0] == b[0] {
                        cpl += 8
                        a = a[1:]
                        b = b[1:]
                        continue
                }
                bits := 8
                ab, bb := a[0], b[0]
                for {
                        ab >>= 1
                        bb >>= 1
                        bits--
                        if ab == bb {
                                cpl += bits
                                return
                        }
                }
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package iputil

import (
        "errors"
        "net"
        "net/http"
        "strings"
)

var cidrs []*net.IPNet

func init() {
        maxCidrBlocks := []string{
                "127.0.0.1/8",    // localhost
                "10.0.0.0/8",     // 24-bit block
                "172.16.0.0/12",  // 20-bit block
                "192.168.0.0/16", // 16-bit block
                "169.254.0.0/16", // link local address
                "::1/128",        // localhost IPv6
                "fc00::/7",       // unique local address IPv6
                "fe80::/10",      // link local address IPv6
        }

        cidrs = make([]*net.IPNet, len(maxCidrBlocks))
        for i, maxCidrBlock := range maxCidrBlocks {
                _, cidr, _ := net.ParseCIDR(maxCidrBlock)
                cidrs[i] = cidr
        }
}

// isLocalAddress works by checking if the address is under private CIDR blocks.
// List of private CIDR blocks can be seen on :
//
// https://en.wikipedia.org/wiki/Private_network
//
// https://en.wikipedia.org/wiki/Link-local_address
func isPrivateAddress(address string) (bool, error) {
        ipAddress := net.ParseIP(address)
        if ipAddress == nil {
                return false, errors.New("address is not valid")
        }

        for i := range cidrs {
                if cidrs[i].Contains(ipAddress) {
                        return true, nil
                }
        }

        return false, nil
}

// FromRequest return client's real public IP address from http request headers.
func FromRequest(r *http.Request) string {
        // Fetch header value
        xRealIP := r.Header.Get("X-Real-Ip")
        xForwardedFor := r.Header.Get("X-Forwarded-For")

        // If both empty, return IP from remote address
        if xRealIP == "" && xForwardedFor == "" {
                var remoteIP string

                // If there are colon in remote address, remove the port number
                // otherwise, return remote address as is
                if strings.ContainsRune(r.RemoteAddr, ':') {
                        remoteIP, _, _ = net.SplitHostPort(r.RemoteAddr)
                } else {
                        remoteIP = r.RemoteAddr
                }

                return remoteIP
        }

        // Check list of IP in X-Forwarded-For and return the first global address
        for _, address := range strings.Split(xForwardedFor, ",") {
                address = strings.TrimSpace(address)
                isPrivate, err := isPrivateAddress(address)
                if !isPrivate && err == nil {
                        return address
                }
        }

        // If nothing succeed, return X-Real-IP
        return xRealIP
}

// RealIP is depreciated, use FromRequest instead
func RealIP(r *http.Request) string {
        return FromRequest(r)
}

// set default max distance from two ips to length of ipv6
const DEFAULT_MAX_DISTANCE = 128

func GetDistance(a, b net.IP) int {
        return DEFAULT_MAX_DISTANCE - commonPrefixLen(a, b)
}

package keystore

import (
        "encoding/json"
        "fmt"
        "regexp"

        "github.com/cubefs/cubefs/util/caps"
)

type AccessKeyInfo struct {
        AccessKey string `json:"access_key"`
        ID        string `json:"id"`
}

type AccessKeyCaps struct {
        AccessKey string `json:"access_key"`
        SecretKey string `json:"secret_key"`
        Caps      []byte `json:"caps"`
        ID        string `json:"user_id"`
}

func (u *AccessKeyCaps) IsValidCaps() (err error) {
        cap := new(caps.Caps)
        if err = cap.Init(u.Caps); err != nil {
                err = fmt.Errorf("Invalid caps [%s] %s", u.Caps, err.Error())
        }
        return
}

func (u *AccessKeyCaps) IsValidAK() (err error) {
        re := regexp.MustCompile("^[A-Za-z0-9]{16}$")
        if !re.MatchString(u.AccessKey) {
                err = fmt.Errorf("invalid AccessKey [%s]", u.AccessKey)
                return
        }
        return
}

func (u *AccessKeyCaps) DumpJSONStr() (r string, err error) {
        dumpInfo := struct {
                AccessKey string `json:"access_key"`
                SecretKey string `json:"secret_key"`
                Caps      string `json:"caps"`
                ID        string `json:"id"`
        }{
                u.AccessKey,
                u.SecretKey,
                string(u.Caps),
                u.ID,
        }
        data, err := json.MarshalIndent(dumpInfo, "", "  ")
        if err != nil {
                return
        }
        r = string(data)
        return
}

package keystore

import (
        "encoding/json"
        "fmt"
        "io"
        "os"
        "regexp"

        "github.com/cubefs/cubefs/util/caps"
)

var roleSet = map[string]bool{
        "client":  true,
        "service": true,
}

// KeyInfo defines the key info structure in key store
type KeyInfo struct {
        ID        string `json:"id"`
        AuthKey   []byte `json:"auth_key"`
        AccessKey string `json:"access_key"`
        SecretKey string `json:"secret_key"`
        Ts        int64  `json:"create_ts"`
        Role      string `json:"role"`
        Caps      []byte `json:"caps"`
}

// DumpJSONFile dump KeyInfo to file in json format
func (u *KeyInfo) DumpJSONFile(filename string, authIdKey string) (err error) {
        var data string
        if data, err = u.DumpJSONStr(authIdKey); err != nil {
                return
        }

        file, err := os.Create(filename)
        if err != nil {
                return
        }
        defer file.Close()

        _, err = io.WriteString(file, data)
        if err != nil {
                return
        }
        return
}

// DumpJSONStr dump KeyInfo to string in json format
func (u *KeyInfo) DumpJSONStr(authIdKey string) (r string, err error) {
        dumpInfo := struct {
                ID        string `json:"id"`
                AuthKey   []byte `json:"auth_key"`
                AccessKey string `json:"access_key"`
                SecretKey string `json:"secret_key"`
                Ts        int64  `json:"create_ts"`
                Role      string `json:"role"`
                Caps      string `json:"caps"`
                AuthIdKey string `json:"auth_id_key"`
        }{
                u.ID,
                u.AuthKey,
                u.AccessKey,
                u.SecretKey,
                u.Ts,
                u.Role,
                string(u.Caps),
                authIdKey,
        }
        data, err := json.MarshalIndent(dumpInfo, "", "  ")
        if err != nil {
                return
        }
        r = string(data)
        return
}

// IsValidID check the validity of ID
func (u *KeyInfo) IsValidID() (err error) {
        re := regexp.MustCompile("^[A-Za-z]{1,1}[A-Za-z0-9_]{0,20}$")
        if !re.MatchString(u.ID) {
                err = fmt.Errorf("invalid ID [%s]", u.ID)
                return
        }
        return
}

// IsValidRole check the validity of role
func (u *KeyInfo) IsValidRole() (err error) {
        if _, ok := roleSet[u.Role]; !ok {
                err = fmt.Errorf("invalid Role [%s]", u.Role)
                return
        }
        return
}

// IsValidCaps check the validity of caps
func (u *KeyInfo) IsValidCaps() (err error) {
        cap := new(caps.Caps)
        if err = cap.Init(u.Caps); err != nil {
                err = fmt.Errorf("Invalid caps [%s] %s", u.Caps, err.Error())
        }
        return
}

// IsValidKeyInfo is a valid of KeyInfo
func (u *KeyInfo) IsValidKeyInfo() (err error) {
        if err = u.IsValidID(); err != nil {
                return
        }
        if err = u.IsValidRole(); err != nil {
                return
        }
        if err = u.IsValidCaps(); err != nil {
                return
        }
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package loadutil

import (
        "fmt"
        "time"

        "github.com/shirou/gopsutil/cpu"

        "github.com/cubefs/cubefs/util/log"
)

func GetCpuUtilPercent(sampleDuration time.Duration) (used float64, err error) {
        utils, err := cpu.Percent(sampleDuration, false)
        if err != nil {
                log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
                return
        }
        if utils == nil {
                err = fmt.Errorf("got nil result")
                log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
                return
        }
        if len(utils) == 0 {
                err = fmt.Errorf("got result len is 0")
                log.LogErrorf("[GetCpuUtilPercent] err: %v", err.Error())
                return
        }

        used = utils[0]
        return
}

// Copyright 2023 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package loadutil

import (
        "errors"
        "fmt"
        "time"

        "github.com/shirou/gopsutil/disk"
)

func getMatchCount(lhs string, rhs string) int {
        count := len(lhs)
        if count > len(rhs) {
                count = len(rhs)
        }
        for i := 0; i < count; i++ {
                if lhs[i] != rhs[i] {
                        return i + 1
                }
        }
        return count
}

func GetMatchParation(path string) (*disk.PartitionStat, error) {
        partitons, err := disk.Partitions(true)
        if err != nil {
                return nil, err
        }
        maxMatch := 0
        matchParation := disk.PartitionStat{}
        for _, partition := range partitons {
                match := getMatchCount(path, partition.Mountpoint)
                if match == len(partition.Mountpoint) && match > maxMatch {
                        matchParation = partition
                }
        }
        return &matchParation, nil
}

var (
        ErrInvalidDiskPartition = errors.New("invalid disk partiton")
        ErrFailedToGetIoCounter = errors.New("failed to get io counter")
)

func getDeviceNameFromPartition(partition *disk.PartitionStat) (string, error) {
        var name string
        if n, err := fmt.Sscanf(partition.Device, "/dev/%s", &name); n != 1 || err != nil {
                return "", ErrInvalidDiskPartition
        }
        return name, nil
}

func GetIoCounter(partition *disk.PartitionStat) (*disk.IOCountersStat, error) {
        name, err := getDeviceNameFromPartition(partition)
        if err != nil {
                return nil, err
        }
        counters, err := disk.IOCounters(name)
        if err != nil {
                return nil, err
        }
        counter, exist := counters[name]
        if !exist {
                return nil, ErrFailedToGetIoCounter
        }
        return &counter, nil
}

type DiskIoSampleItem struct {
        time      time.Time
        ioCounter *disk.IOCountersStat
}

func getDiskIoSampleItem(partition *disk.PartitionStat) (*DiskIoSampleItem, error) {
        ioCounter, err := GetIoCounter(partition)
        if err != nil {
                return nil, err
        }
        time := time.Now()
        return &DiskIoSampleItem{
                time:      time,
                ioCounter: ioCounter,
        }, nil
}

func getReadFlow(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        t := second.time.Sub(first.time)
        ms := uint64(t.Milliseconds())
        bytes := second.ioCounter.ReadBytes - first.ioCounter.ReadBytes
        return bytes * 1000 / ms
}

func getWriteFlow(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        t := second.time.Sub(first.time)
        ms := uint64(t.Milliseconds())
        bytes := second.ioCounter.WriteBytes - first.ioCounter.WriteBytes
        return bytes * 1000 / ms
}

func getIoCount(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        count := second.ioCounter.ReadCount - first.ioCounter.ReadCount + second.ioCounter.WriteCount - first.ioCounter.WriteCount
        return count
}

func getTotalReadWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        count := second.ioCounter.ReadTime - first.ioCounter.ReadTime
        return count
}

func getTotalWriteWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        count := second.ioCounter.WriteTime - first.ioCounter.WriteTime
        return count
}

func getIoTotalWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        count := second.ioCounter.IoTime - first.ioCounter.IoTime
        return count
}

func getIoTotalWeightedWaitTime(first *DiskIoSampleItem, second *DiskIoSampleItem) uint64 {
        count := second.ioCounter.WeightedIO - first.ioCounter.WeightedIO
        return count
}

type DiskIoSample struct {
        partition  *disk.PartitionStat
        firstItem  *DiskIoSampleItem
        secondItem *DiskIoSampleItem
}

func (sample *DiskIoSample) GetReadCount() uint64 {
        return sample.secondItem.ioCounter.ReadCount - sample.firstItem.ioCounter.ReadCount
}

func (sample *DiskIoSample) GetReadFlow() uint64 {
        return getReadFlow(sample.firstItem, sample.secondItem)
}

func (sample *DiskIoSample) GetReadBytes() uint64 {
        return sample.secondItem.ioCounter.ReadBytes - sample.firstItem.ioCounter.ReadBytes
}

func (sample *DiskIoSample) GetReadTotalWaitTime() time.Duration {
        return time.Duration(getTotalReadWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}

func (sample *DiskIoSample) GetReadAvgWaitTime() time.Duration {
        if sample.GetReadCount() == 0 {
                return 0
        }
        return sample.GetReadTotalWaitTime() / time.Duration(sample.GetReadCount())
}

func (sample *DiskIoSample) GetMergedReadCount() uint64 {
        return sample.secondItem.ioCounter.MergedReadCount - sample.firstItem.ioCounter.MergedReadCount
}

func (sample *DiskIoSample) GetWriteCount() uint64 {
        return sample.secondItem.ioCounter.WriteCount - sample.firstItem.ioCounter.WriteCount
}

func (sample *DiskIoSample) GetWriteFlow() uint64 {
        return getWriteFlow(sample.firstItem, sample.secondItem)
}

func (sample *DiskIoSample) GetWriteBytes() uint64 {
        return sample.secondItem.ioCounter.WriteBytes - sample.firstItem.ioCounter.WriteBytes
}

func (sample *DiskIoSample) GetWriteTotalWaitTime() time.Duration {
        return time.Duration(getTotalWriteWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}

func (sample *DiskIoSample) GetWriteAvgWaitTime() time.Duration {
        if sample.GetWriteCount() == 0 {
                return 0
        }
        return sample.GetWriteTotalWaitTime() / time.Duration(sample.GetWriteCount())
}

func (sample *DiskIoSample) GetMergedWriteCount() uint64 {
        return sample.secondItem.ioCounter.MergedWriteCount - sample.firstItem.ioCounter.MergedWriteCount
}

func (sample *DiskIoSample) GetIoCount() uint64 {
        return getIoCount(sample.firstItem, sample.secondItem)
}

func (sample *DiskIoSample) GetIoTotalWaitTime() time.Duration {
        return time.Duration(getIoTotalWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}

func (sample *DiskIoSample) GetIoAvgWaitTime() time.Duration {
        if sample.GetIoCount() == 0 {
                return 0
        }
        return sample.GetIoTotalWaitTime() / time.Duration(sample.GetIoCount())
}

func (sample *DiskIoSample) GetWeightedTotalWaitTime() time.Duration {
        return time.Duration(getIoTotalWeightedWaitTime(sample.firstItem, sample.secondItem)) * time.Millisecond
}

func (sample *DiskIoSample) GetWeightedAvgWaitTime() time.Duration {
        if sample.GetIoCount() == 0 {
                return 0
        }
        return sample.GetWeightedTotalWaitTime() / time.Duration(sample.GetIoCount())
}

func (sample *DiskIoSample) GetIopsInProgress() uint64 {
        return sample.secondItem.ioCounter.IopsInProgress
}

func (sample *DiskIoSample) GetIoUtilPercent() float64 {
        return float64(sample.GetIoTotalWaitTime()) / float64(sample.GetSampleDuration()) * 100
}

func (sample *DiskIoSample) GetSampleDuration() time.Duration {
        return sample.secondItem.time.Sub(sample.firstItem.time)
}

func (sample *DiskIoSample) GetPartition() *disk.PartitionStat {
        return sample.partition
}

func GetDiskIoSample(partition *disk.PartitionStat, duration time.Duration) (DiskIoSample, error) {
        var sample DiskIoSample
        first, err := getDiskIoSampleItem(partition)
        if err != nil {
                return sample, err
        }
        time.Sleep(duration)
        second, err := getDiskIoSampleItem(partition)
        if err != nil {
                return sample, err
        }
        sample.partition = partition
        sample.firstItem = first
        sample.secondItem = second
        return sample, nil
}

func GetDisksIoSample(partitions []*disk.PartitionStat, duration time.Duration) (map[string]DiskIoSample, error) {
        count := len(partitions)
        samples := make(map[string]DiskIoSample)
        if count != 0 {
                firstItems := make([]*DiskIoSampleItem, 0, count)
                for i := 0; i < count; i++ {
                        first, err := getDiskIoSampleItem(partitions[i])
                        if err != nil {
                                return nil, err
                        }
                        firstItems = append(firstItems, first)
                }
                time.Sleep(duration)
                for i := 0; i < count; i++ {
                        var sample DiskIoSample
                        first := firstItems[i]
                        second, err := getDiskIoSampleItem(partitions[i])
                        if err != nil {
                                return nil, err
                        }
                        sample.partition = partitions[i]
                        sample.firstItem = first
                        sample.secondItem = second
                        samples[partitions[i].Device] = sample
                }
        }
        return samples, nil
}

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package loadutil

import "github.com/shirou/gopsutil/mem"

func GetMemUsedPercent() float64 {
        memInfo, _ := mem.VirtualMemory()
        return memInfo.UsedPercent
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package log

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "io/ioutil"
        "log"
        "math"
        "net/http"
        "os"
        "path"
        "runtime"
        "sort"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"

        blog "github.com/cubefs/cubefs/blobstore/util/log"
        syslog "log"
)

type Level uint8

const (
        DebugLevel    Level = 1
        InfoLevel           = DebugLevel<<1 + 1
        WarnLevel           = InfoLevel<<1 + 1
        ErrorLevel          = WarnLevel<<1 + 1
        FatalLevel          = ErrorLevel<<1 + 1
        CriticalLevel       = FatalLevel << +1
        ReadLevel           = InfoLevel
        UpdateLevel         = InfoLevel
)

const (
        FileNameDateFormat    = "20060102150405"
        FileOpt               = os.O_RDWR | os.O_CREATE | os.O_APPEND
        WriterBufferInitSize  = 4 * 1024 * 1024
        WriterBufferLenLimit  = 4 * 1024 * 1024
        DefaultRotateInterval = 1 * time.Second
        RotatedExtension      = ".old"
        MaxReservedDays       = 7 * 24 * time.Hour
)

var levelPrefixes = []string{
        "[DEBUG]",
        "[INFO ]",
        "[WARN ]",
        "[ERROR]",
        "[FATAL]",
        "[READ ]",
        "[WRITE]",
        "[Critical]",
}

type RotatedFile []os.FileInfo

func (f RotatedFile) Less(i, j int) bool {
        return f[i].ModTime().Before(f[j].ModTime())
}

func (f RotatedFile) Len() int {
        return len(f)
}

func (f RotatedFile) Swap(i, j int) {
        f[i], f[j] = f[j], f[i]
}

func setBlobLogLevel(loglevel Level) {
        blevel := blog.Lwarn
        switch loglevel {
        case DebugLevel:
                blevel = blog.Ldebug
        case InfoLevel:
                blevel = blog.Linfo
        case WarnLevel:
                blevel = blog.Lwarn
        case ErrorLevel:
                blevel = blog.Lerror
        default:
                blevel = blog.Lwarn
        }
        blog.SetOutputLevel(blevel)
}

type asyncWriter struct {
        file       *os.File
        fileName   string
        logSize    int64
        rotateSize int64
        buffer     *bytes.Buffer
        flushTmp   *bytes.Buffer
        flushC     chan bool
        rotateDay  chan struct{} // TODO rotateTime?
        mu         sync.Mutex
        rotateMu   sync.Mutex
}

func (writer *asyncWriter) flushScheduler() {
        ticker := time.NewTicker(1 * time.Second)
        for {
                select {
                case <-ticker.C:
                        writer.flushToFile()
                case _, open := <-writer.flushC:
                        writer.flushToFile()
                        if !open {
                                ticker.Stop()

                                // TODO Unhandled errors
                                writer.file.Close()
                                return
                        }
                }
        }
}

// Write writes the log.
func (writer *asyncWriter) Write(p []byte) (n int, err error) {
        writer.mu.Lock()
        writer.buffer.Write(p)
        writer.mu.Unlock()

        if writer.buffer.Len() > WriterBufferLenLimit {
                select {
                case writer.flushC <- true:
                default:
                }
        }
        return
}

// Close closes the writer.
func (writer *asyncWriter) Close() (err error) {
        writer.mu.Lock()
        defer writer.mu.Unlock()
        close(writer.flushC)
        return
}

// Flush flushes the write.
func (writer *asyncWriter) Flush() {
        writer.flushToFile()
        // TODO Unhandled errors
        writer.file.Sync()
}

func (writer *asyncWriter) flushToFile() {
        writer.mu.Lock()
        writer.buffer, writer.flushTmp = writer.flushTmp, writer.buffer
        writer.mu.Unlock()
        isRotateDay := false
        select {
        case <-writer.rotateDay:
                isRotateDay = true
        default:
        }
        flushLength := writer.flushTmp.Len()
        writer.rotateMu.Lock()
        if (writer.logSize+int64(flushLength)) >= writer.
                rotateSize || isRotateDay {
                oldFile := writer.fileName + "." + time.Now().Format(
                        FileNameDateFormat) + RotatedExtension
                if _, err := os.Lstat(oldFile); err != nil {
                        if err := writer.rename(oldFile); err == nil {
                                if fp, err := os.OpenFile(writer.fileName, FileOpt, 0o666); err == nil {
                                        writer.file.Close()
                                        writer.file = fp
                                        writer.logSize = 0
                                        _ = os.Chmod(writer.fileName, 0o666)
                                } else {
                                        syslog.Printf("log rotate: openFile %v error: %v", writer.fileName, err)
                                }
                        } else {
                                syslog.Printf("log rotate: rename %v error: %v ", oldFile, err)
                        }
                } else {
                        syslog.Printf("log rotate: lstat error: %v already exists", oldFile)
                }
        }
        writer.rotateMu.Unlock()
        writer.logSize += int64(flushLength)
        // TODO Unhandled errors
        writer.file.Write(writer.flushTmp.Bytes())
        writer.flushTmp.Reset()
}

func (writer *asyncWriter) rename(newName string) error {
        if err := os.Rename(writer.fileName, newName); err != nil {
                return err
        }
        return nil
}

func newAsyncWriter(fileName string, rotateSize int64) (*asyncWriter, error) {
        fp, err := os.OpenFile(fileName, FileOpt, 0o666)
        if err != nil {
                return nil, err
        }
        fInfo, err := fp.Stat()
        if err != nil {
                return nil, err
        }
        _ = os.Chmod(fileName, 0o666)
        w := &asyncWriter{
                file:       fp,
                fileName:   fileName,
                rotateSize: rotateSize,
                logSize:    fInfo.Size(),
                buffer:     bytes.NewBuffer(make([]byte, 0, WriterBufferInitSize)),
                flushTmp:   bytes.NewBuffer(make([]byte, 0, WriterBufferInitSize)),
                flushC:     make(chan bool, 1000),
                rotateDay:  make(chan struct{}, 1),
        }
        go w.flushScheduler()
        return w, nil
}

// LogObject defines the log object.
type LogObject struct {
        *log.Logger
        object *asyncWriter
}

// Flush flushes the log object.
func (ob *LogObject) Flush() {
        if ob.object != nil {
                ob.object.Flush()
        }
}

func (ob *LogObject) SetRotation() {
        ob.object.rotateDay <- struct{}{}
}

func newLogObject(writer *asyncWriter, prefix string, flag int) *LogObject {
        return &LogObject{
                Logger: log.New(writer, prefix, flag),
                object: writer,
        }
}

// Log defines the log struct.
type Log struct {
        dir            string
        errorLogger    *LogObject
        warnLogger     *LogObject
        debugLogger    *LogObject
        infoLogger     *LogObject
        readLogger     *LogObject
        updateLogger   *LogObject
        criticalLogger *LogObject
        qosLogger      *LogObject
        level          Level
        rotate         *LogRotate
        lastRolledTime time.Time
        printStderr    int32
}

var (
        ErrLogFileName      = "_error.log"
        WarnLogFileName     = "_warn.log"
        InfoLogFileName     = "_info.log"
        DebugLogFileName    = "_debug.log"
        ReadLogFileName     = "_read.log"
        UpdateLogFileName   = "_write.log"
        CriticalLogFileName = "_critical.log"
        QoSLogFileName      = "_qos.log"
)

var gLog *Log = nil

var LogDir string

func (l *Log) DisableStderrOutput() {
        atomic.StoreInt32(&l.printStderr, 0)
}

func (l *Log) outputStderr(calldepth int, s string) {
        if atomic.LoadInt32(&l.printStderr) != 0 {
                log.Output(calldepth+1, s)
        }
}

// InitLog initializes the log.
func InitLog(dir, module string, level Level, rotate *LogRotate, logLeftSpaceLimit int64) (*Log, error) {
        l := new(Log)
        l.printStderr = 1
        dir = path.Join(dir, module)
        l.dir = dir
        LogDir = dir
        fi, err := os.Stat(dir)
        if err != nil {
                os.MkdirAll(dir, 0o755)
        } else {
                if !fi.IsDir() {
                        return nil, errors.New(dir + " is not a directory")
                }
        }
        _ = os.Chmod(dir, 0o755)

        fs := syscall.Statfs_t{}
        if err := syscall.Statfs(dir, &fs); err != nil {
                return nil, fmt.Errorf("[InitLog] stats disk space: %s", err.Error())
        }

        if rotate == nil {
                rotate = NewLogRotate()
        }

        if rotate.headRoom == 0 {
                var minLogLeftSpaceLimit float64
                if float64(fs.Bavail*uint64(fs.Bsize)) < float64(fs.Blocks*uint64(fs.Bsize))*DefaultHeadRatio {
                        minLogLeftSpaceLimit = float64(fs.Bavail*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
                } else {
                        minLogLeftSpaceLimit = float64(fs.Blocks*uint64(fs.Bsize)) * DefaultHeadRatio / 1024 / 1024
                }

                minLogLeftSpaceLimit = math.Max(minLogLeftSpaceLimit, float64(logLeftSpaceLimit))

                rotate.SetHeadRoomMb(int64(math.Min(minLogLeftSpaceLimit, DefaultHeadRoom)))
        }

        if rotate.rotateSize == 0 {
                minRotateSize := int64(fs.Bavail * uint64(fs.Bsize) / uint64(len(levelPrefixes)))
                if minRotateSize < DefaultMinRotateSize {
                        minRotateSize = DefaultMinRotateSize
                }
                rotate.SetRotateSizeMb(int64(math.Min(float64(minRotateSize), float64(DefaultRotateSize))))
        }

        l.rotate = rotate
        err = l.initLog(dir, module, level)
        if err != nil {
                return nil, err
        }
        l.lastRolledTime = time.Now()
        go l.checkLogRotation(dir, module)

        gLog = l
        setBlobLogLevel(level)
        return l, nil
}

func TruncMsg(msg string) string {
        return TruncMsgWith(msg, 100)
}

func TruncMsgWith(msg string, size int) string {
        if len(msg) < size {
                return msg
        }

        return msg[0:size]
}

func OutputPid(logDir, role string) error {
        pidFile := path.Join(logDir, fmt.Sprintf("%s.pid", role))
        file, err := os.Create(pidFile)
        if err != nil {
                return fmt.Errorf("open pid file %s error %s", pidFile, err.Error())
        }

        pid := os.Getpid()
        _, err = file.Write([]byte(fmt.Sprintf("%d", pid)))
        if err != nil {
                return fmt.Errorf("write pid failed, pid %d, file %s, err %s", pid, pidFile, err.Error())
        }

        file.Close()
        return nil
}

func (l *Log) initLog(logDir, module string, level Level) error {
        logOpt := log.LstdFlags | log.Lmicroseconds

        newLog := func(logFileName string) (newLogger *LogObject, err error) {
                logName := path.Join(logDir, module+logFileName)
                w, err := newAsyncWriter(logName, l.rotate.rotateSize)
                if err != nil {
                        return
                }
                newLogger = newLogObject(w, "", logOpt)
                return
        }
        var err error
        logHandles := [...]**LogObject{&l.debugLogger, &l.infoLogger, &l.warnLogger, &l.errorLogger, &l.readLogger, &l.updateLogger, &l.criticalLogger, &l.qosLogger}
        logNames := [...]string{DebugLogFileName, InfoLogFileName, WarnLogFileName, ErrLogFileName, ReadLogFileName, UpdateLogFileName, CriticalLogFileName, QoSLogFileName}
        for i := range logHandles {
                if *logHandles[i], err = newLog(logNames[i]); err != nil {
                        return err
                }
        }
        l.level = level
        return nil
}

// SetPrefix sets the log prefix.
func (l *Log) SetPrefix(s, level string) string {
        _, file, line, ok := runtime.Caller(2)
        if !ok {
                line = 0
        }
        short := file
        for i := len(file) - 1; i > 0; i-- {
                if file[i] == '/' {
                        short = file[i+1:]
                        break
                }
        }
        file = short
        return level + " " + file + ":" + strconv.Itoa(line) + ": " + s
}

// Flush flushes the log.
func (l *Log) Flush() {
        loggers := []*LogObject{
                l.debugLogger,
                l.infoLogger,
                l.warnLogger,
                l.errorLogger,
                l.readLogger,
                l.updateLogger,
                l.criticalLogger,
        }
        for _, logger := range loggers {
                if logger != nil {
                        logger.Flush()
                }
        }
}

const (
        SetLogLevelPath = "/loglevel/set"
)

func SetLogLevel(w http.ResponseWriter, r *http.Request) {
        var err error
        if err = r.ParseForm(); err != nil {
                buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        levelStr := r.FormValue("level")
        var level Level
        switch strings.ToLower(levelStr) {
        case "debug":
                level = DebugLevel
        case "info", "read", "write":
                level = InfoLevel
        case "warn":
                level = WarnLevel
        case "error":
                level = ErrorLevel
        case "critical":
                level = CriticalLevel
        case "fatal":
                level = FatalLevel
        default:
                err = fmt.Errorf("level only can be set :debug,info,warn,error,critical,read,write,fatal")
                buildFailureResp(w, http.StatusBadRequest, err.Error())
                return
        }
        gLog.level = Level(level)
        setBlobLogLevel(level)
        buildSuccessResp(w, "set log level success")
}

func buildSuccessResp(w http.ResponseWriter, data interface{}) {
        buildJSONResp(w, http.StatusOK, data, "")
}

func buildFailureResp(w http.ResponseWriter, code int, msg string) {
        buildJSONResp(w, code, nil, msg)
}

// Create response for the API request.
func buildJSONResp(w http.ResponseWriter, code int, data interface{}, msg string) {
        var (
                jsonBody []byte
                err      error
        )
        w.WriteHeader(code)
        w.Header().Set("Content-Type", "application/json")
        body := struct {
                Code int         `json:"code"`
                Data interface{} `json:"data"`
                Msg  string      `json:"msg"`
        }{
                Code: code,
                Data: data,
                Msg:  msg,
        }
        if jsonBody, err = json.Marshal(body); err != nil {
                return
        }
        w.Write(jsonBody)
}

// LogWarn indicates the warnings.
func LogWarn(v ...interface{}) {
        if gLog == nil {
                return
        }
        if WarnLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[2])
        gLog.warnLogger.Output(2, s)
}

// LogWarnf indicates the warnings with specific format.
func LogWarnf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if WarnLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[2])
        gLog.warnLogger.Output(2, s)
}

// LogInfo indicates log the information. TODO explain
func LogInfo(v ...interface{}) {
        if gLog == nil {
                return
        }
        if InfoLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[1])
        gLog.infoLogger.Output(2, s)
}

// LogInfo indicates log the information with specific format. TODO explain
func LogInfof(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if InfoLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[1])
        gLog.infoLogger.Output(2, s)
}

func EnableInfo() bool {
        if gLog == nil {
                return false
        }
        return InfoLevel&gLog.level == gLog.level
}

// LogError logs the errors.
func LogError(v ...interface{}) {
        if gLog == nil {
                return
        }
        if ErrorLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[3])
        gLog.errorLogger.Output(2, s)
}

// LogErrorf logs the errors with the specified format.
func LogErrorf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if ErrorLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[3])
        gLog.errorLogger.Print(s)
}

// LogDebug logs the debug information.
func LogDebug(v ...interface{}) {
        if gLog == nil {
                return
        }
        if DebugLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[0])
        gLog.debugLogger.Print(s)
}

// LogDebugf logs the debug information with specified format.
func LogDebugf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if DebugLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[0])
        gLog.debugLogger.Output(2, s)
}

func EnableDebug() bool {
        if gLog == nil {
                return false
        }

        return DebugLevel&gLog.level == gLog.level
}

// LogFatal logs the fatal errors.
func LogFatal(v ...interface{}) {
        if gLog == nil {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[4])
        gLog.errorLogger.Output(2, s)
        gLog.Flush()
        os.Exit(1)
}

// LogFatalf logs the fatal errors with specified format.
func LogFatalf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[4])
        gLog.errorLogger.Output(2, s)
        gLog.Flush()
        os.Exit(1)
}

// LogFatal logs the fatal errors.
func LogCritical(v ...interface{}) {
        if gLog == nil {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[4])
        gLog.criticalLogger.Output(2, s)
        gLog.outputStderr(2, s)
}

// LogFatalf logs the fatal errors with specified format.
func LogCriticalf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[4])
        gLog.criticalLogger.Output(2, s)
        gLog.outputStderr(2, s)
}

// LogRead
func LogRead(v ...interface{}) {
        if gLog == nil {
                return
        }
        if ReadLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[5])
        gLog.readLogger.Output(2, s)
}

// TODO not used?
func LogReadf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if ReadLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[5])
        gLog.readLogger.Output(2, s)
}

// QosWrite
func QosWrite(v ...interface{}) {
        if gLog == nil {
                return
        }
        if UpdateLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[0])
        gLog.qosLogger.Output(2, s)
}

// QosWriteDebugf TODO not used
func QosWriteDebugf(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if DebugLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[0])
        gLog.qosLogger.Output(2, s)
}

// LogWrite
func LogWrite(v ...interface{}) {
        if gLog == nil {
                return
        }
        if UpdateLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintln(v...)
        s = gLog.SetPrefix(s, levelPrefixes[6])
        gLog.updateLogger.Output(2, s)
}

// LogWritef TODO not used
func LogWritef(format string, v ...interface{}) {
        if gLog == nil {
                return
        }
        if UpdateLevel&gLog.level != gLog.level {
                return
        }
        s := fmt.Sprintf(format, v...)
        s = gLog.SetPrefix(s, levelPrefixes[6])
        gLog.updateLogger.Output(2, s)
}

// LogFlush flushes the log.
func LogFlush() {
        if gLog != nil {
                gLog.Flush()
        }
}

func LogDisableStderrOutput() {
        if gLog != nil {
                gLog.DisableStderrOutput()
        }
}

func (l *Log) checkLogRotation(logDir, module string) {
        var needDelFiles RotatedFile
        for {
                needDelFiles = needDelFiles[:0]
                // check disk space
                fs := syscall.Statfs_t{}
                if err := syscall.Statfs(logDir, &fs); err != nil {
                        LogErrorf("check disk space: %s", err.Error())
                        time.Sleep(DefaultRotateInterval)
                        continue
                }
                diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
                diskSpaceLeft -= l.rotate.headRoom * 1024 * 1024
                if diskSpaceLeft <= 0 {
                        LogDebugf("logLeftSpaceLimit has been reached, need to clear %v Mb of Space", (-diskSpaceLeft)/1024/1024)
                }
                err := l.removeLogFile(logDir, diskSpaceLeft, module)
                if err != nil {
                        time.Sleep(DefaultRotateInterval)
                        continue
                }
                // check if it is time to rotate
                now := time.Now()
                if now.Day() == l.lastRolledTime.Day() {
                        time.Sleep(DefaultRotateInterval)
                        continue
                }

                // rotate log files
                l.debugLogger.SetRotation()
                l.infoLogger.SetRotation()
                l.warnLogger.SetRotation()
                l.errorLogger.SetRotation()
                l.readLogger.SetRotation()
                l.updateLogger.SetRotation()
                l.criticalLogger.SetRotation()

                l.lastRolledTime = now
        }
}

func DeleteFileFilter(info os.FileInfo, diskSpaceLeft int64, module string) bool {
        if diskSpaceLeft <= 0 {
                return info.Mode().IsRegular() && strings.HasSuffix(info.Name(), RotatedExtension) && strings.HasPrefix(info.Name(), module)
        }
        return time.Since(info.ModTime()) > MaxReservedDays && strings.HasSuffix(info.Name(), RotatedExtension) && strings.HasPrefix(info.Name(), module)
}

func (l *Log) removeLogFile(logDir string, diskSpaceLeft int64, module string) (err error) {
        // collect free file list
        fInfos, err := ioutil.ReadDir(logDir)
        if err != nil {
                LogErrorf("error read log directory files: %s", err.Error())
                return
        }
        var needDelFiles RotatedFile
        for _, info := range fInfos {
                if DeleteFileFilter(info, diskSpaceLeft, module) {
                        LogDebugf("%v will be put into needDelFiles", info.Name())
                        needDelFiles = append(needDelFiles, info)
                }
        }
        sort.Sort(needDelFiles)
        // delete old file
        for _, info := range needDelFiles {
                if err = os.Remove(path.Join(logDir, info.Name())); err != nil {
                        LogErrorf("failed delete log file %s", info.Name())
                        continue
                }
                diskSpaceLeft += info.Size()
                if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
                        break
                }
        }
        err = nil
        return
}

// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package log

import (
        "encoding/json"
        "fmt"
        "io"
        "net/http"
        "os"
        "strconv"
        "strings"
)

const (
        InvalidLogLevel   = "Invalid log level, only support [error, warn, debug, info, read, update, critical]"
        OpenLogFileFailed = "Failed to open log file"
        GetLogNumFailed   = "Failed to get param num"
        TailLogFileFailed = "Failed to tail log file"
        InvaildLogNum     = ", invalid num param, use default num"
        TooBigNum         = ", param num is too big, use default max num"
        LossNum           = ", can't find num param, use default num"

        GetLogPath = "/log/get"

        buffSize       = int64(4096)
        maxLogLine     = 10000
        defaultLogLine = 100
)

// HTTPReply uniform response structure
type HTTPReply struct {
        Code int32       `json:"code"`
        Msg  string      `json:"msg"`
        Data interface{} `json:"data"`
}

func GetLog(w http.ResponseWriter, r *http.Request) {
        query := r.URL.Query()

        levelStr := query.Get("level")
        var fileName string

        switch strings.ToLower(levelStr) {
        case "error":
                fileName = gLog.errorLogger.object.fileName
        case "warn":
                fileName = gLog.warnLogger.object.fileName
        case "debug":
                fileName = gLog.debugLogger.object.fileName
        case "info":
                fileName = gLog.infoLogger.object.fileName
        case "read":
                fileName = gLog.readLogger.object.fileName
        case "update":
                fileName = gLog.updateLogger.object.fileName
        case "critical":
                fileName = gLog.criticalLogger.object.fileName
        default:
                buildFailureResp(w, http.StatusBadRequest, InvalidLogLevel)
                return
        }

        file, err := os.Open(fileName)
        if err != nil {
                buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", OpenLogFileFailed, err))
                return
        }
        defer file.Close()

        var msg string
        var num int
        numStr := query.Get("num")
        if numStr == "" {
                num = defaultLogLine
                msg = fmt.Sprintf("%s(%d)", LossNum, defaultLogLine)
        } else {
                num, err = strconv.Atoi(numStr)
                if err != nil {
                        buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", GetLogNumFailed, err))
                        return
                }
        }

        if num <= 0 {
                num = defaultLogLine
                msg = fmt.Sprintf("%s(%d)", InvaildLogNum, defaultLogLine)
        } else if num > maxLogLine {
                num = maxLogLine
                msg = fmt.Sprintf("%s(%d)", TooBigNum, maxLogLine)
        }

        data, err := tailn(num, file)
        if err != nil {
                buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", TailLogFileFailed, err))
                return
        }

        sendOKReply(w, r, msg, data)
}

func tailn(line int, file *os.File) (data []string, err error) {
        fileLen, err := file.Seek(0, io.SeekEnd)
        if err != nil {
                return
        }

        var dataLen int
        var currNum int
        var lastStr string
        data = make([]string, line)
        for {
                currSize := buffSize
                if currSize > fileLen {
                        currSize = fileLen
                }

                _, err = file.Seek(-currSize, io.SeekCurrent)
                if err != nil {
                        return
                }

                buff := make([]byte, currSize)
                dataLen, err = file.Read(buff)
                if err != nil {
                        return
                }

                last := dataLen
                for i := dataLen - 1; i >= 0; i-- {
                        if buff[i] == '\n' {
                                if i == dataLen-1 {
                                        if lastStr != "" {
                                                data[line-currNum] = lastStr
                                                lastStr = ""
                                                currNum++
                                                if currNum >= line {
                                                        return
                                                }
                                        }
                                        last = i
                                        continue
                                }

                                currNum++
                                data[line-currNum] = string(buff[i+1:last]) + lastStr
                                lastStr = ""
                                if currNum >= line {
                                        return
                                }
                                last = i
                        }
                }
                lastStr = string(buff[:last])

                fileLen, err = file.Seek(-currSize, io.SeekCurrent)

                if fileLen <= 0 {
                        break
                }
        }

        if currNum < line {
                data = data[line-currNum:]
        }

        return
}

func sendOKReply(w http.ResponseWriter, r *http.Request, msg string, data interface{}) {
        reply := &HTTPReply{
                Code: http.StatusOK,
                Msg:  "Success" + msg,
                Data: data,
        }

        httpReply, err := json.Marshal(reply)
        if err != nil {
                buildFailureResp(w, http.StatusBadRequest, fmt.Sprintf("%s, err is [%v]", "", err))
                return
        }

        send(w, r, httpReply)
}

func send(w http.ResponseWriter, r *http.Request, reply []byte) {
        w.Header().Set("content-type", "application/json")
        w.Header().Set("Content-Length", strconv.Itoa(len(reply)))
        w.Write(reply)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package log

const (
        // DefaultRotateSize Specifies at what size to rotate the output log at
        // Units: byte
        DefaultRotateSize    = 1 * 1024 * 1024 * 1024
        DefaultMinRotateSize = 200 * 1024 * 1024
        // DefaultHeadRoom The tolerance for the log space limit (in megabytes)
        DefaultHeadRoom = 50 * 1024
        // DefaultHeadRatio The disk reserve space ratio
        DefaultHeadRatio         = 0.2
        DefaultLogLeftSpaceLimit = 5 * 1024
)

// LogRotate A log can be rotated by the size or time.
type LogRotate struct {
        rotateSize int64 // the size of the rotated log
        headRoom   int64 // capacity reserved for writing the next log on the disk
}

// NewLogRotate returns a new LogRotate instance.
func NewLogRotate() *LogRotate {
        return &LogRotate{}
}

// SetRotateSizeMb sets the rotate size in terms of MB.
func (r *LogRotate) SetRotateSizeMb(size int64) {
        r.rotateSize = size
}

// SetHeadRoomMb sets the headroom in terms of MB.
func (r *LogRotate) SetHeadRoomMb(size int64) {
        r.headRoom = size
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "bytes"
        "encoding/json"
        "errors"
        "fmt"
        "io"
        "net/http"
        "strings"
        "sync"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        requestTimeout = 30 * time.Second
)

var ErrNoValidMaster = errors.New("no valid master")

// MasterHelper defines the helper struct to manage the master.
type MasterHelper interface {
        AddNode(address string)
        Nodes() []string
        Leader() string
        Request(method, path string, param, header map[string]string, body []byte) (data []byte, err error)
}

type masterHelper struct {
        sync.RWMutex
        masters    []string
        leaderAddr string
}

// AddNode add the given address as the master address.
func (helper *masterHelper) AddNode(address string) {
        helper.Lock()
        helper.updateMaster(address)
        helper.Unlock()
}

// Leader returns the current leader address.
func (helper *masterHelper) Leader() (addr string) {
        helper.RLock()
        addr = helper.leaderAddr
        helper.RUnlock()
        return
}

// Change the leader address.
func (helper *masterHelper) setLeader(addr string) {
        helper.Lock()
        helper.leaderAddr = addr
        helper.Unlock()
}

// Request sends out the request through the helper.
func (helper *masterHelper) Request(method, path string, param, header map[string]string, reqData []byte) (respData []byte, err error) {
        respData, err = helper.request(method, path, param, header, reqData)
        return
}

func (helper *masterHelper) request(method, path string, param, header map[string]string, reqData []byte) (repsData []byte, err error) {
        leaderAddr, nodes := helper.prepareRequest()
        host := leaderAddr
        for i := -1; i < len(nodes); i++ {
                if i == -1 {
                        if host == "" {
                                continue
                        }
                } else {
                        host = nodes[i]
                }
                var resp *http.Response
                resp, err = helper.httpRequest(method, fmt.Sprintf("http://%s%s", host,
                        path), param, header, reqData)
                if err != nil {
                        log.LogErrorf("[masterHelper] %s", err)
                        continue
                }
                stateCode := resp.StatusCode
                repsData, err = io.ReadAll(resp.Body)
                resp.Body.Close()
                if err != nil {
                        log.LogErrorf("[masterHelper] %s", err)
                        continue
                }
                switch stateCode {
                case http.StatusForbidden:
                        curMasterAddr := strings.TrimSpace(string(repsData))
                        curMasterAddr = strings.Replace(curMasterAddr, "\n", "", -1)
                        if len(curMasterAddr) == 0 {
                                log.LogErrorf("[masterHelper] request[%s] response statudCode"+
                                        "[403], respBody is empty", host)
                                err = ErrNoValidMaster
                                return
                        }
                        repsData, err = helper.request(method, path, param, header, reqData)
                        return
                case http.StatusOK:
                        if leaderAddr != host {
                                helper.setLeader(host)
                        }
                        body := &struct {
                                Code int32           `json:"code"`
                                Msg  string          `json:"msg"`
                                Data json.RawMessage `json:"data"`
                        }{}
                        if err := json.Unmarshal(repsData, body); err != nil {
                                return nil, fmt.Errorf("unmarshal response body err:%v", err)
                        }
                        // o represent proto.ErrCodeSuccess
                        if body.Code != 0 {
                                return nil, fmt.Errorf("request error, code[%d], msg[%s]", body.Code, body.Msg)
                        }
                        return []byte(body.Data), nil
                default:
                        log.LogErrorf("[masterHelper] master[%v] uri[%v] statusCode[%v] respBody[%v].",
                                resp.Request.URL.String(), host, stateCode, string(repsData))
                        continue
                }
        }
        err = ErrNoValidMaster
        return
}

// Nodes returns all master addresses.
func (helper *masterHelper) Nodes() (nodes []string) {
        helper.RLock()
        nodes = helper.masters
        helper.RUnlock()
        return
}

// prepareRequest returns the leader address and all master addresses.
func (helper *masterHelper) prepareRequest() (addr string, nodes []string) {
        helper.RLock()
        addr = helper.leaderAddr
        nodes = helper.masters
        helper.RUnlock()
        return
}

func (helper *masterHelper) httpRequest(method, url string, param, header map[string]string, reqData []byte) (resp *http.Response, err error) {
        client := &http.Client{}
        reader := bytes.NewReader(reqData)
        client.Timeout = requestTimeout
        var req *http.Request
        fullUrl := helper.mergeRequestUrl(url, param)
        log.LogDebugf("action[httpRequest] method[%v] url[%v] reqBodyLen[%v].", method, fullUrl, len(reqData))
        if req, err = http.NewRequest(method, fullUrl, reader); err != nil {
                return
        }
        req.Header.Set("Content-Type", "application/json")
        req.Header.Set("Connection", "close")
        for k, v := range header {
                req.Header.Set(k, v)
        }
        resp, err = client.Do(req)
        return
}

func (helper *masterHelper) updateMaster(address string) {
        contains := false
        for _, master := range helper.masters {
                if master == address {
                        contains = true
                        break
                }
        }
        if !contains {
                helper.masters = append(helper.masters, address)
        }
        helper.leaderAddr = address
}

func (helper *masterHelper) mergeRequestUrl(url string, params map[string]string) string {
        if len(params) > 0 {
                buff := bytes.NewBuffer([]byte(url))
                isFirstParam := true
                for k, v := range params {
                        if isFirstParam {
                                buff.WriteString("?")
                                isFirstParam = false
                        } else {
                                buff.WriteString("&")
                        }
                        buff.WriteString(k)
                        buff.WriteString("=")
                        buff.WriteString(v)
                }
                return buff.String()
        }
        return url
}

// NewMasterHelper returns a new MasterHelper instance.
func NewMasterHelper() MasterHelper {
        return &masterHelper{}
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "bufio"
        "fmt"
        "os"
        "strconv"
        "strings"
)

const (
        MEMINFO = "/proc/meminfo"
        PRO_MEM = "/proc/%d/status"
)

// GetMemInfo returns the memory information.
func GetMemInfo() (total, used uint64, err error) {
        fp, err := os.Open(MEMINFO)
        if err != nil {
                return
        }
        // TODO Unhandled errors
        defer fp.Close()
        var (
                val    uint64
                free   uint64
                buffer uint64
                cached uint64
        )
        scan := bufio.NewScanner(fp)
        for scan.Scan() {
                line := scan.Text()
                fields := strings.Split(line, ":")
                if len(fields) != 2 {
                        continue
                }
                key := fields[0]
                value := strings.TrimSpace(fields[1])
                value = strings.Replace(value, " kB", "", -1)
                val, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        return
                }
                switch key {
                case "MemTotal":
                        total = val * KB
                case "MemFree":
                        free = val * KB
                case "Buffers":
                        buffer = val * KB
                case "Cached":
                        cached = val * KB
                default:
                        // do nothing
                }
        }
        used = total - free - buffer - cached
        return
}

func GetProcessMemory(pid int) (used uint64, err error) {
        proFileName := fmt.Sprintf(PRO_MEM, pid)
        fp, err := os.Open(proFileName)
        if err != nil {
                return
        }
        defer fp.Close()
        scan := bufio.NewScanner(fp)
        for scan.Scan() {
                line := scan.Text()
                fields := strings.Split(line, ":")
                key := fields[0]
                if key != "VmRSS" {
                        continue
                }
                value := strings.TrimSpace(fields[1])
                value = strings.Replace(value, " kB", "", -1)
                used, err = strconv.ParseUint(value, 10, 64)
                if err != nil {
                        return
                }
                used = used * KB
                break
        }
        return
}

// Copyright 2020 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "fmt"
        "strconv"
        "strings"

        "github.com/google/uuid"
)

const (
        multipartIDMetaLength = 25
        multipartIDFlagLength = 2
        multipartIDDelimiter  = "x"
)

type MultipartID string

func (id MultipartID) String() string {
        return string(id)
}

func (id MultipartID) PartitionID() (pID uint64, found bool) {
        if len(id) < multipartIDMetaLength {
                return
        }
        var (
                mpStart        int
                mpEnd          int
                flag           string
                length         int64
                appendInfo     []rune
                mpIdString     string
                delimiterIndex int
                err            error
        )
        delimiterIndex = len(id) - multipartIDMetaLength
        appendInfo = []rune(id)[delimiterIndex:]
        if string(appendInfo[0]) != multipartIDDelimiter {
                return
        }
        flag = string(appendInfo[1 : multipartIDFlagLength+1])
        length, err = strconv.ParseInt(flag, 10, 32)
        if err != nil {
                return 0, false
        }
        mpStart = 1 + multipartIDFlagLength
        mpEnd = mpStart + int(length)
        mpIdString = string(appendInfo[mpStart:mpEnd])
        pID, err = strconv.ParseUint(mpIdString, 10, 64)
        found = err == nil
        return
}

func MultipartIDFromString(src string) MultipartID {
        return MultipartID(src)
}

func CreateMultipartID(mpId uint64) MultipartID {
        var (
                mpIdLength  string
                multipartId string
        )

        // Append special char 'x' and meta partition id after generated multipart id.
        // If appended string length is less then 25, completion using random string
        tempLength := len(strconv.FormatUint(mpId, 10))

        // Meta partition id's length is fixed, if current length is not enough,
        // append '0' in the beginning of current meta partition id
        if len(strconv.Itoa(tempLength)) < multipartIDFlagLength {
                for i := 0; i < multipartIDFlagLength-len(strconv.Itoa(tempLength)); i++ {
                        mpIdLength += "0"
                }
                mpIdLength += strconv.Itoa(tempLength)
        }
        appendMultipart := fmt.Sprintf("%s%d", mpIdLength, mpId)
        nextId := strings.ReplaceAll(uuid.New().String(), "-", "")
        if len(appendMultipart) < multipartIDMetaLength-1 {
                l := multipartIDMetaLength - 1 - len(appendMultipart)
                t := strings.ReplaceAll(uuid.New().String(), "-", "")
                r := string([]rune(t)[:l])
                multipartId = fmt.Sprintf("%s%s%s%s", nextId, multipartIDDelimiter, appendMultipart, r)
        } else {
                multipartId = fmt.Sprintf("%s%s%s", nextId, multipartIDDelimiter, appendMultipart)
        }
        return MultipartID(multipartId)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import "sync"

type (
        Null struct{}
        Set  struct {
                sync.RWMutex
                m map[string]Null
        }
)

func NewSet() *Set {
        return &Set{
                m: map[string]Null{},
        }
}

func (s *Set) Add(val string) {
        s.Lock()
        defer s.Unlock()
        s.m[val] = Null{}
}

func (s *Set) Remove(val string) {
        s.Lock()
        defer s.Unlock()
        delete(s.m, val)
}

func (s *Set) Has(key string) bool {
        s.RLock()
        defer s.RUnlock()
        _, ok := s.m[key]
        return ok
}

func (s *Set) Len() int {
        s.RLock()
        defer s.RUnlock()
        return len(s.m)
}

func (s *Set) Clear() {
        s.Lock()
        defer s.Unlock()
        s.m = make(map[string]Null)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "fmt"
        "io"
        "net"
        "strconv"
        "strings"
        "sync"
        "sync/atomic"
        "syscall"
        "time"
        "unsafe"

        "github.com/cubefs/cubefs/util/errors"
        "github.com/xtaci/smux"
)

const (
        DefaultSmuxPortShift = 500
)

const (
        defaultCreateInterval = int64(time.Microsecond * 200)
)

var ErrTooMuchSmuxStreams = errors.New("too much smux streams")

// ShiftAddrPort changes the addr(ip:port) to afterShift(ip:(port+shift)).
func ShiftAddrPort(addr string, shift int) (afterShift string) {
        pars := strings.Split(addr, ":")
        if len(pars) != 2 {
                return
        }
        ip, port := pars[0], pars[1]
        portNum, err := strconv.Atoi(port)
        if err != nil {
                return
        }
        afterShift = fmt.Sprintf("%s:%d", ip, portNum+shift)
        return
}

// FilterSmuxAcceptError filter smux accept error
func FilterSmuxAcceptError(err error) error {
        if err == nil {
                return nil
        }
        if err.Error() == io.EOF.Error() {
                return nil
        }
        if operr, ok := err.(*net.OpError); ok {
                if operr.Err == syscall.ECONNRESET {
                        return nil
                }
        }
        return err
}

const (
        streamPreAlloc = 1
        connPreAlloc   = 1
)

type SmuxConnPoolConfig struct {
        *smux.Config
        TotalStreams      int
        StreamsPerConn    int
        ConnsPerAddr      int
        PoolCapacity      int
        DialTimeout       time.Duration
        StreamIdleTimeout int64
}

func DefaultSmuxConnPoolConfig() *SmuxConnPoolConfig {
        return &SmuxConnPoolConfig{
                Config:            DefaultSmuxConfig(),
                TotalStreams:      1000000,
                StreamsPerConn:    1,
                ConnsPerAddr:      16,
                PoolCapacity:      64,
                DialTimeout:       time.Second * 10,
                StreamIdleTimeout: int64(time.Second * 60),
        }
}

func VerifySmuxPoolConfig(cfg *SmuxConnPoolConfig) error {
        if err := smux.VerifyConfig(cfg.Config); err != nil {
                return err
        }
        if cfg.ConnsPerAddr <= 0 {
                return errors.New("cfg.ConnsPerAddr must be larger than 0")
        }
        if cfg.PoolCapacity <= 0 {
                return errors.New("cfg.PoolCapacity must be larger than 0")
        }
        if cfg.StreamsPerConn <= 0 {
                return errors.New("cfg.StreamsPerConn must be larger than 0")
        }
        if cfg.StreamIdleTimeout < int64(10*time.Millisecond) {
                return errors.New("cfg.StreamIdleTimeout too small, must be larger than 10ms")
        }
        if cfg.TotalStreams <= 0 {
                return errors.New("cfg.TotalStreams must be larger than 0")
        }
        return nil
}

func DefaultSmuxConfig() *smux.Config {
        return smux.DefaultConfig()
}

var gConfig = DefaultSmuxConnPoolConfig()

type SmuxConnPoolStat struct {
        TotalStreams         int                      `json:"totalStreams"`
        TotalStreamsReported int                      `json:"totalStreamsInflight"`
        Pools                map[string]*SmuxPoolStat `json:"pools"`
        TotalSessions        int                      `json:"totalSessions"`
        Bucket               int                      `json:"bucket"`
}

// token bucket limit
type simpleTokenBucket struct {
        bucket  int64
        notify  chan struct{}
        blocked bool
}

func newSimpleTokenBucket(n int64, blocked bool) *simpleTokenBucket {
        return &simpleTokenBucket{
                bucket:  n,
                notify:  make(chan struct{}, 1),
                blocked: blocked,
        }
}

func (b *simpleTokenBucket) consumeTokens(n int) bool {
        if atomic.AddInt64(&b.bucket, int64(-n)) < 0 {
                if b.blocked {
                        <-b.notify
                } else {
                        atomic.AddInt64(&b.bucket, int64(n))
                        return false
                }
        }
        return true
}

func (b *simpleTokenBucket) returnTokens(n int) {
        if atomic.AddInt64(&b.bucket, int64(n)) > 0 {
                if b.blocked {
                        select {
                        case b.notify <- struct{}{}:
                        default:
                        }
                }
        }
}

type SmuxConnectPool struct {
        sync.RWMutex
        streamBucket *simpleTokenBucket
        cfg          *SmuxConnPoolConfig
        pools        map[string]*SmuxPool
        closeCh      chan struct{}
        closeOnce    sync.Once
}

func NewSmuxConnectPool(cfg *SmuxConnPoolConfig) (cp *SmuxConnectPool) {
        if cfg == nil {
                cfg = gConfig
        }
        cp = &SmuxConnectPool{
                streamBucket: newSimpleTokenBucket(int64(cfg.TotalStreams), false),
                cfg:          cfg,
                pools:        make(map[string]*SmuxPool),
                closeCh:      make(chan struct{}),
                closeOnce:    sync.Once{},
        }
        go cp.autoRelease()

        return cp
}

func (cp *SmuxConnectPool) GetConnect(targetAddr string) (c *smux.Stream, err error) {
        cp.RLock()
        pool, ok := cp.pools[targetAddr]
        cp.RUnlock()
        if !ok {
                cp.Lock()
                pool, ok = cp.pools[targetAddr]
                if !ok {
                        pool = NewSmuxPool(cp.cfg, targetAddr, cp.streamBucket)
                        cp.pools[targetAddr] = pool
                }
                cp.Unlock()
        }
        return pool.GetConnect()
}

func (cp *SmuxConnectPool) PutConnect(stream *smux.Stream, forceClose bool) {
        if stream == nil {
                return
        }
        select {
        case <-cp.closeCh:
                return
        default:
        }
        addr := stream.RemoteAddr().String()
        cp.RLock()
        pool, ok := cp.pools[addr]
        cp.RUnlock()
        if !ok {
                return
        }
        if forceClose {
                pool.MarkClosed(stream)
                return
        }
        pool.PutStreamObjectToPool(&streamObject{stream: stream, idle: time.Now().UnixNano()})
}

func (cp *SmuxConnectPool) autoRelease() {
        timer := time.NewTimer(time.Duration(cp.cfg.StreamIdleTimeout))
        for {
                select {
                case <-cp.closeCh:
                        timer.Stop()
                        return
                case <-timer.C:
                }
                pools := make([]*SmuxPool, 0)
                cp.RLock()
                for _, pool := range cp.pools {
                        pools = append(pools, pool)
                }
                cp.RUnlock()
                for _, pool := range pools {
                        pool.autoRelease()
                }
                timer.Reset(time.Duration(cp.cfg.StreamIdleTimeout))
        }
}

func (cp *SmuxConnectPool) releaseAll() {
        pools := make([]*SmuxPool, 0)
        cp.RLock()
        for _, pool := range cp.pools {
                pools = append(pools, pool)
        }
        cp.RUnlock()
        for _, pool := range pools {
                pool.ReleaseAll()
        }
}

func (cp *SmuxConnectPool) Close() {
        cp.closeOnce.Do(func() {
                close(cp.closeCh)
                cp.releaseAll()
        })
}

func (cp *SmuxConnectPool) GetStat() *SmuxConnPoolStat {
        stat := &SmuxConnPoolStat{
                TotalStreams:         0,
                TotalStreamsReported: 0,
                Pools:                make(map[string]*SmuxPoolStat),
                TotalSessions:        0,
        }
        cp.RLock()
        for remote, pool := range cp.pools {
                stat.Pools[remote] = pool.GetStat()
        }
        cp.RUnlock()
        for _, poolStat := range stat.Pools {
                stat.TotalSessions += poolStat.TotalSessions
                stat.TotalStreams += poolStat.InflightStreams
                stat.TotalStreamsReported += poolStat.InflightStreamsReported
        }

        stat.Bucket = int(atomic.LoadInt64(&cp.streamBucket.bucket))
        return stat
}

type createSessCall struct {
        idle   int64
        notify chan struct{}
        sess   *smux.Session
        err    error
}

type streamObject struct {
        stream *smux.Stream
        idle   int64
}

type SmuxPool struct {
        target          string
        sessionsLock    sync.RWMutex
        sessionsIter    int64
        sessions        []*smux.Session
        cfg             *SmuxConnPoolConfig
        objects         chan *streamObject
        inflightStreams int64
        createSessCall  *createSessCall
        streamBucket    *simpleTokenBucket
}

type SmuxPoolStat struct {
        Addr                    string         `json:"addr"`
        InflightStreams         int            `json:"inflightStreams"`
        InflightStreamsReported int            `json:"inflightStreamReported"`
        TotalSessions           int            `json:"totalSessions"`
        StreamsPerSession       map[string]int `json:"streamsPerSession"`
}

func NewSmuxPool(cfg *SmuxConnPoolConfig, target string, streamBucket *simpleTokenBucket) (p *SmuxPool) {
        if cfg == nil {
                cfg = gConfig
        }
        p = &SmuxPool{
                target:       target,
                sessions:     make([]*smux.Session, 0, cfg.ConnsPerAddr),
                cfg:          cfg,
                streamBucket: streamBucket,
                objects:      make(chan *streamObject, cfg.PoolCapacity),
        }
        p.initSessions()
        p.initStreams()
        return p
}

func (p *SmuxPool) initSessions() {
        p.sessionsLock.Lock()
        defer p.sessionsLock.Unlock()
        for i := 0; i < connPreAlloc; i++ {
                conn, err := net.DialTimeout("tcp", p.target, p.cfg.DialTimeout)
                if err != nil {
                        continue
                }
                sess, err := smux.Client(conn, p.cfg.Config)
                if err != nil {
                        conn.Close()
                        continue
                }
                p.sessions = append(p.sessions, sess)
        }
}

func (p *SmuxPool) initStreams() {
        for i := 0; i < streamPreAlloc; i++ {
                stream, err := p.NewStream()
                if err == nil {
                        p.PutStreamObjectToPool(&streamObject{
                                stream: stream,
                                idle:   time.Now().UnixNano(),
                        })
                }
        }
}

func (p *SmuxPool) callCreate() (createCall *createSessCall) {
        createCall = p.loadCreateCall()
        if createCall == nil {
                goto tryCreateNewSess
        }
        select {
        case <-createCall.notify:
                if time.Now().UnixNano()-createCall.idle > defaultCreateInterval {
                        goto tryCreateNewSess
                } else {
                        return
                }
                // default:
        }
tryCreateNewSess:
        prev := createCall
        createCall = &createSessCall{
                idle:   time.Now().UnixNano(),
                notify: make(chan struct{}),
        }
        if p.casCreateCall(prev, createCall) {
                go p.handleCreateCall(createCall)
                return createCall
        } else {
                return p.loadCreateCall()
        }
}

func (p *SmuxPool) autoRelease() {
        poolLen := len(p.objects)
getFromPool:
        for i := 0; i < poolLen; i++ {
                select {
                case obj := <-p.objects:
                        if streamClosed(obj.stream) {
                                p.MarkClosed(obj.stream)
                        } else if time.Now().UnixNano()-obj.idle > p.cfg.StreamIdleTimeout {
                                obj.stream.Close()
                                p.MarkClosed(obj.stream)
                        } else {
                                p.PutStreamObjectToPool(obj)
                        }
                default:
                        break getFromPool
                }
        }
        p.sessionsLock.Lock()
        defer p.sessionsLock.Unlock()
        sessionsLen := len(p.sessions)
        hole := 0
        for i := 0; i+hole < sessionsLen; {
                o := p.sessions[i]
                if o.IsClosed() {
                        p.sessions[i] = nil
                        hole++
                } else if o.NumStreams() == 0 {
                        o.Close()
                        p.sessions[i] = nil
                        hole++
                } else {
                        i++
                }
                if hole > 0 && i+hole < sessionsLen {
                        p.sessions[i] = p.sessions[i+hole]
                }
        }
        if hole > 0 {
                p.sessions = p.sessions[:sessionsLen-hole]
        }
}

func streamClosed(stream *smux.Stream) bool {
        select {
        case <-stream.GetDieCh():
                return true
        default:
                return false
        }
}

func (p *SmuxPool) canUse(sess *smux.Session) bool {
        if sess == nil || sess.IsClosed() {
                return false
        }
        streamNum := sess.NumStreams()
        if streamNum > 0 {
                if streamNum < p.cfg.StreamsPerConn {
                        return true
                }
                maxStreams := p.cfg.StreamsPerConn * p.cfg.ConnsPerAddr
                inflight := p.inflightStreamNum()
                if inflight >= maxStreams {
                        // oversold
                        return streamNum <= ((inflight / p.cfg.ConnsPerAddr) + 1)
                } else {
                        return false
                }
        } else {
                return true
        }
}

func (p *SmuxPool) ReleaseAll() {
        p.sessionsLock.Lock()
        defer p.sessionsLock.Unlock()
        sessionsLen := len(p.sessions)
        for i := 0; i < sessionsLen; i++ {
                o := p.sessions[i]
                if o != nil {
                        o.Close()
                        p.sessions[i] = nil
                }
        }
        p.sessions = p.sessions[:0]
}

func (p *SmuxPool) getAvailSess() (sess *smux.Session) {
        // every time start from different pos
        iter := atomic.AddInt64(&p.sessionsIter, 1) - 1
        p.sessionsLock.RLock()
        sessionsLen := len(p.sessions)
        for i := 0; i < sessionsLen; i++ {
                o := p.sessions[(int64(i)+iter)%int64(sessionsLen)]
                if p.canUse(o) {
                        sess = o
                        break
                }
        }
        p.sessionsLock.RUnlock()
        return
}

func (p *SmuxPool) insertSession(sess *smux.Session) {
        p.sessionsLock.Lock()
        // replace
        for i, o := range p.sessions {
                if o == nil || o.IsClosed() {
                        p.sessions[i] = sess
                        p.sessionsLock.Unlock()
                        return
                }
        }
        // or append
        p.sessions = append(p.sessions, sess)
        p.sessionsLock.Unlock()
}

func (p *SmuxPool) GetConnect() (*smux.Stream, error) {
        poolLen := len(p.objects)
getFromPool:
        for i := 0; i < poolLen; i++ {
                select {
                case obj := <-p.objects:
                        if obj != nil {
                                select {
                                case <-obj.stream.GetDieCh():
                                        p.MarkClosed(obj.stream)
                                        continue getFromPool
                                default:
                                        return obj.stream, nil
                                }
                        }
                default:
                        break getFromPool
                }
        }
        return p.NewStream()
}

func (p *SmuxPool) NewStream() (stream *smux.Stream, err error) {
        sess := p.getAvailSess()
        if sess != nil {
                stream, err = p.openStream(sess)
                if err != nil {
                        goto createNewSession
                } else {
                        return
                }
        }
createNewSession:
        call := p.callCreate()
        <-call.notify
        if call.err != nil {
                return nil, call.err
        } else {
                return p.openStream(call.sess)
        }
}

func (p *SmuxPool) MarkClosed(s *smux.Stream) {
        s.Close()
        p.addInflightStream(-1)
        p.streamBucket.returnTokens(1)
}

func (p *SmuxPool) addInflightStream(n int) int {
        return int(atomic.AddInt64(&p.inflightStreams, int64(n)))
}

func (p *SmuxPool) inflightStreamNum() int {
        return int(atomic.LoadInt64(&p.inflightStreams))
}

func (p *SmuxPool) loadCreateCall() *createSessCall {
        return (*createSessCall)(atomic.LoadPointer((*unsafe.Pointer)(unsafe.Pointer(&p.createSessCall))))
}

func (p *SmuxPool) casCreateCall(prev *createSessCall, new *createSessCall) bool {
        return atomic.CompareAndSwapPointer((*unsafe.Pointer)(unsafe.Pointer(&p.createSessCall)),
                unsafe.Pointer(prev), unsafe.Pointer(new))
}

func (p *SmuxPool) handleCreateCall(call *createSessCall) {
        var conn net.Conn
        defer close(call.notify)
        conn, call.err = net.DialTimeout("tcp", p.target, p.cfg.DialTimeout)
        if call.err != nil {
                return
        }
        c := conn.(*net.TCPConn)
        c.SetKeepAlive(true)
        c.SetNoDelay(true)
        call.sess, call.err = smux.Client(conn, p.cfg.Config)
        if call.err != nil {
                c.Close()
                return
        }
        p.insertSession(call.sess)
}

func (p *SmuxPool) openStream(sess *smux.Session) (stream *smux.Stream, err error) {
        if !p.streamBucket.consumeTokens(1) {
                return nil, ErrTooMuchSmuxStreams
        }
        stream, err = sess.OpenStream()
        if err == nil {
                p.addInflightStream(1)
        } else {
                p.streamBucket.returnTokens(1)
        }
        return
}

func (p *SmuxPool) PutStreamObjectToPool(obj *streamObject) {
        if streamClosed(obj.stream) {
                p.MarkClosed(obj.stream)
                return
        }
        select {
        case p.objects <- obj:
                return
        default:
                p.MarkClosed(obj.stream)
        }
}

func (p *SmuxPool) GetStat() *SmuxPoolStat {
        stat := &SmuxPoolStat{
                Addr:                    p.target,
                InflightStreams:         0,
                InflightStreamsReported: 0,
                TotalSessions:           0,
                StreamsPerSession:       make(map[string]int, p.cfg.ConnsPerAddr),
        }
        p.sessionsLock.RLock()
        stat.TotalSessions = len(p.sessions)
        stat.InflightStreamsReported = p.inflightStreamNum()
        for _, sess := range p.sessions {
                streams := sess.NumStreams()
                stat.InflightStreams += streams
                stat.StreamsPerSession[sess.LocalAddr().String()] += streams
        }
        p.sessionsLock.RUnlock()
        return stat
}

// Copyright 2022 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package stat

import (
        "bufio"
        "errors"
        "fmt"
        "io/ioutil"
        "os"
        "path"
        "regexp"
        "sort"
        "strconv"
        "strings"
        "sync"
        "syscall"
        "time"

        "github.com/cubefs/cubefs/util/log"
)

const (
        Stat_Module        = "mem_stat"
        FileNameDateFormat = "20060102150405"
        ShiftedExtension   = ".old"
        PRO_MEM            = "/proc/%d/status"

        F_OK               = 0
        MaxTimeoutLevel    = 3
        DefaultStatLogSize = 200 * 1024 * 1024 // 200M
        DefaultHeadRoom    = 50 * 1024         // 50G
        MaxReservedDays    = 7 * 24 * time.Hour
)

var DefaultTimeOutUs = [MaxTimeoutLevel]uint32{100000, 500000, 1000000}

var DefaultStatInterval = 60 * time.Second // 60 seconds

var re = regexp.MustCompile(`\([0-9]*\)`)

type ShiftedFile []os.FileInfo

func (f ShiftedFile) Less(i, j int) bool {
        return f[i].ModTime().Before(f[j].ModTime())
}

func (f ShiftedFile) Len() int {
        return len(f)
}

func (f ShiftedFile) Swap(i, j int) {
        f[i], f[j] = f[j], f[i]
}

type typeInfo struct {
        typeName  string
        allCount  uint32
        failCount uint32
        maxTime   time.Duration
        minTime   time.Duration
        allTimeUs time.Duration
        timeOut   [MaxTimeoutLevel]uint32
}

type Statistic struct {
        logDir        string
        logMaxSize    int64
        logBaseName   string
        pid           int
        lastClearTime time.Time
        timeOutUs     [MaxTimeoutLevel]uint32
        typeInfoMap   map[string]*typeInfo
        closeStat     bool
        useMutex      bool
        sync.Mutex
}

var gSt *Statistic = nil

func NewStatistic(dir, logModule string, logMaxSize int64, timeOutUs [MaxTimeoutLevel]uint32, useMutex bool) (*Statistic, error) {
        dir = path.Join(dir, logModule)
        fi, err := os.Stat(dir)
        if err != nil {
                os.MkdirAll(dir, 0o755)
        } else {
                if !fi.IsDir() {
                        return nil, errors.New(dir + " is not a directory")
                }
        }
        _ = os.Chmod(dir, 0o755)
        logName := path.Join(dir, Stat_Module)
        st := &Statistic{
                logDir:        dir,
                logMaxSize:    logMaxSize,
                logBaseName:   logName,
                pid:           os.Getpid(),
                lastClearTime: time.Time{},
                timeOutUs:     timeOutUs,
                typeInfoMap:   make(map[string]*typeInfo),
                closeStat:     false,
                useMutex:      useMutex,
                Mutex:         sync.Mutex{},
        }

        gSt = st
        go st.flushScheduler()
        return st, nil
}

// TODO: how to close?
func (st *Statistic) flushScheduler() {
        timer := time.NewTimer(DefaultStatInterval)
        defer timer.Stop()
        for {
                <-timer.C

                err := WriteStat()
                if err != nil {
                        log.LogErrorf("WriteStat error: %v", err)
                }

                timer.Reset(DefaultStatInterval)

                fs := syscall.Statfs_t{}
                if err := syscall.Statfs(st.logDir, &fs); err != nil {
                        log.LogErrorf("Get fs stat failed, err: %v", err)
                        continue
                }
                diskSpaceLeft := int64(fs.Bavail * uint64(fs.Bsize))
                diskSpaceLeft -= DefaultHeadRoom * 1024 * 1024
                removeLogFile(diskSpaceLeft, Stat_Module)
        }
}

func removeLogFile(diskSpaceLeft int64, module string) {
        fInfos, err := ioutil.ReadDir(gSt.logDir)
        if err != nil {
                log.LogErrorf("ReadDir failed, logDir: %s, err: %v", gSt.logDir, err)
                return
        }
        var needDelFiles ShiftedFile
        for _, info := range fInfos {
                if deleteFileFilter(info, diskSpaceLeft, module) {
                        needDelFiles = append(needDelFiles, info)
                }
        }
        sort.Sort(needDelFiles)
        for _, info := range needDelFiles {
                if err = os.Remove(path.Join(gSt.logDir, info.Name())); err != nil {
                        log.LogErrorf("Remove log file failed, logFileName: %s, err: %v", info.Name(), err)
                        continue
                }
                diskSpaceLeft += info.Size()
                if diskSpaceLeft > 0 && time.Since(info.ModTime()) < MaxReservedDays {
                        break
                }
        }
}

func deleteFileFilter(info os.FileInfo, diskSpaceLeft int64, module string) bool {
        if diskSpaceLeft <= 0 {
                return info.Mode().IsRegular() && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
        }
        return time.Since(info.ModTime()) > MaxReservedDays && strings.HasSuffix(info.Name(), ShiftedExtension) && strings.HasPrefix(info.Name(), module)
}

func CloseStat() {
        if gSt == nil {
                return
        }

        gSt.closeStat = true
}

func BeginStat() (bgTime *time.Time) {
        bg := time.Now()
        return &bg
}

func EndStat(typeName string, err error, bgTime *time.Time, statCount uint32) error {
        if gSt == nil {
                return nil
        }

        if gSt.closeStat {
                return nil
        }

        if gSt.useMutex {
                gSt.Lock()
                defer gSt.Unlock()
        }

        if err != nil {
                newErrStr := string(re.ReplaceAll([]byte(err.Error()), []byte("(xxx)")))
                baseLen := len(typeName) + 2
                if len(newErrStr)+baseLen > 41 {
                        typeName = typeName + "[" + newErrStr[:41-baseLen] + "]"
                } else {
                        typeName = typeName + "[" + newErrStr + "]"
                }
        }

        return addStat(typeName, err, bgTime, statCount)
}

func WriteStat() error {
        if gSt == nil {
                return nil
        }

        if gSt.useMutex {
                gSt.Lock()
                defer gSt.Unlock()
        }

        logFileName := gSt.logBaseName + ".log"
        statFile, err := os.OpenFile(logFileName, os.O_RDWR|os.O_APPEND|os.O_CREATE, 0o666)
        if err != nil {
                log.LogErrorf("OpenLogFile failed, logFileName: %s, err: %v\n", logFileName, err)
                return fmt.Errorf("OpenLogFile failed, logFileName %s\n", logFileName)
        }
        defer statFile.Close()

        statSpan := time.Since(gSt.lastClearTime) / 1e9
        ioStream := bufio.NewWriter(statFile)
        defer ioStream.Flush()

        fmt.Fprintf(ioStream, "\n===============  Statistic in %ds, %s  =====================\n",
                statSpan, time.Now().Format("2006-01-02 15:04:05"))

        if virt, res, err := GetProcessMemory(gSt.pid); err != nil {
                log.LogErrorf("Get process memory failed, err: %v", err)
                fmt.Fprintf(ioStream, "Get Mem Failed.\n")
        } else {
                fmt.Fprintf(ioStream, "Mem Allocated(kB): VIRT %-10d   RES %-10d\n", virt, res)
        }

        fmt.Fprintf(ioStream, "%-42s|%10s|%8s|%8s|%8s|%8s|%8s|%8s|%8s|\n",
                "", "TOTAL", "FAILED", "AVG(ms)", "MAX(ms)", "MIN(ms)",
                ">"+strconv.Itoa(int(gSt.timeOutUs[0])/1000)+"ms",
                ">"+strconv.Itoa(int(gSt.timeOutUs[1])/1000)+"ms",
                ">"+strconv.Itoa(int(gSt.timeOutUs[2])/1000)+"ms")

        typeNames := make([]string, 0)
        for typeName := range gSt.typeInfoMap {
                typeNames = append(typeNames, typeName)
        }

        sort.Strings(typeNames)
        for _, typeName := range typeNames {
                typeInfo := gSt.typeInfoMap[typeName]
                avgUs := int32(0)
                if typeInfo.allCount > 0 {
                        avgUs = int32(typeInfo.allTimeUs / time.Duration(typeInfo.allCount))
                }

                fmt.Fprintf(ioStream, "%-42s|%10d|%8d|%8.2f|%8.2f|%8.2f|%8d|%8d|%8d|\n",
                        typeInfo.typeName, typeInfo.allCount, typeInfo.failCount,
                        float32(avgUs)/1000, float32(typeInfo.maxTime)/1000, float32(typeInfo.minTime)/1000,
                        typeInfo.timeOut[0], typeInfo.timeOut[1], typeInfo.timeOut[2])
        }

        fmt.Fprintf(ioStream, "-------------------------------------------------------------------"+
                "--------------------------------------------------\n")

        // clear stat
        gSt.lastClearTime = time.Now()
        gSt.typeInfoMap = make(map[string]*typeInfo)

        shiftFiles()

        return nil
}

func ClearStat() {
        if gSt == nil {
                return
        }

        if gSt.useMutex {
                gSt.Lock()
                defer gSt.Unlock()
        }

        gSt.lastClearTime = time.Now()
        gSt.typeInfoMap = make(map[string]*typeInfo)
}

func addStat(typeName string, err error, bgTime *time.Time, statCount uint32) error {
        if gSt == nil {
                return nil
        }

        if len(typeName) == 0 {
                return fmt.Errorf("AddStat fail, typeName %s\n", typeName)
        }

        if typeInfo, ok := gSt.typeInfoMap[typeName]; ok {
                typeInfo.allCount += statCount
                if err != nil {
                        typeInfo.failCount += statCount
                }
                addTime(typeInfo, bgTime)
                return nil
        }

        typeInfo := &typeInfo{
                typeName:  typeName,
                allCount:  statCount,
                failCount: 0,
                maxTime:   0,
                minTime:   0,
                allTimeUs: 0,
                timeOut:   [3]uint32{},
        }

        if err != nil {
                typeInfo.failCount = statCount
        }

        gSt.typeInfoMap[typeName] = typeInfo
        addTime(typeInfo, bgTime)

        return nil
}

func addTime(typeInfo *typeInfo, bgTime *time.Time) {
        if bgTime == nil {
                return
        }

        timeCostUs := time.Since(*bgTime) / 1e3
        if timeCostUs == 0 {
                return
        }

        if timeCostUs >= time.Duration(gSt.timeOutUs[0]) && timeCostUs < time.Duration(gSt.timeOutUs[1]) {
                typeInfo.timeOut[0]++
        } else if timeCostUs >= time.Duration(gSt.timeOutUs[1]) && timeCostUs < time.Duration(gSt.timeOutUs[2]) {
                typeInfo.timeOut[1]++
        } else if timeCostUs > time.Duration(gSt.timeOutUs[2]) {
                typeInfo.timeOut[2]++
        }

        if timeCostUs > typeInfo.maxTime {
                typeInfo.maxTime = timeCostUs
        }
        if typeInfo.minTime == 0 || timeCostUs < typeInfo.minTime {
                typeInfo.minTime = timeCostUs
        }

        typeInfo.allTimeUs += timeCostUs
}

func shiftFiles() error {
        logFileName := gSt.logBaseName + ".log"
        fileInfo, err := os.Stat(logFileName)
        if err != nil {
                return err
        }

        if fileInfo.Size() < gSt.logMaxSize {
                return nil
        }

        if syscall.Access(logFileName, F_OK) == nil {
                logNewFileName := logFileName + "." + time.Now().Format(
                        FileNameDateFormat) + ShiftedExtension
                if syscall.Rename(logFileName, logNewFileName) != nil {
                        log.LogErrorf("RenameFile failed, logFileName: %s, logNewFileName: %s, err: %v\n",
                                logFileName, logNewFileName, err)
                        return fmt.Errorf("RenameFile failed, logFileName %s, logNewFileName %s\n",
                                logFileName, logNewFileName)
                }
        }

        return nil
}

func StatBandWidth(typeName string, Size uint32) {
        EndStat(typeName+"[FLOW_KB]", nil, nil, Size/1024)
}

func GetMememory() (Virt, Res uint64, err error) {
        return GetProcessMemory(gSt.pid)
}

func GetProcessMemory(pid int) (Virt, Res uint64, err error) {
        proFileName := fmt.Sprintf(PRO_MEM, pid)
        fp, err := os.Open(proFileName)
        if err != nil {
                return
        }
        defer fp.Close()
        scan := bufio.NewScanner(fp)
        for scan.Scan() {
                line := scan.Text()
                fields := strings.Split(line, ":")
                key := fields[0]
                if key == "VmRSS" {
                        value := strings.TrimSpace(fields[1])
                        value = strings.Replace(value, " kB", "", -1)
                        Res, err = strconv.ParseUint(value, 10, 64)
                        if err != nil {
                                return
                        }
                } else if key == "VmSize" {
                        value := strings.TrimSpace(fields[1])
                        value = strings.Replace(value, " kB", "", -1)
                        Virt, err = strconv.ParseUint(value, 10, 64)
                        if err != nil {
                                return
                        }
                } else {
                        continue
                }
        }
        return
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "crypto/rand"
        "fmt"
        "math/big"
        "strconv"
        "strings"
)

func SubString(sourceString string, begin, end int) string {
        bytes := []byte(sourceString)
        stringLength := len(bytes)

        if begin < 0 {
                begin = 0
        }
        if end > stringLength {
                end = stringLength
        }
        return string(bytes[begin:end])
}

type RandomSeed byte

func (s RandomSeed) Runes() []rune {
        sourceBuilder := strings.Builder{}
        if s&Numeric > 0 {
                sourceBuilder.WriteString("0123456789")
        }
        if s&LowerLetter > 0 {
                sourceBuilder.WriteString("abcdefghijklmnopqrstuvwxyz")
        }
        if s&UpperLetter > 0 {
                sourceBuilder.WriteString("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
        }
        return []rune(sourceBuilder.String())
}

const (
        Numeric RandomSeed = 1 << iota
        LowerLetter
        UpperLetter
)

func RandomString(length int, seed RandomSeed) string {
        runs := seed.Runes()
        result := ""
        for i := 0; i < length; i++ {
                lenInt64 := int64(len(runs))
                randNumber, _ := rand.Int(rand.Reader, big.NewInt(lenInt64))
                result += string(runs[randNumber.Uint64()])
        }
        return result
}

// Any2String format any value to string.
func Any2String(value interface{}) string {
        var val string
        switch v := value.(type) {
        case string:
                val = v
        case *string:
                val = *v
        case bool:
                val = strconv.FormatBool(v)
        case *bool:
                val = strconv.FormatBool(*v)

        case int:
                val = strconv.FormatInt(int64(v), 10)
        case int8:
                val = strconv.FormatInt(int64(v), 10)
        case int16:
                val = strconv.FormatInt(int64(v), 10)
        case int32:
                val = strconv.FormatInt(int64(v), 10)
        case int64:
                val = strconv.FormatInt(int64(v), 10)
        case *int:
                val = strconv.FormatInt(int64(*v), 10)
        case *int8:
                val = strconv.FormatInt(int64(*v), 10)
        case *int16:
                val = strconv.FormatInt(int64(*v), 10)
        case *int32:
                val = strconv.FormatInt(int64(*v), 10)
        case *int64:
                val = strconv.FormatInt(int64(*v), 10)

        case uint:
                val = strconv.FormatUint(uint64(v), 10)
        case uint8:
                val = strconv.FormatUint(uint64(v), 10)
        case uint16:
                val = strconv.FormatUint(uint64(v), 10)
        case uint32:
                val = strconv.FormatUint(uint64(v), 10)
        case uint64:
                val = strconv.FormatUint(uint64(v), 10)
        case *uint:
                val = strconv.FormatUint(uint64(*v), 10)
        case *uint8:
                val = strconv.FormatUint(uint64(*v), 10)
        case *uint16:
                val = strconv.FormatUint(uint64(*v), 10)
        case *uint32:
                val = strconv.FormatUint(uint64(*v), 10)
        case *uint64:
                val = strconv.FormatUint(uint64(*v), 10)

        case float32:
                val = strconv.FormatFloat(float64(v), 'f', 6, 64)
        case float64:
                val = strconv.FormatFloat(float64(v), 'f', 6, 64)
        case *float32:
                val = strconv.FormatFloat(float64(*v), 'f', 6, 64)
        case *float64:
                val = strconv.FormatFloat(float64(*v), 'f', 6, 64)
        case complex64:
                val = strconv.FormatComplex(complex128(v), 'f', 6, 64)
        case complex128:
                val = strconv.FormatComplex(complex128(v), 'f', 6, 64)
        case *complex64:
                val = strconv.FormatComplex(complex128(*v), 'f', 6, 64)
        case *complex128:
                val = strconv.FormatComplex(complex128(*v), 'f', 6, 64)

        default:
                val = fmt.Sprintf("%v", value)
        }
        return val
}

// Any2String parse string to pointer of value.
func String2Any(str string, pvalue interface{}) error {
        var val interface{}
        var err error

        switch v := pvalue.(type) {
        case *string:
                val = str
        case *bool:
                val, err = strconv.ParseBool(str)

        case *int:
                val, err = strconv.ParseInt(str, 10, 0)
        case *int8:
                val, err = strconv.ParseInt(str, 10, 8)
        case *int16:
                val, err = strconv.ParseInt(str, 10, 16)
        case *int32:
                val, err = strconv.ParseInt(str, 10, 32)
        case *int64:
                val, err = strconv.ParseInt(str, 10, 64)

        case *uint:
                val, err = strconv.ParseUint(str, 10, 0)
        case *uint8:
                val, err = strconv.ParseUint(str, 10, 8)
        case *uint16:
                val, err = strconv.ParseUint(str, 10, 16)
        case *uint32:
                val, err = strconv.ParseUint(str, 10, 32)
        case *uint64:
                val, err = strconv.ParseUint(str, 10, 64)

        case *float32:
                val, err = strconv.ParseFloat(str, 32)
        case *float64:
                val, err = strconv.ParseFloat(str, 64)
        case *complex64:
                val, err = strconv.ParseComplex(str, 64)
        case *complex128:
                val, err = strconv.ParseComplex(str, 128)

        default:
                return fmt.Errorf("unknown type %v of %s %v", v, str, pvalue)
        }
        if err != nil {
                return err
        }

        switch v := pvalue.(type) {
        case *string:
                *v = val.(string)
        case *bool:
                *v = val.(bool)

        case *int:
                *v = int(val.(int64))
        case *int8:
                *v = int8(val.(int64))
        case *int16:
                *v = int16(val.(int64))
        case *int32:
                *v = int32(val.(int64))
        case *int64:
                *v = int64(val.(int64))

        case *uint:
                *v = uint(val.(uint64))
        case *uint8:
                *v = uint8(val.(uint64))
        case *uint16:
                *v = uint16(val.(uint64))
        case *uint32:
                *v = uint32(val.(uint64))
        case *uint64:
                *v = uint64(val.(uint64))

        case *float32:
                *v = float32(val.(float64))
        case *float64:
                *v = float64(val.(float64))
        case *complex64:
                *v = complex64(val.(complex128))
        case *complex128:
                *v = complex128(val.(complex128))
        }
        return nil
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package synclist

import (
        "container/list"
        "sync"
)

type SyncList struct {
        list.List
        mu sync.RWMutex
}

func New() *SyncList {
        l := new(SyncList)
        l.Init()
        return l
}

func (l *SyncList) Init() *SyncList {
        l.mu.Lock()
        l.List.Init()
        l.mu.Unlock()
        return l
}

func (l *SyncList) Remove(e *list.Element) interface{} {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.List.Remove(e)
}

func (l *SyncList) PushFront(v interface{}) *list.Element {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.List.PushFront(v)
}

func (l *SyncList) Back() *list.Element {
        l.mu.RLock()
        defer l.mu.RUnlock()
        return l.List.Back()
}

func (l *SyncList) PushBack(v interface{}) *list.Element {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.List.PushBack(v)
}

func (l *SyncList) InsertBefore(v interface{}, mark *list.Element) *list.Element {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.List.InsertBefore(v, mark)
}

func (l *SyncList) InsertAfter(v interface{}, mark *list.Element) *list.Element {
        l.mu.Lock()
        defer l.mu.Unlock()
        return l.List.InsertAfter(v, mark)
}

func (l *SyncList) Len() int {
        l.mu.RLock()
        defer l.mu.RUnlock()
        return l.List.Len()
}

func (l *SyncList) Front() *list.Element {
        l.mu.RLock()
        defer l.mu.RUnlock()
        return l.List.Front()
}

func (l *SyncList) MoveToFront(e *list.Element) {
        l.mu.Lock()
        l.List.MoveToFront(e)
        l.mu.Unlock()
}

func (l *SyncList) MoveToBack(e *list.Element) {
        l.mu.Lock()
        l.List.MoveToBack(e)
        l.mu.Unlock()
}

func (l *SyncList) MoveBefore(e, mark *list.Element) {
        l.mu.Lock()
        l.List.MoveBefore(e, mark)
        l.mu.Unlock()
}

func (l *SyncList) MoveAfter(e, mark *list.Element) {
        l.mu.Lock()
        l.List.MoveAfter(e, mark)
        l.mu.Unlock()
}

func (l *SyncList) PushBackList(other *SyncList) {
        l.mu.Lock()
        l.List.PushBackList(&other.List)
        l.mu.Unlock()
}

func (l *SyncList) PushFrontList(other *SyncList) {
        l.mu.Lock()
        l.List.PushFrontList(&other.List)
        l.mu.Unlock()
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package timeutil

import (
        "sync/atomic"
        "time"
)

// GetCurrentTime returns the current time.
func GetCurrentTime() time.Time {
        return now.time.Load().(time.Time)
}

// GetCurrentTimeUnix returns the current time unix.
func GetCurrentTimeUnix() int64 {
        return atomic.LoadInt64(&now.timeUnix)
}

var now = newNowTime()

// nowTime defines the current time.
type nowTime struct {
        time     atomic.Value // store time.Time
        timeUnix int64
}

// newNowTime returns a new nowTime.
func newNowTime() *nowTime {
        n := time.Now()
        t := &nowTime{timeUnix: n.Unix()}
        t.time.Store(n)

        go func() {
                ticker := time.NewTicker(time.Second)
                defer ticker.Stop()
                for {
                        <-ticker.C
                        n := time.Now()
                        t.time.Store(n)
                        atomic.StoreInt64(&t.timeUnix, n.Unix())
                }
        }()
        return t
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package ump

import (
        "os"
        "strconv"
        "sync"
        "time"
)

type TpObject struct {
        StartTime time.Time
        EndTime   time.Time
        UmpType   interface{}
}

func NewTpObject() (o *TpObject) {
        o = new(TpObject)
        o.StartTime = time.Now()
        return
}

const (
        TpMethod        = "TP"
        HeartbeatMethod = "Heartbeat"
        FunctionError   = "FunctionError"
)

var (
        HostName      string
        LogTimeForMat = "20060102150405000"
        AlarmPool     = &sync.Pool{New: func() interface{} {
                return new(BusinessAlarm)
        }}
        TpObjectPool = &sync.Pool{New: func() interface{} {
                return new(TpObject)
        }}
        SystemAlivePool = &sync.Pool{New: func() interface{} {
                return new(SystemAlive)
        }}
        FunctionTpPool = &sync.Pool{New: func() interface{} {
                return new(FunctionTp)
        }}
        enableUmp = true
)

func InitUmp(module, dataDir string) (err error) {
        if _, err = os.Stat(dataDir); err != nil {
                enableUmp = false
                err = nil
                return
        }
        if err = initLogName(module, dataDir); err != nil {
                enableUmp = false
                err = nil
                return
        }

        backGroudWrite()
        return nil
}

func BeforeTP(key string) (o *TpObject) {
        if !enableUmp {
                return
        }
        o = TpObjectPool.Get().(*TpObject)
        o.StartTime = time.Now()
        tp := FunctionTpPool.Get().(*FunctionTp)
        tp.HostName = HostName
        tp.Time = time.Now().Format(LogTimeForMat)
        tp.Key = key
        tp.ProcessState = "0"
        o.UmpType = tp

        return
}

func AfterTP(o *TpObject, err error) {
        if !enableUmp {
                return
        }
        tp := o.UmpType.(*FunctionTp)
        tp.ElapsedTime = strconv.FormatInt((int64)(time.Since(o.StartTime)/1e6), 10)
        TpObjectPool.Put(o)
        tp.ProcessState = "0"
        if err != nil {
                tp.ProcessState = "1"
        }
        select {
        case FunctionTpLogWrite.logCh <- tp:
        default:
        }
}

func Alive(key string) {
        if !enableUmp {
                return
        }
        alive := SystemAlivePool.Get().(*SystemAlive)
        alive.HostName = HostName
        alive.Key = key
        alive.Time = time.Now().Format(LogTimeForMat)
        select {
        case SystemAliveLogWrite.logCh <- alive:
        default:
        }
}

func Alarm(key, detail string) {
        if !enableUmp {
                return
        }
        alarm := AlarmPool.Get().(*BusinessAlarm)
        alarm.Time = time.Now().Format(LogTimeForMat)
        alarm.Key = key
        alarm.HostName = HostName
        alarm.BusinessType = "0"
        alarm.Value = "0"
        alarm.Detail = detail
        if len(alarm.Detail) > 512 {
                rs := []rune(detail)
                alarm.Detail = string(rs[0:510])
        }

        select {
        case BusinessAlarmLogWrite.logCh <- alarm:
        default:
        }
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package ump

import (
        "bytes"
        "encoding/json"
        "fmt"
        "net"
        "os"
        "strings"
)

type FunctionTp struct {
        Time         string `json:"time"`
        Key          string `json:"key"`
        HostName     string `json:"hostname"`
        ProcessState string `json:"processState"`
        ElapsedTime  string `json:"elapsedTime"`
}

type SystemAlive struct {
        Key      string `json:"key"`
        HostName string `json:"hostname"`
        Time     string `json:"time"`
}

type BusinessAlarm struct {
        Time         string `json:"time"`
        Key          string `json:"key"`
        HostName     string `json:"hostname"`
        BusinessType string `json:"type"`
        Value        string `json:"value"`
        Detail       string `json:"detail"`
}

const (
        FunctionTpSufixx    = "tp.log"
        SystemAliveSufixx   = "alive.log"
        BusinessAlarmSufixx = "business.log"
        LogFileOpt          = os.O_RDWR | os.O_CREATE | os.O_APPEND
        ChSize              = 102400
        BusinessAlarmType   = "BusinessAlarm"
        SystemAliveType     = "SystemAlive"
        FunctionTpType      = "FunctionTp"
        HostNameFile        = "/proc/sys/kernel/hostname"
        MaxLogSize          = 1024 * 1024 * 10
)

var (
        FunctionTpLogWrite    = &LogWrite{logCh: make(chan interface{}, ChSize)}
        SystemAliveLogWrite   = &LogWrite{logCh: make(chan interface{}, ChSize)}
        BusinessAlarmLogWrite = &LogWrite{logCh: make(chan interface{}, ChSize)}
        UmpDataDir            = "/export/home/tomcat/UMP-Monitor/logs/"
)

type LogWrite struct {
        logCh       chan interface{}
        logName     string
        logSize     int64
        seq         int
        logSufixx   string
        logFp       *os.File
        sigCh       chan bool
        bf          *bytes.Buffer
        jsonEncoder *json.Encoder
}

func (lw *LogWrite) initLogFp(sufixx string) (err error) {
        var fi os.FileInfo
        lw.seq = 0
        lw.sigCh = make(chan bool, 1)
        lw.logSufixx = sufixx
        lw.logName = fmt.Sprintf("%s%s%s", UmpDataDir, "ump_", lw.logSufixx)
        lw.bf = bytes.NewBuffer([]byte{})
        lw.jsonEncoder = json.NewEncoder(lw.bf)
        lw.jsonEncoder.SetEscapeHTML(false)
        if lw.logFp, err = os.OpenFile(lw.logName, LogFileOpt, 0o666); err != nil {
                return
        }
        if fi, err = lw.logFp.Stat(); err != nil {
                return
        }
        lw.logSize = fi.Size()

        return
}

func (lw *LogWrite) backGroundCheckFile() (err error) {
        if lw.logSize <= MaxLogSize {
                return
        }
        lw.logFp.Close()
        lw.seq++
        if lw.seq > 3 {
                lw.seq = 1
        }

        name := fmt.Sprintf("%s%s%s.%d", UmpDataDir, "ump_", lw.logSufixx, lw.seq)
        if _, err = os.Stat(name); err == nil {
                os.Remove(name)
        }
        os.Rename(lw.logName, name)

        if lw.logFp, err = os.OpenFile(lw.logName, LogFileOpt, 0o666); err != nil {
                lw.seq--
                return
        }
        if err = os.Truncate(lw.logName, 0); err != nil {
                lw.seq--
                return
        }
        lw.logSize = 0

        return
}

func (lw *LogWrite) backGroundWrite(umpType string) {
        for {
                var body []byte
                obj := <-lw.logCh
                switch umpType {
                case FunctionTpType:
                        tp := obj.(*FunctionTp)
                        lw.jsonEncoder.Encode(tp)
                        body = append(body, lw.bf.Bytes()...)
                        lw.bf.Reset()
                        FunctionTpPool.Put(tp)
                case SystemAliveType:
                        alive := obj.(*SystemAlive)
                        lw.jsonEncoder.Encode(alive)
                        body = append(body, lw.bf.Bytes()...)
                        lw.bf.Reset()
                        SystemAlivePool.Put(alive)
                case BusinessAlarmType:
                        alarm := obj.(*BusinessAlarm)
                        lw.jsonEncoder.Encode(alarm)
                        body = append(body, lw.bf.Bytes()...)
                        lw.bf.Reset()
                        AlarmPool.Put(alarm)
                default:
                        // do nothing
                }
                if lw.backGroundCheckFile() != nil {
                        continue
                }
                body = append(body, []byte("\n")...)
                lw.logFp.Write(body)
                lw.logSize += (int64)(len(body))
        }
}

func initLogName(module, dataDir string) (err error) {
        if dataDir != "" {
                UmpDataDir = dataDir
                if !strings.HasSuffix(UmpDataDir, "/") {
                        UmpDataDir += "/"
                }
        } else {
                return fmt.Errorf("warnLogDir dir not config")
        }
        if err = os.MkdirAll(UmpDataDir, 0o755); err != nil {
                return
        }

        if HostName, err = GetLocalIpAddr(); err != nil {
                return
        }

        if err = FunctionTpLogWrite.initLogFp(module + "_" + FunctionTpSufixx); err != nil {
                return
        }

        if err = SystemAliveLogWrite.initLogFp(module + "_" + SystemAliveSufixx); err != nil {
                return
        }

        if err = BusinessAlarmLogWrite.initLogFp(module + "_" + BusinessAlarmSufixx); err != nil {
                return
        }

        return
}

func GetLocalIpAddr() (localAddr string, err error) {
        addrs, err := net.InterfaceAddrs()
        if err != nil {
                return
        }
        for _, addr := range addrs {
                if ipNet, isIpNet := addr.(*net.IPNet); isIpNet && !ipNet.IP.IsLoopback() {
                        if ipv4 := ipNet.IP.To4(); ipv4 != nil {
                                localAddr = ipv4.String()
                                return
                        }
                }
        }
        err = fmt.Errorf("cannot get local ip")
        return
}

func backGroudWrite() {
        go FunctionTpLogWrite.backGroundWrite(FunctionTpType)
        go SystemAliveLogWrite.backGroundWrite(SystemAliveType)
        go BusinessAlarmLogWrite.backGroundWrite(BusinessAlarmType)
}

// Copyright 2018 The CubeFS Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package util

import (
        "crypto/md5"
        "encoding/hex"
        "fmt"
        "net"
        "regexp"
        "strings"

        "github.com/cubefs/cubefs/depends/tiglabs/raft/util"
        "github.com/cubefs/cubefs/util/log"
)

const (
        _  = iota
        KB = 1 << (10 * iota)
        MB
        GB
        TB
        PB
        DefaultDataPartitionSize = 120 * GB
        TaskWorkerInterval       = 1
)

const (
        BlockCount          = 1024
        BlockSize           = 65536 * 2
        ReadBlockSize       = BlockSize
        PerBlockCrcSize     = 4
        ExtentSize          = BlockCount * BlockSize
        PacketHeaderSize    = 57
        BlockHeaderSize     = 4096
        SyscallTryMaxTimes  = 3
        PacketHeaderVerSize = 65
)

const (
        PageSize          = 4 * util.KB
        FallocFLKeepSize  = 1
        FallocFLPunchHole = 2
)

const (
        AclListIP  = 0
        AclAddIP   = 1
        AclDelIP   = 2
        AclCheckIP = 3
)

const (
        UidLimitList = 0
        UidAddLimit  = 1
        UidDel       = 2
        UidGetLimit  = 3
)

const (
        DefaultTinySizeLimit = 1 * MB // TODO explain tiny extent?
)

type MultiVersionSeq uint64

func Min(a, b int) int {
        if a > b {
                return b
        }
        return a
}

func Max(a, b int) int {
        if a > b {
                return a
        }
        return b
}

// IsIPV4 returns if it is IPV4 address.
func IsIPV4(val interface{}) bool {
        ip4Pattern := `((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)`
        ip4 := regexpCompile(ip4Pattern)
        return isMatch(ip4, val)
}

func GetIp(addr string) (ip string) {
        var arr []string
        if arr = strings.Split(addr, ":"); len(arr) < 2 {
                return
        }
        ip = strings.Trim(arr[0], " ")
        return ip
}

func getIpAndPort(ipAddr string) (ip string, port string, success bool) {
        success = false
        arr := strings.Split(ipAddr, ":")
        if len(arr) != 2 {
                log.LogWarnf("action[GetIpAndPort] ipAddr[%v] invalid", ipAddr)
                return
        }
        ip = strings.Trim(arr[0], " ")
        port = strings.Trim(arr[1], " ")
        success = true
        return
}

func getDomainAndPort(domainAddr string) (domain string, port string, success bool) {
        success = false
        arr := strings.Split(domainAddr, ":")
        if len(arr) != 2 {
                log.LogWarnf("action[GetDomainAndPort] domainAddr[%v] invalid", domainAddr)
                return
        }
        domain = strings.Trim(arr[0], " ")
        port = strings.Trim(arr[1], " ")
        success = true
        return
}

func IsIPV4Addr(ipAddr string) bool {
        ip, _, ok := getIpAndPort(ipAddr)
        if !ok {
                return false
        }
        return IsIPV4(ip)
}

func ParseIpAddrToDomainAddr(ipAddr string) (domainAddr string) {
        ip, port, ok := getIpAndPort(ipAddr)
        if !ok {
                return
        }
        domains, err := net.LookupAddr(ip)
        if err != nil {
                log.LogWarnf("action[ParseIpAddrToDomainAddr] failed, ipAddr[%v], ip[%v], err[%v]", ipAddr, ip, err)
                return
        }
        for _, v := range domains {
                domain := v
                if domain[len(domain)-1] == '.' {
                        domain = domain[0 : len(domain)-1]
                }
                if len(domainAddr) != 0 {
                        domainAddr += ","
                }
                domainAddr += fmt.Sprintf("%s:%v", domain, port)
        }
        return
}

func ParseAddrToIpAddr(addr string) (ipAddr string, success bool) {
        success = true
        if IsIPV4Addr(addr) {
                ipAddr = addr
                return
        }
        if parsedAddr, ok := ParseDomainAddrToIpAddr(addr); ok {
                ipAddr = parsedAddr
                return
        }
        success = false
        return
}

func ParseDomainAddrToIpAddr(domainAddr string) (ipAddr string, success bool) {
        success = false
        domain, port, ok := getDomainAndPort(domainAddr)
        if !ok {
                return
        }
        ips, err := net.LookupHost(domain)
        if err != nil {
                log.LogWarnf("action[ParseDomainAddrToIpAddr] failed, domainAddr[%v], domain[%v], err[%v]",
                        domainAddr, domain, err)
                return
        }
        if len(ips) == 0 {
                log.LogWarnf("action[ParseDomainAddrToIpAddr] ips is null, domainAddr[%v], domain[%v]",
                        domainAddr, domain)
                return
        }
        for i := 0; i < len(ips); i += 1 {
                if ips[i] != ips[0] {
                        log.LogWarnf("action[ParseDomainAddrToIpAddr] the number of ips is not one,"+
                                "domainAddr[%v], domain[%v], ips[%v], err[%v]", domainAddr, domain, ips, err)
                        return
                }
        }
        ipAddr = fmt.Sprintf("%s:%v", ips[0], port)
        success = true
        return
}

func regexpCompile(str string) *regexp.Regexp {
        return regexp.MustCompile("^" + str + "$")
}

func isMatch(exp *regexp.Regexp, val interface{}) bool {
        switch v := val.(type) {
        case []rune:
                return exp.MatchString(string(v))
        case []byte:
                return exp.Match(v)
        case string:
                return exp.MatchString(v)
        default:
                return false
        }
}

func GenerateKey(volName string, ino uint64, offset uint64) string {
        return fmt.Sprintf("%v_%v_%016x", volName, ino, offset)
}

func GenerateRepVolKey(volName string, ino uint64, dpId uint64, extentId uint64, offset uint64) string {
        return fmt.Sprintf("%v_%v_%v_%v_%016x", volName, ino, dpId, extentId, offset)
}

func OneDaySec() int64 {
        return 60 * 60 * 24
}

func CalcAuthKey(key string) (authKey string) {
        h := md5.New()
        _, _ = h.Write([]byte(key))
        cipherStr := h.Sum(nil)
        return strings.ToLower(hex.EncodeToString(cipherStr))
}